diff --git a/src/public/app/widgets/llm_chat_panel.ts b/src/public/app/widgets/llm_chat_panel.ts index b88abe1ab..7ff0f8470 100644 --- a/src/public/app/widgets/llm_chat_panel.ts +++ b/src/public/app/widgets/llm_chat_panel.ts @@ -805,7 +805,7 @@ export default class LlmChatPanel extends BasicWidget { const allPrecedenceEnabled = precedenceList.every((p: string) => enabledProviders.includes(p)); // Get embedding queue status - const embeddingStats = await server.get('embeddings/stats') as { + const embeddingStats = await server.get('llm/embeddings/stats') as { success: boolean, stats: { totalNotesCount: number; diff --git a/src/services/llm/constants/formatter_constants.ts b/src/services/llm/constants/formatter_constants.ts new file mode 100644 index 000000000..8046ca24b --- /dev/null +++ b/src/services/llm/constants/formatter_constants.ts @@ -0,0 +1,159 @@ +/** + * Formatter Constants + * + * Constants related to message formatters for different LLM providers. + * This centralizes string formatting patterns, HTML cleaning options, + * and other formatter-specific constants that were previously hardcoded. + */ + +/** + * HTML tag allowlists for different formatter strictness levels + */ +export const HTML_ALLOWED_TAGS = { + // Standard set used by most formatters + STANDARD: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'], + + // Minimal set for providers with limited HTML support + MINIMAL: ['b', 'i', 'p', 'br', 'a'], + + // Empty set for plain text only (Ollama) + NONE: [] +}; + +/** + * HTML attribute allowlists + */ +export const HTML_ALLOWED_ATTRIBUTES = { + // Standard set of allowed attributes + STANDARD: { + 'a': ['href'] + }, + + // Empty set for plain text only + NONE: {} +}; + +/** + * HTML tag transformations + */ +export const HTML_TRANSFORMS = { + // Standard transformations + STANDARD: { + 'h1': 'h2', + 'h2': 'h3', + 'div': 'p', + 'span': 'span' + } +}; + +/** + * RegEx patterns for HTML to Markdown conversion + */ +export const HTML_TO_MARKDOWN_PATTERNS = { + // Headings + HEADING_1: { pattern: /
]*>(.*?)<\/p>/gi, replacement: '$1\n\n' },
+ BREAK: { pattern: / ]*>(.*?)<\/p>/gi, '$1\n\n')
- .replace(/
]*>/gi, replacement: '\n' },
+
+ // Links and formatting
+ LINK: { pattern: /]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, replacement: '[$2]($1)' },
+ STRONG: { pattern: /]*>(.*?)<\/strong>/gi, replacement: '**$1**' },
+ BOLD: { pattern: /]*>(.*?)<\/b>/gi, replacement: '**$1**' },
+ EMPHASIS: { pattern: /]*>(.*?)<\/em>/gi, replacement: '*$1*' },
+ ITALIC: { pattern: /]*>(.*?)<\/i>/gi, replacement: '*$1*' },
+
+ // Code
+ INLINE_CODE: { pattern: /]*>(.*?)<\/code>/gi, replacement: '`$1`' },
+ CODE_BLOCK: { pattern: /
]*>(.*?)<\/pre>/gi, replacement: '```\n$1\n```' },
+
+ // Clean up
+ ANY_REMAINING_TAG: { pattern: /<[^>]*>/g, replacement: '' },
+ EXCESSIVE_NEWLINES: { pattern: /\n{3,}/g, replacement: '\n\n' }
+};
+
+/**
+ * HTML entity replacements
+ */
+export const HTML_ENTITY_REPLACEMENTS = {
+ // Common HTML entities
+ NBSP: { pattern: / /g, replacement: ' ' },
+ LT: { pattern: /</g, replacement: '<' },
+ GT: { pattern: />/g, replacement: '>' },
+ AMP: { pattern: /&/g, replacement: '&' },
+ QUOT: { pattern: /"/g, replacement: '"' },
+ APOS: { pattern: /'/g, replacement: "'" },
+ LDQUO: { pattern: /“/g, replacement: '"' },
+ RDQUO: { pattern: /”/g, replacement: '"' },
+ LSQUO: { pattern: /‘/g, replacement: "'" },
+ RSQUO: { pattern: /’/g, replacement: "'" },
+ MDASH: { pattern: /—/g, replacement: '—' },
+ NDASH: { pattern: /–/g, replacement: '–' },
+ HELLIP: { pattern: /…/g, replacement: '…' }
+};
+
+/**
+ * Encoding issue fixes
+ */
+export const ENCODING_FIXES = {
+ // Common encoding issues
+ BROKEN_QUOTES: { pattern: /Γ\u00c2[\u00a3\u00a5]/g, replacement: '"' },
+
+ // Character replacements for Unicode
+ UNICODE_REPLACEMENTS: {
+ '\u00A0': ' ', // Non-breaking space
+ '\u2018': "'", // Left single quote
+ '\u2019': "'", // Right single quote
+ '\u201C': '"', // Left double quote
+ '\u201D': '"', // Right double quote
+ '\u2013': '-', // En dash
+ '\u2014': '--', // Em dash
+ '\u2022': '*', // Bullet
+ '\u2026': '...' // Ellipsis
+ }
+};
+
+/**
+ * Ollama-specific cleaning patterns
+ */
+export const OLLAMA_CLEANING = {
+ // Replace fancy quotes
+ QUOTES: { pattern: /[""]/g, replacement: '"' },
+ APOSTROPHES: { pattern: /['']/g, replacement: "'" },
+
+ // Replace other Unicode characters
+ DASHES: { pattern: /[–—]/g, replacement: '-' },
+ BULLETS: { pattern: /[•]/g, replacement: '*' },
+ ELLIPSES: { pattern: /[…]/g, replacement: '...' },
+
+ // Remove non-ASCII characters
+ NON_ASCII: { pattern: /[^\x00-\x7F]/g, replacement: '' },
+
+ // Normalize whitespace
+ WHITESPACE: { pattern: /\s+/g, replacement: ' ' },
+ NEWLINE_WHITESPACE: { pattern: /\n\s+/g, replacement: '\n' }
+};
+
+/**
+ * Console log messages for formatters
+ */
+export const FORMATTER_LOGS = {
+ ANTHROPIC: {
+ PROCESSED: (before: number, after: number) => `Anthropic formatter: ${before} messages → ${after} messages`
+ },
+ OPENAI: {
+ PROCESSED: (before: number, after: number) => `OpenAI formatter: ${before} messages → ${after} messages`
+ },
+ OLLAMA: {
+ PROCESSED: (before: number, after: number) => `Ollama formatter processed ${before} messages into ${after} messages`
+ },
+ ERROR: {
+ CONTEXT_CLEANING: (provider: string) => `Error cleaning content for ${provider}:`,
+ ENCODING: 'Error fixing encoding issues:'
+ }
+};
\ No newline at end of file
diff --git a/src/services/llm/formatters/anthropic_formatter.ts b/src/services/llm/formatters/anthropic_formatter.ts
index 39302d4bc..9a17ac135 100644
--- a/src/services/llm/formatters/anthropic_formatter.ts
+++ b/src/services/llm/formatters/anthropic_formatter.ts
@@ -3,6 +3,13 @@ import type { Message } from '../ai_interface.js';
import { BaseMessageFormatter } from './base_formatter.js';
import { PROVIDER_PROMPTS } from '../constants/llm_prompt_constants.js';
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
+import {
+ HTML_ALLOWED_TAGS,
+ HTML_ALLOWED_ATTRIBUTES,
+ FORMATTER_LOGS,
+ HTML_TO_MARKDOWN_PATTERNS,
+ HTML_ENTITY_REPLACEMENTS
+} from '../constants/formatter_constants.js';
/**
* Anthropic-specific message formatter
@@ -144,7 +151,7 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
}
}
- console.log(`Anthropic formatter: ${messages.length} messages → ${formattedMessages.length} messages`);
+ console.log(FORMATTER_LOGS.ANTHROPIC.PROCESSED(messages.length, formattedMessages.length));
return formattedMessages;
}
@@ -158,52 +165,31 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
try {
// Convert HTML to a Claude-friendly format
const cleaned = sanitizeHtml(content, {
- allowedTags: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
- allowedAttributes: {
- 'a': ['href']
- }
+ allowedTags: HTML_ALLOWED_TAGS.STANDARD,
+ allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD
});
// Convert to markdown but preserve some structure
- let markdown = cleaned
- .replace(/
]*>(.*?)<\/h1>/gi, '# $1\n')
- .replace(/
]*>(.*?)<\/h2>/gi, '## $1\n')
- .replace(/
]*>(.*?)<\/h3>/gi, '### $1\n')
- .replace(/
]*>(.*?)<\/h4>/gi, '#### $1\n')
- .replace(/
]*>(.*?)<\/h5>/gi, '##### $1\n')
- .replace(/
]*>/gi, '\n')
- .replace(/]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
- .replace(/]*>(.*?)<\/strong>/gi, '**$1**')
- .replace(/]*>(.*?)<\/b>/gi, '**$1**')
- .replace(/]*>(.*?)<\/em>/gi, '*$1*')
- .replace(/]*>(.*?)<\/i>/gi, '*$1*')
- .replace(/]*>(.*?)<\/code>/gi, '`$1`')
- .replace(/
]*>(.*?)<\/pre>/gi, '```\n$1\n```')
- // Process lists
- .replace(/
]*>(.*?)<\/ul>/gs, (match, content) => {
- return content.replace(/
]*>(.*?)<\/ol>/gs, (match, content) => {
- let index = 1;
- return content.replace(/
]*>(.*?)<\/h1>/gi, '# $1\n')
- .replace(/
]*>(.*?)<\/h2>/gi, '## $1\n')
- .replace(/
]*>(.*?)<\/h3>/gi, '### $1\n')
- .replace(/
]*>(.*?)<\/h4>/gi, '#### $1\n')
- .replace(/
]*>(.*?)<\/h5>/gi, '##### $1\n')
- .replace(/