centralize constants for message formatting

This commit is contained in:
perf3ct 2025-04-01 19:33:53 +00:00
parent 154d2905fa
commit 9719859a39
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
6 changed files with 256 additions and 172 deletions

View File

@ -805,7 +805,7 @@ export default class LlmChatPanel extends BasicWidget {
const allPrecedenceEnabled = precedenceList.every((p: string) => enabledProviders.includes(p));
// Get embedding queue status
const embeddingStats = await server.get('embeddings/stats') as {
const embeddingStats = await server.get('llm/embeddings/stats') as {
success: boolean,
stats: {
totalNotesCount: number;

View File

@ -0,0 +1,159 @@
/**
* Formatter Constants
*
* Constants related to message formatters for different LLM providers.
* This centralizes string formatting patterns, HTML cleaning options,
* and other formatter-specific constants that were previously hardcoded.
*/
/**
* HTML tag allowlists for different formatter strictness levels
*/
export const HTML_ALLOWED_TAGS = {
// Standard set used by most formatters
STANDARD: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
// Minimal set for providers with limited HTML support
MINIMAL: ['b', 'i', 'p', 'br', 'a'],
// Empty set for plain text only (Ollama)
NONE: []
};
/**
* HTML attribute allowlists
*/
export const HTML_ALLOWED_ATTRIBUTES = {
// Standard set of allowed attributes
STANDARD: {
'a': ['href']
},
// Empty set for plain text only
NONE: {}
};
/**
* HTML tag transformations
*/
export const HTML_TRANSFORMS = {
// Standard transformations
STANDARD: {
'h1': 'h2',
'h2': 'h3',
'div': 'p',
'span': 'span'
}
};
/**
* RegEx patterns for HTML to Markdown conversion
*/
export const HTML_TO_MARKDOWN_PATTERNS = {
// Headings
HEADING_1: { pattern: /<h1[^>]*>(.*?)<\/h1>/gi, replacement: '# $1\n' },
HEADING_2: { pattern: /<h2[^>]*>(.*?)<\/h2>/gi, replacement: '## $1\n' },
HEADING_3: { pattern: /<h3[^>]*>(.*?)<\/h3>/gi, replacement: '### $1\n' },
HEADING_4: { pattern: /<h4[^>]*>(.*?)<\/h4>/gi, replacement: '#### $1\n' },
HEADING_5: { pattern: /<h5[^>]*>(.*?)<\/h5>/gi, replacement: '##### $1\n' },
// Paragraph and line breaks
PARAGRAPH: { pattern: /<p[^>]*>(.*?)<\/p>/gi, replacement: '$1\n\n' },
BREAK: { pattern: /<br[^>]*>/gi, replacement: '\n' },
// Links and formatting
LINK: { pattern: /<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, replacement: '[$2]($1)' },
STRONG: { pattern: /<strong[^>]*>(.*?)<\/strong>/gi, replacement: '**$1**' },
BOLD: { pattern: /<b[^>]*>(.*?)<\/b>/gi, replacement: '**$1**' },
EMPHASIS: { pattern: /<em[^>]*>(.*?)<\/em>/gi, replacement: '*$1*' },
ITALIC: { pattern: /<i[^>]*>(.*?)<\/i>/gi, replacement: '*$1*' },
// Code
INLINE_CODE: { pattern: /<code[^>]*>(.*?)<\/code>/gi, replacement: '`$1`' },
CODE_BLOCK: { pattern: /<pre[^>]*>(.*?)<\/pre>/gi, replacement: '```\n$1\n```' },
// Clean up
ANY_REMAINING_TAG: { pattern: /<[^>]*>/g, replacement: '' },
EXCESSIVE_NEWLINES: { pattern: /\n{3,}/g, replacement: '\n\n' }
};
/**
* HTML entity replacements
*/
export const HTML_ENTITY_REPLACEMENTS = {
// Common HTML entities
NBSP: { pattern: /&nbsp;/g, replacement: ' ' },
LT: { pattern: /&lt;/g, replacement: '<' },
GT: { pattern: /&gt;/g, replacement: '>' },
AMP: { pattern: /&amp;/g, replacement: '&' },
QUOT: { pattern: /&quot;/g, replacement: '"' },
APOS: { pattern: /&#39;/g, replacement: "'" },
LDQUO: { pattern: /&ldquo;/g, replacement: '"' },
RDQUO: { pattern: /&rdquo;/g, replacement: '"' },
LSQUO: { pattern: /&lsquo;/g, replacement: "'" },
RSQUO: { pattern: /&rsquo;/g, replacement: "'" },
MDASH: { pattern: /&mdash;/g, replacement: '—' },
NDASH: { pattern: /&ndash;/g, replacement: '' },
HELLIP: { pattern: /&hellip;/g, replacement: '…' }
};
/**
* Encoding issue fixes
*/
export const ENCODING_FIXES = {
// Common encoding issues
BROKEN_QUOTES: { pattern: /Γ\u00c2[\u00a3\u00a5]/g, replacement: '"' },
// Character replacements for Unicode
UNICODE_REPLACEMENTS: {
'\u00A0': ' ', // Non-breaking space
'\u2018': "'", // Left single quote
'\u2019': "'", // Right single quote
'\u201C': '"', // Left double quote
'\u201D': '"', // Right double quote
'\u2013': '-', // En dash
'\u2014': '--', // Em dash
'\u2022': '*', // Bullet
'\u2026': '...' // Ellipsis
}
};
/**
* Ollama-specific cleaning patterns
*/
export const OLLAMA_CLEANING = {
// Replace fancy quotes
QUOTES: { pattern: /[""]/g, replacement: '"' },
APOSTROPHES: { pattern: /['']/g, replacement: "'" },
// Replace other Unicode characters
DASHES: { pattern: /[–—]/g, replacement: '-' },
BULLETS: { pattern: /[•]/g, replacement: '*' },
ELLIPSES: { pattern: /[…]/g, replacement: '...' },
// Remove non-ASCII characters
NON_ASCII: { pattern: /[^\x00-\x7F]/g, replacement: '' },
// Normalize whitespace
WHITESPACE: { pattern: /\s+/g, replacement: ' ' },
NEWLINE_WHITESPACE: { pattern: /\n\s+/g, replacement: '\n' }
};
/**
* Console log messages for formatters
*/
export const FORMATTER_LOGS = {
ANTHROPIC: {
PROCESSED: (before: number, after: number) => `Anthropic formatter: ${before} messages → ${after} messages`
},
OPENAI: {
PROCESSED: (before: number, after: number) => `OpenAI formatter: ${before} messages → ${after} messages`
},
OLLAMA: {
PROCESSED: (before: number, after: number) => `Ollama formatter processed ${before} messages into ${after} messages`
},
ERROR: {
CONTEXT_CLEANING: (provider: string) => `Error cleaning content for ${provider}:`,
ENCODING: 'Error fixing encoding issues:'
}
};

View File

@ -3,6 +3,13 @@ import type { Message } from '../ai_interface.js';
import { BaseMessageFormatter } from './base_formatter.js';
import { PROVIDER_PROMPTS } from '../constants/llm_prompt_constants.js';
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
import {
HTML_ALLOWED_TAGS,
HTML_ALLOWED_ATTRIBUTES,
FORMATTER_LOGS,
HTML_TO_MARKDOWN_PATTERNS,
HTML_ENTITY_REPLACEMENTS
} from '../constants/formatter_constants.js';
/**
* Anthropic-specific message formatter
@ -144,7 +151,7 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
}
}
console.log(`Anthropic formatter: ${messages.length} messages → ${formattedMessages.length} messages`);
console.log(FORMATTER_LOGS.ANTHROPIC.PROCESSED(messages.length, formattedMessages.length));
return formattedMessages;
}
@ -158,52 +165,31 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
try {
// Convert HTML to a Claude-friendly format
const cleaned = sanitizeHtml(content, {
allowedTags: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
allowedAttributes: {
'a': ['href']
}
allowedTags: HTML_ALLOWED_TAGS.STANDARD,
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD
});
// Convert to markdown but preserve some structure
let markdown = cleaned
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n')
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n')
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n')
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n')
.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n')
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
.replace(/<br[^>]*>/gi, '\n')
.replace(/<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```')
// Process lists
.replace(/<ul[^>]*>(.*?)<\/ul>/gs, (match, content) => {
return content.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
})
.replace(/<ol[^>]*>(.*?)<\/ol>/gs, (match, content) => {
let index = 1;
return content.replace(/<li[^>]*>(.*?)<\/li>/gi, (m: string, item: string) => {
return `${index++}. ${item}\n`;
});
})
// Clean up any remaining HTML tags
.replace(/<[^>]*>/g, '')
// Clean up excessive newlines
.replace(/\n{3,}/g, '\n\n')
// Fix common HTML entities
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"');
let markdown = cleaned;
// Apply all standard HTML to Markdown patterns
const patterns = HTML_TO_MARKDOWN_PATTERNS;
for (const pattern of Object.values(patterns)) {
markdown = markdown.replace(pattern.pattern, pattern.replacement);
}
// Process lists - use the parent class method
markdown = this.processListItems(markdown);
// Fix common HTML entities
const entityPatterns = HTML_ENTITY_REPLACEMENTS;
for (const pattern of Object.values(entityPatterns)) {
markdown = markdown.replace(pattern.pattern, pattern.replacement);
}
return markdown.trim();
} catch (error) {
console.error("Error cleaning content for Anthropic:", error);
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("Anthropic"), error);
return content; // Return original if cleaning fails
}
}

View File

@ -2,6 +2,15 @@ import sanitizeHtml from 'sanitize-html';
import type { Message } from '../ai_interface.js';
import type { MessageFormatter } from '../interfaces/message_formatter.js';
import { DEFAULT_SYSTEM_PROMPT, PROVIDER_PROMPTS } from '../constants/llm_prompt_constants.js';
import {
HTML_ALLOWED_TAGS,
HTML_ALLOWED_ATTRIBUTES,
HTML_TRANSFORMS,
HTML_TO_MARKDOWN_PATTERNS,
HTML_ENTITY_REPLACEMENTS,
ENCODING_FIXES,
FORMATTER_LOGS
} from '../constants/formatter_constants.js';
/**
* Base formatter with common functionality for all providers
@ -41,61 +50,32 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
// Convert HTML to markdown for better readability
const cleaned = sanitizeHtml(fixedContent, {
allowedTags: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
allowedAttributes: {
'a': ['href']
},
transformTags: {
'h1': 'h2',
'h2': 'h3',
'div': 'p',
'span': 'span'
}
allowedTags: HTML_ALLOWED_TAGS.STANDARD,
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD,
transformTags: HTML_TRANSFORMS.STANDARD
});
// Process inline elements to markdown
let markdown = cleaned
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n')
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n')
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n')
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n')
.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n')
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
.replace(/<br[^>]*>/gi, '\n')
.replace(/<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```')
// Clean up any remaining HTML tags
.replace(/<[^>]*>/g, '')
// Clean up excessive newlines
.replace(/\n{3,}/g, '\n\n');
let markdown = cleaned;
// Apply all HTML to Markdown patterns
const patterns = HTML_TO_MARKDOWN_PATTERNS;
for (const pattern of Object.values(patterns)) {
markdown = markdown.replace(pattern.pattern, pattern.replacement);
}
// Process list items
markdown = this.processListItems(markdown);
// Fix common HTML entities
markdown = markdown
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&ldquo;/g, '"')
.replace(/&rdquo;/g, '"')
.replace(/&lsquo;/g, "'")
.replace(/&rsquo;/g, "'")
.replace(/&mdash;/g, '—')
.replace(/&ndash;/g, '')
.replace(/&hellip;/g, '…');
const entityPatterns = HTML_ENTITY_REPLACEMENTS;
for (const pattern of Object.values(entityPatterns)) {
markdown = markdown.replace(pattern.pattern, pattern.replacement);
}
return markdown.trim();
} catch (error) {
console.error("Error cleaning context content:", error);
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("Base"), error);
return content; // Return original if cleaning fails
}
}
@ -133,28 +113,18 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
try {
// Fix common encoding issues
return content
// Fix broken quote characters
.replace(/Γ\u00c2[\u00a3\u00a5]/g, '"')
// Fix other common broken unicode
.replace(/[\u{0080}-\u{FFFF}]/gu, (match) => {
// Some common replacements
const replacements: Record<string, string> = {
'\u00A0': ' ', // Non-breaking space
'\u2018': "'", // Left single quote
'\u2019': "'", // Right single quote
'\u201C': '"', // Left double quote
'\u201D': '"', // Right double quote
'\u2013': '-', // En dash
'\u2014': '--', // Em dash
'\u2022': '*', // Bullet
'\u2026': '...' // Ellipsis
};
let fixed = content.replace(ENCODING_FIXES.BROKEN_QUOTES.pattern, ENCODING_FIXES.BROKEN_QUOTES.replacement);
return replacements[match] || match;
});
// Fix other common broken unicode
fixed = fixed.replace(/[\u{0080}-\u{FFFF}]/gu, (match) => {
// Use replacements from constants
const replacements = ENCODING_FIXES.UNICODE_REPLACEMENTS;
return replacements[match as keyof typeof replacements] || match;
});
return fixed;
} catch (error) {
console.error('Error fixing encoding issues:', error);
console.error(FORMATTER_LOGS.ERROR.ENCODING, error);
return content; // Return original if fixing fails
}
}

View File

@ -3,6 +3,12 @@ import { BaseMessageFormatter } from './base_formatter.js';
import sanitizeHtml from 'sanitize-html';
import { PROVIDER_PROMPTS, FORMATTING_PROMPTS } from '../constants/llm_prompt_constants.js';
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
import {
HTML_ALLOWED_TAGS,
HTML_ALLOWED_ATTRIBUTES,
OLLAMA_CLEANING,
FORMATTER_LOGS
} from '../constants/formatter_constants.js';
/**
* Ollama-specific message formatter
@ -66,7 +72,7 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
}
}
console.log(`Ollama formatter processed ${messages.length} messages into ${formattedMessages.length} messages`);
console.log(FORMATTER_LOGS.OLLAMA.PROCESSED(messages.length, formattedMessages.length));
return formattedMessages;
}
@ -85,30 +91,20 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
// Then apply Ollama-specific aggressive cleaning
// Remove any remaining HTML using sanitizeHtml
let plaintext = sanitizeHtml(sanitized, {
allowedTags: [],
allowedAttributes: {},
allowedTags: HTML_ALLOWED_TAGS.NONE,
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.NONE,
textFilter: (text) => text
});
// Then aggressively sanitize to plain ASCII and simple formatting
plaintext = plaintext
// Replace common problematic quotes with simple ASCII quotes
.replace(/[""]/g, '"')
.replace(/['']/g, "'")
// Replace other common Unicode characters
.replace(/[–—]/g, '-')
.replace(/[•]/g, '*')
.replace(/[…]/g, '...')
// Strip all non-ASCII characters
.replace(/[^\x00-\x7F]/g, '')
// Normalize whitespace
.replace(/\s+/g, ' ')
.replace(/\n\s+/g, '\n')
.trim();
// Apply all Ollama-specific cleaning patterns
const ollamaPatterns = OLLAMA_CLEANING;
for (const pattern of Object.values(ollamaPatterns)) {
plaintext = plaintext.replace(pattern.pattern, pattern.replacement);
}
return plaintext;
return plaintext.trim();
} catch (error) {
console.error("Error cleaning context content for Ollama:", error);
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("Ollama"), error);
return content; // Return original if cleaning fails
}
}

View File

@ -3,6 +3,13 @@ import type { Message } from '../ai_interface.js';
import { BaseMessageFormatter } from './base_formatter.js';
import { PROVIDER_PROMPTS, FORMATTING_PROMPTS } from '../constants/llm_prompt_constants.js';
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
import {
HTML_ALLOWED_TAGS,
HTML_ALLOWED_ATTRIBUTES,
HTML_TO_MARKDOWN_PATTERNS,
HTML_ENTITY_REPLACEMENTS,
FORMATTER_LOGS
} from '../constants/formatter_constants.js';
/**
* OpenAI-specific message formatter
@ -72,7 +79,7 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
});
}
console.log(`OpenAI formatter: ${messages.length} messages → ${formattedMessages.length} messages`);
console.log(FORMATTER_LOGS.OPENAI.PROCESSED(messages.length, formattedMessages.length));
return formattedMessages;
}
@ -86,58 +93,24 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
try {
// Convert HTML to Markdown for better readability
const cleaned = sanitizeHtml(content, {
allowedTags: FORMATTING_PROMPTS.HTML_ALLOWED_TAGS,
allowedAttributes: {
'a': ['href']
},
transformTags: {
'h1': 'h2',
'h2': 'h3',
'div': 'p',
'span': 'span'
}
allowedTags: HTML_ALLOWED_TAGS.STANDARD,
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD
});
// Process inline elements to markdown with simpler approach
let markdown = cleaned
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n')
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n')
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n')
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n')
.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n')
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
.replace(/<br[^>]*>/gi, '\n')
.replace(/<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```')
// Clean up any remaining HTML tags
.replace(/<[^>]*>/g, '')
// Clean up excessive newlines
.replace(/\n{3,}/g, '\n\n');
// Apply all HTML to Markdown patterns
let markdown = cleaned;
for (const pattern of Object.values(HTML_TO_MARKDOWN_PATTERNS)) {
markdown = markdown.replace(pattern.pattern, pattern.replacement);
}
// Fix common HTML entities
markdown = markdown
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&ldquo;/g, '"')
.replace(/&rdquo;/g, '"')
.replace(/&lsquo;/g, "'")
.replace(/&rsquo;/g, "'")
.replace(/&mdash;/g, '—')
.replace(/&ndash;/g, '')
.replace(/&hellip;/g, '…');
for (const pattern of Object.values(HTML_ENTITY_REPLACEMENTS)) {
markdown = markdown.replace(pattern.pattern, pattern.replacement);
}
return markdown.trim();
} catch (error) {
console.error("Error cleaning content for OpenAI:", error);
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("OpenAI"), error);
return content; // Return original if cleaning fails
}
}