mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-10-18 12:41:54 +08:00
centralize constants for message formatting
This commit is contained in:
parent
154d2905fa
commit
9719859a39
@ -805,7 +805,7 @@ export default class LlmChatPanel extends BasicWidget {
|
||||
const allPrecedenceEnabled = precedenceList.every((p: string) => enabledProviders.includes(p));
|
||||
|
||||
// Get embedding queue status
|
||||
const embeddingStats = await server.get('embeddings/stats') as {
|
||||
const embeddingStats = await server.get('llm/embeddings/stats') as {
|
||||
success: boolean,
|
||||
stats: {
|
||||
totalNotesCount: number;
|
||||
|
159
src/services/llm/constants/formatter_constants.ts
Normal file
159
src/services/llm/constants/formatter_constants.ts
Normal file
@ -0,0 +1,159 @@
|
||||
/**
|
||||
* Formatter Constants
|
||||
*
|
||||
* Constants related to message formatters for different LLM providers.
|
||||
* This centralizes string formatting patterns, HTML cleaning options,
|
||||
* and other formatter-specific constants that were previously hardcoded.
|
||||
*/
|
||||
|
||||
/**
|
||||
* HTML tag allowlists for different formatter strictness levels
|
||||
*/
|
||||
export const HTML_ALLOWED_TAGS = {
|
||||
// Standard set used by most formatters
|
||||
STANDARD: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
|
||||
|
||||
// Minimal set for providers with limited HTML support
|
||||
MINIMAL: ['b', 'i', 'p', 'br', 'a'],
|
||||
|
||||
// Empty set for plain text only (Ollama)
|
||||
NONE: []
|
||||
};
|
||||
|
||||
/**
|
||||
* HTML attribute allowlists
|
||||
*/
|
||||
export const HTML_ALLOWED_ATTRIBUTES = {
|
||||
// Standard set of allowed attributes
|
||||
STANDARD: {
|
||||
'a': ['href']
|
||||
},
|
||||
|
||||
// Empty set for plain text only
|
||||
NONE: {}
|
||||
};
|
||||
|
||||
/**
|
||||
* HTML tag transformations
|
||||
*/
|
||||
export const HTML_TRANSFORMS = {
|
||||
// Standard transformations
|
||||
STANDARD: {
|
||||
'h1': 'h2',
|
||||
'h2': 'h3',
|
||||
'div': 'p',
|
||||
'span': 'span'
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* RegEx patterns for HTML to Markdown conversion
|
||||
*/
|
||||
export const HTML_TO_MARKDOWN_PATTERNS = {
|
||||
// Headings
|
||||
HEADING_1: { pattern: /<h1[^>]*>(.*?)<\/h1>/gi, replacement: '# $1\n' },
|
||||
HEADING_2: { pattern: /<h2[^>]*>(.*?)<\/h2>/gi, replacement: '## $1\n' },
|
||||
HEADING_3: { pattern: /<h3[^>]*>(.*?)<\/h3>/gi, replacement: '### $1\n' },
|
||||
HEADING_4: { pattern: /<h4[^>]*>(.*?)<\/h4>/gi, replacement: '#### $1\n' },
|
||||
HEADING_5: { pattern: /<h5[^>]*>(.*?)<\/h5>/gi, replacement: '##### $1\n' },
|
||||
|
||||
// Paragraph and line breaks
|
||||
PARAGRAPH: { pattern: /<p[^>]*>(.*?)<\/p>/gi, replacement: '$1\n\n' },
|
||||
BREAK: { pattern: /<br[^>]*>/gi, replacement: '\n' },
|
||||
|
||||
// Links and formatting
|
||||
LINK: { pattern: /<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, replacement: '[$2]($1)' },
|
||||
STRONG: { pattern: /<strong[^>]*>(.*?)<\/strong>/gi, replacement: '**$1**' },
|
||||
BOLD: { pattern: /<b[^>]*>(.*?)<\/b>/gi, replacement: '**$1**' },
|
||||
EMPHASIS: { pattern: /<em[^>]*>(.*?)<\/em>/gi, replacement: '*$1*' },
|
||||
ITALIC: { pattern: /<i[^>]*>(.*?)<\/i>/gi, replacement: '*$1*' },
|
||||
|
||||
// Code
|
||||
INLINE_CODE: { pattern: /<code[^>]*>(.*?)<\/code>/gi, replacement: '`$1`' },
|
||||
CODE_BLOCK: { pattern: /<pre[^>]*>(.*?)<\/pre>/gi, replacement: '```\n$1\n```' },
|
||||
|
||||
// Clean up
|
||||
ANY_REMAINING_TAG: { pattern: /<[^>]*>/g, replacement: '' },
|
||||
EXCESSIVE_NEWLINES: { pattern: /\n{3,}/g, replacement: '\n\n' }
|
||||
};
|
||||
|
||||
/**
|
||||
* HTML entity replacements
|
||||
*/
|
||||
export const HTML_ENTITY_REPLACEMENTS = {
|
||||
// Common HTML entities
|
||||
NBSP: { pattern: / /g, replacement: ' ' },
|
||||
LT: { pattern: /</g, replacement: '<' },
|
||||
GT: { pattern: />/g, replacement: '>' },
|
||||
AMP: { pattern: /&/g, replacement: '&' },
|
||||
QUOT: { pattern: /"/g, replacement: '"' },
|
||||
APOS: { pattern: /'/g, replacement: "'" },
|
||||
LDQUO: { pattern: /“/g, replacement: '"' },
|
||||
RDQUO: { pattern: /”/g, replacement: '"' },
|
||||
LSQUO: { pattern: /‘/g, replacement: "'" },
|
||||
RSQUO: { pattern: /’/g, replacement: "'" },
|
||||
MDASH: { pattern: /—/g, replacement: '—' },
|
||||
NDASH: { pattern: /–/g, replacement: '–' },
|
||||
HELLIP: { pattern: /…/g, replacement: '…' }
|
||||
};
|
||||
|
||||
/**
|
||||
* Encoding issue fixes
|
||||
*/
|
||||
export const ENCODING_FIXES = {
|
||||
// Common encoding issues
|
||||
BROKEN_QUOTES: { pattern: /Γ\u00c2[\u00a3\u00a5]/g, replacement: '"' },
|
||||
|
||||
// Character replacements for Unicode
|
||||
UNICODE_REPLACEMENTS: {
|
||||
'\u00A0': ' ', // Non-breaking space
|
||||
'\u2018': "'", // Left single quote
|
||||
'\u2019': "'", // Right single quote
|
||||
'\u201C': '"', // Left double quote
|
||||
'\u201D': '"', // Right double quote
|
||||
'\u2013': '-', // En dash
|
||||
'\u2014': '--', // Em dash
|
||||
'\u2022': '*', // Bullet
|
||||
'\u2026': '...' // Ellipsis
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Ollama-specific cleaning patterns
|
||||
*/
|
||||
export const OLLAMA_CLEANING = {
|
||||
// Replace fancy quotes
|
||||
QUOTES: { pattern: /[""]/g, replacement: '"' },
|
||||
APOSTROPHES: { pattern: /['']/g, replacement: "'" },
|
||||
|
||||
// Replace other Unicode characters
|
||||
DASHES: { pattern: /[–—]/g, replacement: '-' },
|
||||
BULLETS: { pattern: /[•]/g, replacement: '*' },
|
||||
ELLIPSES: { pattern: /[…]/g, replacement: '...' },
|
||||
|
||||
// Remove non-ASCII characters
|
||||
NON_ASCII: { pattern: /[^\x00-\x7F]/g, replacement: '' },
|
||||
|
||||
// Normalize whitespace
|
||||
WHITESPACE: { pattern: /\s+/g, replacement: ' ' },
|
||||
NEWLINE_WHITESPACE: { pattern: /\n\s+/g, replacement: '\n' }
|
||||
};
|
||||
|
||||
/**
|
||||
* Console log messages for formatters
|
||||
*/
|
||||
export const FORMATTER_LOGS = {
|
||||
ANTHROPIC: {
|
||||
PROCESSED: (before: number, after: number) => `Anthropic formatter: ${before} messages → ${after} messages`
|
||||
},
|
||||
OPENAI: {
|
||||
PROCESSED: (before: number, after: number) => `OpenAI formatter: ${before} messages → ${after} messages`
|
||||
},
|
||||
OLLAMA: {
|
||||
PROCESSED: (before: number, after: number) => `Ollama formatter processed ${before} messages into ${after} messages`
|
||||
},
|
||||
ERROR: {
|
||||
CONTEXT_CLEANING: (provider: string) => `Error cleaning content for ${provider}:`,
|
||||
ENCODING: 'Error fixing encoding issues:'
|
||||
}
|
||||
};
|
@ -3,6 +3,13 @@ import type { Message } from '../ai_interface.js';
|
||||
import { BaseMessageFormatter } from './base_formatter.js';
|
||||
import { PROVIDER_PROMPTS } from '../constants/llm_prompt_constants.js';
|
||||
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
|
||||
import {
|
||||
HTML_ALLOWED_TAGS,
|
||||
HTML_ALLOWED_ATTRIBUTES,
|
||||
FORMATTER_LOGS,
|
||||
HTML_TO_MARKDOWN_PATTERNS,
|
||||
HTML_ENTITY_REPLACEMENTS
|
||||
} from '../constants/formatter_constants.js';
|
||||
|
||||
/**
|
||||
* Anthropic-specific message formatter
|
||||
@ -144,7 +151,7 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Anthropic formatter: ${messages.length} messages → ${formattedMessages.length} messages`);
|
||||
console.log(FORMATTER_LOGS.ANTHROPIC.PROCESSED(messages.length, formattedMessages.length));
|
||||
return formattedMessages;
|
||||
}
|
||||
|
||||
@ -158,52 +165,31 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
||||
try {
|
||||
// Convert HTML to a Claude-friendly format
|
||||
const cleaned = sanitizeHtml(content, {
|
||||
allowedTags: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
|
||||
allowedAttributes: {
|
||||
'a': ['href']
|
||||
}
|
||||
allowedTags: HTML_ALLOWED_TAGS.STANDARD,
|
||||
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD
|
||||
});
|
||||
|
||||
// Convert to markdown but preserve some structure
|
||||
let markdown = cleaned
|
||||
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n')
|
||||
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n')
|
||||
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n')
|
||||
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n')
|
||||
.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n')
|
||||
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
|
||||
.replace(/<br[^>]*>/gi, '\n')
|
||||
.replace(/<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
|
||||
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
|
||||
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
|
||||
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
|
||||
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
|
||||
.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
|
||||
.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```')
|
||||
// Process lists
|
||||
.replace(/<ul[^>]*>(.*?)<\/ul>/gs, (match, content) => {
|
||||
return content.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
|
||||
})
|
||||
.replace(/<ol[^>]*>(.*?)<\/ol>/gs, (match, content) => {
|
||||
let index = 1;
|
||||
return content.replace(/<li[^>]*>(.*?)<\/li>/gi, (m: string, item: string) => {
|
||||
return `${index++}. ${item}\n`;
|
||||
});
|
||||
})
|
||||
// Clean up any remaining HTML tags
|
||||
.replace(/<[^>]*>/g, '')
|
||||
// Clean up excessive newlines
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
// Fix common HTML entities
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"');
|
||||
let markdown = cleaned;
|
||||
|
||||
// Apply all standard HTML to Markdown patterns
|
||||
const patterns = HTML_TO_MARKDOWN_PATTERNS;
|
||||
for (const pattern of Object.values(patterns)) {
|
||||
markdown = markdown.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
// Process lists - use the parent class method
|
||||
markdown = this.processListItems(markdown);
|
||||
|
||||
// Fix common HTML entities
|
||||
const entityPatterns = HTML_ENTITY_REPLACEMENTS;
|
||||
for (const pattern of Object.values(entityPatterns)) {
|
||||
markdown = markdown.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
return markdown.trim();
|
||||
} catch (error) {
|
||||
console.error("Error cleaning content for Anthropic:", error);
|
||||
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("Anthropic"), error);
|
||||
return content; // Return original if cleaning fails
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,15 @@ import sanitizeHtml from 'sanitize-html';
|
||||
import type { Message } from '../ai_interface.js';
|
||||
import type { MessageFormatter } from '../interfaces/message_formatter.js';
|
||||
import { DEFAULT_SYSTEM_PROMPT, PROVIDER_PROMPTS } from '../constants/llm_prompt_constants.js';
|
||||
import {
|
||||
HTML_ALLOWED_TAGS,
|
||||
HTML_ALLOWED_ATTRIBUTES,
|
||||
HTML_TRANSFORMS,
|
||||
HTML_TO_MARKDOWN_PATTERNS,
|
||||
HTML_ENTITY_REPLACEMENTS,
|
||||
ENCODING_FIXES,
|
||||
FORMATTER_LOGS
|
||||
} from '../constants/formatter_constants.js';
|
||||
|
||||
/**
|
||||
* Base formatter with common functionality for all providers
|
||||
@ -41,61 +50,32 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
|
||||
|
||||
// Convert HTML to markdown for better readability
|
||||
const cleaned = sanitizeHtml(fixedContent, {
|
||||
allowedTags: ['b', 'i', 'em', 'strong', 'a', 'p', 'br', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'code', 'pre'],
|
||||
allowedAttributes: {
|
||||
'a': ['href']
|
||||
},
|
||||
transformTags: {
|
||||
'h1': 'h2',
|
||||
'h2': 'h3',
|
||||
'div': 'p',
|
||||
'span': 'span'
|
||||
}
|
||||
allowedTags: HTML_ALLOWED_TAGS.STANDARD,
|
||||
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD,
|
||||
transformTags: HTML_TRANSFORMS.STANDARD
|
||||
});
|
||||
|
||||
// Process inline elements to markdown
|
||||
let markdown = cleaned
|
||||
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n')
|
||||
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n')
|
||||
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n')
|
||||
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n')
|
||||
.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n')
|
||||
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
|
||||
.replace(/<br[^>]*>/gi, '\n')
|
||||
.replace(/<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
|
||||
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
|
||||
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
|
||||
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
|
||||
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
|
||||
.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
|
||||
.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```')
|
||||
// Clean up any remaining HTML tags
|
||||
.replace(/<[^>]*>/g, '')
|
||||
// Clean up excessive newlines
|
||||
.replace(/\n{3,}/g, '\n\n');
|
||||
let markdown = cleaned;
|
||||
|
||||
// Apply all HTML to Markdown patterns
|
||||
const patterns = HTML_TO_MARKDOWN_PATTERNS;
|
||||
for (const pattern of Object.values(patterns)) {
|
||||
markdown = markdown.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
// Process list items
|
||||
markdown = this.processListItems(markdown);
|
||||
|
||||
// Fix common HTML entities
|
||||
markdown = markdown
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/“/g, '"')
|
||||
.replace(/”/g, '"')
|
||||
.replace(/‘/g, "'")
|
||||
.replace(/’/g, "'")
|
||||
.replace(/—/g, '—')
|
||||
.replace(/–/g, '–')
|
||||
.replace(/…/g, '…');
|
||||
const entityPatterns = HTML_ENTITY_REPLACEMENTS;
|
||||
for (const pattern of Object.values(entityPatterns)) {
|
||||
markdown = markdown.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
return markdown.trim();
|
||||
} catch (error) {
|
||||
console.error("Error cleaning context content:", error);
|
||||
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("Base"), error);
|
||||
return content; // Return original if cleaning fails
|
||||
}
|
||||
}
|
||||
@ -133,28 +113,18 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
|
||||
|
||||
try {
|
||||
// Fix common encoding issues
|
||||
return content
|
||||
// Fix broken quote characters
|
||||
.replace(/Γ\u00c2[\u00a3\u00a5]/g, '"')
|
||||
// Fix other common broken unicode
|
||||
.replace(/[\u{0080}-\u{FFFF}]/gu, (match) => {
|
||||
// Some common replacements
|
||||
const replacements: Record<string, string> = {
|
||||
'\u00A0': ' ', // Non-breaking space
|
||||
'\u2018': "'", // Left single quote
|
||||
'\u2019': "'", // Right single quote
|
||||
'\u201C': '"', // Left double quote
|
||||
'\u201D': '"', // Right double quote
|
||||
'\u2013': '-', // En dash
|
||||
'\u2014': '--', // Em dash
|
||||
'\u2022': '*', // Bullet
|
||||
'\u2026': '...' // Ellipsis
|
||||
};
|
||||
let fixed = content.replace(ENCODING_FIXES.BROKEN_QUOTES.pattern, ENCODING_FIXES.BROKEN_QUOTES.replacement);
|
||||
|
||||
return replacements[match] || match;
|
||||
});
|
||||
// Fix other common broken unicode
|
||||
fixed = fixed.replace(/[\u{0080}-\u{FFFF}]/gu, (match) => {
|
||||
// Use replacements from constants
|
||||
const replacements = ENCODING_FIXES.UNICODE_REPLACEMENTS;
|
||||
return replacements[match as keyof typeof replacements] || match;
|
||||
});
|
||||
|
||||
return fixed;
|
||||
} catch (error) {
|
||||
console.error('Error fixing encoding issues:', error);
|
||||
console.error(FORMATTER_LOGS.ERROR.ENCODING, error);
|
||||
return content; // Return original if fixing fails
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,12 @@ import { BaseMessageFormatter } from './base_formatter.js';
|
||||
import sanitizeHtml from 'sanitize-html';
|
||||
import { PROVIDER_PROMPTS, FORMATTING_PROMPTS } from '../constants/llm_prompt_constants.js';
|
||||
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
|
||||
import {
|
||||
HTML_ALLOWED_TAGS,
|
||||
HTML_ALLOWED_ATTRIBUTES,
|
||||
OLLAMA_CLEANING,
|
||||
FORMATTER_LOGS
|
||||
} from '../constants/formatter_constants.js';
|
||||
|
||||
/**
|
||||
* Ollama-specific message formatter
|
||||
@ -66,7 +72,7 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Ollama formatter processed ${messages.length} messages into ${formattedMessages.length} messages`);
|
||||
console.log(FORMATTER_LOGS.OLLAMA.PROCESSED(messages.length, formattedMessages.length));
|
||||
|
||||
return formattedMessages;
|
||||
}
|
||||
@ -85,30 +91,20 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||
// Then apply Ollama-specific aggressive cleaning
|
||||
// Remove any remaining HTML using sanitizeHtml
|
||||
let plaintext = sanitizeHtml(sanitized, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
allowedTags: HTML_ALLOWED_TAGS.NONE,
|
||||
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.NONE,
|
||||
textFilter: (text) => text
|
||||
});
|
||||
|
||||
// Then aggressively sanitize to plain ASCII and simple formatting
|
||||
plaintext = plaintext
|
||||
// Replace common problematic quotes with simple ASCII quotes
|
||||
.replace(/[""]/g, '"')
|
||||
.replace(/['']/g, "'")
|
||||
// Replace other common Unicode characters
|
||||
.replace(/[–—]/g, '-')
|
||||
.replace(/[•]/g, '*')
|
||||
.replace(/[…]/g, '...')
|
||||
// Strip all non-ASCII characters
|
||||
.replace(/[^\x00-\x7F]/g, '')
|
||||
// Normalize whitespace
|
||||
.replace(/\s+/g, ' ')
|
||||
.replace(/\n\s+/g, '\n')
|
||||
.trim();
|
||||
// Apply all Ollama-specific cleaning patterns
|
||||
const ollamaPatterns = OLLAMA_CLEANING;
|
||||
for (const pattern of Object.values(ollamaPatterns)) {
|
||||
plaintext = plaintext.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
return plaintext;
|
||||
return plaintext.trim();
|
||||
} catch (error) {
|
||||
console.error("Error cleaning context content for Ollama:", error);
|
||||
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("Ollama"), error);
|
||||
return content; // Return original if cleaning fails
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,13 @@ import type { Message } from '../ai_interface.js';
|
||||
import { BaseMessageFormatter } from './base_formatter.js';
|
||||
import { PROVIDER_PROMPTS, FORMATTING_PROMPTS } from '../constants/llm_prompt_constants.js';
|
||||
import { LLM_CONSTANTS } from '../constants/provider_constants.js';
|
||||
import {
|
||||
HTML_ALLOWED_TAGS,
|
||||
HTML_ALLOWED_ATTRIBUTES,
|
||||
HTML_TO_MARKDOWN_PATTERNS,
|
||||
HTML_ENTITY_REPLACEMENTS,
|
||||
FORMATTER_LOGS
|
||||
} from '../constants/formatter_constants.js';
|
||||
|
||||
/**
|
||||
* OpenAI-specific message formatter
|
||||
@ -72,7 +79,7 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`OpenAI formatter: ${messages.length} messages → ${formattedMessages.length} messages`);
|
||||
console.log(FORMATTER_LOGS.OPENAI.PROCESSED(messages.length, formattedMessages.length));
|
||||
return formattedMessages;
|
||||
}
|
||||
|
||||
@ -86,58 +93,24 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
||||
try {
|
||||
// Convert HTML to Markdown for better readability
|
||||
const cleaned = sanitizeHtml(content, {
|
||||
allowedTags: FORMATTING_PROMPTS.HTML_ALLOWED_TAGS,
|
||||
allowedAttributes: {
|
||||
'a': ['href']
|
||||
},
|
||||
transformTags: {
|
||||
'h1': 'h2',
|
||||
'h2': 'h3',
|
||||
'div': 'p',
|
||||
'span': 'span'
|
||||
}
|
||||
allowedTags: HTML_ALLOWED_TAGS.STANDARD,
|
||||
allowedAttributes: HTML_ALLOWED_ATTRIBUTES.STANDARD
|
||||
});
|
||||
|
||||
// Process inline elements to markdown with simpler approach
|
||||
let markdown = cleaned
|
||||
.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n')
|
||||
.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n')
|
||||
.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n')
|
||||
.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n')
|
||||
.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n')
|
||||
.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n')
|
||||
.replace(/<br[^>]*>/gi, '\n')
|
||||
.replace(/<a[^>]*href=["'](.*?)["'][^>]*>(.*?)<\/a>/gi, '[$2]($1)')
|
||||
.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**')
|
||||
.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**')
|
||||
.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*')
|
||||
.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*')
|
||||
.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`')
|
||||
.replace(/<pre[^>]*>(.*?)<\/pre>/gi, '```\n$1\n```')
|
||||
// Clean up any remaining HTML tags
|
||||
.replace(/<[^>]*>/g, '')
|
||||
// Clean up excessive newlines
|
||||
.replace(/\n{3,}/g, '\n\n');
|
||||
// Apply all HTML to Markdown patterns
|
||||
let markdown = cleaned;
|
||||
for (const pattern of Object.values(HTML_TO_MARKDOWN_PATTERNS)) {
|
||||
markdown = markdown.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
// Fix common HTML entities
|
||||
markdown = markdown
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/“/g, '"')
|
||||
.replace(/”/g, '"')
|
||||
.replace(/‘/g, "'")
|
||||
.replace(/’/g, "'")
|
||||
.replace(/—/g, '—')
|
||||
.replace(/–/g, '–')
|
||||
.replace(/…/g, '…');
|
||||
for (const pattern of Object.values(HTML_ENTITY_REPLACEMENTS)) {
|
||||
markdown = markdown.replace(pattern.pattern, pattern.replacement);
|
||||
}
|
||||
|
||||
return markdown.trim();
|
||||
} catch (error) {
|
||||
console.error("Error cleaning content for OpenAI:", error);
|
||||
console.error(FORMATTER_LOGS.ERROR.CONTEXT_CLEANING("OpenAI"), error);
|
||||
return content; // Return original if cleaning fails
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user