diff --git a/src/services/llm/context_extractor.ts b/src/services/llm/context_extractor.ts index 875b4307c..d12fcd97c 100644 --- a/src/services/llm/context_extractor.ts +++ b/src/services/llm/context_extractor.ts @@ -102,6 +102,107 @@ export class ContextExtractor { // Format code notes with code blocks formattedContent += '```\n' + content + '\n```'; break; + case 'canvas': + case 'mindMap': + case 'relationMap': + case 'geoMap': + if (mime === 'application/json') { + try { + // Parse JSON content + const jsonContent = JSON.parse(content); + + if (type === 'canvas') { + // Extract text elements from canvas + if (jsonContent.elements && Array.isArray(jsonContent.elements)) { + const texts = jsonContent.elements + .filter((element: any) => element.type === 'text' && element.text) + .map((element: any) => element.text); + + formattedContent += 'Canvas content:\n' + texts.join('\n'); + break; + } + } + else if (type === 'mindMap') { + // Extract node text from mind map + const extractMindMapNodes = (node: any): string[] => { + let texts: string[] = []; + if (node.text) { + texts.push(node.text); + } + if (node.children && Array.isArray(node.children)) { + for (const child of node.children) { + texts = texts.concat(extractMindMapNodes(child)); + } + } + return texts; + }; + + if (jsonContent.root) { + formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n'); + break; + } + } + else if (type === 'relationMap') { + // Extract relation map entities and connections + let result = 'Relation map content:\n'; + + if (jsonContent.notes && Array.isArray(jsonContent.notes)) { + result += 'Notes: ' + jsonContent.notes + .map((note: any) => note.title || note.name) + .filter(Boolean) + .join(', ') + '\n'; + } + + if (jsonContent.relations && Array.isArray(jsonContent.relations)) { + result += 'Relations: ' + jsonContent.relations + .map((rel: any) => { + const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId); + const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId); + const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown'; + const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown'; + return `${source} → ${rel.name || ''} → ${target}`; + }) + .join('; '); + } + + formattedContent += result; + break; + } + else if (type === 'geoMap') { + let result = 'Geographic map content:\n'; + + if (jsonContent.markers && Array.isArray(jsonContent.markers)) { + result += jsonContent.markers + .map((marker: any) => { + return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`; + }) + .join('\n'); + } + + formattedContent += result || 'Empty geographic map'; + break; + } + } + catch (e: any) { + formattedContent += `[Error parsing ${type} content: ${e.message}]`; + break; + } + } + + // If JSON parsing or specific handling failed, use default handling + formattedContent += `[${type} content]`; + break; + + case 'mermaid': + // Format mermaid diagrams as code blocks + formattedContent += '```mermaid\n' + content + '\n```'; + break; + + case 'image': + case 'file': + formattedContent += `[${type} attachment]`; + break; + default: // For other notes, just use the content as is formattedContent += this.sanitizeHtml(content); @@ -114,7 +215,10 @@ export class ContextExtractor { * Sanitize HTML content to plain text */ private sanitizeHtml(html: string): string { - return sanitizeHtml(html, { + if (!html) return ''; + + // Use sanitizeHtml to remove all HTML tags + let content = sanitizeHtml(html, { allowedTags: [], allowedAttributes: {}, textFilter: (text) => { @@ -122,6 +226,17 @@ export class ContextExtractor { return text.replace(/\n\s*\n/g, '\n\n'); } }); + + // Additional cleanup for any remaining HTML entities + content = content + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); + + return content; } /** diff --git a/src/services/llm/embeddings/vector_store.ts b/src/services/llm/embeddings/vector_store.ts index b3c2b2f6e..a384fc524 100644 --- a/src/services/llm/embeddings/vector_store.ts +++ b/src/services/llm/embeddings/vector_store.ts @@ -8,6 +8,7 @@ import type { NoteEmbeddingContext } from "./embeddings_interface.js"; import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js"; import eventService from "../../events.js"; import type BNote from "../../../becca/entities/bnote.js"; +import sanitizeHtml from "sanitize-html"; // Type definition for embedding result interface EmbeddingResult { @@ -183,19 +184,37 @@ export async function findSimilarNotes( * Clean note content by removing HTML tags and normalizing whitespace */ function cleanNoteContent(content: string, type: string, mime: string): string { + if (!content) return ''; + // If it's HTML content, remove HTML tags if ((type === 'text' && mime === 'text/html') || content.includes('
')) {
- // Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser
- content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space
+ // Use sanitizeHtml to remove all HTML tags
+ content = sanitizeHtml(content, {
+ allowedTags: [],
+ allowedAttributes: {},
+ textFilter: (text) => {
+ // Normalize the text, removing excessive whitespace
+ return text.replace(/\s+/g, ' ');
+ }
+ });
}
+ // Additional cleanup for any remaining HTML entities
+ content = content
+ .replace(/ /g, ' ')
+ .replace(/</g, '<')
+ .replace(/>/g, '>')
+ .replace(/&/g, '&')
+ .replace(/"/g, '"')
+ .replace(/'/g, "'");
+
// Normalize whitespace (replace multiple spaces/newlines with single space)
content = content.replace(/\s+/g, ' ');
// Trim the content
content = content.trim();
- // Truncate if extremely long (optional, adjust limit as needed)
+ // Truncate if extremely long
const MAX_CONTENT_LENGTH = 10000;
if (content.length > MAX_CONTENT_LENGTH) {
content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]';
@@ -204,6 +223,113 @@ function cleanNoteContent(content: string, type: string, mime: string): string {
return content;
}
+/**
+ * Extract content from different note types
+ */
+function extractStructuredContent(content: string, type: string, mime: string): string {
+ try {
+ if (!content) return '';
+
+ // Special handling based on note type
+ switch (type) {
+ case 'mindMap':
+ case 'relationMap':
+ case 'canvas':
+ if (mime === 'application/json') {
+ const jsonContent = JSON.parse(content);
+
+ if (type === 'canvas') {
+ // Extract text elements from canvas
+ if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
+ const texts = jsonContent.elements
+ .filter((element: any) => element.type === 'text' && element.text)
+ .map((element: any) => element.text);
+ return texts.join('\n');
+ }
+ }
+ else if (type === 'mindMap') {
+ // Extract node text from mind map
+ const extractMindMapNodes = (node: any): string[] => {
+ let texts: string[] = [];
+ if (node.text) {
+ texts.push(node.text);
+ }
+ if (node.children && Array.isArray(node.children)) {
+ for (const child of node.children) {
+ texts = texts.concat(extractMindMapNodes(child));
+ }
+ }
+ return texts;
+ };
+
+ if (jsonContent.root) {
+ return extractMindMapNodes(jsonContent.root).join('\n');
+ }
+ }
+ else if (type === 'relationMap') {
+ // Extract relation map entities and connections
+ let result = '';
+
+ if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
+ result += 'Notes: ' + jsonContent.notes
+ .map((note: any) => note.title || note.name)
+ .filter(Boolean)
+ .join(', ') + '\n';
+ }
+
+ if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
+ result += 'Relations: ' + jsonContent.relations
+ .map((rel: any) => {
+ const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
+ const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
+ const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
+ const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
+ return `${source} → ${rel.name || ''} → ${target}`;
+ })
+ .join('; ');
+ }
+
+ return result;
+ }
+ }
+ return JSON.stringify(content);
+
+ case 'mermaid':
+ // Return mermaid diagrams as-is (they're human-readable)
+ return content;
+
+ case 'geoMap':
+ if (mime === 'application/json') {
+ const jsonContent = JSON.parse(content);
+ let result = '';
+
+ if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
+ result += jsonContent.markers
+ .map((marker: any) => {
+ return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
+ })
+ .join('\n');
+ }
+
+ return result || JSON.stringify(content);
+ }
+ return JSON.stringify(content);
+
+ case 'file':
+ case 'image':
+ // For files and images, just return a placeholder
+ return `[${type} attachment]`;
+
+ default:
+ return content;
+ }
+ }
+ catch (error) {
+ console.error(`Error extracting content from ${type} note:`, error);
+ return content;
+ }
+}
+
/**
* Gets context for a note to be embedded
*/
@@ -282,12 +408,23 @@ export async function getNoteEmbeddingContext(noteId: string): Promise