diff --git a/src/services/llm/context_extractor.ts b/src/services/llm/context_extractor.ts index 875b4307c..d12fcd97c 100644 --- a/src/services/llm/context_extractor.ts +++ b/src/services/llm/context_extractor.ts @@ -102,6 +102,107 @@ export class ContextExtractor { // Format code notes with code blocks formattedContent += '```\n' + content + '\n```'; break; + case 'canvas': + case 'mindMap': + case 'relationMap': + case 'geoMap': + if (mime === 'application/json') { + try { + // Parse JSON content + const jsonContent = JSON.parse(content); + + if (type === 'canvas') { + // Extract text elements from canvas + if (jsonContent.elements && Array.isArray(jsonContent.elements)) { + const texts = jsonContent.elements + .filter((element: any) => element.type === 'text' && element.text) + .map((element: any) => element.text); + + formattedContent += 'Canvas content:\n' + texts.join('\n'); + break; + } + } + else if (type === 'mindMap') { + // Extract node text from mind map + const extractMindMapNodes = (node: any): string[] => { + let texts: string[] = []; + if (node.text) { + texts.push(node.text); + } + if (node.children && Array.isArray(node.children)) { + for (const child of node.children) { + texts = texts.concat(extractMindMapNodes(child)); + } + } + return texts; + }; + + if (jsonContent.root) { + formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n'); + break; + } + } + else if (type === 'relationMap') { + // Extract relation map entities and connections + let result = 'Relation map content:\n'; + + if (jsonContent.notes && Array.isArray(jsonContent.notes)) { + result += 'Notes: ' + jsonContent.notes + .map((note: any) => note.title || note.name) + .filter(Boolean) + .join(', ') + '\n'; + } + + if (jsonContent.relations && Array.isArray(jsonContent.relations)) { + result += 'Relations: ' + jsonContent.relations + .map((rel: any) => { + const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId); + const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId); + const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown'; + const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown'; + return `${source} → ${rel.name || ''} → ${target}`; + }) + .join('; '); + } + + formattedContent += result; + break; + } + else if (type === 'geoMap') { + let result = 'Geographic map content:\n'; + + if (jsonContent.markers && Array.isArray(jsonContent.markers)) { + result += jsonContent.markers + .map((marker: any) => { + return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`; + }) + .join('\n'); + } + + formattedContent += result || 'Empty geographic map'; + break; + } + } + catch (e: any) { + formattedContent += `[Error parsing ${type} content: ${e.message}]`; + break; + } + } + + // If JSON parsing or specific handling failed, use default handling + formattedContent += `[${type} content]`; + break; + + case 'mermaid': + // Format mermaid diagrams as code blocks + formattedContent += '```mermaid\n' + content + '\n```'; + break; + + case 'image': + case 'file': + formattedContent += `[${type} attachment]`; + break; + default: // For other notes, just use the content as is formattedContent += this.sanitizeHtml(content); @@ -114,7 +215,10 @@ export class ContextExtractor { * Sanitize HTML content to plain text */ private sanitizeHtml(html: string): string { - return sanitizeHtml(html, { + if (!html) return ''; + + // Use sanitizeHtml to remove all HTML tags + let content = sanitizeHtml(html, { allowedTags: [], allowedAttributes: {}, textFilter: (text) => { @@ -122,6 +226,17 @@ export class ContextExtractor { return text.replace(/\n\s*\n/g, '\n\n'); } }); + + // Additional cleanup for any remaining HTML entities + content = content + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); + + return content; } /** diff --git a/src/services/llm/embeddings/vector_store.ts b/src/services/llm/embeddings/vector_store.ts index b3c2b2f6e..a384fc524 100644 --- a/src/services/llm/embeddings/vector_store.ts +++ b/src/services/llm/embeddings/vector_store.ts @@ -8,6 +8,7 @@ import type { NoteEmbeddingContext } from "./embeddings_interface.js"; import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js"; import eventService from "../../events.js"; import type BNote from "../../../becca/entities/bnote.js"; +import sanitizeHtml from "sanitize-html"; // Type definition for embedding result interface EmbeddingResult { @@ -183,19 +184,37 @@ export async function findSimilarNotes( * Clean note content by removing HTML tags and normalizing whitespace */ function cleanNoteContent(content: string, type: string, mime: string): string { + if (!content) return ''; + // If it's HTML content, remove HTML tags if ((type === 'text' && mime === 'text/html') || content.includes('
') || content.includes('

')) { - // Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser - content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space + // Use sanitizeHtml to remove all HTML tags + content = sanitizeHtml(content, { + allowedTags: [], + allowedAttributes: {}, + textFilter: (text) => { + // Normalize the text, removing excessive whitespace + return text.replace(/\s+/g, ' '); + } + }); } + // Additional cleanup for any remaining HTML entities + content = content + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); + // Normalize whitespace (replace multiple spaces/newlines with single space) content = content.replace(/\s+/g, ' '); // Trim the content content = content.trim(); - // Truncate if extremely long (optional, adjust limit as needed) + // Truncate if extremely long const MAX_CONTENT_LENGTH = 10000; if (content.length > MAX_CONTENT_LENGTH) { content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]'; @@ -204,6 +223,113 @@ function cleanNoteContent(content: string, type: string, mime: string): string { return content; } +/** + * Extract content from different note types + */ +function extractStructuredContent(content: string, type: string, mime: string): string { + try { + if (!content) return ''; + + // Special handling based on note type + switch (type) { + case 'mindMap': + case 'relationMap': + case 'canvas': + if (mime === 'application/json') { + const jsonContent = JSON.parse(content); + + if (type === 'canvas') { + // Extract text elements from canvas + if (jsonContent.elements && Array.isArray(jsonContent.elements)) { + const texts = jsonContent.elements + .filter((element: any) => element.type === 'text' && element.text) + .map((element: any) => element.text); + return texts.join('\n'); + } + } + else if (type === 'mindMap') { + // Extract node text from mind map + const extractMindMapNodes = (node: any): string[] => { + let texts: string[] = []; + if (node.text) { + texts.push(node.text); + } + if (node.children && Array.isArray(node.children)) { + for (const child of node.children) { + texts = texts.concat(extractMindMapNodes(child)); + } + } + return texts; + }; + + if (jsonContent.root) { + return extractMindMapNodes(jsonContent.root).join('\n'); + } + } + else if (type === 'relationMap') { + // Extract relation map entities and connections + let result = ''; + + if (jsonContent.notes && Array.isArray(jsonContent.notes)) { + result += 'Notes: ' + jsonContent.notes + .map((note: any) => note.title || note.name) + .filter(Boolean) + .join(', ') + '\n'; + } + + if (jsonContent.relations && Array.isArray(jsonContent.relations)) { + result += 'Relations: ' + jsonContent.relations + .map((rel: any) => { + const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId); + const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId); + const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown'; + const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown'; + return `${source} → ${rel.name || ''} → ${target}`; + }) + .join('; '); + } + + return result; + } + } + return JSON.stringify(content); + + case 'mermaid': + // Return mermaid diagrams as-is (they're human-readable) + return content; + + case 'geoMap': + if (mime === 'application/json') { + const jsonContent = JSON.parse(content); + let result = ''; + + if (jsonContent.markers && Array.isArray(jsonContent.markers)) { + result += jsonContent.markers + .map((marker: any) => { + return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`; + }) + .join('\n'); + } + + return result || JSON.stringify(content); + } + return JSON.stringify(content); + + case 'file': + case 'image': + // For files and images, just return a placeholder + return `[${type} attachment]`; + + default: + return content; + } + } + catch (error) { + console.error(`Error extracting content from ${type} note:`, error); + return content; + } +} + /** * Gets context for a note to be embedded */ @@ -282,12 +408,23 @@ export async function getNoteEmbeddingContext(noteId: string): Promise maxContentLength) { - content = content.substring(0, maxContentLength) + " [content truncated due to length]"; + if (cleanContent.length > maxContentLength) { + cleanContent = cleanContent.substring(0, maxContentLength) + " [content truncated due to length]"; } - context += `${content}\n`; + context += `${cleanContent}\n`; } else { context += "[This note doesn't contain textual content]\n"; } @@ -373,6 +375,45 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`; return context; } + /** + * Sanitize note content for use in context, removing HTML tags + */ + private sanitizeNoteContent(content: string, type?: string, mime?: string): string { + if (!content) return ''; + + // If it's likely HTML content + if ( + (type === 'text' && mime === 'text/html') || + content.includes('') || + content.includes(' { + // Replace multiple newlines with a single one + return text.replace(/\n\s*\n/g, '\n\n'); + } + }); + + // Additional cleanup for remaining HTML entities + content = content + .replace(/ /g, ' ') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); + } + + // Normalize whitespace + content = content.replace(/\s+/g, ' ').trim(); + + return content; + } + /** * Process a user query with the Trilium-specific approach: * 1. Generate search queries from the original question