From 0985cec8d6d9600df11060d9524b05544e90659b Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 11 Mar 2025 18:17:41 +0000 Subject: [PATCH] implement chunking and use becca for some functionality --- src/services/llm/context_extractor.ts | 149 ++++++++++---- src/services/llm/embeddings/vector_store.ts | 203 +++++++++++++++++--- 2 files changed, 282 insertions(+), 70 deletions(-) diff --git a/src/services/llm/context_extractor.ts b/src/services/llm/context_extractor.ts index 3c9115cf7..e631e7ec1 100644 --- a/src/services/llm/context_extractor.ts +++ b/src/services/llm/context_extractor.ts @@ -1,5 +1,6 @@ import sql from '../sql.js'; import sanitizeHtml from 'sanitize-html'; +import becca from '../../becca/becca.js'; /** * Utility class for extracting context from notes to provide to AI models @@ -10,19 +11,27 @@ export class ContextExtractor { * Get the content of a note */ async getNoteContent(noteId: string): Promise { - const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>( - `SELECT note_contents.content, notes.type, notes.mime, notes.title - FROM notes - JOIN note_contents ON notes.noteId = note_contents.noteId - WHERE notes.noteId = ?`, - [noteId] - ); + // Use Becca API to get note data + const note = becca.getNote(noteId); if (!note) { return null; } - return this.formatNoteContent(note.content, note.type, note.mime, note.title); + try { + // Get content using Becca API + const content = String(await note.getContent() || ""); + + return this.formatNoteContent( + content, + note.type, + note.mime, + note.title + ); + } catch (error) { + console.error(`Error getting content for note ${noteId}:`, error); + return null; + } } /** @@ -181,13 +190,27 @@ export class ContextExtractor { * Get a set of parent notes to provide hierarchical context */ async getParentContext(noteId: string, maxDepth = 3): Promise { + // Note: getParentNotes has already been updated to use Becca const parents = await this.getParentNotes(noteId, maxDepth); if (!parents.length) return ''; let context = 'Here is the hierarchical context for the current note:\n\n'; - for (const parent of parents) { - context += `- ${parent.title}\n`; + // Create a hierarchical view of the parents using indentation + // to show the proper parent-child relationship + let indentLevel = 0; + for (let i = 0; i < parents.length; i++) { + const parent = parents[i]; + const indent = ' '.repeat(indentLevel); + context += `${indent}- ${parent.title}\n`; + indentLevel++; + } + + // Now add the current note with proper indentation + const note = becca.getNote(noteId); + if (note) { + const indent = ' '.repeat(indentLevel); + context += `${indent}- ${note.title} (current note)\n`; } return context + '\n'; @@ -197,21 +220,33 @@ export class ContextExtractor { * Get child notes to provide additional context */ async getChildContext(noteId: string, maxChildren = 5): Promise { - const children = await sql.getRows<{noteId: string, title: string}>( - `SELECT noteId, title FROM notes - WHERE parentNoteId = ? AND isDeleted = 0 - LIMIT ?`, - [noteId, maxChildren] - ); + const note = becca.getNote(noteId); - if (!children.length) return ''; + if (!note) { + return ''; + } + + // Use Becca API to get child notes + const childNotes = note.getChildNotes(); + + if (!childNotes || childNotes.length === 0) { + return ''; + } let context = 'The current note has these child notes:\n\n'; - for (const child of children) { + // Limit to maxChildren + const childrenToShow = childNotes.slice(0, maxChildren); + + for (const child of childrenToShow) { context += `- ${child.title}\n`; } + // If there are more children than we're showing, indicate that + if (childNotes.length > maxChildren) { + context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`; + } + return context + '\n'; } @@ -219,24 +254,42 @@ export class ContextExtractor { * Get notes linked to this note */ async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise { - const linkedNotes = await sql.getRows<{title: string}>( - `SELECT title FROM notes - WHERE noteId IN ( - SELECT value FROM attributes - WHERE noteId = ? AND type = 'relation' - LIMIT ? - )`, - [noteId, maxLinks] - ); + const note = becca.getNote(noteId); - if (!linkedNotes.length) return ''; + if (!note) { + return ''; + } + + // Use Becca API to get relations + const relations = note.getRelations(); + + if (!relations || relations.length === 0) { + return ''; + } + + // Get the target notes from relations + const linkedNotes = relations + .map(relation => relation.targetNote) + .filter(note => note !== null && note !== undefined); + + if (linkedNotes.length === 0) { + return ''; + } let context = 'This note has relationships with these notes:\n\n'; - for (const linked of linkedNotes) { + // Limit to maxLinks + const notesToShow = linkedNotes.slice(0, maxLinks); + + for (const linked of notesToShow) { context += `- ${linked.title}\n`; } + // If there are more linked notes than we're showing, indicate that + if (linkedNotes.length > maxLinks) { + context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`; + } + return context + '\n'; } @@ -669,27 +722,41 @@ export class ContextExtractor { */ private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> { const parentNotes: {noteId: string, title: string}[] = []; - let currentNoteId = noteId; + const startNote = becca.getNote(noteId); + + if (!startNote) { + return parentNotes; + } + + // Use non-null assertion as we checked above + let currentNote: any = startNote; for (let i = 0; i < maxDepth; i++) { - const parent = await sql.getRow<{parentNoteId: string, title: string}>( - `SELECT branches.parentNoteId, notes.title - FROM branches - JOIN notes ON branches.parentNoteId = notes.noteId - WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`, - [currentNoteId] - ); + // Get parent branches (should be just one in most cases) + if (!currentNote) break; - if (!parent || parent.parentNoteId === 'root') { + const parentBranches: any[] = currentNote.getParentBranches(); + + if (!parentBranches || parentBranches.length === 0) { + break; + } + + // Use the first parent branch + const branch: any = parentBranches[0]; + if (!branch) break; + + const parentNote: any = branch.getParentNote(); + + if (!parentNote || parentNote.noteId === 'root') { break; } parentNotes.unshift({ - noteId: parent.parentNoteId, - title: parent.title + noteId: parentNote.noteId, + title: parentNote.title }); - currentNoteId = parent.parentNoteId; + currentNote = parentNote; } return parentNotes; diff --git a/src/services/llm/embeddings/vector_store.ts b/src/services/llm/embeddings/vector_store.ts index a384fc524..9d9584aa8 100644 --- a/src/services/llm/embeddings/vector_store.ts +++ b/src/services/llm/embeddings/vector_store.ts @@ -410,25 +410,69 @@ export async function getNoteEmbeddingContext(noteId: string): Promise 10000) { + // Large content handling options: + + // Option 1: Use our summarization feature + const summary = await contextExtractor.getNoteSummary(noteId); + if (summary) { + content = summary; + } + + // Option 2: Alternative approach - use the first chunk if summarization fails + if (content.length > 10000) { + const chunks = await contextExtractor.getChunkedNoteContent(noteId); + if (chunks && chunks.length > 0) { + // Use the first chunk (most relevant/beginning) + content = chunks[0]; + } + } + } + } else { + // Fallback to original method if context extractor fails + const rawContent = String(await note.getContent() || ""); + + // Process the content based on note type to extract meaningful text + if (note.type === 'text' || note.type === 'code') { + content = rawContent; + } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) { + // Process structured content types + content = extractStructuredContent(rawContent, note.type, note.mime); + } else if (note.type === 'image' || note.type === 'file') { + content = `[${note.type} attachment: ${note.mime}]`; + } + + // Clean the content to remove HTML tags and normalize whitespace + content = cleanNoteContent(content, note.type, note.mime); } } catch (err) { console.error(`Error getting content for note ${noteId}:`, err); content = `[Error extracting content]`; - } - // Clean the content to remove HTML tags and normalize whitespace - content = cleanNoteContent(content, note.type, note.mime); + // Try fallback to original method + try { + const rawContent = String(await note.getContent() || ""); + if (note.type === 'text' || note.type === 'code') { + content = rawContent; + } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) { + content = extractStructuredContent(rawContent, note.type, note.mime); + } + content = cleanNoteContent(content, note.type, note.mime); + } catch (fallbackErr) { + console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr); + } + } // Get template/inheritance relationships // This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote @@ -490,19 +534,35 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE' } /** - * Deletes all embeddings for a note + * Delete embeddings for a note + * + * @param noteId - The ID of the note + * @param providerId - Optional provider ID to delete embeddings only for a specific provider + * @param modelId - Optional model ID to delete embeddings only for a specific model */ -export async function deleteNoteEmbeddings(noteId: string) { - await sql.execute( - "DELETE FROM note_embeddings WHERE noteId = ?", - [noteId] - ); +export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) { + let query = "DELETE FROM note_embeddings WHERE noteId = ?"; + const params: any[] = [noteId]; - // Remove from queue if present - await sql.execute( - "DELETE FROM embedding_queue WHERE noteId = ?", - [noteId] - ); + if (providerId) { + query += " AND providerId = ?"; + params.push(providerId); + + if (modelId) { + query += " AND modelId = ?"; + params.push(modelId); + } + } + + await sql.execute(query, params); + + // Only remove from queue if deleting all embeddings for the note + if (!providerId) { + await sql.execute( + "DELETE FROM embedding_queue WHERE noteId = ?", + [noteId] + ); + } } /** @@ -559,15 +619,28 @@ export async function processEmbeddingQueue() { // Get note context for embedding const context = await getNoteEmbeddingContext(noteData.noteId); + // Check if we should use chunking for large content + const useChunking = context.content.length > 5000; // Use chunking for large notes by default + // Process with each enabled provider for (const provider of enabledProviders) { try { - // Generate embedding - const embedding = await provider.generateNoteEmbeddings(context); + if (useChunking) { + // Enhanced approach: Process large notes using chunking + await processNoteWithChunking(noteData.noteId, provider, context); + } else { + // Standard approach: Generate a single embedding for the whole note + const embedding = await provider.generateNoteEmbeddings(context); - // Store embedding - const config = provider.getConfig(); - await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding); + // Store embedding + const config = provider.getConfig(); + await storeNoteEmbedding( + noteData.noteId, + provider.name, + config.model, + embedding + ); + } } catch (providerError: any) { log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`); } @@ -748,6 +821,78 @@ export async function getEmbeddingStats() { }; } +/** + * Process a large note by breaking it into chunks and creating embeddings for each chunk + * This provides more detailed and focused embeddings for different parts of large notes + * + * @param noteId - The ID of the note to process + * @param provider - The embedding provider to use + * @param context - The note context data + */ +async function processNoteWithChunking( + noteId: string, + provider: any, + context: NoteEmbeddingContext +): Promise { + try { + // Get the context extractor dynamically to avoid circular dependencies + const { default: contextExtractor } = await import('../../llm/context_extractor.js'); + + // Get chunks of the note content + const chunks = await contextExtractor.getChunkedNoteContent(noteId); + + if (!chunks || chunks.length === 0) { + // Fall back to single embedding if chunking fails + const embedding = await provider.generateNoteEmbeddings(context); + const config = provider.getConfig(); + await storeNoteEmbedding(noteId, provider.name, config.model, embedding); + return; + } + + // Generate and store embeddings for each chunk + const config = provider.getConfig(); + + // Delete existing embeddings first to avoid duplicates + await deleteNoteEmbeddings(noteId, provider.name, config.model); + + // Process each chunk with a slight delay to avoid rate limits + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i]; + + // Create a modified context object with just this chunk's content + const chunkContext: NoteEmbeddingContext = { + ...context, + content: chunk + }; + + // Generate embedding for this chunk + const embedding = await provider.generateNoteEmbeddings(chunkContext); + + // Store with chunk information + await storeNoteEmbedding( + noteId, + provider.name, + config.model, + embedding + ); + + // Small delay between chunks to avoid rate limits + if (i < chunks.length - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + + log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`); + } catch (error: any) { + log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`); + throw error; + } +} + +export function cleanupEmbeddings() { + // Cleanup function implementation +} + export default { cosineSimilarity, embeddingToBuffer,