implement chunking and use becca for some functionality

2025-10-29 11:44:21 +08:00 · 2025-03-11 18:17:41 +00:00 · 2025-03-11 18:17:41 +00:00 · 0985cec8d6
commit 0985cec8d6
parent 4160db9728
2 changed files with 282 additions and 70 deletions
--- a/src/services/llm/context_extractor.ts
+++ b/src/services/llm/context_extractor.ts
@ -1,5 +1,6 @@
 import sql from '../sql.js';
 import sanitizeHtml from 'sanitize-html';
 import becca from '../../becca/becca.js';
 /**
 * Utility class for extracting context from notes to provide to AI models
@ -10,19 +11,27 @@ export class ContextExtractor {
     * Get the content of a note
     */
    async getNoteContent(noteId: string): Promise<string | null> {
-        const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>(
+        // Use Becca API to get note data
-            `SELECT note_contents.content, notes.type, notes.mime, notes.title
+        const note = becca.getNote(noteId);
             FROM notes
             JOIN note_contents ON notes.noteId = note_contents.noteId
             WHERE notes.noteId = ?`,
            [noteId]
        );
        if (!note) {
            return null;
        }
-        return this.formatNoteContent(note.content, note.type, note.mime, note.title);
+        try {
            // Get content using Becca API
            const content = String(await note.getContent() || "");
            return this.formatNoteContent(
                content,
                note.type,
                note.mime,
                note.title
            );
        } catch (error) {
            console.error(`Error getting content for note ${noteId}:`, error);
            return null;
        }
    }
    /**
@ -181,13 +190,27 @@ export class ContextExtractor {
     * Get a set of parent notes to provide hierarchical context
     */
    async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
        // Note: getParentNotes has already been updated to use Becca
        const parents = await this.getParentNotes(noteId, maxDepth);
        if (!parents.length) return '';
        let context = 'Here is the hierarchical context for the current note:\n\n';
-        for (const parent of parents) {
+        // Create a hierarchical view of the parents using indentation
-            context += `- ${parent.title}\n`;
+        // to show the proper parent-child relationship
        let indentLevel = 0;
        for (let i = 0; i < parents.length; i++) {
            const parent = parents[i];
            const indent = '  '.repeat(indentLevel);
            context += `${indent}- ${parent.title}\n`;
            indentLevel++;
        }
        // Now add the current note with proper indentation
        const note = becca.getNote(noteId);
        if (note) {
            const indent = '  '.repeat(indentLevel);
            context += `${indent}- ${note.title} (current note)\n`;
        }
        return context + '\n';
@ -197,21 +220,33 @@ export class ContextExtractor {
     * Get child notes to provide additional context
     */
    async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
-        const children = await sql.getRows<{noteId: string, title: string}>(
+        const note = becca.getNote(noteId);
            `SELECT noteId, title FROM notes
             WHERE parentNoteId = ? AND isDeleted = 0
             LIMIT ?`,
            [noteId, maxChildren]
        );
-        if (!children.length) return '';
+        if (!note) {
            return '';
        }
        // Use Becca API to get child notes
        const childNotes = note.getChildNotes();
        if (!childNotes || childNotes.length === 0) {
            return '';
        }
        let context = 'The current note has these child notes:\n\n';
-        for (const child of children) {
+        // Limit to maxChildren
        const childrenToShow = childNotes.slice(0, maxChildren);
        for (const child of childrenToShow) {
            context += `- ${child.title}\n`;
        }
        // If there are more children than we're showing, indicate that
        if (childNotes.length > maxChildren) {
            context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
        }
        return context + '\n';
    }
@ -219,24 +254,42 @@ export class ContextExtractor {
     * Get notes linked to this note
     */
    async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
-        const linkedNotes = await sql.getRows<{title: string}>(
+        const note = becca.getNote(noteId);
            `SELECT title FROM notes
             WHERE noteId IN (
                SELECT value FROM attributes
                WHERE noteId = ? AND type = 'relation'
                LIMIT ?
             )`,
            [noteId, maxLinks]
        );
-        if (!linkedNotes.length) return '';
+        if (!note) {
            return '';
        }
        // Use Becca API to get relations
        const relations = note.getRelations();
        if (!relations || relations.length === 0) {
            return '';
        }
        // Get the target notes from relations
        const linkedNotes = relations
            .map(relation => relation.targetNote)
            .filter(note => note !== null && note !== undefined);
        if (linkedNotes.length === 0) {
            return '';
        }
        let context = 'This note has relationships with these notes:\n\n';
-        for (const linked of linkedNotes) {
+        // Limit to maxLinks
        const notesToShow = linkedNotes.slice(0, maxLinks);
        for (const linked of notesToShow) {
            context += `- ${linked.title}\n`;
        }
        // If there are more linked notes than we're showing, indicate that
        if (linkedNotes.length > maxLinks) {
            context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
        }
        return context + '\n';
    }
@ -669,27 +722,41 @@ export class ContextExtractor {
     */
    private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
        const parentNotes: {noteId: string, title: string}[] = [];
-        let currentNoteId = noteId;
+        const startNote = becca.getNote(noteId);
        if (!startNote) {
            return parentNotes;
        }
        // Use non-null assertion as we checked above
        let currentNote: any = startNote;
        for (let i = 0; i < maxDepth; i++) {
-            const parent = await sql.getRow<{parentNoteId: string, title: string}>(
+            // Get parent branches (should be just one in most cases)
-                `SELECT branches.parentNoteId, notes.title
+            if (!currentNote) break;
                 FROM branches
                 JOIN notes ON branches.parentNoteId = notes.noteId
                 WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`,
                [currentNoteId]
            );
-            if (!parent || parent.parentNoteId === 'root') {
+            const parentBranches: any[] = currentNote.getParentBranches();
            if (!parentBranches || parentBranches.length === 0) {
                break;
            }
            // Use the first parent branch
            const branch: any = parentBranches[0];
            if (!branch) break;
            const parentNote: any = branch.getParentNote();
            if (!parentNote || parentNote.noteId === 'root') {
                break;
            }
            parentNotes.unshift({
-                noteId: parent.parentNoteId,
+                noteId: parentNote.noteId,
-                title: parent.title
+                title: parentNote.title
            });
-            currentNoteId = parent.parentNoteId;
+            currentNote = parentNote;
        }
        return parentNotes;
--- a/src/services/llm/embeddings/vector_store.ts
+++ b/src/services/llm/embeddings/vector_store.ts
@ -410,25 +410,69 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
    let content = "";
    try {
-        // Get raw content from the note
+        // Use the enhanced context extractor for improved content extraction
-        const rawContent = String(await note.getContent() || "");
+        // We're using a dynamic import to avoid circular dependencies
        const { default: contextExtractor } = await import('../../llm/context_extractor.js');
-        // Process the content based on note type to extract meaningful text
+        // Get the content using the enhanced formatNoteContent method in context extractor
-        if (note.type === 'text' || note.type === 'code') {
+        const noteContent = await contextExtractor.getNoteContent(noteId);
-            content = rawContent;
+
-        } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
+        if (noteContent) {
-            // Process structured content types
+            content = noteContent;
-            content = extractStructuredContent(rawContent, note.type, note.mime);
+
-        } else if (note.type === 'image' || note.type === 'file') {
+            // For large content, consider chunking or summarization
-            content = `[${note.type} attachment: ${note.mime}]`;
+            if (content.length > 10000) {
                // Large content handling options:
                // Option 1: Use our summarization feature
                const summary = await contextExtractor.getNoteSummary(noteId);
                if (summary) {
                    content = summary;
                }
                // Option 2: Alternative approach - use the first chunk if summarization fails
                if (content.length > 10000) {
                    const chunks = await contextExtractor.getChunkedNoteContent(noteId);
                    if (chunks && chunks.length > 0) {
                        // Use the first chunk (most relevant/beginning)
                        content = chunks[0];
                    }
                }
            }
        } else {
            // Fallback to original method if context extractor fails
            const rawContent = String(await note.getContent() || "");
            // Process the content based on note type to extract meaningful text
            if (note.type === 'text' || note.type === 'code') {
                content = rawContent;
            } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
                // Process structured content types
                content = extractStructuredContent(rawContent, note.type, note.mime);
            } else if (note.type === 'image' || note.type === 'file') {
                content = `[${note.type} attachment: ${note.mime}]`;
            }
            // Clean the content to remove HTML tags and normalize whitespace
            content = cleanNoteContent(content, note.type, note.mime);
        }
    } catch (err) {
        console.error(`Error getting content for note ${noteId}:`, err);
        content = `[Error extracting content]`;
    }
-    // Clean the content to remove HTML tags and normalize whitespace
+        // Try fallback to original method
-    content = cleanNoteContent(content, note.type, note.mime);
+        try {
            const rawContent = String(await note.getContent() || "");
            if (note.type === 'text' || note.type === 'code') {
                content = rawContent;
            } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
                content = extractStructuredContent(rawContent, note.type, note.mime);
            }
            content = cleanNoteContent(content, note.type, note.mime);
        } catch (fallbackErr) {
            console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr);
        }
    }
    // Get template/inheritance relationships
    // This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
@ -490,19 +534,35 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE'
 }
 /**
- * Deletes all embeddings for a note
+ * Delete embeddings for a note
 *
 * @param noteId - The ID of the note
 * @param providerId - Optional provider ID to delete embeddings only for a specific provider
 * @param modelId - Optional model ID to delete embeddings only for a specific model
 */
-export async function deleteNoteEmbeddings(noteId: string) {
+export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) {
-    await sql.execute(
+    let query = "DELETE FROM note_embeddings WHERE noteId = ?";
-        "DELETE FROM note_embeddings WHERE noteId = ?",
+    const params: any[] = [noteId];
        [noteId]
    );
-    // Remove from queue if present
+    if (providerId) {
-    await sql.execute(
+        query += " AND providerId = ?";
-        "DELETE FROM embedding_queue WHERE noteId = ?",
+        params.push(providerId);
-        [noteId]
+
-    );
+        if (modelId) {
            query += " AND modelId = ?";
            params.push(modelId);
        }
    }
    await sql.execute(query, params);
    // Only remove from queue if deleting all embeddings for the note
    if (!providerId) {
        await sql.execute(
            "DELETE FROM embedding_queue WHERE noteId = ?",
            [noteId]
        );
    }
 }
 /**
@ -559,15 +619,28 @@ export async function processEmbeddingQueue() {
            // Get note context for embedding
            const context = await getNoteEmbeddingContext(noteData.noteId);
            // Check if we should use chunking for large content
            const useChunking = context.content.length > 5000; // Use chunking for large notes by default
            // Process with each enabled provider
            for (const provider of enabledProviders) {
                try {
-                    // Generate embedding
+                    if (useChunking) {
-                    const embedding = await provider.generateNoteEmbeddings(context);
+                        // Enhanced approach: Process large notes using chunking
                        await processNoteWithChunking(noteData.noteId, provider, context);
                    } else {
                        // Standard approach: Generate a single embedding for the whole note
                        const embedding = await provider.generateNoteEmbeddings(context);
-                    // Store embedding
+                        // Store embedding
-                    const config = provider.getConfig();
+                        const config = provider.getConfig();
-                    await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding);
+                        await storeNoteEmbedding(
                            noteData.noteId,
                            provider.name,
                            config.model,
                            embedding
                        );
                    }
                } catch (providerError: any) {
                    log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`);
                }
@ -748,6 +821,78 @@ export async function getEmbeddingStats() {
    };
 }
 /**
 * Process a large note by breaking it into chunks and creating embeddings for each chunk
 * This provides more detailed and focused embeddings for different parts of large notes
 *
 * @param noteId - The ID of the note to process
 * @param provider - The embedding provider to use
 * @param context - The note context data
 */
 async function processNoteWithChunking(
    noteId: string,
    provider: any,
    context: NoteEmbeddingContext
 ): Promise<void> {
    try {
        // Get the context extractor dynamically to avoid circular dependencies
        const { default: contextExtractor } = await import('../../llm/context_extractor.js');
        // Get chunks of the note content
        const chunks = await contextExtractor.getChunkedNoteContent(noteId);
        if (!chunks || chunks.length === 0) {
            // Fall back to single embedding if chunking fails
            const embedding = await provider.generateNoteEmbeddings(context);
            const config = provider.getConfig();
            await storeNoteEmbedding(noteId, provider.name, config.model, embedding);
            return;
        }
        // Generate and store embeddings for each chunk
        const config = provider.getConfig();
        // Delete existing embeddings first to avoid duplicates
        await deleteNoteEmbeddings(noteId, provider.name, config.model);
        // Process each chunk with a slight delay to avoid rate limits
        for (let i = 0; i < chunks.length; i++) {
            const chunk = chunks[i];
            // Create a modified context object with just this chunk's content
            const chunkContext: NoteEmbeddingContext = {
                ...context,
                content: chunk
            };
            // Generate embedding for this chunk
            const embedding = await provider.generateNoteEmbeddings(chunkContext);
            // Store with chunk information
            await storeNoteEmbedding(
                noteId,
                provider.name,
                config.model,
                embedding
            );
            // Small delay between chunks to avoid rate limits
            if (i < chunks.length - 1) {
                await new Promise(resolve => setTimeout(resolve, 100));
            }
        }
        log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`);
    } catch (error: any) {
        log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`);
        throw error;
    }
 }
 export function cleanupEmbeddings() {
    // Cleanup function implementation
 }
 export default {
    cosineSimilarity,
    embeddingToBuffer,