implement chunking and use becca for some functionality

2025-10-29 19:51:31 +08:00 · 2025-03-11 18:17:41 +00:00 · 2025-03-11 18:17:41 +00:00 · 0985cec8d6
commit 0985cec8d6
parent 4160db9728
2 changed files with 282 additions and 70 deletions
--- a/src/services/llm/context_extractor.ts
+++ b/src/services/llm/context_extractor.ts
@ -1,5 +1,6 @@
 import sql from '../sql.js';
 import sanitizeHtml from 'sanitize-html';
+import becca from '../../becca/becca.js';

 /**
 * Utility class for extracting context from notes to provide to AI models
@ -10,19 +11,27 @@ export class ContextExtractor {
     * Get the content of a note
     */
    async getNoteContent(noteId: string): Promise<string | null> {
-        const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>(
-            `SELECT note_contents.content, notes.type, notes.mime, notes.title
-             FROM notes
-             JOIN note_contents ON notes.noteId = note_contents.noteId
-             WHERE notes.noteId = ?`,
-            [noteId]
-        );
+        // Use Becca API to get note data
+        const note = becca.getNote(noteId);

        if (!note) {
            return null;
        }

-        return this.formatNoteContent(note.content, note.type, note.mime, note.title);
+        try {
+            // Get content using Becca API
+            const content = String(await note.getContent() || "");
+
+            return this.formatNoteContent(
+                content,
+                note.type,
+                note.mime,
+                note.title
+            );
+        } catch (error) {
+            console.error(`Error getting content for note ${noteId}:`, error);
+            return null;
+        }
    }

    /**
@ -181,13 +190,27 @@ export class ContextExtractor {
     * Get a set of parent notes to provide hierarchical context
     */
    async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
+        // Note: getParentNotes has already been updated to use Becca
        const parents = await this.getParentNotes(noteId, maxDepth);
        if (!parents.length) return '';

        let context = 'Here is the hierarchical context for the current note:\n\n';

-        for (const parent of parents) {
-            context += `- ${parent.title}\n`;
+        // Create a hierarchical view of the parents using indentation
+        // to show the proper parent-child relationship
+        let indentLevel = 0;
+        for (let i = 0; i < parents.length; i++) {
+            const parent = parents[i];
+            const indent = '  '.repeat(indentLevel);
+            context += `${indent}- ${parent.title}\n`;
+            indentLevel++;
+        }
+
+        // Now add the current note with proper indentation
+        const note = becca.getNote(noteId);
+        if (note) {
+            const indent = '  '.repeat(indentLevel);
+            context += `${indent}- ${note.title} (current note)\n`;
        }

        return context + '\n';
@ -197,21 +220,33 @@ export class ContextExtractor {
     * Get child notes to provide additional context
     */
    async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
-        const children = await sql.getRows<{noteId: string, title: string}>(
-            `SELECT noteId, title FROM notes
-             WHERE parentNoteId = ? AND isDeleted = 0
-             LIMIT ?`,
-            [noteId, maxChildren]
-        );
+        const note = becca.getNote(noteId);

-        if (!children.length) return '';
+        if (!note) {
+            return '';
+        }
+
+        // Use Becca API to get child notes
+        const childNotes = note.getChildNotes();
+
+        if (!childNotes || childNotes.length === 0) {
+            return '';
+        }

        let context = 'The current note has these child notes:\n\n';

-        for (const child of children) {
+        // Limit to maxChildren
+        const childrenToShow = childNotes.slice(0, maxChildren);
+
+        for (const child of childrenToShow) {
            context += `- ${child.title}\n`;
        }

+        // If there are more children than we're showing, indicate that
+        if (childNotes.length > maxChildren) {
+            context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
+        }
+
        return context + '\n';
    }

@ -219,24 +254,42 @@ export class ContextExtractor {
     * Get notes linked to this note
     */
    async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
-        const linkedNotes = await sql.getRows<{title: string}>(
-            `SELECT title FROM notes
-             WHERE noteId IN (
-                SELECT value FROM attributes
-                WHERE noteId = ? AND type = 'relation'
-                LIMIT ?
-             )`,
-            [noteId, maxLinks]
-        );
+        const note = becca.getNote(noteId);

-        if (!linkedNotes.length) return '';
+        if (!note) {
+            return '';
+        }
+
+        // Use Becca API to get relations
+        const relations = note.getRelations();
+
+        if (!relations || relations.length === 0) {
+            return '';
+        }
+
+        // Get the target notes from relations
+        const linkedNotes = relations
+            .map(relation => relation.targetNote)
+            .filter(note => note !== null && note !== undefined);
+
+        if (linkedNotes.length === 0) {
+            return '';
+        }

        let context = 'This note has relationships with these notes:\n\n';

-        for (const linked of linkedNotes) {
+        // Limit to maxLinks
+        const notesToShow = linkedNotes.slice(0, maxLinks);
+
+        for (const linked of notesToShow) {
            context += `- ${linked.title}\n`;
        }

+        // If there are more linked notes than we're showing, indicate that
+        if (linkedNotes.length > maxLinks) {
+            context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
+        }
+
        return context + '\n';
    }

@ -669,27 +722,41 @@ export class ContextExtractor {
     */
    private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
        const parentNotes: {noteId: string, title: string}[] = [];
-        let currentNoteId = noteId;
+        const startNote = becca.getNote(noteId);
+
+        if (!startNote) {
+            return parentNotes;
+        }
+
+        // Use non-null assertion as we checked above
+        let currentNote: any = startNote;

        for (let i = 0; i < maxDepth; i++) {
-            const parent = await sql.getRow<{parentNoteId: string, title: string}>(
-                `SELECT branches.parentNoteId, notes.title
-                 FROM branches
-                 JOIN notes ON branches.parentNoteId = notes.noteId
-                 WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`,
-                [currentNoteId]
-            );
+            // Get parent branches (should be just one in most cases)
+            if (!currentNote) break;

-            if (!parent || parent.parentNoteId === 'root') {
+            const parentBranches: any[] = currentNote.getParentBranches();
+
+            if (!parentBranches || parentBranches.length === 0) {
+                break;
+            }
+
+            // Use the first parent branch
+            const branch: any = parentBranches[0];
+            if (!branch) break;
+
+            const parentNote: any = branch.getParentNote();
+
+            if (!parentNote || parentNote.noteId === 'root') {
                break;
            }

            parentNotes.unshift({
-                noteId: parent.parentNoteId,
-                title: parent.title
+                noteId: parentNote.noteId,
+                title: parentNote.title
            });

-            currentNoteId = parent.parentNoteId;
+            currentNote = parentNote;
        }

        return parentNotes;
--- a/src/services/llm/embeddings/vector_store.ts
+++ b/src/services/llm/embeddings/vector_store.ts
@ -410,25 +410,69 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
    let content = "";

    try {
-        // Get raw content from the note
-        const rawContent = String(await note.getContent() || "");
+        // Use the enhanced context extractor for improved content extraction
+        // We're using a dynamic import to avoid circular dependencies
+        const { default: contextExtractor } = await import('../../llm/context_extractor.js');

-        // Process the content based on note type to extract meaningful text
-        if (note.type === 'text' || note.type === 'code') {
-            content = rawContent;
-        } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
-            // Process structured content types
-            content = extractStructuredContent(rawContent, note.type, note.mime);
-        } else if (note.type === 'image' || note.type === 'file') {
-            content = `[${note.type} attachment: ${note.mime}]`;
+        // Get the content using the enhanced formatNoteContent method in context extractor
+        const noteContent = await contextExtractor.getNoteContent(noteId);
+
+        if (noteContent) {
+            content = noteContent;
+
+            // For large content, consider chunking or summarization
+            if (content.length > 10000) {
+                // Large content handling options:
+
+                // Option 1: Use our summarization feature
+                const summary = await contextExtractor.getNoteSummary(noteId);
+                if (summary) {
+                    content = summary;
+                }
+
+                // Option 2: Alternative approach - use the first chunk if summarization fails
+                if (content.length > 10000) {
+                    const chunks = await contextExtractor.getChunkedNoteContent(noteId);
+                    if (chunks && chunks.length > 0) {
+                        // Use the first chunk (most relevant/beginning)
+                        content = chunks[0];
+                    }
+                }
+            }
+        } else {
+            // Fallback to original method if context extractor fails
+            const rawContent = String(await note.getContent() || "");
+
+            // Process the content based on note type to extract meaningful text
+            if (note.type === 'text' || note.type === 'code') {
+                content = rawContent;
+            } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
+                // Process structured content types
+                content = extractStructuredContent(rawContent, note.type, note.mime);
+            } else if (note.type === 'image' || note.type === 'file') {
+                content = `[${note.type} attachment: ${note.mime}]`;
+            }
+
+            // Clean the content to remove HTML tags and normalize whitespace
+            content = cleanNoteContent(content, note.type, note.mime);
        }
    } catch (err) {
        console.error(`Error getting content for note ${noteId}:`, err);
        content = `[Error extracting content]`;
-    }

-    // Clean the content to remove HTML tags and normalize whitespace
-    content = cleanNoteContent(content, note.type, note.mime);
+        // Try fallback to original method
+        try {
+            const rawContent = String(await note.getContent() || "");
+            if (note.type === 'text' || note.type === 'code') {
+                content = rawContent;
+            } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
+                content = extractStructuredContent(rawContent, note.type, note.mime);
+            }
+            content = cleanNoteContent(content, note.type, note.mime);
+        } catch (fallbackErr) {
+            console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr);
+        }
+    }

    // Get template/inheritance relationships
    // This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
@ -490,19 +534,35 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE'
 }

 /**
- * Deletes all embeddings for a note
+ * Delete embeddings for a note
+ *
+ * @param noteId - The ID of the note
+ * @param providerId - Optional provider ID to delete embeddings only for a specific provider
+ * @param modelId - Optional model ID to delete embeddings only for a specific model
 */
-export async function deleteNoteEmbeddings(noteId: string) {
-    await sql.execute(
-        "DELETE FROM note_embeddings WHERE noteId = ?",
-        [noteId]
-    );
+export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) {
+    let query = "DELETE FROM note_embeddings WHERE noteId = ?";
+    const params: any[] = [noteId];

-    // Remove from queue if present
-    await sql.execute(
-        "DELETE FROM embedding_queue WHERE noteId = ?",
-        [noteId]
-    );
+    if (providerId) {
+        query += " AND providerId = ?";
+        params.push(providerId);
+
+        if (modelId) {
+            query += " AND modelId = ?";
+            params.push(modelId);
+        }
+    }
+
+    await sql.execute(query, params);
+
+    // Only remove from queue if deleting all embeddings for the note
+    if (!providerId) {
+        await sql.execute(
+            "DELETE FROM embedding_queue WHERE noteId = ?",
+            [noteId]
+        );
+    }
 }

 /**
@ -559,15 +619,28 @@ export async function processEmbeddingQueue() {
            // Get note context for embedding
            const context = await getNoteEmbeddingContext(noteData.noteId);

+            // Check if we should use chunking for large content
+            const useChunking = context.content.length > 5000; // Use chunking for large notes by default
+
            // Process with each enabled provider
            for (const provider of enabledProviders) {
                try {
-                    // Generate embedding
-                    const embedding = await provider.generateNoteEmbeddings(context);
+                    if (useChunking) {
+                        // Enhanced approach: Process large notes using chunking
+                        await processNoteWithChunking(noteData.noteId, provider, context);
+                    } else {
+                        // Standard approach: Generate a single embedding for the whole note
+                        const embedding = await provider.generateNoteEmbeddings(context);

-                    // Store embedding
-                    const config = provider.getConfig();
-                    await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding);
+                        // Store embedding
+                        const config = provider.getConfig();
+                        await storeNoteEmbedding(
+                            noteData.noteId,
+                            provider.name,
+                            config.model,
+                            embedding
+                        );
+                    }
                } catch (providerError: any) {
                    log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`);
                }
@ -748,6 +821,78 @@ export async function getEmbeddingStats() {
    };
 }

+/**
+ * Process a large note by breaking it into chunks and creating embeddings for each chunk
+ * This provides more detailed and focused embeddings for different parts of large notes
+ *
+ * @param noteId - The ID of the note to process
+ * @param provider - The embedding provider to use
+ * @param context - The note context data
+ */
+async function processNoteWithChunking(
+    noteId: string,
+    provider: any,
+    context: NoteEmbeddingContext
+): Promise<void> {
+    try {
+        // Get the context extractor dynamically to avoid circular dependencies
+        const { default: contextExtractor } = await import('../../llm/context_extractor.js');
+
+        // Get chunks of the note content
+        const chunks = await contextExtractor.getChunkedNoteContent(noteId);
+
+        if (!chunks || chunks.length === 0) {
+            // Fall back to single embedding if chunking fails
+            const embedding = await provider.generateNoteEmbeddings(context);
+            const config = provider.getConfig();
+            await storeNoteEmbedding(noteId, provider.name, config.model, embedding);
+            return;
+        }
+
+        // Generate and store embeddings for each chunk
+        const config = provider.getConfig();
+
+        // Delete existing embeddings first to avoid duplicates
+        await deleteNoteEmbeddings(noteId, provider.name, config.model);
+
+        // Process each chunk with a slight delay to avoid rate limits
+        for (let i = 0; i < chunks.length; i++) {
+            const chunk = chunks[i];
+
+            // Create a modified context object with just this chunk's content
+            const chunkContext: NoteEmbeddingContext = {
+                ...context,
+                content: chunk
+            };
+
+            // Generate embedding for this chunk
+            const embedding = await provider.generateNoteEmbeddings(chunkContext);
+
+            // Store with chunk information
+            await storeNoteEmbedding(
+                noteId,
+                provider.name,
+                config.model,
+                embedding
+            );
+
+            // Small delay between chunks to avoid rate limits
+            if (i < chunks.length - 1) {
+                await new Promise(resolve => setTimeout(resolve, 100));
+            }
+        }
+
+        log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`);
+    } catch (error: any) {
+        log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`);
+        throw error;
+    }
+}
+
+export function cleanupEmbeddings() {
+    // Cleanup function implementation
+}
+
 export default {
    cosineSimilarity,
    embeddingToBuffer,