mirror of
				https://github.com/TriliumNext/Notes.git
				synced 2025-10-30 04:01:31 +08:00 
			
		
		
		
	implement chunking and use becca for some functionality
This commit is contained in:
		
							parent
							
								
									4160db9728
								
							
						
					
					
						commit
						0985cec8d6
					
				| @ -1,5 +1,6 @@ | ||||
| import sql from '../sql.js'; | ||||
| import sanitizeHtml from 'sanitize-html'; | ||||
| import becca from '../../becca/becca.js'; | ||||
| 
 | ||||
| /** | ||||
|  * Utility class for extracting context from notes to provide to AI models | ||||
| @ -10,19 +11,27 @@ export class ContextExtractor { | ||||
|      * Get the content of a note | ||||
|      */ | ||||
|     async getNoteContent(noteId: string): Promise<string | null> { | ||||
|         const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>( | ||||
|             `SELECT note_contents.content, notes.type, notes.mime, notes.title
 | ||||
|              FROM notes | ||||
|              JOIN note_contents ON notes.noteId = note_contents.noteId | ||||
|              WHERE notes.noteId = ?`,
 | ||||
|             [noteId] | ||||
|         ); | ||||
|         // Use Becca API to get note data
 | ||||
|         const note = becca.getNote(noteId); | ||||
| 
 | ||||
|         if (!note) { | ||||
|             return null; | ||||
|         } | ||||
| 
 | ||||
|         return this.formatNoteContent(note.content, note.type, note.mime, note.title); | ||||
|         try { | ||||
|             // Get content using Becca API
 | ||||
|             const content = String(await note.getContent() || ""); | ||||
| 
 | ||||
|             return this.formatNoteContent( | ||||
|                 content, | ||||
|                 note.type, | ||||
|                 note.mime, | ||||
|                 note.title | ||||
|             ); | ||||
|         } catch (error) { | ||||
|             console.error(`Error getting content for note ${noteId}:`, error); | ||||
|             return null; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     /** | ||||
| @ -181,13 +190,27 @@ export class ContextExtractor { | ||||
|      * Get a set of parent notes to provide hierarchical context | ||||
|      */ | ||||
|     async getParentContext(noteId: string, maxDepth = 3): Promise<string> { | ||||
|         // Note: getParentNotes has already been updated to use Becca
 | ||||
|         const parents = await this.getParentNotes(noteId, maxDepth); | ||||
|         if (!parents.length) return ''; | ||||
| 
 | ||||
|         let context = 'Here is the hierarchical context for the current note:\n\n'; | ||||
| 
 | ||||
|         for (const parent of parents) { | ||||
|             context += `- ${parent.title}\n`; | ||||
|         // Create a hierarchical view of the parents using indentation
 | ||||
|         // to show the proper parent-child relationship
 | ||||
|         let indentLevel = 0; | ||||
|         for (let i = 0; i < parents.length; i++) { | ||||
|             const parent = parents[i]; | ||||
|             const indent = '  '.repeat(indentLevel); | ||||
|             context += `${indent}- ${parent.title}\n`; | ||||
|             indentLevel++; | ||||
|         } | ||||
| 
 | ||||
|         // Now add the current note with proper indentation
 | ||||
|         const note = becca.getNote(noteId); | ||||
|         if (note) { | ||||
|             const indent = '  '.repeat(indentLevel); | ||||
|             context += `${indent}- ${note.title} (current note)\n`; | ||||
|         } | ||||
| 
 | ||||
|         return context + '\n'; | ||||
| @ -197,21 +220,33 @@ export class ContextExtractor { | ||||
|      * Get child notes to provide additional context | ||||
|      */ | ||||
|     async getChildContext(noteId: string, maxChildren = 5): Promise<string> { | ||||
|         const children = await sql.getRows<{noteId: string, title: string}>( | ||||
|             `SELECT noteId, title FROM notes
 | ||||
|              WHERE parentNoteId = ? AND isDeleted = 0 | ||||
|              LIMIT ?`,
 | ||||
|             [noteId, maxChildren] | ||||
|         ); | ||||
|         const note = becca.getNote(noteId); | ||||
| 
 | ||||
|         if (!children.length) return ''; | ||||
|         if (!note) { | ||||
|             return ''; | ||||
|         } | ||||
| 
 | ||||
|         // Use Becca API to get child notes
 | ||||
|         const childNotes = note.getChildNotes(); | ||||
| 
 | ||||
|         if (!childNotes || childNotes.length === 0) { | ||||
|             return ''; | ||||
|         } | ||||
| 
 | ||||
|         let context = 'The current note has these child notes:\n\n'; | ||||
| 
 | ||||
|         for (const child of children) { | ||||
|         // Limit to maxChildren
 | ||||
|         const childrenToShow = childNotes.slice(0, maxChildren); | ||||
| 
 | ||||
|         for (const child of childrenToShow) { | ||||
|             context += `- ${child.title}\n`; | ||||
|         } | ||||
| 
 | ||||
|         // If there are more children than we're showing, indicate that
 | ||||
|         if (childNotes.length > maxChildren) { | ||||
|             context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`; | ||||
|         } | ||||
| 
 | ||||
|         return context + '\n'; | ||||
|     } | ||||
| 
 | ||||
| @ -219,24 +254,42 @@ export class ContextExtractor { | ||||
|      * Get notes linked to this note | ||||
|      */ | ||||
|     async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> { | ||||
|         const linkedNotes = await sql.getRows<{title: string}>( | ||||
|             `SELECT title FROM notes
 | ||||
|              WHERE noteId IN ( | ||||
|                 SELECT value FROM attributes | ||||
|                 WHERE noteId = ? AND type = 'relation' | ||||
|                 LIMIT ? | ||||
|              )`,
 | ||||
|             [noteId, maxLinks] | ||||
|         ); | ||||
|         const note = becca.getNote(noteId); | ||||
| 
 | ||||
|         if (!linkedNotes.length) return ''; | ||||
|         if (!note) { | ||||
|             return ''; | ||||
|         } | ||||
| 
 | ||||
|         // Use Becca API to get relations
 | ||||
|         const relations = note.getRelations(); | ||||
| 
 | ||||
|         if (!relations || relations.length === 0) { | ||||
|             return ''; | ||||
|         } | ||||
| 
 | ||||
|         // Get the target notes from relations
 | ||||
|         const linkedNotes = relations | ||||
|             .map(relation => relation.targetNote) | ||||
|             .filter(note => note !== null && note !== undefined); | ||||
| 
 | ||||
|         if (linkedNotes.length === 0) { | ||||
|             return ''; | ||||
|         } | ||||
| 
 | ||||
|         let context = 'This note has relationships with these notes:\n\n'; | ||||
| 
 | ||||
|         for (const linked of linkedNotes) { | ||||
|         // Limit to maxLinks
 | ||||
|         const notesToShow = linkedNotes.slice(0, maxLinks); | ||||
| 
 | ||||
|         for (const linked of notesToShow) { | ||||
|             context += `- ${linked.title}\n`; | ||||
|         } | ||||
| 
 | ||||
|         // If there are more linked notes than we're showing, indicate that
 | ||||
|         if (linkedNotes.length > maxLinks) { | ||||
|             context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`; | ||||
|         } | ||||
| 
 | ||||
|         return context + '\n'; | ||||
|     } | ||||
| 
 | ||||
| @ -669,27 +722,41 @@ export class ContextExtractor { | ||||
|      */ | ||||
|     private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> { | ||||
|         const parentNotes: {noteId: string, title: string}[] = []; | ||||
|         let currentNoteId = noteId; | ||||
|         const startNote = becca.getNote(noteId); | ||||
| 
 | ||||
|         if (!startNote) { | ||||
|             return parentNotes; | ||||
|         } | ||||
| 
 | ||||
|         // Use non-null assertion as we checked above
 | ||||
|         let currentNote: any = startNote; | ||||
| 
 | ||||
|         for (let i = 0; i < maxDepth; i++) { | ||||
|             const parent = await sql.getRow<{parentNoteId: string, title: string}>( | ||||
|                 `SELECT branches.parentNoteId, notes.title
 | ||||
|                  FROM branches | ||||
|                  JOIN notes ON branches.parentNoteId = notes.noteId | ||||
|                  WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`,
 | ||||
|                 [currentNoteId] | ||||
|             ); | ||||
|             // Get parent branches (should be just one in most cases)
 | ||||
|             if (!currentNote) break; | ||||
| 
 | ||||
|             if (!parent || parent.parentNoteId === 'root') { | ||||
|             const parentBranches: any[] = currentNote.getParentBranches(); | ||||
| 
 | ||||
|             if (!parentBranches || parentBranches.length === 0) { | ||||
|                 break; | ||||
|             } | ||||
| 
 | ||||
|             // Use the first parent branch
 | ||||
|             const branch: any = parentBranches[0]; | ||||
|             if (!branch) break; | ||||
| 
 | ||||
|             const parentNote: any = branch.getParentNote(); | ||||
| 
 | ||||
|             if (!parentNote || parentNote.noteId === 'root') { | ||||
|                 break; | ||||
|             } | ||||
| 
 | ||||
|             parentNotes.unshift({ | ||||
|                 noteId: parent.parentNoteId, | ||||
|                 title: parent.title | ||||
|                 noteId: parentNote.noteId, | ||||
|                 title: parentNote.title | ||||
|             }); | ||||
| 
 | ||||
|             currentNoteId = parent.parentNoteId; | ||||
|             currentNote = parentNote; | ||||
|         } | ||||
| 
 | ||||
|         return parentNotes; | ||||
|  | ||||
| @ -410,7 +410,37 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed | ||||
|     let content = ""; | ||||
| 
 | ||||
|     try { | ||||
|         // Get raw content from the note
 | ||||
|         // Use the enhanced context extractor for improved content extraction
 | ||||
|         // We're using a dynamic import to avoid circular dependencies
 | ||||
|         const { default: contextExtractor } = await import('../../llm/context_extractor.js'); | ||||
| 
 | ||||
|         // Get the content using the enhanced formatNoteContent method in context extractor
 | ||||
|         const noteContent = await contextExtractor.getNoteContent(noteId); | ||||
| 
 | ||||
|         if (noteContent) { | ||||
|             content = noteContent; | ||||
| 
 | ||||
|             // For large content, consider chunking or summarization
 | ||||
|             if (content.length > 10000) { | ||||
|                 // Large content handling options:
 | ||||
| 
 | ||||
|                 // Option 1: Use our summarization feature
 | ||||
|                 const summary = await contextExtractor.getNoteSummary(noteId); | ||||
|                 if (summary) { | ||||
|                     content = summary; | ||||
|                 } | ||||
| 
 | ||||
|                 // Option 2: Alternative approach - use the first chunk if summarization fails
 | ||||
|                 if (content.length > 10000) { | ||||
|                     const chunks = await contextExtractor.getChunkedNoteContent(noteId); | ||||
|                     if (chunks && chunks.length > 0) { | ||||
|                         // Use the first chunk (most relevant/beginning)
 | ||||
|                         content = chunks[0]; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } else { | ||||
|             // Fallback to original method if context extractor fails
 | ||||
|             const rawContent = String(await note.getContent() || ""); | ||||
| 
 | ||||
|             // Process the content based on note type to extract meaningful text
 | ||||
| @ -422,13 +452,27 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed | ||||
|             } else if (note.type === 'image' || note.type === 'file') { | ||||
|                 content = `[${note.type} attachment: ${note.mime}]`; | ||||
|             } | ||||
|     } catch (err) { | ||||
|         console.error(`Error getting content for note ${noteId}:`, err); | ||||
|         content = `[Error extracting content]`; | ||||
|     } | ||||
| 
 | ||||
|             // Clean the content to remove HTML tags and normalize whitespace
 | ||||
|             content = cleanNoteContent(content, note.type, note.mime); | ||||
|         } | ||||
|     } catch (err) { | ||||
|         console.error(`Error getting content for note ${noteId}:`, err); | ||||
|         content = `[Error extracting content]`; | ||||
| 
 | ||||
|         // Try fallback to original method
 | ||||
|         try { | ||||
|             const rawContent = String(await note.getContent() || ""); | ||||
|             if (note.type === 'text' || note.type === 'code') { | ||||
|                 content = rawContent; | ||||
|             } else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) { | ||||
|                 content = extractStructuredContent(rawContent, note.type, note.mime); | ||||
|             } | ||||
|             content = cleanNoteContent(content, note.type, note.mime); | ||||
|         } catch (fallbackErr) { | ||||
|             console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // Get template/inheritance relationships
 | ||||
|     // This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
 | ||||
| @ -490,20 +534,36 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE' | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * Deletes all embeddings for a note | ||||
|  * Delete embeddings for a note | ||||
|  * | ||||
|  * @param noteId - The ID of the note | ||||
|  * @param providerId - Optional provider ID to delete embeddings only for a specific provider | ||||
|  * @param modelId - Optional model ID to delete embeddings only for a specific model | ||||
|  */ | ||||
| export async function deleteNoteEmbeddings(noteId: string) { | ||||
|     await sql.execute( | ||||
|         "DELETE FROM note_embeddings WHERE noteId = ?", | ||||
|         [noteId] | ||||
|     ); | ||||
| export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) { | ||||
|     let query = "DELETE FROM note_embeddings WHERE noteId = ?"; | ||||
|     const params: any[] = [noteId]; | ||||
| 
 | ||||
|     // Remove from queue if present
 | ||||
|     if (providerId) { | ||||
|         query += " AND providerId = ?"; | ||||
|         params.push(providerId); | ||||
| 
 | ||||
|         if (modelId) { | ||||
|             query += " AND modelId = ?"; | ||||
|             params.push(modelId); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     await sql.execute(query, params); | ||||
| 
 | ||||
|     // Only remove from queue if deleting all embeddings for the note
 | ||||
|     if (!providerId) { | ||||
|         await sql.execute( | ||||
|             "DELETE FROM embedding_queue WHERE noteId = ?", | ||||
|             [noteId] | ||||
|         ); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * Process the embedding queue | ||||
| @ -559,15 +619,28 @@ export async function processEmbeddingQueue() { | ||||
|             // Get note context for embedding
 | ||||
|             const context = await getNoteEmbeddingContext(noteData.noteId); | ||||
| 
 | ||||
|             // Check if we should use chunking for large content
 | ||||
|             const useChunking = context.content.length > 5000; // Use chunking for large notes by default
 | ||||
| 
 | ||||
|             // Process with each enabled provider
 | ||||
|             for (const provider of enabledProviders) { | ||||
|                 try { | ||||
|                     // Generate embedding
 | ||||
|                     if (useChunking) { | ||||
|                         // Enhanced approach: Process large notes using chunking
 | ||||
|                         await processNoteWithChunking(noteData.noteId, provider, context); | ||||
|                     } else { | ||||
|                         // Standard approach: Generate a single embedding for the whole note
 | ||||
|                         const embedding = await provider.generateNoteEmbeddings(context); | ||||
| 
 | ||||
|                         // Store embedding
 | ||||
|                         const config = provider.getConfig(); | ||||
|                     await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding); | ||||
|                         await storeNoteEmbedding( | ||||
|                             noteData.noteId, | ||||
|                             provider.name, | ||||
|                             config.model, | ||||
|                             embedding | ||||
|                         ); | ||||
|                     } | ||||
|                 } catch (providerError: any) { | ||||
|                     log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`); | ||||
|                 } | ||||
| @ -748,6 +821,78 @@ export async function getEmbeddingStats() { | ||||
|     }; | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * Process a large note by breaking it into chunks and creating embeddings for each chunk | ||||
|  * This provides more detailed and focused embeddings for different parts of large notes | ||||
|  * | ||||
|  * @param noteId - The ID of the note to process | ||||
|  * @param provider - The embedding provider to use | ||||
|  * @param context - The note context data | ||||
|  */ | ||||
| async function processNoteWithChunking( | ||||
|     noteId: string, | ||||
|     provider: any, | ||||
|     context: NoteEmbeddingContext | ||||
| ): Promise<void> { | ||||
|     try { | ||||
|         // Get the context extractor dynamically to avoid circular dependencies
 | ||||
|         const { default: contextExtractor } = await import('../../llm/context_extractor.js'); | ||||
| 
 | ||||
|         // Get chunks of the note content
 | ||||
|         const chunks = await contextExtractor.getChunkedNoteContent(noteId); | ||||
| 
 | ||||
|         if (!chunks || chunks.length === 0) { | ||||
|             // Fall back to single embedding if chunking fails
 | ||||
|             const embedding = await provider.generateNoteEmbeddings(context); | ||||
|             const config = provider.getConfig(); | ||||
|             await storeNoteEmbedding(noteId, provider.name, config.model, embedding); | ||||
|             return; | ||||
|         } | ||||
| 
 | ||||
|         // Generate and store embeddings for each chunk
 | ||||
|         const config = provider.getConfig(); | ||||
| 
 | ||||
|         // Delete existing embeddings first to avoid duplicates
 | ||||
|         await deleteNoteEmbeddings(noteId, provider.name, config.model); | ||||
| 
 | ||||
|         // Process each chunk with a slight delay to avoid rate limits
 | ||||
|         for (let i = 0; i < chunks.length; i++) { | ||||
|             const chunk = chunks[i]; | ||||
| 
 | ||||
|             // Create a modified context object with just this chunk's content
 | ||||
|             const chunkContext: NoteEmbeddingContext = { | ||||
|                 ...context, | ||||
|                 content: chunk | ||||
|             }; | ||||
| 
 | ||||
|             // Generate embedding for this chunk
 | ||||
|             const embedding = await provider.generateNoteEmbeddings(chunkContext); | ||||
| 
 | ||||
|             // Store with chunk information
 | ||||
|             await storeNoteEmbedding( | ||||
|                 noteId, | ||||
|                 provider.name, | ||||
|                 config.model, | ||||
|                 embedding | ||||
|             ); | ||||
| 
 | ||||
|             // Small delay between chunks to avoid rate limits
 | ||||
|             if (i < chunks.length - 1) { | ||||
|                 await new Promise(resolve => setTimeout(resolve, 100)); | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`); | ||||
|     } catch (error: any) { | ||||
|         log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`); | ||||
|         throw error; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| export function cleanupEmbeddings() { | ||||
|     // Cleanup function implementation
 | ||||
| } | ||||
| 
 | ||||
| export default { | ||||
|     cosineSimilarity, | ||||
|     embeddingToBuffer, | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 perf3ct
						perf3ct