mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-07-29 19:12:27 +08:00
implement chunking and use becca for some functionality
This commit is contained in:
parent
4160db9728
commit
0985cec8d6
@ -1,5 +1,6 @@
|
||||
import sql from '../sql.js';
|
||||
import sanitizeHtml from 'sanitize-html';
|
||||
import becca from '../../becca/becca.js';
|
||||
|
||||
/**
|
||||
* Utility class for extracting context from notes to provide to AI models
|
||||
@ -10,19 +11,27 @@ export class ContextExtractor {
|
||||
* Get the content of a note
|
||||
*/
|
||||
async getNoteContent(noteId: string): Promise<string | null> {
|
||||
const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>(
|
||||
`SELECT note_contents.content, notes.type, notes.mime, notes.title
|
||||
FROM notes
|
||||
JOIN note_contents ON notes.noteId = note_contents.noteId
|
||||
WHERE notes.noteId = ?`,
|
||||
[noteId]
|
||||
);
|
||||
// Use Becca API to get note data
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return this.formatNoteContent(note.content, note.type, note.mime, note.title);
|
||||
try {
|
||||
// Get content using Becca API
|
||||
const content = String(await note.getContent() || "");
|
||||
|
||||
return this.formatNoteContent(
|
||||
content,
|
||||
note.type,
|
||||
note.mime,
|
||||
note.title
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`Error getting content for note ${noteId}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -181,13 +190,27 @@ export class ContextExtractor {
|
||||
* Get a set of parent notes to provide hierarchical context
|
||||
*/
|
||||
async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
|
||||
// Note: getParentNotes has already been updated to use Becca
|
||||
const parents = await this.getParentNotes(noteId, maxDepth);
|
||||
if (!parents.length) return '';
|
||||
|
||||
let context = 'Here is the hierarchical context for the current note:\n\n';
|
||||
|
||||
for (const parent of parents) {
|
||||
context += `- ${parent.title}\n`;
|
||||
// Create a hierarchical view of the parents using indentation
|
||||
// to show the proper parent-child relationship
|
||||
let indentLevel = 0;
|
||||
for (let i = 0; i < parents.length; i++) {
|
||||
const parent = parents[i];
|
||||
const indent = ' '.repeat(indentLevel);
|
||||
context += `${indent}- ${parent.title}\n`;
|
||||
indentLevel++;
|
||||
}
|
||||
|
||||
// Now add the current note with proper indentation
|
||||
const note = becca.getNote(noteId);
|
||||
if (note) {
|
||||
const indent = ' '.repeat(indentLevel);
|
||||
context += `${indent}- ${note.title} (current note)\n`;
|
||||
}
|
||||
|
||||
return context + '\n';
|
||||
@ -197,21 +220,33 @@ export class ContextExtractor {
|
||||
* Get child notes to provide additional context
|
||||
*/
|
||||
async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
|
||||
const children = await sql.getRows<{noteId: string, title: string}>(
|
||||
`SELECT noteId, title FROM notes
|
||||
WHERE parentNoteId = ? AND isDeleted = 0
|
||||
LIMIT ?`,
|
||||
[noteId, maxChildren]
|
||||
);
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!children.length) return '';
|
||||
if (!note) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Use Becca API to get child notes
|
||||
const childNotes = note.getChildNotes();
|
||||
|
||||
if (!childNotes || childNotes.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
let context = 'The current note has these child notes:\n\n';
|
||||
|
||||
for (const child of children) {
|
||||
// Limit to maxChildren
|
||||
const childrenToShow = childNotes.slice(0, maxChildren);
|
||||
|
||||
for (const child of childrenToShow) {
|
||||
context += `- ${child.title}\n`;
|
||||
}
|
||||
|
||||
// If there are more children than we're showing, indicate that
|
||||
if (childNotes.length > maxChildren) {
|
||||
context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
|
||||
}
|
||||
|
||||
return context + '\n';
|
||||
}
|
||||
|
||||
@ -219,24 +254,42 @@ export class ContextExtractor {
|
||||
* Get notes linked to this note
|
||||
*/
|
||||
async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
|
||||
const linkedNotes = await sql.getRows<{title: string}>(
|
||||
`SELECT title FROM notes
|
||||
WHERE noteId IN (
|
||||
SELECT value FROM attributes
|
||||
WHERE noteId = ? AND type = 'relation'
|
||||
LIMIT ?
|
||||
)`,
|
||||
[noteId, maxLinks]
|
||||
);
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!linkedNotes.length) return '';
|
||||
if (!note) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Use Becca API to get relations
|
||||
const relations = note.getRelations();
|
||||
|
||||
if (!relations || relations.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Get the target notes from relations
|
||||
const linkedNotes = relations
|
||||
.map(relation => relation.targetNote)
|
||||
.filter(note => note !== null && note !== undefined);
|
||||
|
||||
if (linkedNotes.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
let context = 'This note has relationships with these notes:\n\n';
|
||||
|
||||
for (const linked of linkedNotes) {
|
||||
// Limit to maxLinks
|
||||
const notesToShow = linkedNotes.slice(0, maxLinks);
|
||||
|
||||
for (const linked of notesToShow) {
|
||||
context += `- ${linked.title}\n`;
|
||||
}
|
||||
|
||||
// If there are more linked notes than we're showing, indicate that
|
||||
if (linkedNotes.length > maxLinks) {
|
||||
context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
|
||||
}
|
||||
|
||||
return context + '\n';
|
||||
}
|
||||
|
||||
@ -669,27 +722,41 @@ export class ContextExtractor {
|
||||
*/
|
||||
private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
|
||||
const parentNotes: {noteId: string, title: string}[] = [];
|
||||
let currentNoteId = noteId;
|
||||
const startNote = becca.getNote(noteId);
|
||||
|
||||
if (!startNote) {
|
||||
return parentNotes;
|
||||
}
|
||||
|
||||
// Use non-null assertion as we checked above
|
||||
let currentNote: any = startNote;
|
||||
|
||||
for (let i = 0; i < maxDepth; i++) {
|
||||
const parent = await sql.getRow<{parentNoteId: string, title: string}>(
|
||||
`SELECT branches.parentNoteId, notes.title
|
||||
FROM branches
|
||||
JOIN notes ON branches.parentNoteId = notes.noteId
|
||||
WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`,
|
||||
[currentNoteId]
|
||||
);
|
||||
// Get parent branches (should be just one in most cases)
|
||||
if (!currentNote) break;
|
||||
|
||||
if (!parent || parent.parentNoteId === 'root') {
|
||||
const parentBranches: any[] = currentNote.getParentBranches();
|
||||
|
||||
if (!parentBranches || parentBranches.length === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Use the first parent branch
|
||||
const branch: any = parentBranches[0];
|
||||
if (!branch) break;
|
||||
|
||||
const parentNote: any = branch.getParentNote();
|
||||
|
||||
if (!parentNote || parentNote.noteId === 'root') {
|
||||
break;
|
||||
}
|
||||
|
||||
parentNotes.unshift({
|
||||
noteId: parent.parentNoteId,
|
||||
title: parent.title
|
||||
noteId: parentNote.noteId,
|
||||
title: parentNote.title
|
||||
});
|
||||
|
||||
currentNoteId = parent.parentNoteId;
|
||||
currentNote = parentNote;
|
||||
}
|
||||
|
||||
return parentNotes;
|
||||
|
@ -410,25 +410,69 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
||||
let content = "";
|
||||
|
||||
try {
|
||||
// Get raw content from the note
|
||||
const rawContent = String(await note.getContent() || "");
|
||||
// Use the enhanced context extractor for improved content extraction
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
|
||||
|
||||
// Process the content based on note type to extract meaningful text
|
||||
if (note.type === 'text' || note.type === 'code') {
|
||||
content = rawContent;
|
||||
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
||||
// Process structured content types
|
||||
content = extractStructuredContent(rawContent, note.type, note.mime);
|
||||
} else if (note.type === 'image' || note.type === 'file') {
|
||||
content = `[${note.type} attachment: ${note.mime}]`;
|
||||
// Get the content using the enhanced formatNoteContent method in context extractor
|
||||
const noteContent = await contextExtractor.getNoteContent(noteId);
|
||||
|
||||
if (noteContent) {
|
||||
content = noteContent;
|
||||
|
||||
// For large content, consider chunking or summarization
|
||||
if (content.length > 10000) {
|
||||
// Large content handling options:
|
||||
|
||||
// Option 1: Use our summarization feature
|
||||
const summary = await contextExtractor.getNoteSummary(noteId);
|
||||
if (summary) {
|
||||
content = summary;
|
||||
}
|
||||
|
||||
// Option 2: Alternative approach - use the first chunk if summarization fails
|
||||
if (content.length > 10000) {
|
||||
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
|
||||
if (chunks && chunks.length > 0) {
|
||||
// Use the first chunk (most relevant/beginning)
|
||||
content = chunks[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Fallback to original method if context extractor fails
|
||||
const rawContent = String(await note.getContent() || "");
|
||||
|
||||
// Process the content based on note type to extract meaningful text
|
||||
if (note.type === 'text' || note.type === 'code') {
|
||||
content = rawContent;
|
||||
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
||||
// Process structured content types
|
||||
content = extractStructuredContent(rawContent, note.type, note.mime);
|
||||
} else if (note.type === 'image' || note.type === 'file') {
|
||||
content = `[${note.type} attachment: ${note.mime}]`;
|
||||
}
|
||||
|
||||
// Clean the content to remove HTML tags and normalize whitespace
|
||||
content = cleanNoteContent(content, note.type, note.mime);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Error getting content for note ${noteId}:`, err);
|
||||
content = `[Error extracting content]`;
|
||||
}
|
||||
|
||||
// Clean the content to remove HTML tags and normalize whitespace
|
||||
content = cleanNoteContent(content, note.type, note.mime);
|
||||
// Try fallback to original method
|
||||
try {
|
||||
const rawContent = String(await note.getContent() || "");
|
||||
if (note.type === 'text' || note.type === 'code') {
|
||||
content = rawContent;
|
||||
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
||||
content = extractStructuredContent(rawContent, note.type, note.mime);
|
||||
}
|
||||
content = cleanNoteContent(content, note.type, note.mime);
|
||||
} catch (fallbackErr) {
|
||||
console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr);
|
||||
}
|
||||
}
|
||||
|
||||
// Get template/inheritance relationships
|
||||
// This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
|
||||
@ -490,19 +534,35 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE'
|
||||
}
|
||||
|
||||
/**
|
||||
* Deletes all embeddings for a note
|
||||
* Delete embeddings for a note
|
||||
*
|
||||
* @param noteId - The ID of the note
|
||||
* @param providerId - Optional provider ID to delete embeddings only for a specific provider
|
||||
* @param modelId - Optional model ID to delete embeddings only for a specific model
|
||||
*/
|
||||
export async function deleteNoteEmbeddings(noteId: string) {
|
||||
await sql.execute(
|
||||
"DELETE FROM note_embeddings WHERE noteId = ?",
|
||||
[noteId]
|
||||
);
|
||||
export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) {
|
||||
let query = "DELETE FROM note_embeddings WHERE noteId = ?";
|
||||
const params: any[] = [noteId];
|
||||
|
||||
// Remove from queue if present
|
||||
await sql.execute(
|
||||
"DELETE FROM embedding_queue WHERE noteId = ?",
|
||||
[noteId]
|
||||
);
|
||||
if (providerId) {
|
||||
query += " AND providerId = ?";
|
||||
params.push(providerId);
|
||||
|
||||
if (modelId) {
|
||||
query += " AND modelId = ?";
|
||||
params.push(modelId);
|
||||
}
|
||||
}
|
||||
|
||||
await sql.execute(query, params);
|
||||
|
||||
// Only remove from queue if deleting all embeddings for the note
|
||||
if (!providerId) {
|
||||
await sql.execute(
|
||||
"DELETE FROM embedding_queue WHERE noteId = ?",
|
||||
[noteId]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -559,15 +619,28 @@ export async function processEmbeddingQueue() {
|
||||
// Get note context for embedding
|
||||
const context = await getNoteEmbeddingContext(noteData.noteId);
|
||||
|
||||
// Check if we should use chunking for large content
|
||||
const useChunking = context.content.length > 5000; // Use chunking for large notes by default
|
||||
|
||||
// Process with each enabled provider
|
||||
for (const provider of enabledProviders) {
|
||||
try {
|
||||
// Generate embedding
|
||||
const embedding = await provider.generateNoteEmbeddings(context);
|
||||
if (useChunking) {
|
||||
// Enhanced approach: Process large notes using chunking
|
||||
await processNoteWithChunking(noteData.noteId, provider, context);
|
||||
} else {
|
||||
// Standard approach: Generate a single embedding for the whole note
|
||||
const embedding = await provider.generateNoteEmbeddings(context);
|
||||
|
||||
// Store embedding
|
||||
const config = provider.getConfig();
|
||||
await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding);
|
||||
// Store embedding
|
||||
const config = provider.getConfig();
|
||||
await storeNoteEmbedding(
|
||||
noteData.noteId,
|
||||
provider.name,
|
||||
config.model,
|
||||
embedding
|
||||
);
|
||||
}
|
||||
} catch (providerError: any) {
|
||||
log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`);
|
||||
}
|
||||
@ -748,6 +821,78 @@ export async function getEmbeddingStats() {
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a large note by breaking it into chunks and creating embeddings for each chunk
|
||||
* This provides more detailed and focused embeddings for different parts of large notes
|
||||
*
|
||||
* @param noteId - The ID of the note to process
|
||||
* @param provider - The embedding provider to use
|
||||
* @param context - The note context data
|
||||
*/
|
||||
async function processNoteWithChunking(
|
||||
noteId: string,
|
||||
provider: any,
|
||||
context: NoteEmbeddingContext
|
||||
): Promise<void> {
|
||||
try {
|
||||
// Get the context extractor dynamically to avoid circular dependencies
|
||||
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
|
||||
|
||||
// Get chunks of the note content
|
||||
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
|
||||
|
||||
if (!chunks || chunks.length === 0) {
|
||||
// Fall back to single embedding if chunking fails
|
||||
const embedding = await provider.generateNoteEmbeddings(context);
|
||||
const config = provider.getConfig();
|
||||
await storeNoteEmbedding(noteId, provider.name, config.model, embedding);
|
||||
return;
|
||||
}
|
||||
|
||||
// Generate and store embeddings for each chunk
|
||||
const config = provider.getConfig();
|
||||
|
||||
// Delete existing embeddings first to avoid duplicates
|
||||
await deleteNoteEmbeddings(noteId, provider.name, config.model);
|
||||
|
||||
// Process each chunk with a slight delay to avoid rate limits
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
const chunk = chunks[i];
|
||||
|
||||
// Create a modified context object with just this chunk's content
|
||||
const chunkContext: NoteEmbeddingContext = {
|
||||
...context,
|
||||
content: chunk
|
||||
};
|
||||
|
||||
// Generate embedding for this chunk
|
||||
const embedding = await provider.generateNoteEmbeddings(chunkContext);
|
||||
|
||||
// Store with chunk information
|
||||
await storeNoteEmbedding(
|
||||
noteId,
|
||||
provider.name,
|
||||
config.model,
|
||||
embedding
|
||||
);
|
||||
|
||||
// Small delay between chunks to avoid rate limits
|
||||
if (i < chunks.length - 1) {
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
}
|
||||
|
||||
log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`);
|
||||
} catch (error: any) {
|
||||
log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export function cleanupEmbeddings() {
|
||||
// Cleanup function implementation
|
||||
}
|
||||
|
||||
export default {
|
||||
cosineSimilarity,
|
||||
embeddingToBuffer,
|
||||
|
Loading…
x
Reference in New Issue
Block a user