mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-07-29 11:02:28 +08:00
implement chunking and use becca for some functionality
This commit is contained in:
parent
4160db9728
commit
0985cec8d6
@ -1,5 +1,6 @@
|
|||||||
import sql from '../sql.js';
|
import sql from '../sql.js';
|
||||||
import sanitizeHtml from 'sanitize-html';
|
import sanitizeHtml from 'sanitize-html';
|
||||||
|
import becca from '../../becca/becca.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Utility class for extracting context from notes to provide to AI models
|
* Utility class for extracting context from notes to provide to AI models
|
||||||
@ -10,19 +11,27 @@ export class ContextExtractor {
|
|||||||
* Get the content of a note
|
* Get the content of a note
|
||||||
*/
|
*/
|
||||||
async getNoteContent(noteId: string): Promise<string | null> {
|
async getNoteContent(noteId: string): Promise<string | null> {
|
||||||
const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>(
|
// Use Becca API to get note data
|
||||||
`SELECT note_contents.content, notes.type, notes.mime, notes.title
|
const note = becca.getNote(noteId);
|
||||||
FROM notes
|
|
||||||
JOIN note_contents ON notes.noteId = note_contents.noteId
|
|
||||||
WHERE notes.noteId = ?`,
|
|
||||||
[noteId]
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!note) {
|
if (!note) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.formatNoteContent(note.content, note.type, note.mime, note.title);
|
try {
|
||||||
|
// Get content using Becca API
|
||||||
|
const content = String(await note.getContent() || "");
|
||||||
|
|
||||||
|
return this.formatNoteContent(
|
||||||
|
content,
|
||||||
|
note.type,
|
||||||
|
note.mime,
|
||||||
|
note.title
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error getting content for note ${noteId}:`, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -181,13 +190,27 @@ export class ContextExtractor {
|
|||||||
* Get a set of parent notes to provide hierarchical context
|
* Get a set of parent notes to provide hierarchical context
|
||||||
*/
|
*/
|
||||||
async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
|
async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
|
||||||
|
// Note: getParentNotes has already been updated to use Becca
|
||||||
const parents = await this.getParentNotes(noteId, maxDepth);
|
const parents = await this.getParentNotes(noteId, maxDepth);
|
||||||
if (!parents.length) return '';
|
if (!parents.length) return '';
|
||||||
|
|
||||||
let context = 'Here is the hierarchical context for the current note:\n\n';
|
let context = 'Here is the hierarchical context for the current note:\n\n';
|
||||||
|
|
||||||
for (const parent of parents) {
|
// Create a hierarchical view of the parents using indentation
|
||||||
context += `- ${parent.title}\n`;
|
// to show the proper parent-child relationship
|
||||||
|
let indentLevel = 0;
|
||||||
|
for (let i = 0; i < parents.length; i++) {
|
||||||
|
const parent = parents[i];
|
||||||
|
const indent = ' '.repeat(indentLevel);
|
||||||
|
context += `${indent}- ${parent.title}\n`;
|
||||||
|
indentLevel++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now add the current note with proper indentation
|
||||||
|
const note = becca.getNote(noteId);
|
||||||
|
if (note) {
|
||||||
|
const indent = ' '.repeat(indentLevel);
|
||||||
|
context += `${indent}- ${note.title} (current note)\n`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return context + '\n';
|
return context + '\n';
|
||||||
@ -197,21 +220,33 @@ export class ContextExtractor {
|
|||||||
* Get child notes to provide additional context
|
* Get child notes to provide additional context
|
||||||
*/
|
*/
|
||||||
async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
|
async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
|
||||||
const children = await sql.getRows<{noteId: string, title: string}>(
|
const note = becca.getNote(noteId);
|
||||||
`SELECT noteId, title FROM notes
|
|
||||||
WHERE parentNoteId = ? AND isDeleted = 0
|
|
||||||
LIMIT ?`,
|
|
||||||
[noteId, maxChildren]
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!children.length) return '';
|
if (!note) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use Becca API to get child notes
|
||||||
|
const childNotes = note.getChildNotes();
|
||||||
|
|
||||||
|
if (!childNotes || childNotes.length === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
let context = 'The current note has these child notes:\n\n';
|
let context = 'The current note has these child notes:\n\n';
|
||||||
|
|
||||||
for (const child of children) {
|
// Limit to maxChildren
|
||||||
|
const childrenToShow = childNotes.slice(0, maxChildren);
|
||||||
|
|
||||||
|
for (const child of childrenToShow) {
|
||||||
context += `- ${child.title}\n`;
|
context += `- ${child.title}\n`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If there are more children than we're showing, indicate that
|
||||||
|
if (childNotes.length > maxChildren) {
|
||||||
|
context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
|
||||||
|
}
|
||||||
|
|
||||||
return context + '\n';
|
return context + '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,24 +254,42 @@ export class ContextExtractor {
|
|||||||
* Get notes linked to this note
|
* Get notes linked to this note
|
||||||
*/
|
*/
|
||||||
async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
|
async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
|
||||||
const linkedNotes = await sql.getRows<{title: string}>(
|
const note = becca.getNote(noteId);
|
||||||
`SELECT title FROM notes
|
|
||||||
WHERE noteId IN (
|
|
||||||
SELECT value FROM attributes
|
|
||||||
WHERE noteId = ? AND type = 'relation'
|
|
||||||
LIMIT ?
|
|
||||||
)`,
|
|
||||||
[noteId, maxLinks]
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!linkedNotes.length) return '';
|
if (!note) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use Becca API to get relations
|
||||||
|
const relations = note.getRelations();
|
||||||
|
|
||||||
|
if (!relations || relations.length === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the target notes from relations
|
||||||
|
const linkedNotes = relations
|
||||||
|
.map(relation => relation.targetNote)
|
||||||
|
.filter(note => note !== null && note !== undefined);
|
||||||
|
|
||||||
|
if (linkedNotes.length === 0) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
let context = 'This note has relationships with these notes:\n\n';
|
let context = 'This note has relationships with these notes:\n\n';
|
||||||
|
|
||||||
for (const linked of linkedNotes) {
|
// Limit to maxLinks
|
||||||
|
const notesToShow = linkedNotes.slice(0, maxLinks);
|
||||||
|
|
||||||
|
for (const linked of notesToShow) {
|
||||||
context += `- ${linked.title}\n`;
|
context += `- ${linked.title}\n`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If there are more linked notes than we're showing, indicate that
|
||||||
|
if (linkedNotes.length > maxLinks) {
|
||||||
|
context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
|
||||||
|
}
|
||||||
|
|
||||||
return context + '\n';
|
return context + '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -669,27 +722,41 @@ export class ContextExtractor {
|
|||||||
*/
|
*/
|
||||||
private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
|
private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
|
||||||
const parentNotes: {noteId: string, title: string}[] = [];
|
const parentNotes: {noteId: string, title: string}[] = [];
|
||||||
let currentNoteId = noteId;
|
const startNote = becca.getNote(noteId);
|
||||||
|
|
||||||
|
if (!startNote) {
|
||||||
|
return parentNotes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use non-null assertion as we checked above
|
||||||
|
let currentNote: any = startNote;
|
||||||
|
|
||||||
for (let i = 0; i < maxDepth; i++) {
|
for (let i = 0; i < maxDepth; i++) {
|
||||||
const parent = await sql.getRow<{parentNoteId: string, title: string}>(
|
// Get parent branches (should be just one in most cases)
|
||||||
`SELECT branches.parentNoteId, notes.title
|
if (!currentNote) break;
|
||||||
FROM branches
|
|
||||||
JOIN notes ON branches.parentNoteId = notes.noteId
|
|
||||||
WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`,
|
|
||||||
[currentNoteId]
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!parent || parent.parentNoteId === 'root') {
|
const parentBranches: any[] = currentNote.getParentBranches();
|
||||||
|
|
||||||
|
if (!parentBranches || parentBranches.length === 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the first parent branch
|
||||||
|
const branch: any = parentBranches[0];
|
||||||
|
if (!branch) break;
|
||||||
|
|
||||||
|
const parentNote: any = branch.getParentNote();
|
||||||
|
|
||||||
|
if (!parentNote || parentNote.noteId === 'root') {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
parentNotes.unshift({
|
parentNotes.unshift({
|
||||||
noteId: parent.parentNoteId,
|
noteId: parentNote.noteId,
|
||||||
title: parent.title
|
title: parentNote.title
|
||||||
});
|
});
|
||||||
|
|
||||||
currentNoteId = parent.parentNoteId;
|
currentNote = parentNote;
|
||||||
}
|
}
|
||||||
|
|
||||||
return parentNotes;
|
return parentNotes;
|
||||||
|
@ -410,25 +410,69 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
|||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Get raw content from the note
|
// Use the enhanced context extractor for improved content extraction
|
||||||
const rawContent = String(await note.getContent() || "");
|
// We're using a dynamic import to avoid circular dependencies
|
||||||
|
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
|
||||||
|
|
||||||
// Process the content based on note type to extract meaningful text
|
// Get the content using the enhanced formatNoteContent method in context extractor
|
||||||
if (note.type === 'text' || note.type === 'code') {
|
const noteContent = await contextExtractor.getNoteContent(noteId);
|
||||||
content = rawContent;
|
|
||||||
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
if (noteContent) {
|
||||||
// Process structured content types
|
content = noteContent;
|
||||||
content = extractStructuredContent(rawContent, note.type, note.mime);
|
|
||||||
} else if (note.type === 'image' || note.type === 'file') {
|
// For large content, consider chunking or summarization
|
||||||
content = `[${note.type} attachment: ${note.mime}]`;
|
if (content.length > 10000) {
|
||||||
|
// Large content handling options:
|
||||||
|
|
||||||
|
// Option 1: Use our summarization feature
|
||||||
|
const summary = await contextExtractor.getNoteSummary(noteId);
|
||||||
|
if (summary) {
|
||||||
|
content = summary;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Option 2: Alternative approach - use the first chunk if summarization fails
|
||||||
|
if (content.length > 10000) {
|
||||||
|
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
|
||||||
|
if (chunks && chunks.length > 0) {
|
||||||
|
// Use the first chunk (most relevant/beginning)
|
||||||
|
content = chunks[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Fallback to original method if context extractor fails
|
||||||
|
const rawContent = String(await note.getContent() || "");
|
||||||
|
|
||||||
|
// Process the content based on note type to extract meaningful text
|
||||||
|
if (note.type === 'text' || note.type === 'code') {
|
||||||
|
content = rawContent;
|
||||||
|
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
||||||
|
// Process structured content types
|
||||||
|
content = extractStructuredContent(rawContent, note.type, note.mime);
|
||||||
|
} else if (note.type === 'image' || note.type === 'file') {
|
||||||
|
content = `[${note.type} attachment: ${note.mime}]`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean the content to remove HTML tags and normalize whitespace
|
||||||
|
content = cleanNoteContent(content, note.type, note.mime);
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error(`Error getting content for note ${noteId}:`, err);
|
console.error(`Error getting content for note ${noteId}:`, err);
|
||||||
content = `[Error extracting content]`;
|
content = `[Error extracting content]`;
|
||||||
}
|
|
||||||
|
|
||||||
// Clean the content to remove HTML tags and normalize whitespace
|
// Try fallback to original method
|
||||||
content = cleanNoteContent(content, note.type, note.mime);
|
try {
|
||||||
|
const rawContent = String(await note.getContent() || "");
|
||||||
|
if (note.type === 'text' || note.type === 'code') {
|
||||||
|
content = rawContent;
|
||||||
|
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
||||||
|
content = extractStructuredContent(rawContent, note.type, note.mime);
|
||||||
|
}
|
||||||
|
content = cleanNoteContent(content, note.type, note.mime);
|
||||||
|
} catch (fallbackErr) {
|
||||||
|
console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get template/inheritance relationships
|
// Get template/inheritance relationships
|
||||||
// This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
|
// This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
|
||||||
@ -490,19 +534,35 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE'
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Deletes all embeddings for a note
|
* Delete embeddings for a note
|
||||||
|
*
|
||||||
|
* @param noteId - The ID of the note
|
||||||
|
* @param providerId - Optional provider ID to delete embeddings only for a specific provider
|
||||||
|
* @param modelId - Optional model ID to delete embeddings only for a specific model
|
||||||
*/
|
*/
|
||||||
export async function deleteNoteEmbeddings(noteId: string) {
|
export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) {
|
||||||
await sql.execute(
|
let query = "DELETE FROM note_embeddings WHERE noteId = ?";
|
||||||
"DELETE FROM note_embeddings WHERE noteId = ?",
|
const params: any[] = [noteId];
|
||||||
[noteId]
|
|
||||||
);
|
|
||||||
|
|
||||||
// Remove from queue if present
|
if (providerId) {
|
||||||
await sql.execute(
|
query += " AND providerId = ?";
|
||||||
"DELETE FROM embedding_queue WHERE noteId = ?",
|
params.push(providerId);
|
||||||
[noteId]
|
|
||||||
);
|
if (modelId) {
|
||||||
|
query += " AND modelId = ?";
|
||||||
|
params.push(modelId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await sql.execute(query, params);
|
||||||
|
|
||||||
|
// Only remove from queue if deleting all embeddings for the note
|
||||||
|
if (!providerId) {
|
||||||
|
await sql.execute(
|
||||||
|
"DELETE FROM embedding_queue WHERE noteId = ?",
|
||||||
|
[noteId]
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -559,15 +619,28 @@ export async function processEmbeddingQueue() {
|
|||||||
// Get note context for embedding
|
// Get note context for embedding
|
||||||
const context = await getNoteEmbeddingContext(noteData.noteId);
|
const context = await getNoteEmbeddingContext(noteData.noteId);
|
||||||
|
|
||||||
|
// Check if we should use chunking for large content
|
||||||
|
const useChunking = context.content.length > 5000; // Use chunking for large notes by default
|
||||||
|
|
||||||
// Process with each enabled provider
|
// Process with each enabled provider
|
||||||
for (const provider of enabledProviders) {
|
for (const provider of enabledProviders) {
|
||||||
try {
|
try {
|
||||||
// Generate embedding
|
if (useChunking) {
|
||||||
const embedding = await provider.generateNoteEmbeddings(context);
|
// Enhanced approach: Process large notes using chunking
|
||||||
|
await processNoteWithChunking(noteData.noteId, provider, context);
|
||||||
|
} else {
|
||||||
|
// Standard approach: Generate a single embedding for the whole note
|
||||||
|
const embedding = await provider.generateNoteEmbeddings(context);
|
||||||
|
|
||||||
// Store embedding
|
// Store embedding
|
||||||
const config = provider.getConfig();
|
const config = provider.getConfig();
|
||||||
await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding);
|
await storeNoteEmbedding(
|
||||||
|
noteData.noteId,
|
||||||
|
provider.name,
|
||||||
|
config.model,
|
||||||
|
embedding
|
||||||
|
);
|
||||||
|
}
|
||||||
} catch (providerError: any) {
|
} catch (providerError: any) {
|
||||||
log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`);
|
log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`);
|
||||||
}
|
}
|
||||||
@ -748,6 +821,78 @@ export async function getEmbeddingStats() {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process a large note by breaking it into chunks and creating embeddings for each chunk
|
||||||
|
* This provides more detailed and focused embeddings for different parts of large notes
|
||||||
|
*
|
||||||
|
* @param noteId - The ID of the note to process
|
||||||
|
* @param provider - The embedding provider to use
|
||||||
|
* @param context - The note context data
|
||||||
|
*/
|
||||||
|
async function processNoteWithChunking(
|
||||||
|
noteId: string,
|
||||||
|
provider: any,
|
||||||
|
context: NoteEmbeddingContext
|
||||||
|
): Promise<void> {
|
||||||
|
try {
|
||||||
|
// Get the context extractor dynamically to avoid circular dependencies
|
||||||
|
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
|
||||||
|
|
||||||
|
// Get chunks of the note content
|
||||||
|
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
|
||||||
|
|
||||||
|
if (!chunks || chunks.length === 0) {
|
||||||
|
// Fall back to single embedding if chunking fails
|
||||||
|
const embedding = await provider.generateNoteEmbeddings(context);
|
||||||
|
const config = provider.getConfig();
|
||||||
|
await storeNoteEmbedding(noteId, provider.name, config.model, embedding);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate and store embeddings for each chunk
|
||||||
|
const config = provider.getConfig();
|
||||||
|
|
||||||
|
// Delete existing embeddings first to avoid duplicates
|
||||||
|
await deleteNoteEmbeddings(noteId, provider.name, config.model);
|
||||||
|
|
||||||
|
// Process each chunk with a slight delay to avoid rate limits
|
||||||
|
for (let i = 0; i < chunks.length; i++) {
|
||||||
|
const chunk = chunks[i];
|
||||||
|
|
||||||
|
// Create a modified context object with just this chunk's content
|
||||||
|
const chunkContext: NoteEmbeddingContext = {
|
||||||
|
...context,
|
||||||
|
content: chunk
|
||||||
|
};
|
||||||
|
|
||||||
|
// Generate embedding for this chunk
|
||||||
|
const embedding = await provider.generateNoteEmbeddings(chunkContext);
|
||||||
|
|
||||||
|
// Store with chunk information
|
||||||
|
await storeNoteEmbedding(
|
||||||
|
noteId,
|
||||||
|
provider.name,
|
||||||
|
config.model,
|
||||||
|
embedding
|
||||||
|
);
|
||||||
|
|
||||||
|
// Small delay between chunks to avoid rate limits
|
||||||
|
if (i < chunks.length - 1) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`);
|
||||||
|
} catch (error: any) {
|
||||||
|
log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function cleanupEmbeddings() {
|
||||||
|
// Cleanup function implementation
|
||||||
|
}
|
||||||
|
|
||||||
export default {
|
export default {
|
||||||
cosineSimilarity,
|
cosineSimilarity,
|
||||||
embeddingToBuffer,
|
embeddingToBuffer,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user