implement chunking and use becca for some functionality

This commit is contained in:
perf3ct 2025-03-11 18:17:41 +00:00
parent 4160db9728
commit 0985cec8d6
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
2 changed files with 282 additions and 70 deletions

View File

@ -1,5 +1,6 @@
import sql from '../sql.js';
import sanitizeHtml from 'sanitize-html';
import becca from '../../becca/becca.js';
/**
* Utility class for extracting context from notes to provide to AI models
@ -10,19 +11,27 @@ export class ContextExtractor {
* Get the content of a note
*/
async getNoteContent(noteId: string): Promise<string | null> {
const note = await sql.getRow<{content: string, type: string, mime: string, title: string}>(
`SELECT note_contents.content, notes.type, notes.mime, notes.title
FROM notes
JOIN note_contents ON notes.noteId = note_contents.noteId
WHERE notes.noteId = ?`,
[noteId]
);
// Use Becca API to get note data
const note = becca.getNote(noteId);
if (!note) {
return null;
}
return this.formatNoteContent(note.content, note.type, note.mime, note.title);
try {
// Get content using Becca API
const content = String(await note.getContent() || "");
return this.formatNoteContent(
content,
note.type,
note.mime,
note.title
);
} catch (error) {
console.error(`Error getting content for note ${noteId}:`, error);
return null;
}
}
/**
@ -181,13 +190,27 @@ export class ContextExtractor {
* Get a set of parent notes to provide hierarchical context
*/
async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
// Note: getParentNotes has already been updated to use Becca
const parents = await this.getParentNotes(noteId, maxDepth);
if (!parents.length) return '';
let context = 'Here is the hierarchical context for the current note:\n\n';
for (const parent of parents) {
context += `- ${parent.title}\n`;
// Create a hierarchical view of the parents using indentation
// to show the proper parent-child relationship
let indentLevel = 0;
for (let i = 0; i < parents.length; i++) {
const parent = parents[i];
const indent = ' '.repeat(indentLevel);
context += `${indent}- ${parent.title}\n`;
indentLevel++;
}
// Now add the current note with proper indentation
const note = becca.getNote(noteId);
if (note) {
const indent = ' '.repeat(indentLevel);
context += `${indent}- ${note.title} (current note)\n`;
}
return context + '\n';
@ -197,21 +220,33 @@ export class ContextExtractor {
* Get child notes to provide additional context
*/
async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
const children = await sql.getRows<{noteId: string, title: string}>(
`SELECT noteId, title FROM notes
WHERE parentNoteId = ? AND isDeleted = 0
LIMIT ?`,
[noteId, maxChildren]
);
const note = becca.getNote(noteId);
if (!children.length) return '';
if (!note) {
return '';
}
// Use Becca API to get child notes
const childNotes = note.getChildNotes();
if (!childNotes || childNotes.length === 0) {
return '';
}
let context = 'The current note has these child notes:\n\n';
for (const child of children) {
// Limit to maxChildren
const childrenToShow = childNotes.slice(0, maxChildren);
for (const child of childrenToShow) {
context += `- ${child.title}\n`;
}
// If there are more children than we're showing, indicate that
if (childNotes.length > maxChildren) {
context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
}
return context + '\n';
}
@ -219,24 +254,42 @@ export class ContextExtractor {
* Get notes linked to this note
*/
async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
const linkedNotes = await sql.getRows<{title: string}>(
`SELECT title FROM notes
WHERE noteId IN (
SELECT value FROM attributes
WHERE noteId = ? AND type = 'relation'
LIMIT ?
)`,
[noteId, maxLinks]
);
const note = becca.getNote(noteId);
if (!linkedNotes.length) return '';
if (!note) {
return '';
}
// Use Becca API to get relations
const relations = note.getRelations();
if (!relations || relations.length === 0) {
return '';
}
// Get the target notes from relations
const linkedNotes = relations
.map(relation => relation.targetNote)
.filter(note => note !== null && note !== undefined);
if (linkedNotes.length === 0) {
return '';
}
let context = 'This note has relationships with these notes:\n\n';
for (const linked of linkedNotes) {
// Limit to maxLinks
const notesToShow = linkedNotes.slice(0, maxLinks);
for (const linked of notesToShow) {
context += `- ${linked.title}\n`;
}
// If there are more linked notes than we're showing, indicate that
if (linkedNotes.length > maxLinks) {
context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
}
return context + '\n';
}
@ -669,27 +722,41 @@ export class ContextExtractor {
*/
private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
const parentNotes: {noteId: string, title: string}[] = [];
let currentNoteId = noteId;
const startNote = becca.getNote(noteId);
if (!startNote) {
return parentNotes;
}
// Use non-null assertion as we checked above
let currentNote: any = startNote;
for (let i = 0; i < maxDepth; i++) {
const parent = await sql.getRow<{parentNoteId: string, title: string}>(
`SELECT branches.parentNoteId, notes.title
FROM branches
JOIN notes ON branches.parentNoteId = notes.noteId
WHERE branches.noteId = ? AND branches.isDeleted = 0 LIMIT 1`,
[currentNoteId]
);
// Get parent branches (should be just one in most cases)
if (!currentNote) break;
if (!parent || parent.parentNoteId === 'root') {
const parentBranches: any[] = currentNote.getParentBranches();
if (!parentBranches || parentBranches.length === 0) {
break;
}
// Use the first parent branch
const branch: any = parentBranches[0];
if (!branch) break;
const parentNote: any = branch.getParentNote();
if (!parentNote || parentNote.noteId === 'root') {
break;
}
parentNotes.unshift({
noteId: parent.parentNoteId,
title: parent.title
noteId: parentNote.noteId,
title: parentNote.title
});
currentNoteId = parent.parentNoteId;
currentNote = parentNote;
}
return parentNotes;

View File

@ -410,7 +410,37 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
let content = "";
try {
// Get raw content from the note
// Use the enhanced context extractor for improved content extraction
// We're using a dynamic import to avoid circular dependencies
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
// Get the content using the enhanced formatNoteContent method in context extractor
const noteContent = await contextExtractor.getNoteContent(noteId);
if (noteContent) {
content = noteContent;
// For large content, consider chunking or summarization
if (content.length > 10000) {
// Large content handling options:
// Option 1: Use our summarization feature
const summary = await contextExtractor.getNoteSummary(noteId);
if (summary) {
content = summary;
}
// Option 2: Alternative approach - use the first chunk if summarization fails
if (content.length > 10000) {
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
if (chunks && chunks.length > 0) {
// Use the first chunk (most relevant/beginning)
content = chunks[0];
}
}
}
} else {
// Fallback to original method if context extractor fails
const rawContent = String(await note.getContent() || "");
// Process the content based on note type to extract meaningful text
@ -422,13 +452,27 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
} else if (note.type === 'image' || note.type === 'file') {
content = `[${note.type} attachment: ${note.mime}]`;
}
} catch (err) {
console.error(`Error getting content for note ${noteId}:`, err);
content = `[Error extracting content]`;
}
// Clean the content to remove HTML tags and normalize whitespace
content = cleanNoteContent(content, note.type, note.mime);
}
} catch (err) {
console.error(`Error getting content for note ${noteId}:`, err);
content = `[Error extracting content]`;
// Try fallback to original method
try {
const rawContent = String(await note.getContent() || "");
if (note.type === 'text' || note.type === 'code') {
content = rawContent;
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
content = extractStructuredContent(rawContent, note.type, note.mime);
}
content = cleanNoteContent(content, note.type, note.mime);
} catch (fallbackErr) {
console.error(`Fallback content extraction also failed for note ${noteId}:`, fallbackErr);
}
}
// Get template/inheritance relationships
// This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
@ -490,19 +534,35 @@ export async function queueNoteForEmbedding(noteId: string, operation = 'UPDATE'
}
/**
* Deletes all embeddings for a note
* Delete embeddings for a note
*
* @param noteId - The ID of the note
* @param providerId - Optional provider ID to delete embeddings only for a specific provider
* @param modelId - Optional model ID to delete embeddings only for a specific model
*/
export async function deleteNoteEmbeddings(noteId: string) {
await sql.execute(
"DELETE FROM note_embeddings WHERE noteId = ?",
[noteId]
);
export async function deleteNoteEmbeddings(noteId: string, providerId?: string, modelId?: string) {
let query = "DELETE FROM note_embeddings WHERE noteId = ?";
const params: any[] = [noteId];
// Remove from queue if present
if (providerId) {
query += " AND providerId = ?";
params.push(providerId);
if (modelId) {
query += " AND modelId = ?";
params.push(modelId);
}
}
await sql.execute(query, params);
// Only remove from queue if deleting all embeddings for the note
if (!providerId) {
await sql.execute(
"DELETE FROM embedding_queue WHERE noteId = ?",
[noteId]
);
}
}
/**
@ -559,15 +619,28 @@ export async function processEmbeddingQueue() {
// Get note context for embedding
const context = await getNoteEmbeddingContext(noteData.noteId);
// Check if we should use chunking for large content
const useChunking = context.content.length > 5000; // Use chunking for large notes by default
// Process with each enabled provider
for (const provider of enabledProviders) {
try {
// Generate embedding
if (useChunking) {
// Enhanced approach: Process large notes using chunking
await processNoteWithChunking(noteData.noteId, provider, context);
} else {
// Standard approach: Generate a single embedding for the whole note
const embedding = await provider.generateNoteEmbeddings(context);
// Store embedding
const config = provider.getConfig();
await storeNoteEmbedding(noteData.noteId, provider.name, config.model, embedding);
await storeNoteEmbedding(
noteData.noteId,
provider.name,
config.model,
embedding
);
}
} catch (providerError: any) {
log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`);
}
@ -748,6 +821,78 @@ export async function getEmbeddingStats() {
};
}
/**
* Process a large note by breaking it into chunks and creating embeddings for each chunk
* This provides more detailed and focused embeddings for different parts of large notes
*
* @param noteId - The ID of the note to process
* @param provider - The embedding provider to use
* @param context - The note context data
*/
async function processNoteWithChunking(
noteId: string,
provider: any,
context: NoteEmbeddingContext
): Promise<void> {
try {
// Get the context extractor dynamically to avoid circular dependencies
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
// Get chunks of the note content
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
if (!chunks || chunks.length === 0) {
// Fall back to single embedding if chunking fails
const embedding = await provider.generateNoteEmbeddings(context);
const config = provider.getConfig();
await storeNoteEmbedding(noteId, provider.name, config.model, embedding);
return;
}
// Generate and store embeddings for each chunk
const config = provider.getConfig();
// Delete existing embeddings first to avoid duplicates
await deleteNoteEmbeddings(noteId, provider.name, config.model);
// Process each chunk with a slight delay to avoid rate limits
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
// Create a modified context object with just this chunk's content
const chunkContext: NoteEmbeddingContext = {
...context,
content: chunk
};
// Generate embedding for this chunk
const embedding = await provider.generateNoteEmbeddings(chunkContext);
// Store with chunk information
await storeNoteEmbedding(
noteId,
provider.name,
config.model,
embedding
);
// Small delay between chunks to avoid rate limits
if (i < chunks.length - 1) {
await new Promise(resolve => setTimeout(resolve, 100));
}
}
log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`);
} catch (error: any) {
log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`);
throw error;
}
}
export function cleanupEmbeddings() {
// Cleanup function implementation
}
export default {
cosineSimilarity,
embeddingToBuffer,