diff --git a/src/services/llm/embeddings/base_embeddings.ts b/src/services/llm/embeddings/base_embeddings.ts index 86cd56618..be2b1a2a9 100644 --- a/src/services/llm/embeddings/base_embeddings.ts +++ b/src/services/llm/embeddings/base_embeddings.ts @@ -18,45 +18,89 @@ export abstract class BaseEmbeddingProvider implements EmbeddingProvider { abstract generateEmbeddings(text: string): Promise; abstract generateBatchEmbeddings(texts: string[]): Promise; + /** + * Cleans and normalizes text for embeddings by removing excessive whitespace + */ + private cleanText(text: string): string { + return text.replace(/\s+/g, ' ').trim(); + } + /** * Generates a rich text representation of a note's context for embedding */ protected generateNoteContextText(context: NoteEmbeddingContext): string { - const parts = [ - `Title: ${context.title}`, - `Type: ${context.type}`, - `MIME: ${context.mime}`, - `Created: ${context.dateCreated}`, - `Modified: ${context.dateModified}` - ]; + // Start with core note information + let result = + `Title: ${this.cleanText(context.title)}\n` + + `Type: ${context.type}\n` + + `MIME: ${context.mime}\n` + + `Created: ${context.dateCreated}\n` + + `Modified: ${context.dateModified}\n`; + // Add attributes in a concise format if (context.attributes.length > 0) { - parts.push('Attributes:'); - for (const attr of context.attributes) { - parts.push(` ${attr.type} - ${attr.name}: ${attr.value}`); - } + result += 'Attributes: '; + const attributeTexts = context.attributes.map(attr => + `${attr.type}:${attr.name}=${this.cleanText(attr.value)}` + ); + result += attributeTexts.join('; ') + '\n'; } + // Add important label values concisely + if (context.labelValues && Object.keys(context.labelValues).length > 0) { + result += 'Labels: '; + const labelTexts = Object.entries(context.labelValues).map(([name, value]) => + `${name}=${this.cleanText(value)}` + ); + result += labelTexts.join('; ') + '\n'; + } + + // Add parents concisely if (context.parentTitles.length > 0) { - parts.push('Parent Notes:'); - parts.push(...context.parentTitles.map(t => ` ${t}`)); + result += `Parents: ${context.parentTitles.map(t => this.cleanText(t)).join('; ')}\n`; } + // Add children concisely if (context.childTitles.length > 0) { - parts.push('Child Notes:'); - parts.push(...context.childTitles.map(t => ` ${t}`)); + result += `Children: ${context.childTitles.map(t => this.cleanText(t)).join('; ')}\n`; } + // Add template/inheritance relationships concisely + if (context.templateTitles && context.templateTitles.length > 0) { + result += `Templates: ${context.templateTitles.map(t => this.cleanText(t)).join('; ')}\n`; + } + + // Add related notes concisely + if (context.relatedNotes && context.relatedNotes.length > 0) { + result += 'Related: '; + const relatedTexts = context.relatedNotes.map(rel => + `${rel.relationName}→${this.cleanText(rel.targetTitle)}` + ); + result += relatedTexts.join('; ') + '\n'; + } + + // Add backlinks concisely + if (context.backlinks && context.backlinks.length > 0) { + result += 'Referenced By: '; + const backlinkTexts = context.backlinks.map(link => + `${this.cleanText(link.sourceTitle)}→${link.relationName}` + ); + result += backlinkTexts.join('; ') + '\n'; + } + + // Add attachments concisely if (context.attachments.length > 0) { - parts.push('Attachments:'); - for (const att of context.attachments) { - parts.push(` ${att.title} (${att.mime})`); - } + result += 'Attachments: '; + const attachmentTexts = context.attachments.map(att => + `${this.cleanText(att.title)}(${att.mime})` + ); + result += attachmentTexts.join('; ') + '\n'; } - parts.push('Content:', context.content); + // Add content (already cleaned in getNoteEmbeddingContext) + result += `Content: ${context.content}`; - return parts.join('\n'); + return result; } /** diff --git a/src/services/llm/embeddings/embeddings_interface.ts b/src/services/llm/embeddings/embeddings_interface.ts index 8f320e066..7a6b11bfb 100644 --- a/src/services/llm/embeddings/embeddings_interface.ts +++ b/src/services/llm/embeddings/embeddings_interface.ts @@ -22,6 +22,18 @@ export interface NoteEmbeddingContext { title: string; mime: string; }[]; + backlinks?: { + sourceNoteId: string; + sourceTitle: string; + relationName: string; + }[]; + relatedNotes?: { + targetNoteId: string; + targetTitle: string; + relationName: string; + }[]; + labelValues?: Record; + templateTitles?: string[]; } /** diff --git a/src/services/llm/embeddings/vector_store.ts b/src/services/llm/embeddings/vector_store.ts index 3f17c0227..3c767bb83 100644 --- a/src/services/llm/embeddings/vector_store.ts +++ b/src/services/llm/embeddings/vector_store.ts @@ -7,6 +7,7 @@ import becca from "../../../becca/becca.js"; import type { NoteEmbeddingContext } from "./embeddings_interface.js"; import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js"; import eventService from "../../events.js"; +import type BNote from "../../../becca/entities/bnote.js"; // Type definition for embedding result interface EmbeddingResult { @@ -178,6 +179,31 @@ export async function findSimilarNotes( .slice(0, limit); } +/** + * Clean note content by removing HTML tags and normalizing whitespace + */ +function cleanNoteContent(content: string, type: string, mime: string): string { + // If it's HTML content, remove HTML tags + if ((type === 'text' && mime === 'text/html') || content.includes('
') || content.includes('

')) { + // Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser + content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space + } + + // Normalize whitespace (replace multiple spaces/newlines with single space) + content = content.replace(/\s+/g, ' '); + + // Trim the content + content = content.trim(); + + // Truncate if extremely long (optional, adjust limit as needed) + const MAX_CONTENT_LENGTH = 10000; + if (content.length > MAX_CONTENT_LENGTH) { + content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]'; + } + + return content; +} + /** * Gets context for a note to be embedded */ @@ -196,13 +222,58 @@ export async function getNoteEmbeddingContext(noteId: string): Promise note.title); - // Get attributes - const attributes = note.getOwnedAttributes().map(attr => ({ + // Get all attributes (not just owned ones) + const attributes = note.getAttributes().map(attr => ({ type: attr.type, name: attr.name, value: attr.value })); + // Get backlinks (notes that reference this note through relations) + const targetRelations = note.getTargetRelations(); + const backlinks = targetRelations + .map(relation => { + const sourceNote = relation.getNote(); + if (sourceNote && sourceNote.type !== 'search') { // Filter out search notes + return { + sourceNoteId: sourceNote.noteId, + sourceTitle: sourceNote.title, + relationName: relation.name + }; + } + return null; + }) + .filter((item): item is { sourceNoteId: string; sourceTitle: string; relationName: string } => item !== null); + + // Get related notes through relations + const relations = note.getRelations(); + const relatedNotes = relations + .map(relation => { + const targetNote = relation.targetNote; + if (targetNote) { + return { + targetNoteId: targetNote.noteId, + targetTitle: targetNote.title, + relationName: relation.name + }; + } + return null; + }) + .filter((item): item is { targetNoteId: string; targetTitle: string; relationName: string } => item !== null); + + // Extract important labels that might affect semantics + const labelValues: Record = {}; + const labels = note.getLabels(); + for (const label of labels) { + // Skip CSS and UI-related labels that don't affect semantics + if (!label.name.startsWith('css') && + !label.name.startsWith('workspace') && + !label.name.startsWith('hide') && + !label.name.startsWith('collapsed')) { + labelValues[label.name] = label.value; + } + } + // Get attachments const attachments = note.getAttachments().map(att => ({ title: att.title, @@ -219,6 +290,17 @@ export async function getNoteEmbeddingContext(noteId: string): Promise rel.targetNote) + .filter((note): note is BNote => note !== undefined) + .map(templateNote => templateNote.title); + return { noteId: note.noteId, title: note.title, @@ -230,7 +312,11 @@ export async function getNoteEmbeddingContext(noteId: string): Promise