mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-08-10 18:39:22 +08:00
Create better relationships between notes, sanitize ridiculous spacing to save tokens
This commit is contained in:
parent
19bf741cd9
commit
7e232d17e1
@ -18,45 +18,89 @@ export abstract class BaseEmbeddingProvider implements EmbeddingProvider {
|
|||||||
abstract generateEmbeddings(text: string): Promise<Float32Array>;
|
abstract generateEmbeddings(text: string): Promise<Float32Array>;
|
||||||
abstract generateBatchEmbeddings(texts: string[]): Promise<Float32Array[]>;
|
abstract generateBatchEmbeddings(texts: string[]): Promise<Float32Array[]>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cleans and normalizes text for embeddings by removing excessive whitespace
|
||||||
|
*/
|
||||||
|
private cleanText(text: string): string {
|
||||||
|
return text.replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates a rich text representation of a note's context for embedding
|
* Generates a rich text representation of a note's context for embedding
|
||||||
*/
|
*/
|
||||||
protected generateNoteContextText(context: NoteEmbeddingContext): string {
|
protected generateNoteContextText(context: NoteEmbeddingContext): string {
|
||||||
const parts = [
|
// Start with core note information
|
||||||
`Title: ${context.title}`,
|
let result =
|
||||||
`Type: ${context.type}`,
|
`Title: ${this.cleanText(context.title)}\n` +
|
||||||
`MIME: ${context.mime}`,
|
`Type: ${context.type}\n` +
|
||||||
`Created: ${context.dateCreated}`,
|
`MIME: ${context.mime}\n` +
|
||||||
`Modified: ${context.dateModified}`
|
`Created: ${context.dateCreated}\n` +
|
||||||
];
|
`Modified: ${context.dateModified}\n`;
|
||||||
|
|
||||||
|
// Add attributes in a concise format
|
||||||
if (context.attributes.length > 0) {
|
if (context.attributes.length > 0) {
|
||||||
parts.push('Attributes:');
|
result += 'Attributes: ';
|
||||||
for (const attr of context.attributes) {
|
const attributeTexts = context.attributes.map(attr =>
|
||||||
parts.push(` ${attr.type} - ${attr.name}: ${attr.value}`);
|
`${attr.type}:${attr.name}=${this.cleanText(attr.value)}`
|
||||||
}
|
);
|
||||||
|
result += attributeTexts.join('; ') + '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add important label values concisely
|
||||||
|
if (context.labelValues && Object.keys(context.labelValues).length > 0) {
|
||||||
|
result += 'Labels: ';
|
||||||
|
const labelTexts = Object.entries(context.labelValues).map(([name, value]) =>
|
||||||
|
`${name}=${this.cleanText(value)}`
|
||||||
|
);
|
||||||
|
result += labelTexts.join('; ') + '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add parents concisely
|
||||||
if (context.parentTitles.length > 0) {
|
if (context.parentTitles.length > 0) {
|
||||||
parts.push('Parent Notes:');
|
result += `Parents: ${context.parentTitles.map(t => this.cleanText(t)).join('; ')}\n`;
|
||||||
parts.push(...context.parentTitles.map(t => ` ${t}`));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add children concisely
|
||||||
if (context.childTitles.length > 0) {
|
if (context.childTitles.length > 0) {
|
||||||
parts.push('Child Notes:');
|
result += `Children: ${context.childTitles.map(t => this.cleanText(t)).join('; ')}\n`;
|
||||||
parts.push(...context.childTitles.map(t => ` ${t}`));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add template/inheritance relationships concisely
|
||||||
|
if (context.templateTitles && context.templateTitles.length > 0) {
|
||||||
|
result += `Templates: ${context.templateTitles.map(t => this.cleanText(t)).join('; ')}\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add related notes concisely
|
||||||
|
if (context.relatedNotes && context.relatedNotes.length > 0) {
|
||||||
|
result += 'Related: ';
|
||||||
|
const relatedTexts = context.relatedNotes.map(rel =>
|
||||||
|
`${rel.relationName}→${this.cleanText(rel.targetTitle)}`
|
||||||
|
);
|
||||||
|
result += relatedTexts.join('; ') + '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add backlinks concisely
|
||||||
|
if (context.backlinks && context.backlinks.length > 0) {
|
||||||
|
result += 'Referenced By: ';
|
||||||
|
const backlinkTexts = context.backlinks.map(link =>
|
||||||
|
`${this.cleanText(link.sourceTitle)}→${link.relationName}`
|
||||||
|
);
|
||||||
|
result += backlinkTexts.join('; ') + '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add attachments concisely
|
||||||
if (context.attachments.length > 0) {
|
if (context.attachments.length > 0) {
|
||||||
parts.push('Attachments:');
|
result += 'Attachments: ';
|
||||||
for (const att of context.attachments) {
|
const attachmentTexts = context.attachments.map(att =>
|
||||||
parts.push(` ${att.title} (${att.mime})`);
|
`${this.cleanText(att.title)}(${att.mime})`
|
||||||
}
|
);
|
||||||
|
result += attachmentTexts.join('; ') + '\n';
|
||||||
}
|
}
|
||||||
|
|
||||||
parts.push('Content:', context.content);
|
// Add content (already cleaned in getNoteEmbeddingContext)
|
||||||
|
result += `Content: ${context.content}`;
|
||||||
|
|
||||||
return parts.join('\n');
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -22,6 +22,18 @@ export interface NoteEmbeddingContext {
|
|||||||
title: string;
|
title: string;
|
||||||
mime: string;
|
mime: string;
|
||||||
}[];
|
}[];
|
||||||
|
backlinks?: {
|
||||||
|
sourceNoteId: string;
|
||||||
|
sourceTitle: string;
|
||||||
|
relationName: string;
|
||||||
|
}[];
|
||||||
|
relatedNotes?: {
|
||||||
|
targetNoteId: string;
|
||||||
|
targetTitle: string;
|
||||||
|
relationName: string;
|
||||||
|
}[];
|
||||||
|
labelValues?: Record<string, string>;
|
||||||
|
templateTitles?: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -7,6 +7,7 @@ import becca from "../../../becca/becca.js";
|
|||||||
import type { NoteEmbeddingContext } from "./embeddings_interface.js";
|
import type { NoteEmbeddingContext } from "./embeddings_interface.js";
|
||||||
import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js";
|
import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js";
|
||||||
import eventService from "../../events.js";
|
import eventService from "../../events.js";
|
||||||
|
import type BNote from "../../../becca/entities/bnote.js";
|
||||||
|
|
||||||
// Type definition for embedding result
|
// Type definition for embedding result
|
||||||
interface EmbeddingResult {
|
interface EmbeddingResult {
|
||||||
@ -178,6 +179,31 @@ export async function findSimilarNotes(
|
|||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean note content by removing HTML tags and normalizing whitespace
|
||||||
|
*/
|
||||||
|
function cleanNoteContent(content: string, type: string, mime: string): string {
|
||||||
|
// If it's HTML content, remove HTML tags
|
||||||
|
if ((type === 'text' && mime === 'text/html') || content.includes('<div>') || content.includes('<p>')) {
|
||||||
|
// Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser
|
||||||
|
content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize whitespace (replace multiple spaces/newlines with single space)
|
||||||
|
content = content.replace(/\s+/g, ' ');
|
||||||
|
|
||||||
|
// Trim the content
|
||||||
|
content = content.trim();
|
||||||
|
|
||||||
|
// Truncate if extremely long (optional, adjust limit as needed)
|
||||||
|
const MAX_CONTENT_LENGTH = 10000;
|
||||||
|
if (content.length > MAX_CONTENT_LENGTH) {
|
||||||
|
content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]';
|
||||||
|
}
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets context for a note to be embedded
|
* Gets context for a note to be embedded
|
||||||
*/
|
*/
|
||||||
@ -196,13 +222,58 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
|||||||
const childNotes = note.getChildNotes();
|
const childNotes = note.getChildNotes();
|
||||||
const childTitles = childNotes.map(note => note.title);
|
const childTitles = childNotes.map(note => note.title);
|
||||||
|
|
||||||
// Get attributes
|
// Get all attributes (not just owned ones)
|
||||||
const attributes = note.getOwnedAttributes().map(attr => ({
|
const attributes = note.getAttributes().map(attr => ({
|
||||||
type: attr.type,
|
type: attr.type,
|
||||||
name: attr.name,
|
name: attr.name,
|
||||||
value: attr.value
|
value: attr.value
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
// Get backlinks (notes that reference this note through relations)
|
||||||
|
const targetRelations = note.getTargetRelations();
|
||||||
|
const backlinks = targetRelations
|
||||||
|
.map(relation => {
|
||||||
|
const sourceNote = relation.getNote();
|
||||||
|
if (sourceNote && sourceNote.type !== 'search') { // Filter out search notes
|
||||||
|
return {
|
||||||
|
sourceNoteId: sourceNote.noteId,
|
||||||
|
sourceTitle: sourceNote.title,
|
||||||
|
relationName: relation.name
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter((item): item is { sourceNoteId: string; sourceTitle: string; relationName: string } => item !== null);
|
||||||
|
|
||||||
|
// Get related notes through relations
|
||||||
|
const relations = note.getRelations();
|
||||||
|
const relatedNotes = relations
|
||||||
|
.map(relation => {
|
||||||
|
const targetNote = relation.targetNote;
|
||||||
|
if (targetNote) {
|
||||||
|
return {
|
||||||
|
targetNoteId: targetNote.noteId,
|
||||||
|
targetTitle: targetNote.title,
|
||||||
|
relationName: relation.name
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter((item): item is { targetNoteId: string; targetTitle: string; relationName: string } => item !== null);
|
||||||
|
|
||||||
|
// Extract important labels that might affect semantics
|
||||||
|
const labelValues: Record<string, string> = {};
|
||||||
|
const labels = note.getLabels();
|
||||||
|
for (const label of labels) {
|
||||||
|
// Skip CSS and UI-related labels that don't affect semantics
|
||||||
|
if (!label.name.startsWith('css') &&
|
||||||
|
!label.name.startsWith('workspace') &&
|
||||||
|
!label.name.startsWith('hide') &&
|
||||||
|
!label.name.startsWith('collapsed')) {
|
||||||
|
labelValues[label.name] = label.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get attachments
|
// Get attachments
|
||||||
const attachments = note.getAttachments().map(att => ({
|
const attachments = note.getAttachments().map(att => ({
|
||||||
title: att.title,
|
title: att.title,
|
||||||
@ -219,6 +290,17 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
|||||||
content = `[${note.type} attachment: ${note.mime}]`;
|
content = `[${note.type} attachment: ${note.mime}]`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean the content to remove HTML tags and normalize whitespace
|
||||||
|
content = cleanNoteContent(content, note.type, note.mime);
|
||||||
|
|
||||||
|
// Get template/inheritance relationships
|
||||||
|
// This is from FNote.getNotesToInheritAttributesFrom - recreating similar logic for BNote
|
||||||
|
const templateRelations = note.getRelations('template').concat(note.getRelations('inherit'));
|
||||||
|
const templateTitles = templateRelations
|
||||||
|
.map(rel => rel.targetNote)
|
||||||
|
.filter((note): note is BNote => note !== undefined)
|
||||||
|
.map(templateNote => templateNote.title);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
noteId: note.noteId,
|
noteId: note.noteId,
|
||||||
title: note.title,
|
title: note.title,
|
||||||
@ -230,7 +312,11 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
|||||||
attributes,
|
attributes,
|
||||||
parentTitles,
|
parentTitles,
|
||||||
childTitles,
|
childTitles,
|
||||||
attachments
|
attachments,
|
||||||
|
backlinks,
|
||||||
|
relatedNotes,
|
||||||
|
labelValues,
|
||||||
|
templateTitles
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user