do a better job of extracting context

This commit is contained in:
perf3ct 2025-03-10 18:53:36 +00:00
parent c386e34c33
commit f482b3b4c8
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
3 changed files with 308 additions and 15 deletions

View File

@ -102,6 +102,107 @@ export class ContextExtractor {
// Format code notes with code blocks
formattedContent += '```\n' + content + '\n```';
break;
case 'canvas':
case 'mindMap':
case 'relationMap':
case 'geoMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
if (type === 'canvas') {
// Extract text elements from canvas
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
const texts = jsonContent.elements
.filter((element: any) => element.type === 'text' && element.text)
.map((element: any) => element.text);
formattedContent += 'Canvas content:\n' + texts.join('\n');
break;
}
}
else if (type === 'mindMap') {
// Extract node text from mind map
const extractMindMapNodes = (node: any): string[] => {
let texts: string[] = [];
if (node.text) {
texts.push(node.text);
}
if (node.children && Array.isArray(node.children)) {
for (const child of node.children) {
texts = texts.concat(extractMindMapNodes(child));
}
}
return texts;
};
if (jsonContent.root) {
formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
break;
}
}
else if (type === 'relationMap') {
// Extract relation map entities and connections
let result = 'Relation map content:\n';
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
result += 'Notes: ' + jsonContent.notes
.map((note: any) => note.title || note.name)
.filter(Boolean)
.join(', ') + '\n';
}
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
result += 'Relations: ' + jsonContent.relations
.map((rel: any) => {
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
return `${source}${rel.name || ''}${target}`;
})
.join('; ');
}
formattedContent += result;
break;
}
else if (type === 'geoMap') {
let result = 'Geographic map content:\n';
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
result += jsonContent.markers
.map((marker: any) => {
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
})
.join('\n');
}
formattedContent += result || 'Empty geographic map';
break;
}
}
catch (e: any) {
formattedContent += `[Error parsing ${type} content: ${e.message}]`;
break;
}
}
// If JSON parsing or specific handling failed, use default handling
formattedContent += `[${type} content]`;
break;
case 'mermaid':
// Format mermaid diagrams as code blocks
formattedContent += '```mermaid\n' + content + '\n```';
break;
case 'image':
case 'file':
formattedContent += `[${type} attachment]`;
break;
default:
// For other notes, just use the content as is
formattedContent += this.sanitizeHtml(content);
@ -114,7 +215,10 @@ export class ContextExtractor {
* Sanitize HTML content to plain text
*/
private sanitizeHtml(html: string): string {
return sanitizeHtml(html, {
if (!html) return '';
// Use sanitizeHtml to remove all HTML tags
let content = sanitizeHtml(html, {
allowedTags: [],
allowedAttributes: {},
textFilter: (text) => {
@ -122,6 +226,17 @@ export class ContextExtractor {
return text.replace(/\n\s*\n/g, '\n\n');
}
});
// Additional cleanup for any remaining HTML entities
content = content
.replace(/ /g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return content;
}
/**

View File

@ -8,6 +8,7 @@ import type { NoteEmbeddingContext } from "./embeddings_interface.js";
import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js";
import eventService from "../../events.js";
import type BNote from "../../../becca/entities/bnote.js";
import sanitizeHtml from "sanitize-html";
// Type definition for embedding result
interface EmbeddingResult {
@ -183,19 +184,37 @@ export async function findSimilarNotes(
* Clean note content by removing HTML tags and normalizing whitespace
*/
function cleanNoteContent(content: string, type: string, mime: string): string {
if (!content) return '';
// If it's HTML content, remove HTML tags
if ((type === 'text' && mime === 'text/html') || content.includes('<div>') || content.includes('<p>')) {
// Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser
content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space
// Use sanitizeHtml to remove all HTML tags
content = sanitizeHtml(content, {
allowedTags: [],
allowedAttributes: {},
textFilter: (text) => {
// Normalize the text, removing excessive whitespace
return text.replace(/\s+/g, ' ');
}
});
}
// Additional cleanup for any remaining HTML entities
content = content
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
// Normalize whitespace (replace multiple spaces/newlines with single space)
content = content.replace(/\s+/g, ' ');
// Trim the content
content = content.trim();
// Truncate if extremely long (optional, adjust limit as needed)
// Truncate if extremely long
const MAX_CONTENT_LENGTH = 10000;
if (content.length > MAX_CONTENT_LENGTH) {
content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]';
@ -204,6 +223,113 @@ function cleanNoteContent(content: string, type: string, mime: string): string {
return content;
}
/**
* Extract content from different note types
*/
function extractStructuredContent(content: string, type: string, mime: string): string {
try {
if (!content) return '';
// Special handling based on note type
switch (type) {
case 'mindMap':
case 'relationMap':
case 'canvas':
if (mime === 'application/json') {
const jsonContent = JSON.parse(content);
if (type === 'canvas') {
// Extract text elements from canvas
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
const texts = jsonContent.elements
.filter((element: any) => element.type === 'text' && element.text)
.map((element: any) => element.text);
return texts.join('\n');
}
}
else if (type === 'mindMap') {
// Extract node text from mind map
const extractMindMapNodes = (node: any): string[] => {
let texts: string[] = [];
if (node.text) {
texts.push(node.text);
}
if (node.children && Array.isArray(node.children)) {
for (const child of node.children) {
texts = texts.concat(extractMindMapNodes(child));
}
}
return texts;
};
if (jsonContent.root) {
return extractMindMapNodes(jsonContent.root).join('\n');
}
}
else if (type === 'relationMap') {
// Extract relation map entities and connections
let result = '';
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
result += 'Notes: ' + jsonContent.notes
.map((note: any) => note.title || note.name)
.filter(Boolean)
.join(', ') + '\n';
}
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
result += 'Relations: ' + jsonContent.relations
.map((rel: any) => {
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
return `${source}${rel.name || ''}${target}`;
})
.join('; ');
}
return result;
}
}
return JSON.stringify(content);
case 'mermaid':
// Return mermaid diagrams as-is (they're human-readable)
return content;
case 'geoMap':
if (mime === 'application/json') {
const jsonContent = JSON.parse(content);
let result = '';
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
result += jsonContent.markers
.map((marker: any) => {
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
})
.join('\n');
}
return result || JSON.stringify(content);
}
return JSON.stringify(content);
case 'file':
case 'image':
// For files and images, just return a placeholder
return `[${type} attachment]`;
default:
return content;
}
}
catch (error) {
console.error(`Error extracting content from ${type} note:`, error);
return content;
}
}
/**
* Gets context for a note to be embedded
*/
@ -282,12 +408,23 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
// Get content
let content = "";
if (note.type === 'text') {
content = String(await note.getContent());
} else if (note.type === 'code') {
content = String(await note.getContent());
} else if (note.type === 'image' || note.type === 'file') {
content = `[${note.type} attachment: ${note.mime}]`;
try {
// Get raw content from the note
const rawContent = String(await note.getContent() || "");
// Process the content based on note type to extract meaningful text
if (note.type === 'text' || note.type === 'code') {
content = rawContent;
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
// Process structured content types
content = extractStructuredContent(rawContent, note.type, note.mime);
} else if (note.type === 'image' || note.type === 'file') {
content = `[${note.type} attachment: ${note.mime}]`;
}
} catch (err) {
console.error(`Error getting content for note ${noteId}:`, err);
content = `[Error extracting content]`;
}
// Clean the content to remove HTML tags and normalize whitespace

View File

@ -5,6 +5,7 @@ import options from "../options.js";
import log from "../log.js";
import type { Message } from "./ai_interface.js";
import { cosineSimilarity } from "./embeddings/vector_store.js";
import sanitizeHtml from "sanitize-html";
/**
* TriliumContextService provides intelligent context management for working with large knowledge bases
@ -351,15 +352,16 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
context += `--- NOTE ${index + 1}: ${source.title} ---\n`;
if (source.content) {
// Clean up HTML content before adding it to the context
let cleanContent = this.sanitizeNoteContent(source.content, source.type, source.mime);
// Truncate content if it's too long
const maxContentLength = 1000;
let content = source.content;
if (content.length > maxContentLength) {
content = content.substring(0, maxContentLength) + " [content truncated due to length]";
if (cleanContent.length > maxContentLength) {
cleanContent = cleanContent.substring(0, maxContentLength) + " [content truncated due to length]";
}
context += `${content}\n`;
context += `${cleanContent}\n`;
} else {
context += "[This note doesn't contain textual content]\n";
}
@ -373,6 +375,45 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
return context;
}
/**
* Sanitize note content for use in context, removing HTML tags
*/
private sanitizeNoteContent(content: string, type?: string, mime?: string): string {
if (!content) return '';
// If it's likely HTML content
if (
(type === 'text' && mime === 'text/html') ||
content.includes('<div') ||
content.includes('<p>') ||
content.includes('<span')
) {
// Use sanitizeHtml to remove all HTML tags
content = sanitizeHtml(content, {
allowedTags: [],
allowedAttributes: {},
textFilter: (text) => {
// Replace multiple newlines with a single one
return text.replace(/\n\s*\n/g, '\n\n');
}
});
// Additional cleanup for remaining HTML entities
content = content
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
}
// Normalize whitespace
content = content.replace(/\s+/g, ' ').trim();
return content;
}
/**
* Process a user query with the Trilium-specific approach:
* 1. Generate search queries from the original question