mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-08-11 11:02:27 +08:00
do a better job of extracting context
This commit is contained in:
parent
c386e34c33
commit
f482b3b4c8
@ -102,6 +102,107 @@ export class ContextExtractor {
|
||||
// Format code notes with code blocks
|
||||
formattedContent += '```\n' + content + '\n```';
|
||||
break;
|
||||
case 'canvas':
|
||||
case 'mindMap':
|
||||
case 'relationMap':
|
||||
case 'geoMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
if (type === 'canvas') {
|
||||
// Extract text elements from canvas
|
||||
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
|
||||
const texts = jsonContent.elements
|
||||
.filter((element: any) => element.type === 'text' && element.text)
|
||||
.map((element: any) => element.text);
|
||||
|
||||
formattedContent += 'Canvas content:\n' + texts.join('\n');
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (type === 'mindMap') {
|
||||
// Extract node text from mind map
|
||||
const extractMindMapNodes = (node: any): string[] => {
|
||||
let texts: string[] = [];
|
||||
if (node.text) {
|
||||
texts.push(node.text);
|
||||
}
|
||||
if (node.children && Array.isArray(node.children)) {
|
||||
for (const child of node.children) {
|
||||
texts = texts.concat(extractMindMapNodes(child));
|
||||
}
|
||||
}
|
||||
return texts;
|
||||
};
|
||||
|
||||
if (jsonContent.root) {
|
||||
formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (type === 'relationMap') {
|
||||
// Extract relation map entities and connections
|
||||
let result = 'Relation map content:\n';
|
||||
|
||||
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
|
||||
result += 'Notes: ' + jsonContent.notes
|
||||
.map((note: any) => note.title || note.name)
|
||||
.filter(Boolean)
|
||||
.join(', ') + '\n';
|
||||
}
|
||||
|
||||
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
|
||||
result += 'Relations: ' + jsonContent.relations
|
||||
.map((rel: any) => {
|
||||
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
|
||||
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
|
||||
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
|
||||
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
|
||||
return `${source} → ${rel.name || ''} → ${target}`;
|
||||
})
|
||||
.join('; ');
|
||||
}
|
||||
|
||||
formattedContent += result;
|
||||
break;
|
||||
}
|
||||
else if (type === 'geoMap') {
|
||||
let result = 'Geographic map content:\n';
|
||||
|
||||
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
|
||||
result += jsonContent.markers
|
||||
.map((marker: any) => {
|
||||
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
|
||||
})
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
formattedContent += result || 'Empty geographic map';
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing ${type} content: ${e.message}]`;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If JSON parsing or specific handling failed, use default handling
|
||||
formattedContent += `[${type} content]`;
|
||||
break;
|
||||
|
||||
case 'mermaid':
|
||||
// Format mermaid diagrams as code blocks
|
||||
formattedContent += '```mermaid\n' + content + '\n```';
|
||||
break;
|
||||
|
||||
case 'image':
|
||||
case 'file':
|
||||
formattedContent += `[${type} attachment]`;
|
||||
break;
|
||||
|
||||
default:
|
||||
// For other notes, just use the content as is
|
||||
formattedContent += this.sanitizeHtml(content);
|
||||
@ -114,7 +215,10 @@ export class ContextExtractor {
|
||||
* Sanitize HTML content to plain text
|
||||
*/
|
||||
private sanitizeHtml(html: string): string {
|
||||
return sanitizeHtml(html, {
|
||||
if (!html) return '';
|
||||
|
||||
// Use sanitizeHtml to remove all HTML tags
|
||||
let content = sanitizeHtml(html, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
textFilter: (text) => {
|
||||
@ -122,6 +226,17 @@ export class ContextExtractor {
|
||||
return text.replace(/\n\s*\n/g, '\n\n');
|
||||
}
|
||||
});
|
||||
|
||||
// Additional cleanup for any remaining HTML entities
|
||||
content = content
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -8,6 +8,7 @@ import type { NoteEmbeddingContext } from "./embeddings_interface.js";
|
||||
import { getEmbeddingProviders, getEnabledEmbeddingProviders } from "./providers.js";
|
||||
import eventService from "../../events.js";
|
||||
import type BNote from "../../../becca/entities/bnote.js";
|
||||
import sanitizeHtml from "sanitize-html";
|
||||
|
||||
// Type definition for embedding result
|
||||
interface EmbeddingResult {
|
||||
@ -183,19 +184,37 @@ export async function findSimilarNotes(
|
||||
* Clean note content by removing HTML tags and normalizing whitespace
|
||||
*/
|
||||
function cleanNoteContent(content: string, type: string, mime: string): string {
|
||||
if (!content) return '';
|
||||
|
||||
// If it's HTML content, remove HTML tags
|
||||
if ((type === 'text' && mime === 'text/html') || content.includes('<div>') || content.includes('<p>')) {
|
||||
// Simple tag removal - for more complex HTML parsing, consider using a proper HTML parser
|
||||
content = content.replace(/<[^>]*>/g, ' '); // Replace tags with a space
|
||||
// Use sanitizeHtml to remove all HTML tags
|
||||
content = sanitizeHtml(content, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
textFilter: (text) => {
|
||||
// Normalize the text, removing excessive whitespace
|
||||
return text.replace(/\s+/g, ' ');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Additional cleanup for any remaining HTML entities
|
||||
content = content
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
|
||||
// Normalize whitespace (replace multiple spaces/newlines with single space)
|
||||
content = content.replace(/\s+/g, ' ');
|
||||
|
||||
// Trim the content
|
||||
content = content.trim();
|
||||
|
||||
// Truncate if extremely long (optional, adjust limit as needed)
|
||||
// Truncate if extremely long
|
||||
const MAX_CONTENT_LENGTH = 10000;
|
||||
if (content.length > MAX_CONTENT_LENGTH) {
|
||||
content = content.substring(0, MAX_CONTENT_LENGTH) + ' [content truncated]';
|
||||
@ -204,6 +223,113 @@ function cleanNoteContent(content: string, type: string, mime: string): string {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract content from different note types
|
||||
*/
|
||||
function extractStructuredContent(content: string, type: string, mime: string): string {
|
||||
try {
|
||||
if (!content) return '';
|
||||
|
||||
// Special handling based on note type
|
||||
switch (type) {
|
||||
case 'mindMap':
|
||||
case 'relationMap':
|
||||
case 'canvas':
|
||||
if (mime === 'application/json') {
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
if (type === 'canvas') {
|
||||
// Extract text elements from canvas
|
||||
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
|
||||
const texts = jsonContent.elements
|
||||
.filter((element: any) => element.type === 'text' && element.text)
|
||||
.map((element: any) => element.text);
|
||||
return texts.join('\n');
|
||||
}
|
||||
}
|
||||
else if (type === 'mindMap') {
|
||||
// Extract node text from mind map
|
||||
const extractMindMapNodes = (node: any): string[] => {
|
||||
let texts: string[] = [];
|
||||
if (node.text) {
|
||||
texts.push(node.text);
|
||||
}
|
||||
if (node.children && Array.isArray(node.children)) {
|
||||
for (const child of node.children) {
|
||||
texts = texts.concat(extractMindMapNodes(child));
|
||||
}
|
||||
}
|
||||
return texts;
|
||||
};
|
||||
|
||||
if (jsonContent.root) {
|
||||
return extractMindMapNodes(jsonContent.root).join('\n');
|
||||
}
|
||||
}
|
||||
else if (type === 'relationMap') {
|
||||
// Extract relation map entities and connections
|
||||
let result = '';
|
||||
|
||||
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
|
||||
result += 'Notes: ' + jsonContent.notes
|
||||
.map((note: any) => note.title || note.name)
|
||||
.filter(Boolean)
|
||||
.join(', ') + '\n';
|
||||
}
|
||||
|
||||
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
|
||||
result += 'Relations: ' + jsonContent.relations
|
||||
.map((rel: any) => {
|
||||
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
|
||||
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
|
||||
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
|
||||
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
|
||||
return `${source} → ${rel.name || ''} → ${target}`;
|
||||
})
|
||||
.join('; ');
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return JSON.stringify(content);
|
||||
|
||||
case 'mermaid':
|
||||
// Return mermaid diagrams as-is (they're human-readable)
|
||||
return content;
|
||||
|
||||
case 'geoMap':
|
||||
if (mime === 'application/json') {
|
||||
const jsonContent = JSON.parse(content);
|
||||
let result = '';
|
||||
|
||||
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
|
||||
result += jsonContent.markers
|
||||
.map((marker: any) => {
|
||||
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
|
||||
})
|
||||
.join('\n');
|
||||
}
|
||||
|
||||
return result || JSON.stringify(content);
|
||||
}
|
||||
return JSON.stringify(content);
|
||||
|
||||
case 'file':
|
||||
case 'image':
|
||||
// For files and images, just return a placeholder
|
||||
return `[${type} attachment]`;
|
||||
|
||||
default:
|
||||
return content;
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.error(`Error extracting content from ${type} note:`, error);
|
||||
return content;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets context for a note to be embedded
|
||||
*/
|
||||
@ -282,12 +408,23 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
||||
|
||||
// Get content
|
||||
let content = "";
|
||||
if (note.type === 'text') {
|
||||
content = String(await note.getContent());
|
||||
} else if (note.type === 'code') {
|
||||
content = String(await note.getContent());
|
||||
} else if (note.type === 'image' || note.type === 'file') {
|
||||
content = `[${note.type} attachment: ${note.mime}]`;
|
||||
|
||||
try {
|
||||
// Get raw content from the note
|
||||
const rawContent = String(await note.getContent() || "");
|
||||
|
||||
// Process the content based on note type to extract meaningful text
|
||||
if (note.type === 'text' || note.type === 'code') {
|
||||
content = rawContent;
|
||||
} else if (['canvas', 'mindMap', 'relationMap', 'mermaid', 'geoMap'].includes(note.type)) {
|
||||
// Process structured content types
|
||||
content = extractStructuredContent(rawContent, note.type, note.mime);
|
||||
} else if (note.type === 'image' || note.type === 'file') {
|
||||
content = `[${note.type} attachment: ${note.mime}]`;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Error getting content for note ${noteId}:`, err);
|
||||
content = `[Error extracting content]`;
|
||||
}
|
||||
|
||||
// Clean the content to remove HTML tags and normalize whitespace
|
||||
|
@ -5,6 +5,7 @@ import options from "../options.js";
|
||||
import log from "../log.js";
|
||||
import type { Message } from "./ai_interface.js";
|
||||
import { cosineSimilarity } from "./embeddings/vector_store.js";
|
||||
import sanitizeHtml from "sanitize-html";
|
||||
|
||||
/**
|
||||
* TriliumContextService provides intelligent context management for working with large knowledge bases
|
||||
@ -351,15 +352,16 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
|
||||
context += `--- NOTE ${index + 1}: ${source.title} ---\n`;
|
||||
|
||||
if (source.content) {
|
||||
// Clean up HTML content before adding it to the context
|
||||
let cleanContent = this.sanitizeNoteContent(source.content, source.type, source.mime);
|
||||
|
||||
// Truncate content if it's too long
|
||||
const maxContentLength = 1000;
|
||||
let content = source.content;
|
||||
|
||||
if (content.length > maxContentLength) {
|
||||
content = content.substring(0, maxContentLength) + " [content truncated due to length]";
|
||||
if (cleanContent.length > maxContentLength) {
|
||||
cleanContent = cleanContent.substring(0, maxContentLength) + " [content truncated due to length]";
|
||||
}
|
||||
|
||||
context += `${content}\n`;
|
||||
context += `${cleanContent}\n`;
|
||||
} else {
|
||||
context += "[This note doesn't contain textual content]\n";
|
||||
}
|
||||
@ -373,6 +375,45 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
|
||||
return context;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize note content for use in context, removing HTML tags
|
||||
*/
|
||||
private sanitizeNoteContent(content: string, type?: string, mime?: string): string {
|
||||
if (!content) return '';
|
||||
|
||||
// If it's likely HTML content
|
||||
if (
|
||||
(type === 'text' && mime === 'text/html') ||
|
||||
content.includes('<div') ||
|
||||
content.includes('<p>') ||
|
||||
content.includes('<span')
|
||||
) {
|
||||
// Use sanitizeHtml to remove all HTML tags
|
||||
content = sanitizeHtml(content, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
textFilter: (text) => {
|
||||
// Replace multiple newlines with a single one
|
||||
return text.replace(/\n\s*\n/g, '\n\n');
|
||||
}
|
||||
});
|
||||
|
||||
// Additional cleanup for remaining HTML entities
|
||||
content = content
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
|
||||
// Normalize whitespace
|
||||
content = content.replace(/\s+/g, ' ').trim();
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a user query with the Trilium-specific approach:
|
||||
* 1. Generate search queries from the original question
|
||||
|
Loading…
x
Reference in New Issue
Block a user