break up the huge context_extractor into smaller files

This commit is contained in:
perf3ct 2025-03-11 18:39:59 +00:00
parent 0985cec8d6
commit 71b3b04c53
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
12 changed files with 2212 additions and 882 deletions

View File

@ -4,7 +4,7 @@ import { OpenAIService } from './providers/openai_service.js';
import { AnthropicService } from './providers/anthropic_service.js';
import { OllamaService } from './providers/ollama_service.js';
import log from '../log.js';
import contextExtractor from './context_extractor.js';
import { ContextExtractor } from './context/index.js';
import semanticContextService from './semantic_context_service.js';
type ServiceProviders = 'openai' | 'anthropic' | 'ollama';
@ -216,3 +216,6 @@ export default {
return getInstance().getSemanticContextService();
}
};
// Create an instance of ContextExtractor for backward compatibility
const contextExtractor = new ContextExtractor();

View File

@ -1,7 +1,10 @@
import type { Message, ChatCompletionOptions } from './ai_interface.js';
import aiServiceManager from './ai_service_manager.js';
import chatStorageService from './chat_storage_service.js';
import contextExtractor from './context_extractor.js';
import { ContextExtractor } from './context/index.js';
// Create an instance of ContextExtractor for backward compatibility
const contextExtractor = new ContextExtractor();
export interface ChatSession {
id: string;

View File

@ -0,0 +1,288 @@
/**
* Contains functions for chunking content into smaller pieces for processing
* These functions are used to properly prepare content for LLM context windows
*/
/**
* Interface for chunked content
*/
export interface ContentChunk {
content: string;
prefix: string;
noteId?: string;
title?: string;
path?: string;
metadata?: Record<string, any>;
}
/**
* Options for the chunking process
*/
export interface ChunkOptions {
/**
* Maximum size of each chunk in characters
* Defaults to LLM context window size (typically around 2048)
*/
maxChunkSize?: number;
/**
* How much chunks should overlap to maintain context
*/
overlapSize?: number;
/**
* Whether to respect sentence and paragraph boundaries
*/
respectBoundaries?: boolean;
/**
* Whether to add metadata to chunks
*/
includeMetadata?: boolean;
/**
* Additional information to include in chunk metadata
*/
metadata?: Record<string, any>;
}
/**
* Default options for chunking
*/
const DEFAULT_CHUNK_OPTIONS: Required<ChunkOptions> = {
maxChunkSize: 1500, // Characters per chunk
overlapSize: 100, // Overlap between chunks
respectBoundaries: true,
includeMetadata: true,
metadata: {}
};
/**
* Chunk content into smaller pieces
* Used for processing large documents and preparing them for LLMs
*/
export function chunkContent(
content: string,
title: string = '',
noteId: string = '',
options: ChunkOptions = {}
): ContentChunk[] {
// Merge provided options with defaults
const config: Required<ChunkOptions> = { ...DEFAULT_CHUNK_OPTIONS, ...options };
// If content is small enough, return as a single chunk
if (content.length <= config.maxChunkSize) {
return [{
content,
prefix: title,
noteId,
title,
metadata: config.metadata
}];
}
const chunks: ContentChunk[] = [];
if (config.respectBoundaries) {
// Try to split on paragraph boundaries first
const paragraphs = content.split(/\n\s*\n/);
let currentChunk = '';
let currentPrefix = title ? title : '';
for (const paragraph of paragraphs) {
// If adding this paragraph would exceed max size, create a new chunk
if (currentChunk.length + paragraph.length > config.maxChunkSize) {
// If current chunk is not empty, add it to chunks
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk,
prefix: currentPrefix,
noteId,
title,
metadata: config.metadata
});
}
// Start a new chunk, use the overlap if possible
if (config.overlapSize > 0 && currentChunk.length > 0) {
// For overlap, take the last N characters
const overlapText = currentChunk.slice(-config.overlapSize);
currentChunk = overlapText + paragraph;
currentPrefix = `${title} (continued)`;
} else {
currentChunk = paragraph;
currentPrefix = `${title} (continued)`;
}
} else {
// Add paragraph to current chunk
if (currentChunk.length > 0) {
currentChunk += '\n\n';
}
currentChunk += paragraph;
}
}
// Add the last chunk if it's not empty
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk,
prefix: currentPrefix,
noteId,
title,
metadata: config.metadata
});
}
} else {
// Simple chunking by character count
let currentPosition = 0;
while (currentPosition < content.length) {
const chunkEnd = Math.min(currentPosition + config.maxChunkSize, content.length);
const chunk = content.substring(currentPosition, chunkEnd);
const prefix = currentPosition === 0 ? title : `${title} (continued)`;
chunks.push({
content: chunk,
prefix,
noteId,
title,
metadata: config.metadata
});
// Move position, considering overlap
currentPosition = chunkEnd - (config.overlapSize || 0);
// Prevent infinite loop if overlap is too large
if (currentPosition <= 0 || currentPosition >= content.length) {
break;
}
}
}
return chunks;
}
/**
* Smarter chunking that tries to respect semantic boundaries like headers and sections
*/
export function semanticChunking(
content: string,
title: string = '',
noteId: string = '',
options: ChunkOptions = {}
): ContentChunk[] {
// Merge provided options with defaults
const config: Required<ChunkOptions> = { ...DEFAULT_CHUNK_OPTIONS, ...options };
// If content is small enough, return as a single chunk
if (content.length <= config.maxChunkSize) {
return [{
content,
prefix: title,
noteId,
title,
metadata: config.metadata
}];
}
const chunks: ContentChunk[] = [];
// Try to split on headers first
const headerPattern = /#{1,6}\s+.+|<h[1-6][^>]*>.*?<\/h[1-6]>/g;
const sections = [];
let lastIndex = 0;
let match;
// First, find all headers and split content into sections
while ((match = headerPattern.exec(content)) !== null) {
if (match.index > lastIndex) {
// Add the content before this header
sections.push(content.substring(lastIndex, match.index));
}
// Start a new section with this header
lastIndex = match.index;
}
// Add the last section
if (lastIndex < content.length) {
sections.push(content.substring(lastIndex));
}
// If no headers were found, fall back to regular chunking
if (sections.length <= 1) {
return chunkContent(content, title, noteId, options);
}
// Process each section
let currentChunk = '';
let currentPrefix = title;
for (const section of sections) {
// If adding this section would exceed max size, create a new chunk
if (currentChunk.length + section.length > config.maxChunkSize) {
// If this single section is too big, it needs to be chunked further
if (section.length > config.maxChunkSize) {
// First add the current chunk if not empty
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk,
prefix: currentPrefix,
noteId,
title,
metadata: config.metadata
});
}
// Chunk this section separately
const sectionChunks = chunkContent(
section,
title,
noteId,
options
);
chunks.push(...sectionChunks);
// Reset current chunk
currentChunk = '';
currentPrefix = `${title} (continued)`;
} else {
// Add current chunk to chunks
chunks.push({
content: currentChunk,
prefix: currentPrefix,
noteId,
title,
metadata: config.metadata
});
// Start a new chunk with this section
currentChunk = section;
currentPrefix = `${title} (continued)`;
}
} else {
// Add section to current chunk
if (currentChunk.length > 0 && !currentChunk.endsWith('\n')) {
currentChunk += '\n\n';
}
currentChunk += section;
}
}
// Add the last chunk if it's not empty
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk,
prefix: currentPrefix,
noteId,
title,
metadata: config.metadata
});
}
return chunks;
}

View File

@ -0,0 +1,433 @@
/**
* Helper functions for processing code notes, including language detection and structure extraction
*/
/**
* Attempt to detect the programming language from code content or note attributes
*/
export function detectLanguage(content: string, mime: string): string {
// First check MIME type for hints
if (mime) {
const mimeLower = mime.toLowerCase();
// Map of mime types to language names
const mimeMap: {[key: string]: string} = {
'text/javascript': 'javascript',
'application/javascript': 'javascript',
'text/typescript': 'typescript',
'application/typescript': 'typescript',
'text/x-python': 'python',
'text/x-java': 'java',
'text/x-c': 'c',
'text/x-c++': 'cpp',
'text/x-csharp': 'csharp',
'text/x-go': 'go',
'text/x-ruby': 'ruby',
'text/x-php': 'php',
'text/x-rust': 'rust',
'text/x-swift': 'swift',
'text/x-kotlin': 'kotlin',
'text/x-scala': 'scala',
'text/x-perl': 'perl',
'text/x-lua': 'lua',
'text/x-r': 'r',
'text/x-dart': 'dart',
'text/html': 'html',
'text/css': 'css',
'application/json': 'json',
'application/xml': 'xml',
'text/markdown': 'markdown',
'text/yaml': 'yaml',
'text/x-sql': 'sql'
};
if (mimeMap[mimeLower]) {
return mimeMap[mimeLower];
}
}
// Check for common language patterns in the first few lines
const firstLines = content.split('\n').slice(0, 10).join('\n');
// Simple heuristics for common languages
if (firstLines.includes('<?php')) return 'php';
if (firstLines.includes('#!/usr/bin/python') || firstLines.includes('import ') && firstLines.includes('def ')) return 'python';
if (firstLines.includes('#!/bin/bash') || firstLines.includes('#!/usr/bin/bash')) return 'bash';
if (firstLines.includes('#!/usr/bin/perl')) return 'perl';
if (firstLines.includes('#!/usr/bin/ruby')) return 'ruby';
if (firstLines.includes('package ') && firstLines.includes('import ') && firstLines.includes('public class ')) return 'java';
if (firstLines.includes('using System;') && firstLines.includes('namespace ')) return 'csharp';
if (firstLines.includes('package main') && firstLines.includes('import (') && firstLines.includes('func ')) return 'go';
if (firstLines.includes('#include <') && (firstLines.includes('int main(') || firstLines.includes('void main('))) {
if (firstLines.includes('std::')) return 'cpp';
return 'c';
}
if (firstLines.includes('fn main()') && firstLines.includes('let ') && firstLines.includes('impl ')) return 'rust';
if (firstLines.includes('<!DOCTYPE html>') || firstLines.includes('<html>')) return 'html';
if (firstLines.includes('function ') && firstLines.includes('var ') && firstLines.includes('const ')) return 'javascript';
if (firstLines.includes('interface ') && firstLines.includes('export class ')) return 'typescript';
if (firstLines.includes('@Component') || firstLines.includes('import { Component }')) return 'typescript';
// Default to 'text' if language can't be determined
return 'text';
}
/**
* Extract structure from code to create a summary
*/
export function extractCodeStructure(content: string, language: string): string {
// Avoid processing very large code files
if (content.length > 100000) {
return "Code content too large for structure extraction";
}
let structure = "";
try {
switch (language.toLowerCase()) {
case 'javascript':
case 'typescript':
structure = extractJsStructure(content);
break;
case 'python':
structure = extractPythonStructure(content);
break;
case 'java':
case 'csharp':
case 'cpp':
structure = extractClassBasedStructure(content);
break;
case 'go':
structure = extractGoStructure(content);
break;
case 'rust':
structure = extractRustStructure(content);
break;
case 'html':
structure = extractHtmlStructure(content);
break;
default:
// For other languages, just return a summary of the file size and a few lines
const lines = content.split('\n');
structure = `Code file with ${lines.length} lines.\n`;
// Add first few non-empty lines that aren't comments
const firstCodeLines = lines.filter(line =>
line.trim() !== '' &&
!line.trim().startsWith('//') &&
!line.trim().startsWith('#') &&
!line.trim().startsWith('*') &&
!line.trim().startsWith('<!--')
).slice(0, 5);
if (firstCodeLines.length > 0) {
structure += "First few code lines:\n" + firstCodeLines.join('\n');
}
}
} catch (e: any) {
return `Error extracting code structure: ${e.message}`;
}
return structure;
}
/**
* Extract structure from JavaScript/TypeScript code
*/
function extractJsStructure(content: string): string {
const lines = content.split('\n');
let structure = "";
// Look for imports/requires
const imports = lines.filter(line =>
line.trim().startsWith('import ') ||
line.includes('require(')
).slice(0, 10);
if (imports.length > 0) {
structure += "Imports:\n" + imports.join('\n') + '\n\n';
}
// Look for class declarations
const classes = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('class ') || line.includes(' class ')) {
classes.push(line);
}
}
if (classes.length > 0) {
structure += "Classes:\n" + classes.join('\n') + '\n\n';
}
// Look for function declarations
const functions = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('function ') ||
line.match(/^(const|let|var)\s+\w+\s*=\s*function/) ||
line.match(/^(const|let|var)\s+\w+\s*=\s*\(/)) {
functions.push(line);
}
}
if (functions.length > 0) {
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
if (functions.length > 15) {
structure += `\n... and ${functions.length - 15} more functions`;
}
structure += '\n\n';
}
return structure;
}
/**
* Extract structure from Python code
*/
function extractPythonStructure(content: string): string {
const lines = content.split('\n');
let structure = "";
// Look for imports
const imports = lines.filter(line =>
line.trim().startsWith('import ') ||
line.trim().startsWith('from ')
).slice(0, 10);
if (imports.length > 0) {
structure += "Imports:\n" + imports.join('\n') + '\n\n';
}
// Look for class declarations
const classes = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('class ')) {
classes.push(line);
}
}
if (classes.length > 0) {
structure += "Classes:\n" + classes.join('\n') + '\n\n';
}
// Look for function declarations
const functions = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('def ')) {
functions.push(line);
}
}
if (functions.length > 0) {
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
if (functions.length > 15) {
structure += `\n... and ${functions.length - 15} more functions`;
}
structure += '\n\n';
}
return structure;
}
/**
* Extract structure from class-based languages like Java, C#, C++
*/
function extractClassBasedStructure(content: string): string {
const lines = content.split('\n');
let structure = "";
// Look for package/namespace declarations
const packageLines = lines.filter(line =>
line.trim().startsWith('package ') ||
line.trim().startsWith('namespace ') ||
line.trim().startsWith('using ')
).slice(0, 5);
if (packageLines.length > 0) {
structure += "Package/Imports:\n" + packageLines.join('\n') + '\n\n';
}
// Look for class declarations
const classes = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.match(/^(public|private|protected)?\s*(class|interface|enum)\s+\w+/)) {
classes.push(line);
}
}
if (classes.length > 0) {
structure += "Classes/Interfaces:\n" + classes.join('\n') + '\n\n';
}
// Look for method declarations
const methods = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.match(/^(public|private|protected)?\s*(static)?\s*[\w<>[\]]+\s+\w+\s*\(/)) {
methods.push(line);
}
}
if (methods.length > 0) {
structure += "Methods:\n" + methods.slice(0, 15).join('\n');
if (methods.length > 15) {
structure += `\n... and ${methods.length - 15} more methods`;
}
structure += '\n\n';
}
return structure;
}
/**
* Extract structure from Go code
*/
function extractGoStructure(content: string): string {
const lines = content.split('\n');
let structure = "";
// Look for package declarations
const packageLines = lines.filter(line => line.trim().startsWith('package ')).slice(0, 1);
if (packageLines.length > 0) {
structure += "Package:\n" + packageLines.join('\n') + '\n\n';
}
// Look for imports
const importStart = lines.findIndex(line => line.trim() === 'import (');
if (importStart !== -1) {
let importEnd = lines.findIndex((line, i) => i > importStart && line.trim() === ')');
if (importEnd !== -1) {
structure += "Imports:\n" + lines.slice(importStart, importEnd + 1).join('\n') + '\n\n';
}
}
// Look for type declarations (structs, interfaces)
const types = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('type ') && (line.includes(' struct ') || line.includes(' interface '))) {
types.push(line);
}
}
if (types.length > 0) {
structure += "Types:\n" + types.join('\n') + '\n\n';
}
// Look for function declarations
const functions = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('func ')) {
functions.push(line);
}
}
if (functions.length > 0) {
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
if (functions.length > 15) {
structure += `\n... and ${functions.length - 15} more functions`;
}
structure += '\n\n';
}
return structure;
}
/**
* Extract structure from Rust code
*/
function extractRustStructure(content: string): string {
const lines = content.split('\n');
let structure = "";
// Look for module declarations
const moduleLines = lines.filter(line => line.trim().startsWith('mod ') || line.trim().startsWith('use ')).slice(0, 10);
if (moduleLines.length > 0) {
structure += "Modules/Imports:\n" + moduleLines.join('\n') + '\n\n';
}
// Look for struct/enum/trait declarations
const types = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('struct ') || line.startsWith('enum ') || line.startsWith('trait ')) {
types.push(line);
}
}
if (types.length > 0) {
structure += "Types:\n" + types.join('\n') + '\n\n';
}
// Look for function/impl declarations
const functions = [];
const impls = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('fn ')) {
functions.push(line);
}
if (line.startsWith('impl ')) {
impls.push(line);
}
}
if (impls.length > 0) {
structure += "Implementations:\n" + impls.join('\n') + '\n\n';
}
if (functions.length > 0) {
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
if (functions.length > 15) {
structure += `\n... and ${functions.length - 15} more functions`;
}
structure += '\n\n';
}
return structure;
}
/**
* Extract structure from HTML
*/
function extractHtmlStructure(content: string): string {
const lines = content.split('\n');
// Extract title
const titleMatch = content.match(/<title>(.*?)<\/title>/i);
const title = titleMatch ? titleMatch[1] : "No title";
// Count main elements
const headings = content.match(/<h[1-6].*?>.*?<\/h[1-6]>/gi) || [];
const divs = content.match(/<div.*?>/gi) || [];
const scripts = content.match(/<script.*?>.*?<\/script>/gis) || [];
const links = content.match(/<a.*?>.*?<\/a>/gi) || [];
const images = content.match(/<img.*?>/gi) || [];
// Extract some key elements
const structure = `HTML Document: "${title}"
Document structure:
- Contains ${headings.length} headings
- Contains ${divs.length} div elements
- Contains ${scripts.length} script blocks
- Contains ${links.length} links
- Contains ${images.length} images
`;
return structure;
}

View File

@ -0,0 +1,243 @@
import becca from '../../../becca/becca.js';
import { sanitizeHtmlContent } from './note_content.js';
/**
* Get a list of parent notes for a given note
*/
export async function getParentNotes(noteId: string, maxParents: number = 5): Promise<{id: string, title: string}[]> {
const note = becca.getNote(noteId);
if (!note) {
return [];
}
try {
// Use Becca API to get parent branches and notes
const parentBranches = note.getParentBranches();
if (!parentBranches || parentBranches.length === 0) {
return [];
}
// Map to get parent notes, limiting to maxParents
const parentNotes = parentBranches
.slice(0, maxParents)
.map(branch => {
if (!branch.parentNote) {
return null;
}
return {
id: branch.parentNote.noteId,
title: branch.parentNote.title
};
})
.filter(note => note !== null) as {id: string, title: string}[];
return parentNotes;
} catch (error) {
console.error(`Error getting parent notes for ${noteId}:`, error);
return [];
}
}
/**
* Get hierarchical context of parent notes
* This function builds a representation of the note hierarchy to provide context
*/
export async function getParentContext(
noteId: string,
maxDepth: number = 3,
maxParents: number = 3,
includeCurrentNote: boolean = true
): Promise<string> {
// Note: getParentNotes has been updated to use Becca API
const note = becca.getNote(noteId);
if (!note) {
return "";
}
const visited = new Set<string>();
let context = "";
// Helper function to build the hierarchical context recursively
async function buildHierarchy(currentNoteId: string, depth: number, prefix: string = ""): Promise<void> {
if (depth > maxDepth || visited.has(currentNoteId)) {
return;
}
visited.add(currentNoteId);
const parentNotes = await getParentNotes(currentNoteId, maxParents);
for (const parent of parentNotes) {
// Add parent with proper indentation
context += `${prefix}- ${parent.title}\n`;
// Recursively add parents of this parent with increased indentation
await buildHierarchy(parent.id, depth + 1, prefix + " ");
}
}
// Build the hierarchy starting from the current note
await buildHierarchy(noteId, 1);
// Add the current note at the end with appropriate indentation
if (includeCurrentNote) {
// Determine the indentation level based on hierarchy depth
let indentation = "";
if (context) {
// If we have parent context, add the current note with proper indentation
indentation = " ".repeat(1); // One level deeper than parents
context += `${indentation}> ${note.title} (current note)\n`;
} else {
// If no parents, just add the current note
context += `> ${note.title} (current note)\n`;
}
}
if (!context) {
return "No parent context available.";
}
return context;
}
/**
* Get context from child notes
*/
export async function getChildContext(
noteId: string,
maxChildren: number = 10,
includeContent: boolean = false
): Promise<string> {
const note = becca.getNote(noteId);
if (!note) {
return "";
}
try {
// Get child notes using Becca API
const childNotes = note.getChildNotes();
if (!childNotes || childNotes.length === 0) {
return "No child notes.";
}
let context = `Child notes (${childNotes.length} total):\n`;
// Limit the number of children included in context
const limitedChildren = childNotes.slice(0, maxChildren);
for (const childNote of limitedChildren) {
context += `- ${childNote.title}\n`;
// Optionally include a snippet of content
if (includeContent) {
try {
const content = String(await childNote.getContent() || "");
// Truncate and sanitize content
const truncatedContent = sanitizeHtmlContent(content)
.substring(0, 100)
.trim()
.replace(/\n/g, ' ');
if (truncatedContent) {
context += ` Summary: ${truncatedContent}${truncatedContent.length >= 100 ? '...' : ''}\n`;
}
} catch (e) {
// Silently skip content errors
}
}
}
// Add note about truncation if needed
if (childNotes.length > maxChildren) {
context += `... and ${childNotes.length - maxChildren} more child notes not shown\n`;
}
return context;
} catch (error) {
console.error(`Error getting child context for ${noteId}:`, error);
return "Error retrieving child notes.";
}
}
/**
* Get context from linked notes (relations)
*/
export async function getLinkedNotesContext(
noteId: string,
maxRelations: number = 10
): Promise<string> {
const note = becca.getNote(noteId);
if (!note) {
return "";
}
try {
// Get all relations using Becca API
const relations = note.getRelations();
if (!relations || relations.length === 0) {
return "No linked notes.";
}
// Get incoming relations as well
const incomingRelations = note.getTargetRelations();
let context = "";
// Handle outgoing relations
if (relations.length > 0) {
context += `Outgoing relations (${relations.length} total):\n`;
// Limit the number of relations included in context
const limitedRelations = relations.slice(0, maxRelations);
for (const relation of limitedRelations) {
const targetNote = becca.getNote(relation.value || "");
if (targetNote) {
const relationName = relation.name || 'relates to';
context += `- ${relationName}${targetNote.title}\n`;
}
}
// Add note about truncation if needed
if (relations.length > maxRelations) {
context += `... and ${relations.length - maxRelations} more outgoing relations not shown\n`;
}
}
// Handle incoming relations
if (incomingRelations && incomingRelations.length > 0) {
if (context) context += "\n";
context += `Incoming relations (${incomingRelations.length} total):\n`;
// Limit the number of relations included in context
const limitedIncoming = incomingRelations.slice(0, maxRelations);
for (const relation of limitedIncoming) {
const sourceNote = becca.getNote(relation.value || "");
if (sourceNote) {
const relationName = relation.name || 'relates to';
context += `- ${sourceNote.title}${relationName}\n`;
}
}
// Add note about truncation if needed
if (incomingRelations.length > maxRelations) {
context += `... and ${incomingRelations.length - maxRelations} more incoming relations not shown\n`;
}
}
return context || "No linked notes.";
} catch (error) {
console.error(`Error getting linked notes context for ${noteId}:`, error);
return "Error retrieving linked notes.";
}
}

View File

@ -0,0 +1,616 @@
/**
* Context extraction module for LLM features
* Provides methods to extract relevant context from notes for LLM processing
*/
import becca from '../../../becca/becca.js';
import { getNoteContent, formatNoteContent, sanitizeHtmlContent } from './note_content.js';
import { detectLanguage, extractCodeStructure } from './code_handlers.js';
import { chunkContent, semanticChunking } from './chunking.js';
import type { ContentChunk, ChunkOptions } from './chunking.js';
import { summarizeContent, extractKeyPoints } from './summarization.js';
import { getParentNotes, getParentContext, getChildContext, getLinkedNotesContext } from './hierarchy.js';
import { getSemanticContext } from './semantic_context.js';
/**
* Options for context extraction
*/
export interface ContextOptions {
/**
* Include parent context
*/
includeParents?: boolean;
/**
* Include child notes in context
*/
includeChildren?: boolean;
/**
* Include linked notes in context
*/
includeLinks?: boolean;
/**
* Include semantically similar notes
*/
includeSimilar?: boolean;
/**
* Include note content in context
*/
includeContent?: boolean;
/**
* Maximum depth for parent hierarchy
*/
maxParentDepth?: number;
/**
* Maximum number of children to include
*/
maxChildren?: number;
/**
* Maximum number of linked notes to include
*/
maxLinks?: number;
/**
* Maximum number of similar notes to include
*/
maxSimilarNotes?: number;
/**
* Maximum content length
*/
maxContentLength?: number;
}
/**
* Default options for context extraction
*/
const DEFAULT_CONTEXT_OPTIONS: Required<ContextOptions> = {
includeParents: true,
includeChildren: true,
includeLinks: true,
includeSimilar: false,
includeContent: true,
maxParentDepth: 3,
maxChildren: 10,
maxLinks: 10,
maxSimilarNotes: 5,
maxContentLength: 2000
};
/**
* Context Extractor class
* Handles extraction of context from notes for LLM processing
*/
export class ContextExtractor {
/**
* Get content of a note
*/
static async getNoteContent(noteId: string): Promise<string | null> {
return getNoteContent(noteId);
}
/**
* Get content of a note - instance method
*/
async getNoteContent(noteId: string): Promise<string | null> {
return ContextExtractor.getNoteContent(noteId);
}
/**
* Format note content based on its type
*/
static formatNoteContent(content: string, type: string, mime: string, title: string): string {
return formatNoteContent(content, type, mime, title);
}
/**
* Format note content based on its type - instance method
*/
formatNoteContent(content: string, type: string, mime: string, title: string): string {
return ContextExtractor.formatNoteContent(content, type, mime, title);
}
/**
* Sanitize HTML content to plain text
*/
static sanitizeHtmlContent(html: string): string {
return sanitizeHtmlContent(html);
}
/**
* Sanitize HTML content to plain text - instance method
*/
sanitizeHtmlContent(html: string): string {
return ContextExtractor.sanitizeHtmlContent(html);
}
/**
* Detect programming language from content
*/
static detectLanguage(content: string, mime: string): string {
return detectLanguage(content, mime);
}
/**
* Detect programming language from content - instance method
*/
detectLanguage(content: string, mime: string): string {
return ContextExtractor.detectLanguage(content, mime);
}
/**
* Extract structure from code
*/
static extractCodeStructure(content: string, language: string): string {
return extractCodeStructure(content, language);
}
/**
* Extract structure from code - instance method
*/
extractCodeStructure(content: string, language: string): string {
return ContextExtractor.extractCodeStructure(content, language);
}
/**
* Chunk content into smaller pieces
*/
static chunkContent(
content: string,
title: string = '',
noteId: string = '',
options: ChunkOptions = {}
): ContentChunk[] {
return chunkContent(content, title, noteId, options);
}
/**
* Chunk content into smaller pieces - instance method
*/
chunkContent(
content: string,
title: string = '',
noteId: string = '',
options: ChunkOptions = {}
): ContentChunk[] {
return ContextExtractor.chunkContent(content, title, noteId, options);
}
/**
* Smarter chunking that respects semantic boundaries
*/
static semanticChunking(
content: string,
title: string = '',
noteId: string = '',
options: ChunkOptions = {}
): ContentChunk[] {
return semanticChunking(content, title, noteId, options);
}
/**
* Smarter chunking that respects semantic boundaries - instance method
*/
semanticChunking(
content: string,
title: string = '',
noteId: string = '',
options: ChunkOptions = {}
): ContentChunk[] {
return ContextExtractor.semanticChunking(content, title, noteId, options);
}
/**
* Summarize content
*/
static summarizeContent(
content: string,
title: string = ''
): string {
return summarizeContent(content, title);
}
/**
* Summarize content - instance method
*/
summarizeContent(
content: string,
title: string = ''
): string {
return ContextExtractor.summarizeContent(content, title);
}
/**
* Extract key points from content
*/
static extractKeyPoints(
content: string,
maxPoints: number = 5
): string[] {
return extractKeyPoints(content, maxPoints);
}
/**
* Extract key points from content - instance method
*/
extractKeyPoints(
content: string,
maxPoints: number = 5
): string[] {
return ContextExtractor.extractKeyPoints(content, maxPoints);
}
/**
* Get parent notes
*/
static async getParentNotes(
noteId: string,
maxParents: number = 5
): Promise<{id: string, title: string}[]> {
return getParentNotes(noteId, maxParents);
}
/**
* Get parent notes - instance method
*/
async getParentNotes(
noteId: string,
maxParents: number = 5
): Promise<{id: string, title: string}[]> {
return ContextExtractor.getParentNotes(noteId, maxParents);
}
/**
* Get hierarchical parent context
*/
static async getParentContext(
noteId: string,
maxDepth: number = 3,
maxParents: number = 3
): Promise<string> {
return getParentContext(noteId, maxDepth, maxParents);
}
/**
* Get hierarchical parent context - instance method
*/
async getParentContext(
noteId: string,
maxDepth: number = 3,
maxParents: number = 3
): Promise<string> {
return ContextExtractor.getParentContext(noteId, maxDepth, maxParents);
}
/**
* Get child context
*/
static async getChildContext(
noteId: string,
maxChildren: number = 10,
includeContent: boolean = false
): Promise<string> {
return getChildContext(noteId, maxChildren, includeContent);
}
/**
* Get child context - instance method
*/
async getChildContext(
noteId: string,
maxChildren: number = 10,
includeContent: boolean = false
): Promise<string> {
return ContextExtractor.getChildContext(noteId, maxChildren, includeContent);
}
/**
* Get linked notes context
*/
static async getLinkedNotesContext(
noteId: string,
maxRelations: number = 10
): Promise<string> {
return getLinkedNotesContext(noteId, maxRelations);
}
/**
* Get linked notes context - instance method
*/
async getLinkedNotesContext(
noteId: string,
maxRelations: number = 10
): Promise<string> {
return ContextExtractor.getLinkedNotesContext(noteId, maxRelations);
}
/**
* Get semantic context
*/
static async getSemanticContext(
noteId: string,
maxSimilarNotesOrQuery: number | string = 5
): Promise<string> {
// Handle both the new (number) and old (string query) parameter types
if (typeof maxSimilarNotesOrQuery === 'string') {
// Old API: The second parameter was a query string
// For backward compatibility, we'll still accept this
return getSemanticContext(noteId, { maxSimilarNotes: 5 });
} else {
// New API: The second parameter is maxSimilarNotes
return getSemanticContext(noteId, { maxSimilarNotes: maxSimilarNotesOrQuery });
}
}
/**
* Get semantic context - instance method
*/
async getSemanticContext(
noteId: string,
maxSimilarNotesOrQuery: number | string = 5
): Promise<string> {
return ContextExtractor.getSemanticContext(noteId, maxSimilarNotesOrQuery);
}
/**
* Extract full context for a note
* This combines various context sources based on provided options
*/
static async extractContext(
noteId: string,
options: ContextOptions = {}
): Promise<string> {
const config: Required<ContextOptions> = { ...DEFAULT_CONTEXT_OPTIONS, ...options };
const note = becca.getNote(noteId);
if (!note) {
return "Note not found.";
}
let context = `# Context for note: ${note.title}\n\n`;
// Include parent context
if (config.includeParents) {
const parentContext = await ContextExtractor.getParentContext(
noteId,
config.maxParentDepth,
3 // Default to 3 parents per level
);
if (parentContext) {
context += `## Parent Hierarchy\n${parentContext}\n\n`;
}
}
// Include note content
if (config.includeContent) {
const content = await ContextExtractor.getNoteContent(noteId);
if (content) {
// If content is too large, summarize it
let contentSection = '';
if (content.length > config.maxContentLength) {
contentSection = ContextExtractor.summarizeContent(content, note.title);
contentSection += "\n\n[Content summarized due to length]";
} else {
contentSection = content;
}
context += `## Note Content\n${contentSection}\n\n`;
}
}
// Include child context
if (config.includeChildren) {
const childContext = await ContextExtractor.getChildContext(
noteId,
config.maxChildren,
false // Don't include child content by default
);
if (childContext && childContext !== "No child notes.") {
context += `## Child Notes\n${childContext}\n\n`;
}
}
// Include linked notes
if (config.includeLinks) {
const linkedContext = await ContextExtractor.getLinkedNotesContext(
noteId,
config.maxLinks
);
if (linkedContext && linkedContext !== "No linked notes.") {
context += `## Linked Notes\n${linkedContext}\n\n`;
}
}
// Include semantically similar notes
if (config.includeSimilar) {
const semanticContext = await ContextExtractor.getSemanticContext(
noteId,
config.maxSimilarNotes
);
if (semanticContext && !semanticContext.includes("No semantically similar notes found.")) {
context += `## Similar Notes\n${semanticContext}\n\n`;
}
}
return context;
}
/**
* Extract full context for a note - instance method
*/
async extractContext(
noteId: string,
options: ContextOptions = {}
): Promise<string> {
return ContextExtractor.extractContext(noteId, options);
}
/**
* Get progressively loaded context based on depth level
* This provides different levels of context detail depending on the depth parameter
*
* @param noteId - The ID of the note to get context for
* @param depth - Depth level (1-4) determining how much context to include
* @returns Context appropriate for the requested depth
*/
static async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
try {
// This requires the semantic context service to be available
// We're using a dynamic import to avoid circular dependencies
const { default: aiServiceManager } = await import('../ai_service_manager.js');
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
if (!semanticContext) {
return ContextExtractor.extractContext(noteId);
}
return await semanticContext.getProgressiveContext(noteId, depth);
} catch (error) {
// Fall back to regular context if progressive loading fails
console.error('Error in progressive context loading:', error);
return ContextExtractor.extractContext(noteId);
}
}
/**
* Get progressively loaded context based on depth level - instance method
*/
async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
return ContextExtractor.getProgressiveContext(noteId, depth);
}
/**
* Get smart context based on the query complexity
* This automatically selects the appropriate context depth and relevance
*
* @param noteId - The ID of the note to get context for
* @param query - The user's query for semantic relevance matching
* @returns The optimal context for answering the query
*/
static async getSmartContext(noteId: string, query: string): Promise<string> {
try {
// This requires the semantic context service to be available
// We're using a dynamic import to avoid circular dependencies
const { default: aiServiceManager } = await import('../ai_service_manager.js');
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
if (!semanticContext) {
return ContextExtractor.extractContext(noteId);
}
return await semanticContext.getSmartContext(noteId, query);
} catch (error) {
// Fall back to regular context if smart context fails
console.error('Error in smart context selection:', error);
return ContextExtractor.extractContext(noteId);
}
}
/**
* Get smart context based on the query complexity - instance method
*/
async getSmartContext(noteId: string, query: string): Promise<string> {
return ContextExtractor.getSmartContext(noteId, query);
}
/**
* Get the full context for a note, including parent hierarchy, content, and children
* Legacy method for backwards compatibility
*/
static async getFullContext(noteId: string): Promise<string> {
// Use extractContext with default options
return ContextExtractor.extractContext(noteId);
}
/**
* Get the full context for a note - instance method
*/
async getFullContext(noteId: string): Promise<string> {
return ContextExtractor.getFullContext(noteId);
}
/**
* Get note summary - for backward compatibility
*/
static async getNoteSummary(noteId: string, maxLength = 5000): Promise<string> {
const note = becca.getNote(noteId);
if (!note) return '';
const content = await getNoteContent(noteId);
if (!content || content.length < maxLength) return content || '';
// For larger content, generate a summary
return summarizeContent(content, note.title);
}
/**
* Get note summary - instance method
*/
async getNoteSummary(noteId: string, maxLength = 5000): Promise<string> {
return ContextExtractor.getNoteSummary(noteId, maxLength);
}
/**
* Split a large note into smaller, semantically meaningful chunks
* This is useful for handling large notes that exceed the context window of LLMs
* For backward compatibility
*/
static async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise<string[]> {
const content = await getNoteContent(noteId);
if (!content) return [];
// Use the new chunking functionality
const chunks = chunkContent(
content,
'',
noteId,
{ maxChunkSize, respectBoundaries: true }
);
// Convert to the old API format which was an array of strings
return chunks.map(chunk => chunk.content);
}
/**
* Split a large note into smaller chunks - instance method
*/
async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise<string[]> {
return ContextExtractor.getChunkedNoteContent(noteId, maxChunkSize);
}
}
// Export all modules
export {
getNoteContent,
formatNoteContent,
sanitizeHtmlContent,
detectLanguage,
extractCodeStructure,
chunkContent,
semanticChunking,
summarizeContent,
extractKeyPoints,
getParentNotes,
getParentContext,
getChildContext,
getLinkedNotesContext,
getSemanticContext
};
// Export types
export type {
ContentChunk,
ChunkOptions
};

View File

@ -0,0 +1,223 @@
import sanitizeHtml from 'sanitize-html';
import becca from '../../../becca/becca.js';
/**
* Get the content of a note
*/
export async function getNoteContent(noteId: string): Promise<string | null> {
// Use Becca API to get note data
const note = becca.getNote(noteId);
if (!note) {
return null;
}
try {
// Get content using Becca API
const content = String(await note.getContent() || "");
return formatNoteContent(
content,
note.type,
note.mime,
note.title
);
} catch (error) {
console.error(`Error getting content for note ${noteId}:`, error);
return null;
}
}
/**
* Format the content of a note based on its type
* Enhanced with better handling for large and specialized content types
*/
export function formatNoteContent(content: string, type: string, mime: string, title: string): string {
let formattedContent = `# ${title}\n\n`;
switch (type) {
case 'text':
// Remove HTML formatting for text notes
formattedContent += sanitizeHtml(content);
break;
case 'code':
// For code, we'll handle this in code_handlers.ts
// Just use basic formatting here
formattedContent += '```\n' + content + '\n```';
break;
case 'canvas':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
// Extract text elements from canvas
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
const texts = jsonContent.elements
.filter((element: any) => element.type === 'text' && element.text)
.map((element: any) => element.text);
formattedContent += 'Canvas content:\n' + texts.join('\n');
} else {
formattedContent += '[Empty canvas]';
}
}
catch (e: any) {
formattedContent += `[Error parsing canvas content: ${e.message}]`;
}
} else {
formattedContent += '[Canvas content]';
}
break;
case 'mindMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
// Extract node text from mind map
const extractMindMapNodes = (node: any): string[] => {
let texts: string[] = [];
if (node.text) {
texts.push(node.text);
}
if (node.children && Array.isArray(node.children)) {
for (const child of node.children) {
texts = texts.concat(extractMindMapNodes(child));
}
}
return texts;
};
if (jsonContent.root) {
formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
} else {
formattedContent += '[Empty mind map]';
}
}
catch (e: any) {
formattedContent += `[Error parsing mind map content: ${e.message}]`;
}
} else {
formattedContent += '[Mind map content]';
}
break;
case 'relationMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
// Extract relation map entities and connections
let result = 'Relation map content:\n';
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
result += 'Notes: ' + jsonContent.notes
.map((note: any) => note.title || note.name)
.filter(Boolean)
.join(', ') + '\n';
}
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
result += 'Relations: ' + jsonContent.relations
.map((rel: any) => {
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
return `${source}${rel.name || ''}${target}`;
})
.join('; ');
}
formattedContent += result;
}
catch (e: any) {
formattedContent += `[Error parsing relation map content: ${e.message}]`;
}
} else {
formattedContent += '[Relation map content]';
}
break;
case 'geoMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
let result = 'Geographic map content:\n';
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
if (jsonContent.markers.length > 0) {
result += jsonContent.markers
.map((marker: any) => {
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
})
.join('\n');
} else {
result += 'Empty geographic map';
}
} else {
result += 'Empty geographic map';
}
formattedContent += result;
}
catch (e: any) {
formattedContent += `[Error parsing geographic map content: ${e.message}]`;
}
} else {
formattedContent += '[Geographic map content]';
}
break;
case 'mermaid':
// Format mermaid diagrams as code blocks
formattedContent += '```mermaid\n' + content + '\n```';
break;
case 'image':
case 'file':
formattedContent += `[${type} attachment]`;
break;
default:
// For other notes, just use the content as is
formattedContent += sanitizeHtml(content);
}
return formattedContent;
}
/**
* Sanitize HTML content to plain text
*/
export function sanitizeHtmlContent(html: string): string {
if (!html) return '';
// Use sanitizeHtml to remove all HTML tags
let content = sanitizeHtml(html, {
allowedTags: [],
allowedAttributes: {},
textFilter: (text) => {
// Replace multiple newlines with a single one
return text.replace(/\n\s*\n/g, '\n\n');
}
});
// Additional cleanup for any remaining HTML entities
content = content
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return content;
}

View File

@ -0,0 +1,225 @@
/**
* Contains functions for semantic context extraction
* Uses more intelligent methods to determine relevant context
*/
import { sanitizeHtmlContent } from './note_content.js';
import becca from '../../../becca/becca.js';
import { getNoteContent } from './note_content.js';
/**
* Options for semantic context extraction
*/
export interface SemanticContextOptions {
/**
* Maximum number of similar notes to include
*/
maxSimilarNotes?: number;
/**
* Whether to include note content snippets
*/
includeContent?: boolean;
/**
* Maximum length of content snippets
*/
snippetLength?: number;
/**
* Minimum similarity score (0-1) to include a note
*/
minSimilarity?: number;
}
/**
* Default options for semantic context extraction
*/
const DEFAULT_SEMANTIC_CONTEXT_OPTIONS: Required<SemanticContextOptions> = {
maxSimilarNotes: 5,
includeContent: true,
snippetLength: 200,
minSimilarity: 0.7
};
/**
* Retrieve semantically similar notes to provide context
* This is a simplified version without vector store integration
* Use vector_store for actual semantic search
*/
export async function getSemanticContext(
noteId: string,
options: SemanticContextOptions = {}
): Promise<string> {
// Merge provided options with defaults
const config: Required<SemanticContextOptions> = {
...DEFAULT_SEMANTIC_CONTEXT_OPTIONS,
...options
};
try {
// Get the current note
const note = becca.getNote(noteId);
if (!note) {
return "Note not found.";
}
// Get note content for comparison
const noteContent = await getNoteContent(noteId);
if (!noteContent) {
return "No content available for similarity comparison.";
}
// Get potential related notes (simplified method)
// In real implementation, this would use vector_store.similarity methods
const relatedNotes = await findRelatedNotes(noteId, noteContent, config);
// Format the semantic context result
let context = `Semantically related notes to "${note.title}":\n\n`;
if (relatedNotes.length === 0) {
context += "No semantically similar notes found.";
return context;
}
// Add each related note to the context
for (const relatedNote of relatedNotes) {
context += `## ${relatedNote.title}\n`;
if (config.includeContent && relatedNote.snippet) {
context += `${relatedNote.snippet}\n\n`;
}
}
return context;
} catch (error) {
console.error(`Error getting semantic context for ${noteId}:`, error);
return "Error retrieving semantic context.";
}
}
/**
* Find related notes based on simple heuristics
* This is a placeholder for semantic search that would normally use vector embeddings
*/
async function findRelatedNotes(
noteId: string,
noteContent: string,
options: Required<SemanticContextOptions>
): Promise<{ id: string, title: string, snippet: string | null, score: number }[]> {
const results: { id: string, title: string, snippet: string | null, score: number }[] = [];
const note = becca.getNote(noteId);
if (!note) {
return results;
}
// 1. Check siblings (notes with the same parent)
const parentBranches = note.getParentBranches();
const processedNotes = new Set<string>();
processedNotes.add(noteId); // Don't include the current note
// Process parent branches to find siblings
for (const branch of parentBranches) {
if (!branch.parentNote) {
continue;
}
const parentNote = branch.parentNote;
const siblingNotes = parentNote.getChildNotes().filter(n => n.noteId !== noteId);
for (const siblingNote of siblingNotes) {
if (processedNotes.has(siblingNote.noteId)) {
continue;
}
processedNotes.add(siblingNote.noteId);
const siblingContent = await getNoteContent(siblingNote.noteId);
if (!siblingContent) {
continue;
}
// Calculate a very simple similarity score
const score = calculateSimpleTextSimilarity(noteContent, siblingContent);
if (score >= options.minSimilarity) {
results.push({
id: siblingNote.noteId,
title: siblingNote.title,
snippet: siblingContent.substring(0, options.snippetLength) + '...',
score
});
}
}
}
// 2. Check notes connected by relations
const relations = note.getRelations();
for (const relation of relations) {
const targetNoteId = relation.value;
if (!targetNoteId || processedNotes.has(targetNoteId)) {
continue;
}
processedNotes.add(targetNoteId);
const targetNote = becca.getNote(targetNoteId);
if (!targetNote) {
continue;
}
const targetContent = await getNoteContent(targetNoteId);
if (!targetContent) {
continue;
}
// Relations are already semantically connected, so give them a boost
const score = calculateSimpleTextSimilarity(noteContent, targetContent) + 0.2;
results.push({
id: targetNoteId,
title: targetNote.title,
snippet: targetContent.substring(0, options.snippetLength) + '...',
score: Math.min(score, 1.0) // Cap at 1.0
});
}
// Sort by similarity score (highest first) and limit
return results
.sort((a, b) => b.score - a.score)
.slice(0, options.maxSimilarNotes);
}
/**
* Calculate a simple text similarity based on shared words
* This is a very basic implementation and should be replaced with actual embedding similarity
*/
function calculateSimpleTextSimilarity(text1: string, text2: string): number {
// Clean and tokenize the texts
const cleanText1 = sanitizeHtmlContent(text1).toLowerCase();
const cleanText2 = sanitizeHtmlContent(text2).toLowerCase();
// Get unique words (case insensitive)
const words1 = new Set(cleanText1.split(/\W+/).filter(w => w.length > 3));
const words2 = new Set(cleanText2.split(/\W+/).filter(w => w.length > 3));
// No meaningful comparison possible if either text has no significant words
if (words1.size === 0 || words2.size === 0) {
return 0;
}
// Count shared words
let sharedCount = 0;
for (const word of words1) {
if (words2.has(word)) {
sharedCount++;
}
}
// Jaccard similarity: intersection size / union size
return sharedCount / (words1.size + words2.size - sharedCount);
}

View File

@ -0,0 +1,162 @@
/**
* Contains functions for generating summaries of note content
* Used to provide concise context for LLM processing
*/
import { sanitizeHtmlContent } from './note_content.js';
/**
* Options for summarization
*/
export interface SummarizationOptions {
/**
* Maximum length of the summary in characters
*/
maxLength?: number;
/**
* Whether to include title in the summary
*/
includeTitle?: boolean;
/**
* Minimum content length to trigger summarization
*/
minContentLengthForSummarization?: number;
}
/**
* Default summarization options
*/
const DEFAULT_SUMMARIZATION_OPTIONS: Required<SummarizationOptions> = {
maxLength: 500,
includeTitle: true,
minContentLengthForSummarization: 1000
};
/**
* Summarize note content
* If the content is smaller than minContentLengthForSummarization, returns trimmed content
* This is a local implementation that doesn't require API calls
*/
export function summarizeContent(
content: string,
title: string = '',
options: SummarizationOptions = {}
): string {
// Merge provided options with defaults
const config: Required<SummarizationOptions> = {
...DEFAULT_SUMMARIZATION_OPTIONS,
...options
};
// Clean up the content
const cleanedContent = sanitizeHtmlContent(content);
// If content is small enough, no need to summarize
if (cleanedContent.length < config.minContentLengthForSummarization) {
// Just truncate if needed
if (cleanedContent.length > config.maxLength) {
return cleanedContent.substring(0, config.maxLength) + '...';
}
return cleanedContent;
}
// Use local summarization
return generateLocalSummary(cleanedContent, config);
}
/**
* Generate a simple summary locally without using LLM API
*/
function generateLocalSummary(content: string, options: Required<SummarizationOptions>): string {
// Simple heuristic approach - extract first paragraph and some key sentences
// First, try to get the first paragraph that has reasonable length
const paragraphs = content.split(/\n\s*\n/);
let summary = '';
for (const paragraph of paragraphs) {
if (paragraph.length > 30 && !paragraph.startsWith('#') && !paragraph.startsWith('!')) {
summary = paragraph;
break;
}
}
// If no good paragraph found, use the first X characters
if (!summary) {
summary = content.substring(0, options.maxLength * 0.8);
}
// Truncate if too long
if (summary.length > options.maxLength) {
summary = summary.substring(0, options.maxLength) + '...';
}
return summary;
}
/**
* Extract key points from content
* Returns a bulleted list of key points
* This is a local implementation that doesn't require API calls
*/
export function extractKeyPoints(
content: string,
maxPoints: number = 5
): string[] {
// Clean up the content
const cleanedContent = sanitizeHtmlContent(content);
// Use local extraction
return generateLocalKeyPoints(cleanedContent, maxPoints);
}
/**
* Generate key points locally without using LLM API
*/
function generateLocalKeyPoints(content: string, maxPoints: number): string[] {
// Simple approach - look for sentences that might contain key information
const sentences = content
.replace(/\n+/g, ' ')
.split(/[.!?]/)
.map(s => s.trim())
.filter(s => s.length > 20);
// Heuristics for important sentences - look for indicator phrases
const importanceMarkers = [
'important', 'key', 'significant', 'essential', 'critical',
'main', 'primary', 'crucial', 'vital', 'fundamental',
'in summary', 'to summarize', 'in conclusion', 'conclude',
'therefore', 'thus', 'consequently', 'as a result'
];
// Score sentences based on potential importance
const scoredSentences = sentences.map(sentence => {
let score = 0;
// Sentences at the beginning or end are often important
if (sentences.indexOf(sentence) < sentences.length * 0.1) score += 3;
if (sentences.indexOf(sentence) > sentences.length * 0.9) score += 4;
// Check for importance markers
for (const marker of importanceMarkers) {
if (sentence.toLowerCase().includes(marker)) {
score += 2;
}
}
// Prefer medium-length sentences
if (sentence.length > 40 && sentence.length < 150) score += 2;
return { sentence, score };
});
// Sort by score and take top N
const topSentences = scoredSentences
.sort((a, b) => b.score - a.score)
.slice(0, maxPoints)
.map(item => item.sentence + '.');
return topSentences;
}

View File

@ -1,871 +0,0 @@
import sql from '../sql.js';
import sanitizeHtml from 'sanitize-html';
import becca from '../../becca/becca.js';
/**
* Utility class for extracting context from notes to provide to AI models
* Enhanced with advanced capabilities for handling large notes and specialized content
*/
export class ContextExtractor {
/**
* Get the content of a note
*/
async getNoteContent(noteId: string): Promise<string | null> {
// Use Becca API to get note data
const note = becca.getNote(noteId);
if (!note) {
return null;
}
try {
// Get content using Becca API
const content = String(await note.getContent() || "");
return this.formatNoteContent(
content,
note.type,
note.mime,
note.title
);
} catch (error) {
console.error(`Error getting content for note ${noteId}:`, error);
return null;
}
}
/**
* Split a large note into smaller, semantically meaningful chunks
* This is useful for handling large notes that exceed the context window of LLMs
*
* @param noteId - The ID of the note to chunk
* @param maxChunkSize - Maximum size of each chunk in characters
* @returns Array of content chunks, or empty array if note not found
*/
async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise<string[]> {
const content = await this.getNoteContent(noteId);
if (!content) return [];
// Split into semantic chunks (paragraphs, sections, etc.)
return this.splitContentIntoChunks(content, maxChunkSize);
}
/**
* Split text content into semantically meaningful chunks based on natural boundaries
* like paragraphs, headings, and code blocks
*
* @param content - The text content to split
* @param maxChunkSize - Maximum size of each chunk in characters
* @returns Array of content chunks
*/
private splitContentIntoChunks(content: string, maxChunkSize: number): string[] {
// Look for semantic boundaries (headings, blank lines, etc.)
const headingPattern = /^(#+)\s+(.+)$/gm;
const codeBlockPattern = /```[\s\S]+?```/gm;
// Replace code blocks with placeholders to avoid splitting inside them
const codeBlocks: string[] = [];
let contentWithPlaceholders = content.replace(codeBlockPattern, (match) => {
const placeholder = `__CODE_BLOCK_${codeBlocks.length}__`;
codeBlocks.push(match);
return placeholder;
});
// Split content at headings and paragraphs
const sections: string[] = [];
let currentSection = '';
// First split by headings
const lines = contentWithPlaceholders.split('\n');
for (const line of lines) {
const isHeading = headingPattern.test(line);
headingPattern.lastIndex = 0; // Reset regex
// If this is a heading and we already have content, start a new section
if (isHeading && currentSection.trim().length > 0) {
sections.push(currentSection.trim());
currentSection = line;
} else {
currentSection += (currentSection ? '\n' : '') + line;
}
}
// Add the last section if there's any content
if (currentSection.trim().length > 0) {
sections.push(currentSection.trim());
}
// Now combine smaller sections to respect maxChunkSize
const chunks: string[] = [];
let currentChunk = '';
for (const section of sections) {
// If adding this section exceeds maxChunkSize and we already have content,
// finalize the current chunk and start a new one
if ((currentChunk + section).length > maxChunkSize && currentChunk.length > 0) {
chunks.push(currentChunk);
currentChunk = section;
} else {
currentChunk += (currentChunk ? '\n\n' : '') + section;
}
}
// Add the last chunk if there's any content
if (currentChunk.length > 0) {
chunks.push(currentChunk);
}
// Restore code blocks in all chunks
return chunks.map(chunk => {
return chunk.replace(/__CODE_BLOCK_(\d+)__/g, (_, index) => {
return codeBlocks[parseInt(index)];
});
});
}
/**
* Generate a summary of a note's content
* Useful for providing a condensed version of very large notes
*
* @param noteId - The ID of the note to summarize
* @param maxLength - Cut-off length to trigger summarization
* @returns Summary of the note or the original content if small enough
*/
async getNoteSummary(noteId: string, maxLength = 5000): Promise<string> {
const content = await this.getNoteContent(noteId);
if (!content || content.length < maxLength) return content || '';
// For larger content, generate a summary
return this.summarizeContent(content);
}
/**
* Summarize content by extracting key information
* This uses a heuristic approach to find important sentences and paragraphs
*
* @param content - The content to summarize
* @returns A summarized version of the content
*/
private summarizeContent(content: string): string {
// Extract title/heading if present
const titleMatch = content.match(/^# (.+)$/m);
const title = titleMatch ? titleMatch[1] : 'Untitled Note';
// Extract all headings for an outline
const headings: string[] = [];
const headingMatches = content.matchAll(/^(#+)\s+(.+)$/gm);
for (const match of headingMatches) {
const level = match[1].length;
const text = match[2];
headings.push(`${' '.repeat(level-1)}- ${text}`);
}
// Extract first sentence of each paragraph for a summary
const paragraphs = content.split(/\n\s*\n/);
const firstSentences = paragraphs
.filter(p => p.trim().length > 0 && !p.trim().startsWith('#') && !p.trim().startsWith('```'))
.map(p => {
const sentenceMatch = p.match(/^[^.!?]+[.!?]/);
return sentenceMatch ? sentenceMatch[0].trim() : p.substring(0, Math.min(150, p.length)).trim() + '...';
})
.slice(0, 5); // Limit to 5 sentences
// Create the summary
let summary = `# Summary of: ${title}\n\n`;
if (headings.length > 0) {
summary += `## Document Outline\n${headings.join('\n')}\n\n`;
}
if (firstSentences.length > 0) {
summary += `## Key Points\n${firstSentences.map(s => `- ${s}`).join('\n')}\n\n`;
}
summary += `(Note: This is an automatically generated summary of a larger document with ${content.length} characters)`;
return summary;
}
/**
* Get a set of parent notes to provide hierarchical context
*/
async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
// Note: getParentNotes has already been updated to use Becca
const parents = await this.getParentNotes(noteId, maxDepth);
if (!parents.length) return '';
let context = 'Here is the hierarchical context for the current note:\n\n';
// Create a hierarchical view of the parents using indentation
// to show the proper parent-child relationship
let indentLevel = 0;
for (let i = 0; i < parents.length; i++) {
const parent = parents[i];
const indent = ' '.repeat(indentLevel);
context += `${indent}- ${parent.title}\n`;
indentLevel++;
}
// Now add the current note with proper indentation
const note = becca.getNote(noteId);
if (note) {
const indent = ' '.repeat(indentLevel);
context += `${indent}- ${note.title} (current note)\n`;
}
return context + '\n';
}
/**
* Get child notes to provide additional context
*/
async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
const note = becca.getNote(noteId);
if (!note) {
return '';
}
// Use Becca API to get child notes
const childNotes = note.getChildNotes();
if (!childNotes || childNotes.length === 0) {
return '';
}
let context = 'The current note has these child notes:\n\n';
// Limit to maxChildren
const childrenToShow = childNotes.slice(0, maxChildren);
for (const child of childrenToShow) {
context += `- ${child.title}\n`;
}
// If there are more children than we're showing, indicate that
if (childNotes.length > maxChildren) {
context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
}
return context + '\n';
}
/**
* Get notes linked to this note
*/
async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
const note = becca.getNote(noteId);
if (!note) {
return '';
}
// Use Becca API to get relations
const relations = note.getRelations();
if (!relations || relations.length === 0) {
return '';
}
// Get the target notes from relations
const linkedNotes = relations
.map(relation => relation.targetNote)
.filter(note => note !== null && note !== undefined);
if (linkedNotes.length === 0) {
return '';
}
let context = 'This note has relationships with these notes:\n\n';
// Limit to maxLinks
const notesToShow = linkedNotes.slice(0, maxLinks);
for (const linked of notesToShow) {
context += `- ${linked.title}\n`;
}
// If there are more linked notes than we're showing, indicate that
if (linkedNotes.length > maxLinks) {
context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
}
return context + '\n';
}
/**
* Format the content of a note based on its type
* Enhanced with better handling for large and specialized content types
*/
private formatNoteContent(content: string, type: string, mime: string, title: string): string {
let formattedContent = `# ${title}\n\n`;
switch (type) {
case 'text':
// Remove HTML formatting for text notes
formattedContent += this.sanitizeHtml(content);
break;
case 'code':
// Improved code handling with language detection
const codeLanguage = this.detectCodeLanguage(content, mime);
// For large code files, extract structure rather than full content
if (content.length > 8000) {
formattedContent += this.extractCodeStructure(content, codeLanguage);
} else {
formattedContent += `\`\`\`${codeLanguage}\n${content}\n\`\`\``;
}
break;
case 'canvas':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
// Extract text elements from canvas
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
const texts = jsonContent.elements
.filter((element: any) => element.type === 'text' && element.text)
.map((element: any) => element.text);
formattedContent += 'Canvas content:\n' + texts.join('\n');
} else {
formattedContent += '[Empty canvas]';
}
}
catch (e: any) {
formattedContent += `[Error parsing canvas content: ${e.message}]`;
}
} else {
formattedContent += '[Canvas content]';
}
break;
case 'mindMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
// Extract node text from mind map
const extractMindMapNodes = (node: any): string[] => {
let texts: string[] = [];
if (node.text) {
texts.push(node.text);
}
if (node.children && Array.isArray(node.children)) {
for (const child of node.children) {
texts = texts.concat(extractMindMapNodes(child));
}
}
return texts;
};
if (jsonContent.root) {
formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
} else {
formattedContent += '[Empty mind map]';
}
}
catch (e: any) {
formattedContent += `[Error parsing mind map content: ${e.message}]`;
}
} else {
formattedContent += '[Mind map content]';
}
break;
case 'relationMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
// Extract relation map entities and connections
let result = 'Relation map content:\n';
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
result += 'Notes: ' + jsonContent.notes
.map((note: any) => note.title || note.name)
.filter(Boolean)
.join(', ') + '\n';
}
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
result += 'Relations: ' + jsonContent.relations
.map((rel: any) => {
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
return `${source}${rel.name || ''}${target}`;
})
.join('; ');
}
formattedContent += result;
}
catch (e: any) {
formattedContent += `[Error parsing relation map content: ${e.message}]`;
}
} else {
formattedContent += '[Relation map content]';
}
break;
case 'geoMap':
if (mime === 'application/json') {
try {
// Parse JSON content
const jsonContent = JSON.parse(content);
let result = 'Geographic map content:\n';
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
if (jsonContent.markers.length > 0) {
result += jsonContent.markers
.map((marker: any) => {
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
})
.join('\n');
} else {
result += 'Empty geographic map';
}
} else {
result += 'Empty geographic map';
}
formattedContent += result;
}
catch (e: any) {
formattedContent += `[Error parsing geographic map content: ${e.message}]`;
}
} else {
formattedContent += '[Geographic map content]';
}
break;
case 'mermaid':
// Format mermaid diagrams as code blocks
formattedContent += '```mermaid\n' + content + '\n```';
break;
case 'image':
case 'file':
formattedContent += `[${type} attachment]`;
break;
default:
// For other notes, just use the content as is
formattedContent += this.sanitizeHtml(content);
}
return formattedContent;
}
/**
* Detect the programming language of code content
*
* @param content - The code content to analyze
* @param mime - MIME type (if available)
* @returns The detected language or empty string
*/
private detectCodeLanguage(content: string, mime: string): string {
// First check if mime type provides a hint
if (mime) {
const mimeMap: Record<string, string> = {
'text/x-python': 'python',
'text/javascript': 'javascript',
'application/javascript': 'javascript',
'text/typescript': 'typescript',
'application/typescript': 'typescript',
'text/x-java': 'java',
'text/html': 'html',
'text/css': 'css',
'text/x-c': 'c',
'text/x-c++': 'cpp',
'text/x-csharp': 'csharp',
'text/x-go': 'go',
'text/x-ruby': 'ruby',
'text/x-php': 'php',
'text/x-swift': 'swift',
'text/x-rust': 'rust',
'text/markdown': 'markdown',
'text/x-sql': 'sql',
'text/x-yaml': 'yaml',
'application/json': 'json',
'text/x-shell': 'bash'
};
for (const [mimePattern, language] of Object.entries(mimeMap)) {
if (mime.includes(mimePattern)) {
return language;
}
}
}
// Check for common language patterns in the content
const firstLines = content.split('\n', 20).join('\n');
const languagePatterns: Record<string, RegExp> = {
'python': /^(import\s+|from\s+\w+\s+import|def\s+\w+\s*\(|class\s+\w+\s*:)/m,
'javascript': /^(const\s+\w+\s*=|let\s+\w+\s*=|var\s+\w+\s*=|function\s+\w+\s*\(|import\s+.*from\s+)/m,
'typescript': /^(interface\s+\w+|type\s+\w+\s*=|class\s+\w+\s*{)/m,
'html': /^<!DOCTYPE html>|<html>|<head>|<body>/m,
'css': /^(\.\w+\s*{|\#\w+\s*{|@media|@import)/m,
'java': /^(public\s+class|import\s+java|package\s+)/m,
'cpp': /^(#include\s+<\w+>|namespace\s+\w+|void\s+\w+\s*\()/m,
'csharp': /^(using\s+System|namespace\s+\w+|public\s+class)/m,
'go': /^(package\s+\w+|import\s+\(|func\s+\w+\s*\()/m,
'ruby': /^(require\s+|class\s+\w+\s*<|def\s+\w+)/m,
'php': /^(<\?php|namespace\s+\w+|use\s+\w+)/m,
'sql': /^(SELECT|INSERT|UPDATE|DELETE|CREATE TABLE|ALTER TABLE)/im,
'bash': /^(#!\/bin\/sh|#!\/bin\/bash|function\s+\w+\s*\(\))/m,
'markdown': /^(#\s+|##\s+|###\s+|\*\s+|-\s+|>\s+)/m,
'json': /^({[\s\n]*"|[\s\n]*\[)/m,
'yaml': /^(---|\w+:\s+)/m
};
for (const [language, pattern] of Object.entries(languagePatterns)) {
if (pattern.test(firstLines)) {
return language;
}
}
// Default to empty string if we can't detect the language
return '';
}
/**
* Extract the structure of a code file rather than its full content
* Useful for providing high-level understanding of large code files
*
* @param content - The full code content
* @param language - The programming language
* @returns A structured representation of the code
*/
private extractCodeStructure(content: string, language: string): string {
const lines = content.split('\n');
const maxLines = 8000;
// If it's not that much over the limit, just include the whole thing
if (lines.length <= maxLines * 1.2) {
return `\`\`\`${language}\n${content}\n\`\`\``;
}
// For large files, extract important structural elements based on language
let extractedStructure = '';
let importSection = '';
let classDefinitions = [];
let functionDefinitions = [];
let otherImportantLines = [];
// Extract imports/includes, class/function definitions based on language
if (['javascript', 'typescript', 'python', 'java', 'csharp'].includes(language)) {
// Find imports
for (let i = 0; i < Math.min(100, lines.length); i++) {
if (lines[i].match(/^(import|from|using|require|#include|package)\s+/)) {
importSection += lines[i] + '\n';
}
}
// Find class definitions
for (let i = 0; i < lines.length; i++) {
if (lines[i].match(/^(class|interface|type)\s+\w+/)) {
const endBracketLine = this.findMatchingEnd(lines, i, language);
if (endBracketLine > i && endBracketLine <= i + 10) {
// Include small class definitions entirely
classDefinitions.push(lines.slice(i, endBracketLine + 1).join('\n'));
i = endBracketLine;
} else {
// For larger classes, just show the definition and methods
let className = lines[i];
classDefinitions.push(className);
// Look for methods in this class
for (let j = i + 1; j < Math.min(endBracketLine, lines.length); j++) {
if (lines[j].match(/^\s+(function|def|public|private|protected)\s+\w+/)) {
classDefinitions.push(' ' + lines[j].trim());
}
}
if (endBracketLine > 0 && endBracketLine < lines.length) {
i = endBracketLine;
}
}
}
}
// Find function definitions not inside classes
for (let i = 0; i < lines.length; i++) {
if (lines[i].match(/^(function|def|const\s+\w+\s*=\s*\(|let\s+\w+\s*=\s*\(|var\s+\w+\s*=\s*\()/)) {
functionDefinitions.push(lines[i]);
}
}
}
// Build the extracted structure
extractedStructure += `# Code Structure (${lines.length} lines total)\n\n`;
if (importSection) {
extractedStructure += "## Imports/Dependencies\n```" + language + "\n" + importSection + "```\n\n";
}
if (classDefinitions.length > 0) {
extractedStructure += "## Classes/Interfaces\n```" + language + "\n" + classDefinitions.join('\n\n') + "\n```\n\n";
}
if (functionDefinitions.length > 0) {
extractedStructure += "## Functions\n```" + language + "\n" + functionDefinitions.join('\n\n') + "\n```\n\n";
}
// Add beginning and end of the file for context
extractedStructure += "## Beginning of File\n```" + language + "\n" +
lines.slice(0, Math.min(50, lines.length)).join('\n') + "\n```\n\n";
if (lines.length > 100) {
extractedStructure += "## End of File\n```" + language + "\n" +
lines.slice(Math.max(0, lines.length - 50)).join('\n') + "\n```\n\n";
}
return extractedStructure;
}
/**
* Find the line number of the matching ending bracket/block
*
* @param lines - Array of code lines
* @param startLine - Starting line number
* @param language - Programming language
* @returns The line number of the matching end, or -1 if not found
*/
private findMatchingEnd(lines: string[], startLine: number, language: string): number {
let depth = 0;
let inClass = false;
// Different languages have different ways to define blocks
if (['javascript', 'typescript', 'java', 'csharp', 'cpp'].includes(language)) {
// Curly brace languages
for (let i = startLine; i < lines.length; i++) {
const line = lines[i];
// Count opening braces
for (const char of line) {
if (char === '{') depth++;
if (char === '}') {
depth--;
if (depth === 0 && inClass) return i;
}
}
// Check if this line contains the class declaration
if (i === startLine && line.includes('{')) {
inClass = true;
} else if (i === startLine) {
// If the first line doesn't have an opening brace, look at the next few lines
if (i + 1 < lines.length && lines[i + 1].includes('{')) {
inClass = true;
}
}
}
} else if (language === 'python') {
// Indentation-based language
const baseIndentation = lines[startLine].match(/^\s*/)?.[0].length || 0;
for (let i = startLine + 1; i < lines.length; i++) {
// Skip empty lines
if (lines[i].trim() === '') continue;
const currentIndentation = lines[i].match(/^\s*/)?.[0].length || 0;
// If we're back to the same or lower indentation level, we've reached the end
if (currentIndentation <= baseIndentation) {
return i - 1;
}
}
}
return -1;
}
/**
* Sanitize HTML content to plain text
*/
private sanitizeHtml(html: string): string {
if (!html) return '';
// Use sanitizeHtml to remove all HTML tags
let content = sanitizeHtml(html, {
allowedTags: [],
allowedAttributes: {},
textFilter: (text) => {
// Replace multiple newlines with a single one
return text.replace(/\n\s*\n/g, '\n\n');
}
});
// Additional cleanup for any remaining HTML entities
content = content
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return content;
}
/**
* Get parent notes in the hierarchy
*/
private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
const parentNotes: {noteId: string, title: string}[] = [];
const startNote = becca.getNote(noteId);
if (!startNote) {
return parentNotes;
}
// Use non-null assertion as we checked above
let currentNote: any = startNote;
for (let i = 0; i < maxDepth; i++) {
// Get parent branches (should be just one in most cases)
if (!currentNote) break;
const parentBranches: any[] = currentNote.getParentBranches();
if (!parentBranches || parentBranches.length === 0) {
break;
}
// Use the first parent branch
const branch: any = parentBranches[0];
if (!branch) break;
const parentNote: any = branch.getParentNote();
if (!parentNote || parentNote.noteId === 'root') {
break;
}
parentNotes.unshift({
noteId: parentNote.noteId,
title: parentNote.title
});
currentNote = parentNote;
}
return parentNotes;
}
/**
* Get the full context for a note, including parent hierarchy, content, and children
*/
async getFullContext(noteId: string): Promise<string> {
const noteContent = await this.getNoteContent(noteId);
if (!noteContent) {
return 'Note not found';
}
const parentContext = await this.getParentContext(noteId);
const childContext = await this.getChildContext(noteId);
const linkedContext = await this.getLinkedNotesContext(noteId);
return [
parentContext,
noteContent,
childContext,
linkedContext
].filter(Boolean).join('\n\n');
}
/**
* Get semantically ranked context based on semantic similarity to a query
* This method delegates to the semantic context service for the actual ranking
*
* @param noteId - The ID of the current note
* @param query - The user's query to compare against
* @param maxResults - Maximum number of related notes to include
* @returns Context with the most semantically relevant related notes
*/
async getSemanticContext(noteId: string, query: string, maxResults = 5): Promise<string> {
try {
// This requires the semantic context service to be available
// We're using a dynamic import to avoid circular dependencies
const { default: aiServiceManager } = await import('./ai_service_manager.js');
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
if (!semanticContext) {
return this.getFullContext(noteId);
}
return await semanticContext.getSemanticContext(noteId, query, maxResults);
} catch (error) {
// Fall back to regular context if semantic ranking fails
console.error('Error in semantic context ranking:', error);
return this.getFullContext(noteId);
}
}
/**
* Get progressively loaded context based on depth level
* This provides different levels of context detail depending on the depth parameter
*
* @param noteId - The ID of the note to get context for
* @param depth - Depth level (1-4) determining how much context to include
* @returns Context appropriate for the requested depth
*/
async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
try {
// This requires the semantic context service to be available
// We're using a dynamic import to avoid circular dependencies
const { default: aiServiceManager } = await import('./ai_service_manager.js');
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
if (!semanticContext) {
return this.getFullContext(noteId);
}
return await semanticContext.getProgressiveContext(noteId, depth);
} catch (error) {
// Fall back to regular context if progressive loading fails
console.error('Error in progressive context loading:', error);
return this.getFullContext(noteId);
}
}
/**
* Get smart context based on the query complexity
* This automatically selects the appropriate context depth and relevance
*
* @param noteId - The ID of the note to get context for
* @param query - The user's query for semantic relevance matching
* @returns The optimal context for answering the query
*/
async getSmartContext(noteId: string, query: string): Promise<string> {
try {
// This requires the semantic context service to be available
// We're using a dynamic import to avoid circular dependencies
const { default: aiServiceManager } = await import('./ai_service_manager.js');
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
if (!semanticContext) {
return this.getFullContext(noteId);
}
return await semanticContext.getSmartContext(noteId, query);
} catch (error) {
// Fall back to regular context if smart context fails
console.error('Error in smart context selection:', error);
return this.getFullContext(noteId);
}
}
}
// Singleton instance
const contextExtractor = new ContextExtractor();
export default contextExtractor;

View File

@ -412,7 +412,8 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
try {
// Use the enhanced context extractor for improved content extraction
// We're using a dynamic import to avoid circular dependencies
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
const { ContextExtractor } = await import('../../llm/context/index.js');
const contextExtractor = new ContextExtractor();
// Get the content using the enhanced formatNoteContent method in context extractor
const noteContent = await contextExtractor.getNoteContent(noteId);
@ -836,7 +837,8 @@ async function processNoteWithChunking(
): Promise<void> {
try {
// Get the context extractor dynamically to avoid circular dependencies
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
const { ContextExtractor } = await import('../../llm/context/index.js');
const contextExtractor = new ContextExtractor();
// Get chunks of the note content
const chunks = await contextExtractor.getChunkedNoteContent(noteId);

View File

@ -1,4 +1,4 @@
import contextExtractor from './context_extractor.js';
import { ContextExtractor } from './context/index.js';
import * as vectorStore from './embeddings/vector_store.js';
import sql from '../sql.js';
import { cosineSimilarity } from './embeddings/vector_store.js';
@ -58,6 +58,9 @@ import options from '../options.js';
* knowledge bases when working with limited-context LLMs.
*/
class SemanticContextService {
// Create an instance of ContextExtractor for backward compatibility
private contextExtractor = new ContextExtractor();
/**
* Get the preferred embedding provider based on user settings
* Tries to use the most appropriate provider in this order:
@ -156,7 +159,7 @@ class SemanticContextService {
if (!noteEmbedding) {
// If note doesn't have an embedding yet, get content and generate one
const content = await contextExtractor.getNoteContent(note.noteId);
const content = await this.contextExtractor.getNoteContent(note.noteId);
if (content && provider) {
try {
noteEmbedding = await provider.generateEmbeddings(content);
@ -225,7 +228,7 @@ class SemanticContextService {
const mostRelevantNotes = rankedNotes.slice(0, maxResults);
const relevantContent = await Promise.all(
mostRelevantNotes.map(async note => {
const content = await contextExtractor.getNoteContent(note.noteId);
const content = await this.contextExtractor.getNoteContent(note.noteId);
if (!content) return null;
// Format with relevance score and title
@ -253,22 +256,22 @@ class SemanticContextService {
*/
async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
// Start with the note content
const noteContent = await contextExtractor.getNoteContent(noteId);
const noteContent = await this.contextExtractor.getNoteContent(noteId);
if (!noteContent) return 'Note not found';
// If depth is 1, just return the note content
if (depth <= 1) return noteContent;
// Add parent context for depth >= 2
const parentContext = await contextExtractor.getParentContext(noteId);
const parentContext = await this.contextExtractor.getParentContext(noteId);
if (depth <= 2) return `${parentContext}\n\n${noteContent}`;
// Add child context for depth >= 3
const childContext = await contextExtractor.getChildContext(noteId);
const childContext = await this.contextExtractor.getChildContext(noteId);
if (depth <= 3) return `${parentContext}\n\n${noteContent}\n\n${childContext}`;
// Add linked notes for depth >= 4
const linkedContext = await contextExtractor.getLinkedNotesContext(noteId);
const linkedContext = await this.contextExtractor.getLinkedNotesContext(noteId);
return `${parentContext}\n\n${noteContent}\n\n${childContext}\n\n${linkedContext}`;
}