mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-08-10 10:22:29 +08:00
break up the huge context_extractor into smaller files
This commit is contained in:
parent
0985cec8d6
commit
71b3b04c53
@ -4,7 +4,7 @@ import { OpenAIService } from './providers/openai_service.js';
|
||||
import { AnthropicService } from './providers/anthropic_service.js';
|
||||
import { OllamaService } from './providers/ollama_service.js';
|
||||
import log from '../log.js';
|
||||
import contextExtractor from './context_extractor.js';
|
||||
import { ContextExtractor } from './context/index.js';
|
||||
import semanticContextService from './semantic_context_service.js';
|
||||
|
||||
type ServiceProviders = 'openai' | 'anthropic' | 'ollama';
|
||||
@ -216,3 +216,6 @@ export default {
|
||||
return getInstance().getSemanticContextService();
|
||||
}
|
||||
};
|
||||
|
||||
// Create an instance of ContextExtractor for backward compatibility
|
||||
const contextExtractor = new ContextExtractor();
|
||||
|
@ -1,7 +1,10 @@
|
||||
import type { Message, ChatCompletionOptions } from './ai_interface.js';
|
||||
import aiServiceManager from './ai_service_manager.js';
|
||||
import chatStorageService from './chat_storage_service.js';
|
||||
import contextExtractor from './context_extractor.js';
|
||||
import { ContextExtractor } from './context/index.js';
|
||||
|
||||
// Create an instance of ContextExtractor for backward compatibility
|
||||
const contextExtractor = new ContextExtractor();
|
||||
|
||||
export interface ChatSession {
|
||||
id: string;
|
||||
|
288
src/services/llm/context/chunking.ts
Normal file
288
src/services/llm/context/chunking.ts
Normal file
@ -0,0 +1,288 @@
|
||||
/**
|
||||
* Contains functions for chunking content into smaller pieces for processing
|
||||
* These functions are used to properly prepare content for LLM context windows
|
||||
*/
|
||||
|
||||
/**
|
||||
* Interface for chunked content
|
||||
*/
|
||||
export interface ContentChunk {
|
||||
content: string;
|
||||
prefix: string;
|
||||
noteId?: string;
|
||||
title?: string;
|
||||
path?: string;
|
||||
metadata?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options for the chunking process
|
||||
*/
|
||||
export interface ChunkOptions {
|
||||
/**
|
||||
* Maximum size of each chunk in characters
|
||||
* Defaults to LLM context window size (typically around 2048)
|
||||
*/
|
||||
maxChunkSize?: number;
|
||||
|
||||
/**
|
||||
* How much chunks should overlap to maintain context
|
||||
*/
|
||||
overlapSize?: number;
|
||||
|
||||
/**
|
||||
* Whether to respect sentence and paragraph boundaries
|
||||
*/
|
||||
respectBoundaries?: boolean;
|
||||
|
||||
/**
|
||||
* Whether to add metadata to chunks
|
||||
*/
|
||||
includeMetadata?: boolean;
|
||||
|
||||
/**
|
||||
* Additional information to include in chunk metadata
|
||||
*/
|
||||
metadata?: Record<string, any>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default options for chunking
|
||||
*/
|
||||
const DEFAULT_CHUNK_OPTIONS: Required<ChunkOptions> = {
|
||||
maxChunkSize: 1500, // Characters per chunk
|
||||
overlapSize: 100, // Overlap between chunks
|
||||
respectBoundaries: true,
|
||||
includeMetadata: true,
|
||||
metadata: {}
|
||||
};
|
||||
|
||||
/**
|
||||
* Chunk content into smaller pieces
|
||||
* Used for processing large documents and preparing them for LLMs
|
||||
*/
|
||||
export function chunkContent(
|
||||
content: string,
|
||||
title: string = '',
|
||||
noteId: string = '',
|
||||
options: ChunkOptions = {}
|
||||
): ContentChunk[] {
|
||||
// Merge provided options with defaults
|
||||
const config: Required<ChunkOptions> = { ...DEFAULT_CHUNK_OPTIONS, ...options };
|
||||
|
||||
// If content is small enough, return as a single chunk
|
||||
if (content.length <= config.maxChunkSize) {
|
||||
return [{
|
||||
content,
|
||||
prefix: title,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
}];
|
||||
}
|
||||
|
||||
const chunks: ContentChunk[] = [];
|
||||
|
||||
if (config.respectBoundaries) {
|
||||
// Try to split on paragraph boundaries first
|
||||
const paragraphs = content.split(/\n\s*\n/);
|
||||
|
||||
let currentChunk = '';
|
||||
let currentPrefix = title ? title : '';
|
||||
|
||||
for (const paragraph of paragraphs) {
|
||||
// If adding this paragraph would exceed max size, create a new chunk
|
||||
if (currentChunk.length + paragraph.length > config.maxChunkSize) {
|
||||
// If current chunk is not empty, add it to chunks
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push({
|
||||
content: currentChunk,
|
||||
prefix: currentPrefix,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
});
|
||||
}
|
||||
|
||||
// Start a new chunk, use the overlap if possible
|
||||
if (config.overlapSize > 0 && currentChunk.length > 0) {
|
||||
// For overlap, take the last N characters
|
||||
const overlapText = currentChunk.slice(-config.overlapSize);
|
||||
currentChunk = overlapText + paragraph;
|
||||
currentPrefix = `${title} (continued)`;
|
||||
} else {
|
||||
currentChunk = paragraph;
|
||||
currentPrefix = `${title} (continued)`;
|
||||
}
|
||||
} else {
|
||||
// Add paragraph to current chunk
|
||||
if (currentChunk.length > 0) {
|
||||
currentChunk += '\n\n';
|
||||
}
|
||||
currentChunk += paragraph;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last chunk if it's not empty
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push({
|
||||
content: currentChunk,
|
||||
prefix: currentPrefix,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
});
|
||||
}
|
||||
} else {
|
||||
// Simple chunking by character count
|
||||
let currentPosition = 0;
|
||||
|
||||
while (currentPosition < content.length) {
|
||||
const chunkEnd = Math.min(currentPosition + config.maxChunkSize, content.length);
|
||||
|
||||
const chunk = content.substring(currentPosition, chunkEnd);
|
||||
const prefix = currentPosition === 0 ? title : `${title} (continued)`;
|
||||
|
||||
chunks.push({
|
||||
content: chunk,
|
||||
prefix,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
});
|
||||
|
||||
// Move position, considering overlap
|
||||
currentPosition = chunkEnd - (config.overlapSize || 0);
|
||||
|
||||
// Prevent infinite loop if overlap is too large
|
||||
if (currentPosition <= 0 || currentPosition >= content.length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Smarter chunking that tries to respect semantic boundaries like headers and sections
|
||||
*/
|
||||
export function semanticChunking(
|
||||
content: string,
|
||||
title: string = '',
|
||||
noteId: string = '',
|
||||
options: ChunkOptions = {}
|
||||
): ContentChunk[] {
|
||||
// Merge provided options with defaults
|
||||
const config: Required<ChunkOptions> = { ...DEFAULT_CHUNK_OPTIONS, ...options };
|
||||
|
||||
// If content is small enough, return as a single chunk
|
||||
if (content.length <= config.maxChunkSize) {
|
||||
return [{
|
||||
content,
|
||||
prefix: title,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
}];
|
||||
}
|
||||
|
||||
const chunks: ContentChunk[] = [];
|
||||
|
||||
// Try to split on headers first
|
||||
const headerPattern = /#{1,6}\s+.+|<h[1-6][^>]*>.*?<\/h[1-6]>/g;
|
||||
const sections = [];
|
||||
|
||||
let lastIndex = 0;
|
||||
let match;
|
||||
|
||||
// First, find all headers and split content into sections
|
||||
while ((match = headerPattern.exec(content)) !== null) {
|
||||
if (match.index > lastIndex) {
|
||||
// Add the content before this header
|
||||
sections.push(content.substring(lastIndex, match.index));
|
||||
}
|
||||
|
||||
// Start a new section with this header
|
||||
lastIndex = match.index;
|
||||
}
|
||||
|
||||
// Add the last section
|
||||
if (lastIndex < content.length) {
|
||||
sections.push(content.substring(lastIndex));
|
||||
}
|
||||
|
||||
// If no headers were found, fall back to regular chunking
|
||||
if (sections.length <= 1) {
|
||||
return chunkContent(content, title, noteId, options);
|
||||
}
|
||||
|
||||
// Process each section
|
||||
let currentChunk = '';
|
||||
let currentPrefix = title;
|
||||
|
||||
for (const section of sections) {
|
||||
// If adding this section would exceed max size, create a new chunk
|
||||
if (currentChunk.length + section.length > config.maxChunkSize) {
|
||||
// If this single section is too big, it needs to be chunked further
|
||||
if (section.length > config.maxChunkSize) {
|
||||
// First add the current chunk if not empty
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push({
|
||||
content: currentChunk,
|
||||
prefix: currentPrefix,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
});
|
||||
}
|
||||
|
||||
// Chunk this section separately
|
||||
const sectionChunks = chunkContent(
|
||||
section,
|
||||
title,
|
||||
noteId,
|
||||
options
|
||||
);
|
||||
|
||||
chunks.push(...sectionChunks);
|
||||
|
||||
// Reset current chunk
|
||||
currentChunk = '';
|
||||
currentPrefix = `${title} (continued)`;
|
||||
} else {
|
||||
// Add current chunk to chunks
|
||||
chunks.push({
|
||||
content: currentChunk,
|
||||
prefix: currentPrefix,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
});
|
||||
|
||||
// Start a new chunk with this section
|
||||
currentChunk = section;
|
||||
currentPrefix = `${title} (continued)`;
|
||||
}
|
||||
} else {
|
||||
// Add section to current chunk
|
||||
if (currentChunk.length > 0 && !currentChunk.endsWith('\n')) {
|
||||
currentChunk += '\n\n';
|
||||
}
|
||||
currentChunk += section;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last chunk if it's not empty
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push({
|
||||
content: currentChunk,
|
||||
prefix: currentPrefix,
|
||||
noteId,
|
||||
title,
|
||||
metadata: config.metadata
|
||||
});
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
433
src/services/llm/context/code_handlers.ts
Normal file
433
src/services/llm/context/code_handlers.ts
Normal file
@ -0,0 +1,433 @@
|
||||
/**
|
||||
* Helper functions for processing code notes, including language detection and structure extraction
|
||||
*/
|
||||
|
||||
/**
|
||||
* Attempt to detect the programming language from code content or note attributes
|
||||
*/
|
||||
export function detectLanguage(content: string, mime: string): string {
|
||||
// First check MIME type for hints
|
||||
if (mime) {
|
||||
const mimeLower = mime.toLowerCase();
|
||||
|
||||
// Map of mime types to language names
|
||||
const mimeMap: {[key: string]: string} = {
|
||||
'text/javascript': 'javascript',
|
||||
'application/javascript': 'javascript',
|
||||
'text/typescript': 'typescript',
|
||||
'application/typescript': 'typescript',
|
||||
'text/x-python': 'python',
|
||||
'text/x-java': 'java',
|
||||
'text/x-c': 'c',
|
||||
'text/x-c++': 'cpp',
|
||||
'text/x-csharp': 'csharp',
|
||||
'text/x-go': 'go',
|
||||
'text/x-ruby': 'ruby',
|
||||
'text/x-php': 'php',
|
||||
'text/x-rust': 'rust',
|
||||
'text/x-swift': 'swift',
|
||||
'text/x-kotlin': 'kotlin',
|
||||
'text/x-scala': 'scala',
|
||||
'text/x-perl': 'perl',
|
||||
'text/x-lua': 'lua',
|
||||
'text/x-r': 'r',
|
||||
'text/x-dart': 'dart',
|
||||
'text/html': 'html',
|
||||
'text/css': 'css',
|
||||
'application/json': 'json',
|
||||
'application/xml': 'xml',
|
||||
'text/markdown': 'markdown',
|
||||
'text/yaml': 'yaml',
|
||||
'text/x-sql': 'sql'
|
||||
};
|
||||
|
||||
if (mimeMap[mimeLower]) {
|
||||
return mimeMap[mimeLower];
|
||||
}
|
||||
}
|
||||
|
||||
// Check for common language patterns in the first few lines
|
||||
const firstLines = content.split('\n').slice(0, 10).join('\n');
|
||||
|
||||
// Simple heuristics for common languages
|
||||
if (firstLines.includes('<?php')) return 'php';
|
||||
if (firstLines.includes('#!/usr/bin/python') || firstLines.includes('import ') && firstLines.includes('def ')) return 'python';
|
||||
if (firstLines.includes('#!/bin/bash') || firstLines.includes('#!/usr/bin/bash')) return 'bash';
|
||||
if (firstLines.includes('#!/usr/bin/perl')) return 'perl';
|
||||
if (firstLines.includes('#!/usr/bin/ruby')) return 'ruby';
|
||||
if (firstLines.includes('package ') && firstLines.includes('import ') && firstLines.includes('public class ')) return 'java';
|
||||
if (firstLines.includes('using System;') && firstLines.includes('namespace ')) return 'csharp';
|
||||
if (firstLines.includes('package main') && firstLines.includes('import (') && firstLines.includes('func ')) return 'go';
|
||||
if (firstLines.includes('#include <') && (firstLines.includes('int main(') || firstLines.includes('void main('))) {
|
||||
if (firstLines.includes('std::')) return 'cpp';
|
||||
return 'c';
|
||||
}
|
||||
if (firstLines.includes('fn main()') && firstLines.includes('let ') && firstLines.includes('impl ')) return 'rust';
|
||||
if (firstLines.includes('<!DOCTYPE html>') || firstLines.includes('<html>')) return 'html';
|
||||
if (firstLines.includes('function ') && firstLines.includes('var ') && firstLines.includes('const ')) return 'javascript';
|
||||
if (firstLines.includes('interface ') && firstLines.includes('export class ')) return 'typescript';
|
||||
if (firstLines.includes('@Component') || firstLines.includes('import { Component }')) return 'typescript';
|
||||
|
||||
// Default to 'text' if language can't be determined
|
||||
return 'text';
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from code to create a summary
|
||||
*/
|
||||
export function extractCodeStructure(content: string, language: string): string {
|
||||
// Avoid processing very large code files
|
||||
if (content.length > 100000) {
|
||||
return "Code content too large for structure extraction";
|
||||
}
|
||||
|
||||
let structure = "";
|
||||
|
||||
try {
|
||||
switch (language.toLowerCase()) {
|
||||
case 'javascript':
|
||||
case 'typescript':
|
||||
structure = extractJsStructure(content);
|
||||
break;
|
||||
|
||||
case 'python':
|
||||
structure = extractPythonStructure(content);
|
||||
break;
|
||||
|
||||
case 'java':
|
||||
case 'csharp':
|
||||
case 'cpp':
|
||||
structure = extractClassBasedStructure(content);
|
||||
break;
|
||||
|
||||
case 'go':
|
||||
structure = extractGoStructure(content);
|
||||
break;
|
||||
|
||||
case 'rust':
|
||||
structure = extractRustStructure(content);
|
||||
break;
|
||||
|
||||
case 'html':
|
||||
structure = extractHtmlStructure(content);
|
||||
break;
|
||||
|
||||
default:
|
||||
// For other languages, just return a summary of the file size and a few lines
|
||||
const lines = content.split('\n');
|
||||
structure = `Code file with ${lines.length} lines.\n`;
|
||||
|
||||
// Add first few non-empty lines that aren't comments
|
||||
const firstCodeLines = lines.filter(line =>
|
||||
line.trim() !== '' &&
|
||||
!line.trim().startsWith('//') &&
|
||||
!line.trim().startsWith('#') &&
|
||||
!line.trim().startsWith('*') &&
|
||||
!line.trim().startsWith('<!--')
|
||||
).slice(0, 5);
|
||||
|
||||
if (firstCodeLines.length > 0) {
|
||||
structure += "First few code lines:\n" + firstCodeLines.join('\n');
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
return `Error extracting code structure: ${e.message}`;
|
||||
}
|
||||
|
||||
return structure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from JavaScript/TypeScript code
|
||||
*/
|
||||
function extractJsStructure(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
let structure = "";
|
||||
|
||||
// Look for imports/requires
|
||||
const imports = lines.filter(line =>
|
||||
line.trim().startsWith('import ') ||
|
||||
line.includes('require(')
|
||||
).slice(0, 10);
|
||||
|
||||
if (imports.length > 0) {
|
||||
structure += "Imports:\n" + imports.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for class declarations
|
||||
const classes = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('class ') || line.includes(' class ')) {
|
||||
classes.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (classes.length > 0) {
|
||||
structure += "Classes:\n" + classes.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for function declarations
|
||||
const functions = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('function ') ||
|
||||
line.match(/^(const|let|var)\s+\w+\s*=\s*function/) ||
|
||||
line.match(/^(const|let|var)\s+\w+\s*=\s*\(/)) {
|
||||
functions.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (functions.length > 0) {
|
||||
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
|
||||
if (functions.length > 15) {
|
||||
structure += `\n... and ${functions.length - 15} more functions`;
|
||||
}
|
||||
structure += '\n\n';
|
||||
}
|
||||
|
||||
return structure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from Python code
|
||||
*/
|
||||
function extractPythonStructure(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
let structure = "";
|
||||
|
||||
// Look for imports
|
||||
const imports = lines.filter(line =>
|
||||
line.trim().startsWith('import ') ||
|
||||
line.trim().startsWith('from ')
|
||||
).slice(0, 10);
|
||||
|
||||
if (imports.length > 0) {
|
||||
structure += "Imports:\n" + imports.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for class declarations
|
||||
const classes = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('class ')) {
|
||||
classes.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (classes.length > 0) {
|
||||
structure += "Classes:\n" + classes.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for function declarations
|
||||
const functions = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('def ')) {
|
||||
functions.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (functions.length > 0) {
|
||||
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
|
||||
if (functions.length > 15) {
|
||||
structure += `\n... and ${functions.length - 15} more functions`;
|
||||
}
|
||||
structure += '\n\n';
|
||||
}
|
||||
|
||||
return structure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from class-based languages like Java, C#, C++
|
||||
*/
|
||||
function extractClassBasedStructure(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
let structure = "";
|
||||
|
||||
// Look for package/namespace declarations
|
||||
const packageLines = lines.filter(line =>
|
||||
line.trim().startsWith('package ') ||
|
||||
line.trim().startsWith('namespace ') ||
|
||||
line.trim().startsWith('using ')
|
||||
).slice(0, 5);
|
||||
|
||||
if (packageLines.length > 0) {
|
||||
structure += "Package/Imports:\n" + packageLines.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for class declarations
|
||||
const classes = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.match(/^(public|private|protected)?\s*(class|interface|enum)\s+\w+/)) {
|
||||
classes.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (classes.length > 0) {
|
||||
structure += "Classes/Interfaces:\n" + classes.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for method declarations
|
||||
const methods = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.match(/^(public|private|protected)?\s*(static)?\s*[\w<>[\]]+\s+\w+\s*\(/)) {
|
||||
methods.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (methods.length > 0) {
|
||||
structure += "Methods:\n" + methods.slice(0, 15).join('\n');
|
||||
if (methods.length > 15) {
|
||||
structure += `\n... and ${methods.length - 15} more methods`;
|
||||
}
|
||||
structure += '\n\n';
|
||||
}
|
||||
|
||||
return structure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from Go code
|
||||
*/
|
||||
function extractGoStructure(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
let structure = "";
|
||||
|
||||
// Look for package declarations
|
||||
const packageLines = lines.filter(line => line.trim().startsWith('package ')).slice(0, 1);
|
||||
|
||||
if (packageLines.length > 0) {
|
||||
structure += "Package:\n" + packageLines.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for imports
|
||||
const importStart = lines.findIndex(line => line.trim() === 'import (');
|
||||
if (importStart !== -1) {
|
||||
let importEnd = lines.findIndex((line, i) => i > importStart && line.trim() === ')');
|
||||
if (importEnd !== -1) {
|
||||
structure += "Imports:\n" + lines.slice(importStart, importEnd + 1).join('\n') + '\n\n';
|
||||
}
|
||||
}
|
||||
|
||||
// Look for type declarations (structs, interfaces)
|
||||
const types = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('type ') && (line.includes(' struct ') || line.includes(' interface '))) {
|
||||
types.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (types.length > 0) {
|
||||
structure += "Types:\n" + types.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for function declarations
|
||||
const functions = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('func ')) {
|
||||
functions.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (functions.length > 0) {
|
||||
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
|
||||
if (functions.length > 15) {
|
||||
structure += `\n... and ${functions.length - 15} more functions`;
|
||||
}
|
||||
structure += '\n\n';
|
||||
}
|
||||
|
||||
return structure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from Rust code
|
||||
*/
|
||||
function extractRustStructure(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
let structure = "";
|
||||
|
||||
// Look for module declarations
|
||||
const moduleLines = lines.filter(line => line.trim().startsWith('mod ') || line.trim().startsWith('use ')).slice(0, 10);
|
||||
|
||||
if (moduleLines.length > 0) {
|
||||
structure += "Modules/Imports:\n" + moduleLines.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for struct/enum/trait declarations
|
||||
const types = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('struct ') || line.startsWith('enum ') || line.startsWith('trait ')) {
|
||||
types.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (types.length > 0) {
|
||||
structure += "Types:\n" + types.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
// Look for function/impl declarations
|
||||
const functions = [];
|
||||
const impls = [];
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
if (line.startsWith('fn ')) {
|
||||
functions.push(line);
|
||||
}
|
||||
if (line.startsWith('impl ')) {
|
||||
impls.push(line);
|
||||
}
|
||||
}
|
||||
|
||||
if (impls.length > 0) {
|
||||
structure += "Implementations:\n" + impls.join('\n') + '\n\n';
|
||||
}
|
||||
|
||||
if (functions.length > 0) {
|
||||
structure += "Functions:\n" + functions.slice(0, 15).join('\n');
|
||||
if (functions.length > 15) {
|
||||
structure += `\n... and ${functions.length - 15} more functions`;
|
||||
}
|
||||
structure += '\n\n';
|
||||
}
|
||||
|
||||
return structure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from HTML
|
||||
*/
|
||||
function extractHtmlStructure(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
|
||||
// Extract title
|
||||
const titleMatch = content.match(/<title>(.*?)<\/title>/i);
|
||||
const title = titleMatch ? titleMatch[1] : "No title";
|
||||
|
||||
// Count main elements
|
||||
const headings = content.match(/<h[1-6].*?>.*?<\/h[1-6]>/gi) || [];
|
||||
const divs = content.match(/<div.*?>/gi) || [];
|
||||
const scripts = content.match(/<script.*?>.*?<\/script>/gis) || [];
|
||||
const links = content.match(/<a.*?>.*?<\/a>/gi) || [];
|
||||
const images = content.match(/<img.*?>/gi) || [];
|
||||
|
||||
// Extract some key elements
|
||||
const structure = `HTML Document: "${title}"
|
||||
Document structure:
|
||||
- Contains ${headings.length} headings
|
||||
- Contains ${divs.length} div elements
|
||||
- Contains ${scripts.length} script blocks
|
||||
- Contains ${links.length} links
|
||||
- Contains ${images.length} images
|
||||
`;
|
||||
|
||||
return structure;
|
||||
}
|
243
src/services/llm/context/hierarchy.ts
Normal file
243
src/services/llm/context/hierarchy.ts
Normal file
@ -0,0 +1,243 @@
|
||||
import becca from '../../../becca/becca.js';
|
||||
import { sanitizeHtmlContent } from './note_content.js';
|
||||
|
||||
/**
|
||||
* Get a list of parent notes for a given note
|
||||
*/
|
||||
export async function getParentNotes(noteId: string, maxParents: number = 5): Promise<{id: string, title: string}[]> {
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
// Use Becca API to get parent branches and notes
|
||||
const parentBranches = note.getParentBranches();
|
||||
|
||||
if (!parentBranches || parentBranches.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Map to get parent notes, limiting to maxParents
|
||||
const parentNotes = parentBranches
|
||||
.slice(0, maxParents)
|
||||
.map(branch => {
|
||||
if (!branch.parentNote) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
id: branch.parentNote.noteId,
|
||||
title: branch.parentNote.title
|
||||
};
|
||||
})
|
||||
.filter(note => note !== null) as {id: string, title: string}[];
|
||||
|
||||
return parentNotes;
|
||||
} catch (error) {
|
||||
console.error(`Error getting parent notes for ${noteId}:`, error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hierarchical context of parent notes
|
||||
* This function builds a representation of the note hierarchy to provide context
|
||||
*/
|
||||
export async function getParentContext(
|
||||
noteId: string,
|
||||
maxDepth: number = 3,
|
||||
maxParents: number = 3,
|
||||
includeCurrentNote: boolean = true
|
||||
): Promise<string> {
|
||||
// Note: getParentNotes has been updated to use Becca API
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const visited = new Set<string>();
|
||||
let context = "";
|
||||
|
||||
// Helper function to build the hierarchical context recursively
|
||||
async function buildHierarchy(currentNoteId: string, depth: number, prefix: string = ""): Promise<void> {
|
||||
if (depth > maxDepth || visited.has(currentNoteId)) {
|
||||
return;
|
||||
}
|
||||
|
||||
visited.add(currentNoteId);
|
||||
const parentNotes = await getParentNotes(currentNoteId, maxParents);
|
||||
|
||||
for (const parent of parentNotes) {
|
||||
// Add parent with proper indentation
|
||||
context += `${prefix}- ${parent.title}\n`;
|
||||
|
||||
// Recursively add parents of this parent with increased indentation
|
||||
await buildHierarchy(parent.id, depth + 1, prefix + " ");
|
||||
}
|
||||
}
|
||||
|
||||
// Build the hierarchy starting from the current note
|
||||
await buildHierarchy(noteId, 1);
|
||||
|
||||
// Add the current note at the end with appropriate indentation
|
||||
if (includeCurrentNote) {
|
||||
// Determine the indentation level based on hierarchy depth
|
||||
let indentation = "";
|
||||
if (context) {
|
||||
// If we have parent context, add the current note with proper indentation
|
||||
indentation = " ".repeat(1); // One level deeper than parents
|
||||
context += `${indentation}> ${note.title} (current note)\n`;
|
||||
} else {
|
||||
// If no parents, just add the current note
|
||||
context += `> ${note.title} (current note)\n`;
|
||||
}
|
||||
}
|
||||
|
||||
if (!context) {
|
||||
return "No parent context available.";
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get context from child notes
|
||||
*/
|
||||
export async function getChildContext(
|
||||
noteId: string,
|
||||
maxChildren: number = 10,
|
||||
includeContent: boolean = false
|
||||
): Promise<string> {
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
// Get child notes using Becca API
|
||||
const childNotes = note.getChildNotes();
|
||||
|
||||
if (!childNotes || childNotes.length === 0) {
|
||||
return "No child notes.";
|
||||
}
|
||||
|
||||
let context = `Child notes (${childNotes.length} total):\n`;
|
||||
|
||||
// Limit the number of children included in context
|
||||
const limitedChildren = childNotes.slice(0, maxChildren);
|
||||
|
||||
for (const childNote of limitedChildren) {
|
||||
context += `- ${childNote.title}\n`;
|
||||
|
||||
// Optionally include a snippet of content
|
||||
if (includeContent) {
|
||||
try {
|
||||
const content = String(await childNote.getContent() || "");
|
||||
|
||||
// Truncate and sanitize content
|
||||
const truncatedContent = sanitizeHtmlContent(content)
|
||||
.substring(0, 100)
|
||||
.trim()
|
||||
.replace(/\n/g, ' ');
|
||||
|
||||
if (truncatedContent) {
|
||||
context += ` Summary: ${truncatedContent}${truncatedContent.length >= 100 ? '...' : ''}\n`;
|
||||
}
|
||||
} catch (e) {
|
||||
// Silently skip content errors
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add note about truncation if needed
|
||||
if (childNotes.length > maxChildren) {
|
||||
context += `... and ${childNotes.length - maxChildren} more child notes not shown\n`;
|
||||
}
|
||||
|
||||
return context;
|
||||
} catch (error) {
|
||||
console.error(`Error getting child context for ${noteId}:`, error);
|
||||
return "Error retrieving child notes.";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get context from linked notes (relations)
|
||||
*/
|
||||
export async function getLinkedNotesContext(
|
||||
noteId: string,
|
||||
maxRelations: number = 10
|
||||
): Promise<string> {
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return "";
|
||||
}
|
||||
|
||||
try {
|
||||
// Get all relations using Becca API
|
||||
const relations = note.getRelations();
|
||||
|
||||
if (!relations || relations.length === 0) {
|
||||
return "No linked notes.";
|
||||
}
|
||||
|
||||
// Get incoming relations as well
|
||||
const incomingRelations = note.getTargetRelations();
|
||||
|
||||
let context = "";
|
||||
|
||||
// Handle outgoing relations
|
||||
if (relations.length > 0) {
|
||||
context += `Outgoing relations (${relations.length} total):\n`;
|
||||
|
||||
// Limit the number of relations included in context
|
||||
const limitedRelations = relations.slice(0, maxRelations);
|
||||
|
||||
for (const relation of limitedRelations) {
|
||||
const targetNote = becca.getNote(relation.value || "");
|
||||
if (targetNote) {
|
||||
const relationName = relation.name || 'relates to';
|
||||
context += `- ${relationName} → ${targetNote.title}\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Add note about truncation if needed
|
||||
if (relations.length > maxRelations) {
|
||||
context += `... and ${relations.length - maxRelations} more outgoing relations not shown\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle incoming relations
|
||||
if (incomingRelations && incomingRelations.length > 0) {
|
||||
if (context) context += "\n";
|
||||
|
||||
context += `Incoming relations (${incomingRelations.length} total):\n`;
|
||||
|
||||
// Limit the number of relations included in context
|
||||
const limitedIncoming = incomingRelations.slice(0, maxRelations);
|
||||
|
||||
for (const relation of limitedIncoming) {
|
||||
const sourceNote = becca.getNote(relation.value || "");
|
||||
if (sourceNote) {
|
||||
const relationName = relation.name || 'relates to';
|
||||
context += `- ${sourceNote.title} → ${relationName}\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Add note about truncation if needed
|
||||
if (incomingRelations.length > maxRelations) {
|
||||
context += `... and ${incomingRelations.length - maxRelations} more incoming relations not shown\n`;
|
||||
}
|
||||
}
|
||||
|
||||
return context || "No linked notes.";
|
||||
} catch (error) {
|
||||
console.error(`Error getting linked notes context for ${noteId}:`, error);
|
||||
return "Error retrieving linked notes.";
|
||||
}
|
||||
}
|
616
src/services/llm/context/index.ts
Normal file
616
src/services/llm/context/index.ts
Normal file
@ -0,0 +1,616 @@
|
||||
/**
|
||||
* Context extraction module for LLM features
|
||||
* Provides methods to extract relevant context from notes for LLM processing
|
||||
*/
|
||||
|
||||
import becca from '../../../becca/becca.js';
|
||||
import { getNoteContent, formatNoteContent, sanitizeHtmlContent } from './note_content.js';
|
||||
import { detectLanguage, extractCodeStructure } from './code_handlers.js';
|
||||
import { chunkContent, semanticChunking } from './chunking.js';
|
||||
import type { ContentChunk, ChunkOptions } from './chunking.js';
|
||||
import { summarizeContent, extractKeyPoints } from './summarization.js';
|
||||
import { getParentNotes, getParentContext, getChildContext, getLinkedNotesContext } from './hierarchy.js';
|
||||
import { getSemanticContext } from './semantic_context.js';
|
||||
|
||||
/**
|
||||
* Options for context extraction
|
||||
*/
|
||||
export interface ContextOptions {
|
||||
/**
|
||||
* Include parent context
|
||||
*/
|
||||
includeParents?: boolean;
|
||||
|
||||
/**
|
||||
* Include child notes in context
|
||||
*/
|
||||
includeChildren?: boolean;
|
||||
|
||||
/**
|
||||
* Include linked notes in context
|
||||
*/
|
||||
includeLinks?: boolean;
|
||||
|
||||
/**
|
||||
* Include semantically similar notes
|
||||
*/
|
||||
includeSimilar?: boolean;
|
||||
|
||||
/**
|
||||
* Include note content in context
|
||||
*/
|
||||
includeContent?: boolean;
|
||||
|
||||
/**
|
||||
* Maximum depth for parent hierarchy
|
||||
*/
|
||||
maxParentDepth?: number;
|
||||
|
||||
/**
|
||||
* Maximum number of children to include
|
||||
*/
|
||||
maxChildren?: number;
|
||||
|
||||
/**
|
||||
* Maximum number of linked notes to include
|
||||
*/
|
||||
maxLinks?: number;
|
||||
|
||||
/**
|
||||
* Maximum number of similar notes to include
|
||||
*/
|
||||
maxSimilarNotes?: number;
|
||||
|
||||
/**
|
||||
* Maximum content length
|
||||
*/
|
||||
maxContentLength?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default options for context extraction
|
||||
*/
|
||||
const DEFAULT_CONTEXT_OPTIONS: Required<ContextOptions> = {
|
||||
includeParents: true,
|
||||
includeChildren: true,
|
||||
includeLinks: true,
|
||||
includeSimilar: false,
|
||||
includeContent: true,
|
||||
maxParentDepth: 3,
|
||||
maxChildren: 10,
|
||||
maxLinks: 10,
|
||||
maxSimilarNotes: 5,
|
||||
maxContentLength: 2000
|
||||
};
|
||||
|
||||
/**
|
||||
* Context Extractor class
|
||||
* Handles extraction of context from notes for LLM processing
|
||||
*/
|
||||
export class ContextExtractor {
|
||||
/**
|
||||
* Get content of a note
|
||||
*/
|
||||
static async getNoteContent(noteId: string): Promise<string | null> {
|
||||
return getNoteContent(noteId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get content of a note - instance method
|
||||
*/
|
||||
async getNoteContent(noteId: string): Promise<string | null> {
|
||||
return ContextExtractor.getNoteContent(noteId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Format note content based on its type
|
||||
*/
|
||||
static formatNoteContent(content: string, type: string, mime: string, title: string): string {
|
||||
return formatNoteContent(content, type, mime, title);
|
||||
}
|
||||
|
||||
/**
|
||||
* Format note content based on its type - instance method
|
||||
*/
|
||||
formatNoteContent(content: string, type: string, mime: string, title: string): string {
|
||||
return ContextExtractor.formatNoteContent(content, type, mime, title);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML content to plain text
|
||||
*/
|
||||
static sanitizeHtmlContent(html: string): string {
|
||||
return sanitizeHtmlContent(html);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML content to plain text - instance method
|
||||
*/
|
||||
sanitizeHtmlContent(html: string): string {
|
||||
return ContextExtractor.sanitizeHtmlContent(html);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect programming language from content
|
||||
*/
|
||||
static detectLanguage(content: string, mime: string): string {
|
||||
return detectLanguage(content, mime);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect programming language from content - instance method
|
||||
*/
|
||||
detectLanguage(content: string, mime: string): string {
|
||||
return ContextExtractor.detectLanguage(content, mime);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from code
|
||||
*/
|
||||
static extractCodeStructure(content: string, language: string): string {
|
||||
return extractCodeStructure(content, language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract structure from code - instance method
|
||||
*/
|
||||
extractCodeStructure(content: string, language: string): string {
|
||||
return ContextExtractor.extractCodeStructure(content, language);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk content into smaller pieces
|
||||
*/
|
||||
static chunkContent(
|
||||
content: string,
|
||||
title: string = '',
|
||||
noteId: string = '',
|
||||
options: ChunkOptions = {}
|
||||
): ContentChunk[] {
|
||||
return chunkContent(content, title, noteId, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk content into smaller pieces - instance method
|
||||
*/
|
||||
chunkContent(
|
||||
content: string,
|
||||
title: string = '',
|
||||
noteId: string = '',
|
||||
options: ChunkOptions = {}
|
||||
): ContentChunk[] {
|
||||
return ContextExtractor.chunkContent(content, title, noteId, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Smarter chunking that respects semantic boundaries
|
||||
*/
|
||||
static semanticChunking(
|
||||
content: string,
|
||||
title: string = '',
|
||||
noteId: string = '',
|
||||
options: ChunkOptions = {}
|
||||
): ContentChunk[] {
|
||||
return semanticChunking(content, title, noteId, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Smarter chunking that respects semantic boundaries - instance method
|
||||
*/
|
||||
semanticChunking(
|
||||
content: string,
|
||||
title: string = '',
|
||||
noteId: string = '',
|
||||
options: ChunkOptions = {}
|
||||
): ContentChunk[] {
|
||||
return ContextExtractor.semanticChunking(content, title, noteId, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Summarize content
|
||||
*/
|
||||
static summarizeContent(
|
||||
content: string,
|
||||
title: string = ''
|
||||
): string {
|
||||
return summarizeContent(content, title);
|
||||
}
|
||||
|
||||
/**
|
||||
* Summarize content - instance method
|
||||
*/
|
||||
summarizeContent(
|
||||
content: string,
|
||||
title: string = ''
|
||||
): string {
|
||||
return ContextExtractor.summarizeContent(content, title);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract key points from content
|
||||
*/
|
||||
static extractKeyPoints(
|
||||
content: string,
|
||||
maxPoints: number = 5
|
||||
): string[] {
|
||||
return extractKeyPoints(content, maxPoints);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract key points from content - instance method
|
||||
*/
|
||||
extractKeyPoints(
|
||||
content: string,
|
||||
maxPoints: number = 5
|
||||
): string[] {
|
||||
return ContextExtractor.extractKeyPoints(content, maxPoints);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get parent notes
|
||||
*/
|
||||
static async getParentNotes(
|
||||
noteId: string,
|
||||
maxParents: number = 5
|
||||
): Promise<{id: string, title: string}[]> {
|
||||
return getParentNotes(noteId, maxParents);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get parent notes - instance method
|
||||
*/
|
||||
async getParentNotes(
|
||||
noteId: string,
|
||||
maxParents: number = 5
|
||||
): Promise<{id: string, title: string}[]> {
|
||||
return ContextExtractor.getParentNotes(noteId, maxParents);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hierarchical parent context
|
||||
*/
|
||||
static async getParentContext(
|
||||
noteId: string,
|
||||
maxDepth: number = 3,
|
||||
maxParents: number = 3
|
||||
): Promise<string> {
|
||||
return getParentContext(noteId, maxDepth, maxParents);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get hierarchical parent context - instance method
|
||||
*/
|
||||
async getParentContext(
|
||||
noteId: string,
|
||||
maxDepth: number = 3,
|
||||
maxParents: number = 3
|
||||
): Promise<string> {
|
||||
return ContextExtractor.getParentContext(noteId, maxDepth, maxParents);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get child context
|
||||
*/
|
||||
static async getChildContext(
|
||||
noteId: string,
|
||||
maxChildren: number = 10,
|
||||
includeContent: boolean = false
|
||||
): Promise<string> {
|
||||
return getChildContext(noteId, maxChildren, includeContent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get child context - instance method
|
||||
*/
|
||||
async getChildContext(
|
||||
noteId: string,
|
||||
maxChildren: number = 10,
|
||||
includeContent: boolean = false
|
||||
): Promise<string> {
|
||||
return ContextExtractor.getChildContext(noteId, maxChildren, includeContent);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get linked notes context
|
||||
*/
|
||||
static async getLinkedNotesContext(
|
||||
noteId: string,
|
||||
maxRelations: number = 10
|
||||
): Promise<string> {
|
||||
return getLinkedNotesContext(noteId, maxRelations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get linked notes context - instance method
|
||||
*/
|
||||
async getLinkedNotesContext(
|
||||
noteId: string,
|
||||
maxRelations: number = 10
|
||||
): Promise<string> {
|
||||
return ContextExtractor.getLinkedNotesContext(noteId, maxRelations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get semantic context
|
||||
*/
|
||||
static async getSemanticContext(
|
||||
noteId: string,
|
||||
maxSimilarNotesOrQuery: number | string = 5
|
||||
): Promise<string> {
|
||||
// Handle both the new (number) and old (string query) parameter types
|
||||
if (typeof maxSimilarNotesOrQuery === 'string') {
|
||||
// Old API: The second parameter was a query string
|
||||
// For backward compatibility, we'll still accept this
|
||||
return getSemanticContext(noteId, { maxSimilarNotes: 5 });
|
||||
} else {
|
||||
// New API: The second parameter is maxSimilarNotes
|
||||
return getSemanticContext(noteId, { maxSimilarNotes: maxSimilarNotesOrQuery });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get semantic context - instance method
|
||||
*/
|
||||
async getSemanticContext(
|
||||
noteId: string,
|
||||
maxSimilarNotesOrQuery: number | string = 5
|
||||
): Promise<string> {
|
||||
return ContextExtractor.getSemanticContext(noteId, maxSimilarNotesOrQuery);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract full context for a note
|
||||
* This combines various context sources based on provided options
|
||||
*/
|
||||
static async extractContext(
|
||||
noteId: string,
|
||||
options: ContextOptions = {}
|
||||
): Promise<string> {
|
||||
const config: Required<ContextOptions> = { ...DEFAULT_CONTEXT_OPTIONS, ...options };
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return "Note not found.";
|
||||
}
|
||||
|
||||
let context = `# Context for note: ${note.title}\n\n`;
|
||||
|
||||
// Include parent context
|
||||
if (config.includeParents) {
|
||||
const parentContext = await ContextExtractor.getParentContext(
|
||||
noteId,
|
||||
config.maxParentDepth,
|
||||
3 // Default to 3 parents per level
|
||||
);
|
||||
|
||||
if (parentContext) {
|
||||
context += `## Parent Hierarchy\n${parentContext}\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Include note content
|
||||
if (config.includeContent) {
|
||||
const content = await ContextExtractor.getNoteContent(noteId);
|
||||
|
||||
if (content) {
|
||||
// If content is too large, summarize it
|
||||
let contentSection = '';
|
||||
|
||||
if (content.length > config.maxContentLength) {
|
||||
contentSection = ContextExtractor.summarizeContent(content, note.title);
|
||||
contentSection += "\n\n[Content summarized due to length]";
|
||||
} else {
|
||||
contentSection = content;
|
||||
}
|
||||
|
||||
context += `## Note Content\n${contentSection}\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Include child context
|
||||
if (config.includeChildren) {
|
||||
const childContext = await ContextExtractor.getChildContext(
|
||||
noteId,
|
||||
config.maxChildren,
|
||||
false // Don't include child content by default
|
||||
);
|
||||
|
||||
if (childContext && childContext !== "No child notes.") {
|
||||
context += `## Child Notes\n${childContext}\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Include linked notes
|
||||
if (config.includeLinks) {
|
||||
const linkedContext = await ContextExtractor.getLinkedNotesContext(
|
||||
noteId,
|
||||
config.maxLinks
|
||||
);
|
||||
|
||||
if (linkedContext && linkedContext !== "No linked notes.") {
|
||||
context += `## Linked Notes\n${linkedContext}\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
// Include semantically similar notes
|
||||
if (config.includeSimilar) {
|
||||
const semanticContext = await ContextExtractor.getSemanticContext(
|
||||
noteId,
|
||||
config.maxSimilarNotes
|
||||
);
|
||||
|
||||
if (semanticContext && !semanticContext.includes("No semantically similar notes found.")) {
|
||||
context += `## Similar Notes\n${semanticContext}\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract full context for a note - instance method
|
||||
*/
|
||||
async extractContext(
|
||||
noteId: string,
|
||||
options: ContextOptions = {}
|
||||
): Promise<string> {
|
||||
return ContextExtractor.extractContext(noteId, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get progressively loaded context based on depth level
|
||||
* This provides different levels of context detail depending on the depth parameter
|
||||
*
|
||||
* @param noteId - The ID of the note to get context for
|
||||
* @param depth - Depth level (1-4) determining how much context to include
|
||||
* @returns Context appropriate for the requested depth
|
||||
*/
|
||||
static async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
|
||||
try {
|
||||
// This requires the semantic context service to be available
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: aiServiceManager } = await import('../ai_service_manager.js');
|
||||
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
|
||||
|
||||
if (!semanticContext) {
|
||||
return ContextExtractor.extractContext(noteId);
|
||||
}
|
||||
|
||||
return await semanticContext.getProgressiveContext(noteId, depth);
|
||||
} catch (error) {
|
||||
// Fall back to regular context if progressive loading fails
|
||||
console.error('Error in progressive context loading:', error);
|
||||
return ContextExtractor.extractContext(noteId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get progressively loaded context based on depth level - instance method
|
||||
*/
|
||||
async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
|
||||
return ContextExtractor.getProgressiveContext(noteId, depth);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get smart context based on the query complexity
|
||||
* This automatically selects the appropriate context depth and relevance
|
||||
*
|
||||
* @param noteId - The ID of the note to get context for
|
||||
* @param query - The user's query for semantic relevance matching
|
||||
* @returns The optimal context for answering the query
|
||||
*/
|
||||
static async getSmartContext(noteId: string, query: string): Promise<string> {
|
||||
try {
|
||||
// This requires the semantic context service to be available
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: aiServiceManager } = await import('../ai_service_manager.js');
|
||||
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
|
||||
|
||||
if (!semanticContext) {
|
||||
return ContextExtractor.extractContext(noteId);
|
||||
}
|
||||
|
||||
return await semanticContext.getSmartContext(noteId, query);
|
||||
} catch (error) {
|
||||
// Fall back to regular context if smart context fails
|
||||
console.error('Error in smart context selection:', error);
|
||||
return ContextExtractor.extractContext(noteId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get smart context based on the query complexity - instance method
|
||||
*/
|
||||
async getSmartContext(noteId: string, query: string): Promise<string> {
|
||||
return ContextExtractor.getSmartContext(noteId, query);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full context for a note, including parent hierarchy, content, and children
|
||||
* Legacy method for backwards compatibility
|
||||
*/
|
||||
static async getFullContext(noteId: string): Promise<string> {
|
||||
// Use extractContext with default options
|
||||
return ContextExtractor.extractContext(noteId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full context for a note - instance method
|
||||
*/
|
||||
async getFullContext(noteId: string): Promise<string> {
|
||||
return ContextExtractor.getFullContext(noteId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get note summary - for backward compatibility
|
||||
*/
|
||||
static async getNoteSummary(noteId: string, maxLength = 5000): Promise<string> {
|
||||
const note = becca.getNote(noteId);
|
||||
if (!note) return '';
|
||||
|
||||
const content = await getNoteContent(noteId);
|
||||
if (!content || content.length < maxLength) return content || '';
|
||||
|
||||
// For larger content, generate a summary
|
||||
return summarizeContent(content, note.title);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get note summary - instance method
|
||||
*/
|
||||
async getNoteSummary(noteId: string, maxLength = 5000): Promise<string> {
|
||||
return ContextExtractor.getNoteSummary(noteId, maxLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a large note into smaller, semantically meaningful chunks
|
||||
* This is useful for handling large notes that exceed the context window of LLMs
|
||||
* For backward compatibility
|
||||
*/
|
||||
static async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise<string[]> {
|
||||
const content = await getNoteContent(noteId);
|
||||
if (!content) return [];
|
||||
|
||||
// Use the new chunking functionality
|
||||
const chunks = chunkContent(
|
||||
content,
|
||||
'',
|
||||
noteId,
|
||||
{ maxChunkSize, respectBoundaries: true }
|
||||
);
|
||||
|
||||
// Convert to the old API format which was an array of strings
|
||||
return chunks.map(chunk => chunk.content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a large note into smaller chunks - instance method
|
||||
*/
|
||||
async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise<string[]> {
|
||||
return ContextExtractor.getChunkedNoteContent(noteId, maxChunkSize);
|
||||
}
|
||||
}
|
||||
|
||||
// Export all modules
|
||||
export {
|
||||
getNoteContent,
|
||||
formatNoteContent,
|
||||
sanitizeHtmlContent,
|
||||
detectLanguage,
|
||||
extractCodeStructure,
|
||||
chunkContent,
|
||||
semanticChunking,
|
||||
summarizeContent,
|
||||
extractKeyPoints,
|
||||
getParentNotes,
|
||||
getParentContext,
|
||||
getChildContext,
|
||||
getLinkedNotesContext,
|
||||
getSemanticContext
|
||||
};
|
||||
|
||||
// Export types
|
||||
export type {
|
||||
ContentChunk,
|
||||
ChunkOptions
|
||||
};
|
223
src/services/llm/context/note_content.ts
Normal file
223
src/services/llm/context/note_content.ts
Normal file
@ -0,0 +1,223 @@
|
||||
import sanitizeHtml from 'sanitize-html';
|
||||
import becca from '../../../becca/becca.js';
|
||||
|
||||
/**
|
||||
* Get the content of a note
|
||||
*/
|
||||
export async function getNoteContent(noteId: string): Promise<string | null> {
|
||||
// Use Becca API to get note data
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
// Get content using Becca API
|
||||
const content = String(await note.getContent() || "");
|
||||
|
||||
return formatNoteContent(
|
||||
content,
|
||||
note.type,
|
||||
note.mime,
|
||||
note.title
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`Error getting content for note ${noteId}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Format the content of a note based on its type
|
||||
* Enhanced with better handling for large and specialized content types
|
||||
*/
|
||||
export function formatNoteContent(content: string, type: string, mime: string, title: string): string {
|
||||
let formattedContent = `# ${title}\n\n`;
|
||||
|
||||
switch (type) {
|
||||
case 'text':
|
||||
// Remove HTML formatting for text notes
|
||||
formattedContent += sanitizeHtml(content);
|
||||
break;
|
||||
|
||||
case 'code':
|
||||
// For code, we'll handle this in code_handlers.ts
|
||||
// Just use basic formatting here
|
||||
formattedContent += '```\n' + content + '\n```';
|
||||
break;
|
||||
|
||||
case 'canvas':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
// Extract text elements from canvas
|
||||
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
|
||||
const texts = jsonContent.elements
|
||||
.filter((element: any) => element.type === 'text' && element.text)
|
||||
.map((element: any) => element.text);
|
||||
|
||||
formattedContent += 'Canvas content:\n' + texts.join('\n');
|
||||
} else {
|
||||
formattedContent += '[Empty canvas]';
|
||||
}
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing canvas content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Canvas content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'mindMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
// Extract node text from mind map
|
||||
const extractMindMapNodes = (node: any): string[] => {
|
||||
let texts: string[] = [];
|
||||
if (node.text) {
|
||||
texts.push(node.text);
|
||||
}
|
||||
if (node.children && Array.isArray(node.children)) {
|
||||
for (const child of node.children) {
|
||||
texts = texts.concat(extractMindMapNodes(child));
|
||||
}
|
||||
}
|
||||
return texts;
|
||||
};
|
||||
|
||||
if (jsonContent.root) {
|
||||
formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
|
||||
} else {
|
||||
formattedContent += '[Empty mind map]';
|
||||
}
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing mind map content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Mind map content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'relationMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
// Extract relation map entities and connections
|
||||
let result = 'Relation map content:\n';
|
||||
|
||||
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
|
||||
result += 'Notes: ' + jsonContent.notes
|
||||
.map((note: any) => note.title || note.name)
|
||||
.filter(Boolean)
|
||||
.join(', ') + '\n';
|
||||
}
|
||||
|
||||
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
|
||||
result += 'Relations: ' + jsonContent.relations
|
||||
.map((rel: any) => {
|
||||
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
|
||||
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
|
||||
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
|
||||
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
|
||||
return `${source} → ${rel.name || ''} → ${target}`;
|
||||
})
|
||||
.join('; ');
|
||||
}
|
||||
|
||||
formattedContent += result;
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing relation map content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Relation map content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'geoMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
let result = 'Geographic map content:\n';
|
||||
|
||||
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
|
||||
if (jsonContent.markers.length > 0) {
|
||||
result += jsonContent.markers
|
||||
.map((marker: any) => {
|
||||
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
|
||||
})
|
||||
.join('\n');
|
||||
} else {
|
||||
result += 'Empty geographic map';
|
||||
}
|
||||
} else {
|
||||
result += 'Empty geographic map';
|
||||
}
|
||||
|
||||
formattedContent += result;
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing geographic map content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Geographic map content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'mermaid':
|
||||
// Format mermaid diagrams as code blocks
|
||||
formattedContent += '```mermaid\n' + content + '\n```';
|
||||
break;
|
||||
|
||||
case 'image':
|
||||
case 'file':
|
||||
formattedContent += `[${type} attachment]`;
|
||||
break;
|
||||
|
||||
default:
|
||||
// For other notes, just use the content as is
|
||||
formattedContent += sanitizeHtml(content);
|
||||
}
|
||||
|
||||
return formattedContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML content to plain text
|
||||
*/
|
||||
export function sanitizeHtmlContent(html: string): string {
|
||||
if (!html) return '';
|
||||
|
||||
// Use sanitizeHtml to remove all HTML tags
|
||||
let content = sanitizeHtml(html, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
textFilter: (text) => {
|
||||
// Replace multiple newlines with a single one
|
||||
return text.replace(/\n\s*\n/g, '\n\n');
|
||||
}
|
||||
});
|
||||
|
||||
// Additional cleanup for any remaining HTML entities
|
||||
content = content
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
|
||||
return content;
|
||||
}
|
225
src/services/llm/context/semantic_context.ts
Normal file
225
src/services/llm/context/semantic_context.ts
Normal file
@ -0,0 +1,225 @@
|
||||
/**
|
||||
* Contains functions for semantic context extraction
|
||||
* Uses more intelligent methods to determine relevant context
|
||||
*/
|
||||
|
||||
import { sanitizeHtmlContent } from './note_content.js';
|
||||
import becca from '../../../becca/becca.js';
|
||||
import { getNoteContent } from './note_content.js';
|
||||
|
||||
/**
|
||||
* Options for semantic context extraction
|
||||
*/
|
||||
export interface SemanticContextOptions {
|
||||
/**
|
||||
* Maximum number of similar notes to include
|
||||
*/
|
||||
maxSimilarNotes?: number;
|
||||
|
||||
/**
|
||||
* Whether to include note content snippets
|
||||
*/
|
||||
includeContent?: boolean;
|
||||
|
||||
/**
|
||||
* Maximum length of content snippets
|
||||
*/
|
||||
snippetLength?: number;
|
||||
|
||||
/**
|
||||
* Minimum similarity score (0-1) to include a note
|
||||
*/
|
||||
minSimilarity?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default options for semantic context extraction
|
||||
*/
|
||||
const DEFAULT_SEMANTIC_CONTEXT_OPTIONS: Required<SemanticContextOptions> = {
|
||||
maxSimilarNotes: 5,
|
||||
includeContent: true,
|
||||
snippetLength: 200,
|
||||
minSimilarity: 0.7
|
||||
};
|
||||
|
||||
/**
|
||||
* Retrieve semantically similar notes to provide context
|
||||
* This is a simplified version without vector store integration
|
||||
* Use vector_store for actual semantic search
|
||||
*/
|
||||
export async function getSemanticContext(
|
||||
noteId: string,
|
||||
options: SemanticContextOptions = {}
|
||||
): Promise<string> {
|
||||
// Merge provided options with defaults
|
||||
const config: Required<SemanticContextOptions> = {
|
||||
...DEFAULT_SEMANTIC_CONTEXT_OPTIONS,
|
||||
...options
|
||||
};
|
||||
|
||||
try {
|
||||
// Get the current note
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return "Note not found.";
|
||||
}
|
||||
|
||||
// Get note content for comparison
|
||||
const noteContent = await getNoteContent(noteId);
|
||||
|
||||
if (!noteContent) {
|
||||
return "No content available for similarity comparison.";
|
||||
}
|
||||
|
||||
// Get potential related notes (simplified method)
|
||||
// In real implementation, this would use vector_store.similarity methods
|
||||
const relatedNotes = await findRelatedNotes(noteId, noteContent, config);
|
||||
|
||||
// Format the semantic context result
|
||||
let context = `Semantically related notes to "${note.title}":\n\n`;
|
||||
|
||||
if (relatedNotes.length === 0) {
|
||||
context += "No semantically similar notes found.";
|
||||
return context;
|
||||
}
|
||||
|
||||
// Add each related note to the context
|
||||
for (const relatedNote of relatedNotes) {
|
||||
context += `## ${relatedNote.title}\n`;
|
||||
|
||||
if (config.includeContent && relatedNote.snippet) {
|
||||
context += `${relatedNote.snippet}\n\n`;
|
||||
}
|
||||
}
|
||||
|
||||
return context;
|
||||
} catch (error) {
|
||||
console.error(`Error getting semantic context for ${noteId}:`, error);
|
||||
return "Error retrieving semantic context.";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find related notes based on simple heuristics
|
||||
* This is a placeholder for semantic search that would normally use vector embeddings
|
||||
*/
|
||||
async function findRelatedNotes(
|
||||
noteId: string,
|
||||
noteContent: string,
|
||||
options: Required<SemanticContextOptions>
|
||||
): Promise<{ id: string, title: string, snippet: string | null, score: number }[]> {
|
||||
const results: { id: string, title: string, snippet: string | null, score: number }[] = [];
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return results;
|
||||
}
|
||||
|
||||
// 1. Check siblings (notes with the same parent)
|
||||
const parentBranches = note.getParentBranches();
|
||||
const processedNotes = new Set<string>();
|
||||
processedNotes.add(noteId); // Don't include the current note
|
||||
|
||||
// Process parent branches to find siblings
|
||||
for (const branch of parentBranches) {
|
||||
if (!branch.parentNote) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parentNote = branch.parentNote;
|
||||
const siblingNotes = parentNote.getChildNotes().filter(n => n.noteId !== noteId);
|
||||
|
||||
for (const siblingNote of siblingNotes) {
|
||||
if (processedNotes.has(siblingNote.noteId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
processedNotes.add(siblingNote.noteId);
|
||||
|
||||
const siblingContent = await getNoteContent(siblingNote.noteId);
|
||||
if (!siblingContent) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Calculate a very simple similarity score
|
||||
const score = calculateSimpleTextSimilarity(noteContent, siblingContent);
|
||||
|
||||
if (score >= options.minSimilarity) {
|
||||
results.push({
|
||||
id: siblingNote.noteId,
|
||||
title: siblingNote.title,
|
||||
snippet: siblingContent.substring(0, options.snippetLength) + '...',
|
||||
score
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Check notes connected by relations
|
||||
const relations = note.getRelations();
|
||||
for (const relation of relations) {
|
||||
const targetNoteId = relation.value;
|
||||
|
||||
if (!targetNoteId || processedNotes.has(targetNoteId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
processedNotes.add(targetNoteId);
|
||||
|
||||
const targetNote = becca.getNote(targetNoteId);
|
||||
if (!targetNote) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const targetContent = await getNoteContent(targetNoteId);
|
||||
if (!targetContent) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Relations are already semantically connected, so give them a boost
|
||||
const score = calculateSimpleTextSimilarity(noteContent, targetContent) + 0.2;
|
||||
|
||||
results.push({
|
||||
id: targetNoteId,
|
||||
title: targetNote.title,
|
||||
snippet: targetContent.substring(0, options.snippetLength) + '...',
|
||||
score: Math.min(score, 1.0) // Cap at 1.0
|
||||
});
|
||||
}
|
||||
|
||||
// Sort by similarity score (highest first) and limit
|
||||
return results
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, options.maxSimilarNotes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate a simple text similarity based on shared words
|
||||
* This is a very basic implementation and should be replaced with actual embedding similarity
|
||||
*/
|
||||
function calculateSimpleTextSimilarity(text1: string, text2: string): number {
|
||||
// Clean and tokenize the texts
|
||||
const cleanText1 = sanitizeHtmlContent(text1).toLowerCase();
|
||||
const cleanText2 = sanitizeHtmlContent(text2).toLowerCase();
|
||||
|
||||
// Get unique words (case insensitive)
|
||||
const words1 = new Set(cleanText1.split(/\W+/).filter(w => w.length > 3));
|
||||
const words2 = new Set(cleanText2.split(/\W+/).filter(w => w.length > 3));
|
||||
|
||||
// No meaningful comparison possible if either text has no significant words
|
||||
if (words1.size === 0 || words2.size === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Count shared words
|
||||
let sharedCount = 0;
|
||||
for (const word of words1) {
|
||||
if (words2.has(word)) {
|
||||
sharedCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Jaccard similarity: intersection size / union size
|
||||
return sharedCount / (words1.size + words2.size - sharedCount);
|
||||
}
|
162
src/services/llm/context/summarization.ts
Normal file
162
src/services/llm/context/summarization.ts
Normal file
@ -0,0 +1,162 @@
|
||||
/**
|
||||
* Contains functions for generating summaries of note content
|
||||
* Used to provide concise context for LLM processing
|
||||
*/
|
||||
|
||||
import { sanitizeHtmlContent } from './note_content.js';
|
||||
|
||||
/**
|
||||
* Options for summarization
|
||||
*/
|
||||
export interface SummarizationOptions {
|
||||
/**
|
||||
* Maximum length of the summary in characters
|
||||
*/
|
||||
maxLength?: number;
|
||||
|
||||
/**
|
||||
* Whether to include title in the summary
|
||||
*/
|
||||
includeTitle?: boolean;
|
||||
|
||||
/**
|
||||
* Minimum content length to trigger summarization
|
||||
*/
|
||||
minContentLengthForSummarization?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default summarization options
|
||||
*/
|
||||
const DEFAULT_SUMMARIZATION_OPTIONS: Required<SummarizationOptions> = {
|
||||
maxLength: 500,
|
||||
includeTitle: true,
|
||||
minContentLengthForSummarization: 1000
|
||||
};
|
||||
|
||||
/**
|
||||
* Summarize note content
|
||||
* If the content is smaller than minContentLengthForSummarization, returns trimmed content
|
||||
* This is a local implementation that doesn't require API calls
|
||||
*/
|
||||
export function summarizeContent(
|
||||
content: string,
|
||||
title: string = '',
|
||||
options: SummarizationOptions = {}
|
||||
): string {
|
||||
// Merge provided options with defaults
|
||||
const config: Required<SummarizationOptions> = {
|
||||
...DEFAULT_SUMMARIZATION_OPTIONS,
|
||||
...options
|
||||
};
|
||||
|
||||
// Clean up the content
|
||||
const cleanedContent = sanitizeHtmlContent(content);
|
||||
|
||||
// If content is small enough, no need to summarize
|
||||
if (cleanedContent.length < config.minContentLengthForSummarization) {
|
||||
// Just truncate if needed
|
||||
if (cleanedContent.length > config.maxLength) {
|
||||
return cleanedContent.substring(0, config.maxLength) + '...';
|
||||
}
|
||||
return cleanedContent;
|
||||
}
|
||||
|
||||
// Use local summarization
|
||||
return generateLocalSummary(cleanedContent, config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a simple summary locally without using LLM API
|
||||
*/
|
||||
function generateLocalSummary(content: string, options: Required<SummarizationOptions>): string {
|
||||
// Simple heuristic approach - extract first paragraph and some key sentences
|
||||
|
||||
// First, try to get the first paragraph that has reasonable length
|
||||
const paragraphs = content.split(/\n\s*\n/);
|
||||
let summary = '';
|
||||
|
||||
for (const paragraph of paragraphs) {
|
||||
if (paragraph.length > 30 && !paragraph.startsWith('#') && !paragraph.startsWith('!')) {
|
||||
summary = paragraph;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If no good paragraph found, use the first X characters
|
||||
if (!summary) {
|
||||
summary = content.substring(0, options.maxLength * 0.8);
|
||||
}
|
||||
|
||||
// Truncate if too long
|
||||
if (summary.length > options.maxLength) {
|
||||
summary = summary.substring(0, options.maxLength) + '...';
|
||||
}
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract key points from content
|
||||
* Returns a bulleted list of key points
|
||||
* This is a local implementation that doesn't require API calls
|
||||
*/
|
||||
export function extractKeyPoints(
|
||||
content: string,
|
||||
maxPoints: number = 5
|
||||
): string[] {
|
||||
// Clean up the content
|
||||
const cleanedContent = sanitizeHtmlContent(content);
|
||||
|
||||
// Use local extraction
|
||||
return generateLocalKeyPoints(cleanedContent, maxPoints);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate key points locally without using LLM API
|
||||
*/
|
||||
function generateLocalKeyPoints(content: string, maxPoints: number): string[] {
|
||||
// Simple approach - look for sentences that might contain key information
|
||||
const sentences = content
|
||||
.replace(/\n+/g, ' ')
|
||||
.split(/[.!?]/)
|
||||
.map(s => s.trim())
|
||||
.filter(s => s.length > 20);
|
||||
|
||||
// Heuristics for important sentences - look for indicator phrases
|
||||
const importanceMarkers = [
|
||||
'important', 'key', 'significant', 'essential', 'critical',
|
||||
'main', 'primary', 'crucial', 'vital', 'fundamental',
|
||||
'in summary', 'to summarize', 'in conclusion', 'conclude',
|
||||
'therefore', 'thus', 'consequently', 'as a result'
|
||||
];
|
||||
|
||||
// Score sentences based on potential importance
|
||||
const scoredSentences = sentences.map(sentence => {
|
||||
let score = 0;
|
||||
|
||||
// Sentences at the beginning or end are often important
|
||||
if (sentences.indexOf(sentence) < sentences.length * 0.1) score += 3;
|
||||
if (sentences.indexOf(sentence) > sentences.length * 0.9) score += 4;
|
||||
|
||||
// Check for importance markers
|
||||
for (const marker of importanceMarkers) {
|
||||
if (sentence.toLowerCase().includes(marker)) {
|
||||
score += 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Prefer medium-length sentences
|
||||
if (sentence.length > 40 && sentence.length < 150) score += 2;
|
||||
|
||||
return { sentence, score };
|
||||
});
|
||||
|
||||
// Sort by score and take top N
|
||||
const topSentences = scoredSentences
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, maxPoints)
|
||||
.map(item => item.sentence + '.');
|
||||
|
||||
return topSentences;
|
||||
}
|
@ -1,871 +0,0 @@
|
||||
import sql from '../sql.js';
|
||||
import sanitizeHtml from 'sanitize-html';
|
||||
import becca from '../../becca/becca.js';
|
||||
|
||||
/**
|
||||
* Utility class for extracting context from notes to provide to AI models
|
||||
* Enhanced with advanced capabilities for handling large notes and specialized content
|
||||
*/
|
||||
export class ContextExtractor {
|
||||
/**
|
||||
* Get the content of a note
|
||||
*/
|
||||
async getNoteContent(noteId: string): Promise<string | null> {
|
||||
// Use Becca API to get note data
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
// Get content using Becca API
|
||||
const content = String(await note.getContent() || "");
|
||||
|
||||
return this.formatNoteContent(
|
||||
content,
|
||||
note.type,
|
||||
note.mime,
|
||||
note.title
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`Error getting content for note ${noteId}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a large note into smaller, semantically meaningful chunks
|
||||
* This is useful for handling large notes that exceed the context window of LLMs
|
||||
*
|
||||
* @param noteId - The ID of the note to chunk
|
||||
* @param maxChunkSize - Maximum size of each chunk in characters
|
||||
* @returns Array of content chunks, or empty array if note not found
|
||||
*/
|
||||
async getChunkedNoteContent(noteId: string, maxChunkSize = 2000): Promise<string[]> {
|
||||
const content = await this.getNoteContent(noteId);
|
||||
if (!content) return [];
|
||||
|
||||
// Split into semantic chunks (paragraphs, sections, etc.)
|
||||
return this.splitContentIntoChunks(content, maxChunkSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Split text content into semantically meaningful chunks based on natural boundaries
|
||||
* like paragraphs, headings, and code blocks
|
||||
*
|
||||
* @param content - The text content to split
|
||||
* @param maxChunkSize - Maximum size of each chunk in characters
|
||||
* @returns Array of content chunks
|
||||
*/
|
||||
private splitContentIntoChunks(content: string, maxChunkSize: number): string[] {
|
||||
// Look for semantic boundaries (headings, blank lines, etc.)
|
||||
const headingPattern = /^(#+)\s+(.+)$/gm;
|
||||
const codeBlockPattern = /```[\s\S]+?```/gm;
|
||||
|
||||
// Replace code blocks with placeholders to avoid splitting inside them
|
||||
const codeBlocks: string[] = [];
|
||||
let contentWithPlaceholders = content.replace(codeBlockPattern, (match) => {
|
||||
const placeholder = `__CODE_BLOCK_${codeBlocks.length}__`;
|
||||
codeBlocks.push(match);
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
// Split content at headings and paragraphs
|
||||
const sections: string[] = [];
|
||||
let currentSection = '';
|
||||
|
||||
// First split by headings
|
||||
const lines = contentWithPlaceholders.split('\n');
|
||||
for (const line of lines) {
|
||||
const isHeading = headingPattern.test(line);
|
||||
headingPattern.lastIndex = 0; // Reset regex
|
||||
|
||||
// If this is a heading and we already have content, start a new section
|
||||
if (isHeading && currentSection.trim().length > 0) {
|
||||
sections.push(currentSection.trim());
|
||||
currentSection = line;
|
||||
} else {
|
||||
currentSection += (currentSection ? '\n' : '') + line;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last section if there's any content
|
||||
if (currentSection.trim().length > 0) {
|
||||
sections.push(currentSection.trim());
|
||||
}
|
||||
|
||||
// Now combine smaller sections to respect maxChunkSize
|
||||
const chunks: string[] = [];
|
||||
let currentChunk = '';
|
||||
|
||||
for (const section of sections) {
|
||||
// If adding this section exceeds maxChunkSize and we already have content,
|
||||
// finalize the current chunk and start a new one
|
||||
if ((currentChunk + section).length > maxChunkSize && currentChunk.length > 0) {
|
||||
chunks.push(currentChunk);
|
||||
currentChunk = section;
|
||||
} else {
|
||||
currentChunk += (currentChunk ? '\n\n' : '') + section;
|
||||
}
|
||||
}
|
||||
|
||||
// Add the last chunk if there's any content
|
||||
if (currentChunk.length > 0) {
|
||||
chunks.push(currentChunk);
|
||||
}
|
||||
|
||||
// Restore code blocks in all chunks
|
||||
return chunks.map(chunk => {
|
||||
return chunk.replace(/__CODE_BLOCK_(\d+)__/g, (_, index) => {
|
||||
return codeBlocks[parseInt(index)];
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a summary of a note's content
|
||||
* Useful for providing a condensed version of very large notes
|
||||
*
|
||||
* @param noteId - The ID of the note to summarize
|
||||
* @param maxLength - Cut-off length to trigger summarization
|
||||
* @returns Summary of the note or the original content if small enough
|
||||
*/
|
||||
async getNoteSummary(noteId: string, maxLength = 5000): Promise<string> {
|
||||
const content = await this.getNoteContent(noteId);
|
||||
if (!content || content.length < maxLength) return content || '';
|
||||
|
||||
// For larger content, generate a summary
|
||||
return this.summarizeContent(content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Summarize content by extracting key information
|
||||
* This uses a heuristic approach to find important sentences and paragraphs
|
||||
*
|
||||
* @param content - The content to summarize
|
||||
* @returns A summarized version of the content
|
||||
*/
|
||||
private summarizeContent(content: string): string {
|
||||
// Extract title/heading if present
|
||||
const titleMatch = content.match(/^# (.+)$/m);
|
||||
const title = titleMatch ? titleMatch[1] : 'Untitled Note';
|
||||
|
||||
// Extract all headings for an outline
|
||||
const headings: string[] = [];
|
||||
const headingMatches = content.matchAll(/^(#+)\s+(.+)$/gm);
|
||||
for (const match of headingMatches) {
|
||||
const level = match[1].length;
|
||||
const text = match[2];
|
||||
headings.push(`${' '.repeat(level-1)}- ${text}`);
|
||||
}
|
||||
|
||||
// Extract first sentence of each paragraph for a summary
|
||||
const paragraphs = content.split(/\n\s*\n/);
|
||||
const firstSentences = paragraphs
|
||||
.filter(p => p.trim().length > 0 && !p.trim().startsWith('#') && !p.trim().startsWith('```'))
|
||||
.map(p => {
|
||||
const sentenceMatch = p.match(/^[^.!?]+[.!?]/);
|
||||
return sentenceMatch ? sentenceMatch[0].trim() : p.substring(0, Math.min(150, p.length)).trim() + '...';
|
||||
})
|
||||
.slice(0, 5); // Limit to 5 sentences
|
||||
|
||||
// Create the summary
|
||||
let summary = `# Summary of: ${title}\n\n`;
|
||||
|
||||
if (headings.length > 0) {
|
||||
summary += `## Document Outline\n${headings.join('\n')}\n\n`;
|
||||
}
|
||||
|
||||
if (firstSentences.length > 0) {
|
||||
summary += `## Key Points\n${firstSentences.map(s => `- ${s}`).join('\n')}\n\n`;
|
||||
}
|
||||
|
||||
summary += `(Note: This is an automatically generated summary of a larger document with ${content.length} characters)`;
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a set of parent notes to provide hierarchical context
|
||||
*/
|
||||
async getParentContext(noteId: string, maxDepth = 3): Promise<string> {
|
||||
// Note: getParentNotes has already been updated to use Becca
|
||||
const parents = await this.getParentNotes(noteId, maxDepth);
|
||||
if (!parents.length) return '';
|
||||
|
||||
let context = 'Here is the hierarchical context for the current note:\n\n';
|
||||
|
||||
// Create a hierarchical view of the parents using indentation
|
||||
// to show the proper parent-child relationship
|
||||
let indentLevel = 0;
|
||||
for (let i = 0; i < parents.length; i++) {
|
||||
const parent = parents[i];
|
||||
const indent = ' '.repeat(indentLevel);
|
||||
context += `${indent}- ${parent.title}\n`;
|
||||
indentLevel++;
|
||||
}
|
||||
|
||||
// Now add the current note with proper indentation
|
||||
const note = becca.getNote(noteId);
|
||||
if (note) {
|
||||
const indent = ' '.repeat(indentLevel);
|
||||
context += `${indent}- ${note.title} (current note)\n`;
|
||||
}
|
||||
|
||||
return context + '\n';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get child notes to provide additional context
|
||||
*/
|
||||
async getChildContext(noteId: string, maxChildren = 5): Promise<string> {
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Use Becca API to get child notes
|
||||
const childNotes = note.getChildNotes();
|
||||
|
||||
if (!childNotes || childNotes.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
let context = 'The current note has these child notes:\n\n';
|
||||
|
||||
// Limit to maxChildren
|
||||
const childrenToShow = childNotes.slice(0, maxChildren);
|
||||
|
||||
for (const child of childrenToShow) {
|
||||
context += `- ${child.title}\n`;
|
||||
}
|
||||
|
||||
// If there are more children than we're showing, indicate that
|
||||
if (childNotes.length > maxChildren) {
|
||||
context += `\n(+ ${childNotes.length - maxChildren} more child notes)\n`;
|
||||
}
|
||||
|
||||
return context + '\n';
|
||||
}
|
||||
|
||||
/**
|
||||
* Get notes linked to this note
|
||||
*/
|
||||
async getLinkedNotesContext(noteId: string, maxLinks = 5): Promise<string> {
|
||||
const note = becca.getNote(noteId);
|
||||
|
||||
if (!note) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Use Becca API to get relations
|
||||
const relations = note.getRelations();
|
||||
|
||||
if (!relations || relations.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Get the target notes from relations
|
||||
const linkedNotes = relations
|
||||
.map(relation => relation.targetNote)
|
||||
.filter(note => note !== null && note !== undefined);
|
||||
|
||||
if (linkedNotes.length === 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
let context = 'This note has relationships with these notes:\n\n';
|
||||
|
||||
// Limit to maxLinks
|
||||
const notesToShow = linkedNotes.slice(0, maxLinks);
|
||||
|
||||
for (const linked of notesToShow) {
|
||||
context += `- ${linked.title}\n`;
|
||||
}
|
||||
|
||||
// If there are more linked notes than we're showing, indicate that
|
||||
if (linkedNotes.length > maxLinks) {
|
||||
context += `\n(+ ${linkedNotes.length - maxLinks} more linked notes)\n`;
|
||||
}
|
||||
|
||||
return context + '\n';
|
||||
}
|
||||
|
||||
/**
|
||||
* Format the content of a note based on its type
|
||||
* Enhanced with better handling for large and specialized content types
|
||||
*/
|
||||
private formatNoteContent(content: string, type: string, mime: string, title: string): string {
|
||||
let formattedContent = `# ${title}\n\n`;
|
||||
|
||||
switch (type) {
|
||||
case 'text':
|
||||
// Remove HTML formatting for text notes
|
||||
formattedContent += this.sanitizeHtml(content);
|
||||
break;
|
||||
|
||||
case 'code':
|
||||
// Improved code handling with language detection
|
||||
const codeLanguage = this.detectCodeLanguage(content, mime);
|
||||
|
||||
// For large code files, extract structure rather than full content
|
||||
if (content.length > 8000) {
|
||||
formattedContent += this.extractCodeStructure(content, codeLanguage);
|
||||
} else {
|
||||
formattedContent += `\`\`\`${codeLanguage}\n${content}\n\`\`\``;
|
||||
}
|
||||
break;
|
||||
|
||||
case 'canvas':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
// Extract text elements from canvas
|
||||
if (jsonContent.elements && Array.isArray(jsonContent.elements)) {
|
||||
const texts = jsonContent.elements
|
||||
.filter((element: any) => element.type === 'text' && element.text)
|
||||
.map((element: any) => element.text);
|
||||
|
||||
formattedContent += 'Canvas content:\n' + texts.join('\n');
|
||||
} else {
|
||||
formattedContent += '[Empty canvas]';
|
||||
}
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing canvas content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Canvas content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'mindMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
// Extract node text from mind map
|
||||
const extractMindMapNodes = (node: any): string[] => {
|
||||
let texts: string[] = [];
|
||||
if (node.text) {
|
||||
texts.push(node.text);
|
||||
}
|
||||
if (node.children && Array.isArray(node.children)) {
|
||||
for (const child of node.children) {
|
||||
texts = texts.concat(extractMindMapNodes(child));
|
||||
}
|
||||
}
|
||||
return texts;
|
||||
};
|
||||
|
||||
if (jsonContent.root) {
|
||||
formattedContent += 'Mind map content:\n' + extractMindMapNodes(jsonContent.root).join('\n');
|
||||
} else {
|
||||
formattedContent += '[Empty mind map]';
|
||||
}
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing mind map content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Mind map content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'relationMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
// Extract relation map entities and connections
|
||||
let result = 'Relation map content:\n';
|
||||
|
||||
if (jsonContent.notes && Array.isArray(jsonContent.notes)) {
|
||||
result += 'Notes: ' + jsonContent.notes
|
||||
.map((note: any) => note.title || note.name)
|
||||
.filter(Boolean)
|
||||
.join(', ') + '\n';
|
||||
}
|
||||
|
||||
if (jsonContent.relations && Array.isArray(jsonContent.relations)) {
|
||||
result += 'Relations: ' + jsonContent.relations
|
||||
.map((rel: any) => {
|
||||
const sourceNote = jsonContent.notes.find((n: any) => n.noteId === rel.sourceNoteId);
|
||||
const targetNote = jsonContent.notes.find((n: any) => n.noteId === rel.targetNoteId);
|
||||
const source = sourceNote ? (sourceNote.title || sourceNote.name) : 'unknown';
|
||||
const target = targetNote ? (targetNote.title || targetNote.name) : 'unknown';
|
||||
return `${source} → ${rel.name || ''} → ${target}`;
|
||||
})
|
||||
.join('; ');
|
||||
}
|
||||
|
||||
formattedContent += result;
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing relation map content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Relation map content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'geoMap':
|
||||
if (mime === 'application/json') {
|
||||
try {
|
||||
// Parse JSON content
|
||||
const jsonContent = JSON.parse(content);
|
||||
|
||||
let result = 'Geographic map content:\n';
|
||||
|
||||
if (jsonContent.markers && Array.isArray(jsonContent.markers)) {
|
||||
if (jsonContent.markers.length > 0) {
|
||||
result += jsonContent.markers
|
||||
.map((marker: any) => {
|
||||
return `Location: ${marker.title || ''} (${marker.lat}, ${marker.lng})${marker.description ? ' - ' + marker.description : ''}`;
|
||||
})
|
||||
.join('\n');
|
||||
} else {
|
||||
result += 'Empty geographic map';
|
||||
}
|
||||
} else {
|
||||
result += 'Empty geographic map';
|
||||
}
|
||||
|
||||
formattedContent += result;
|
||||
}
|
||||
catch (e: any) {
|
||||
formattedContent += `[Error parsing geographic map content: ${e.message}]`;
|
||||
}
|
||||
} else {
|
||||
formattedContent += '[Geographic map content]';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'mermaid':
|
||||
// Format mermaid diagrams as code blocks
|
||||
formattedContent += '```mermaid\n' + content + '\n```';
|
||||
break;
|
||||
|
||||
case 'image':
|
||||
case 'file':
|
||||
formattedContent += `[${type} attachment]`;
|
||||
break;
|
||||
|
||||
default:
|
||||
// For other notes, just use the content as is
|
||||
formattedContent += this.sanitizeHtml(content);
|
||||
}
|
||||
|
||||
return formattedContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect the programming language of code content
|
||||
*
|
||||
* @param content - The code content to analyze
|
||||
* @param mime - MIME type (if available)
|
||||
* @returns The detected language or empty string
|
||||
*/
|
||||
private detectCodeLanguage(content: string, mime: string): string {
|
||||
// First check if mime type provides a hint
|
||||
if (mime) {
|
||||
const mimeMap: Record<string, string> = {
|
||||
'text/x-python': 'python',
|
||||
'text/javascript': 'javascript',
|
||||
'application/javascript': 'javascript',
|
||||
'text/typescript': 'typescript',
|
||||
'application/typescript': 'typescript',
|
||||
'text/x-java': 'java',
|
||||
'text/html': 'html',
|
||||
'text/css': 'css',
|
||||
'text/x-c': 'c',
|
||||
'text/x-c++': 'cpp',
|
||||
'text/x-csharp': 'csharp',
|
||||
'text/x-go': 'go',
|
||||
'text/x-ruby': 'ruby',
|
||||
'text/x-php': 'php',
|
||||
'text/x-swift': 'swift',
|
||||
'text/x-rust': 'rust',
|
||||
'text/markdown': 'markdown',
|
||||
'text/x-sql': 'sql',
|
||||
'text/x-yaml': 'yaml',
|
||||
'application/json': 'json',
|
||||
'text/x-shell': 'bash'
|
||||
};
|
||||
|
||||
for (const [mimePattern, language] of Object.entries(mimeMap)) {
|
||||
if (mime.includes(mimePattern)) {
|
||||
return language;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for common language patterns in the content
|
||||
const firstLines = content.split('\n', 20).join('\n');
|
||||
|
||||
const languagePatterns: Record<string, RegExp> = {
|
||||
'python': /^(import\s+|from\s+\w+\s+import|def\s+\w+\s*\(|class\s+\w+\s*:)/m,
|
||||
'javascript': /^(const\s+\w+\s*=|let\s+\w+\s*=|var\s+\w+\s*=|function\s+\w+\s*\(|import\s+.*from\s+)/m,
|
||||
'typescript': /^(interface\s+\w+|type\s+\w+\s*=|class\s+\w+\s*{)/m,
|
||||
'html': /^<!DOCTYPE html>|<html>|<head>|<body>/m,
|
||||
'css': /^(\.\w+\s*{|\#\w+\s*{|@media|@import)/m,
|
||||
'java': /^(public\s+class|import\s+java|package\s+)/m,
|
||||
'cpp': /^(#include\s+<\w+>|namespace\s+\w+|void\s+\w+\s*\()/m,
|
||||
'csharp': /^(using\s+System|namespace\s+\w+|public\s+class)/m,
|
||||
'go': /^(package\s+\w+|import\s+\(|func\s+\w+\s*\()/m,
|
||||
'ruby': /^(require\s+|class\s+\w+\s*<|def\s+\w+)/m,
|
||||
'php': /^(<\?php|namespace\s+\w+|use\s+\w+)/m,
|
||||
'sql': /^(SELECT|INSERT|UPDATE|DELETE|CREATE TABLE|ALTER TABLE)/im,
|
||||
'bash': /^(#!\/bin\/sh|#!\/bin\/bash|function\s+\w+\s*\(\))/m,
|
||||
'markdown': /^(#\s+|##\s+|###\s+|\*\s+|-\s+|>\s+)/m,
|
||||
'json': /^({[\s\n]*"|[\s\n]*\[)/m,
|
||||
'yaml': /^(---|\w+:\s+)/m
|
||||
};
|
||||
|
||||
for (const [language, pattern] of Object.entries(languagePatterns)) {
|
||||
if (pattern.test(firstLines)) {
|
||||
return language;
|
||||
}
|
||||
}
|
||||
|
||||
// Default to empty string if we can't detect the language
|
||||
return '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the structure of a code file rather than its full content
|
||||
* Useful for providing high-level understanding of large code files
|
||||
*
|
||||
* @param content - The full code content
|
||||
* @param language - The programming language
|
||||
* @returns A structured representation of the code
|
||||
*/
|
||||
private extractCodeStructure(content: string, language: string): string {
|
||||
const lines = content.split('\n');
|
||||
const maxLines = 8000;
|
||||
|
||||
// If it's not that much over the limit, just include the whole thing
|
||||
if (lines.length <= maxLines * 1.2) {
|
||||
return `\`\`\`${language}\n${content}\n\`\`\``;
|
||||
}
|
||||
|
||||
// For large files, extract important structural elements based on language
|
||||
let extractedStructure = '';
|
||||
let importSection = '';
|
||||
let classDefinitions = [];
|
||||
let functionDefinitions = [];
|
||||
let otherImportantLines = [];
|
||||
|
||||
// Extract imports/includes, class/function definitions based on language
|
||||
if (['javascript', 'typescript', 'python', 'java', 'csharp'].includes(language)) {
|
||||
// Find imports
|
||||
for (let i = 0; i < Math.min(100, lines.length); i++) {
|
||||
if (lines[i].match(/^(import|from|using|require|#include|package)\s+/)) {
|
||||
importSection += lines[i] + '\n';
|
||||
}
|
||||
}
|
||||
|
||||
// Find class definitions
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
if (lines[i].match(/^(class|interface|type)\s+\w+/)) {
|
||||
const endBracketLine = this.findMatchingEnd(lines, i, language);
|
||||
if (endBracketLine > i && endBracketLine <= i + 10) {
|
||||
// Include small class definitions entirely
|
||||
classDefinitions.push(lines.slice(i, endBracketLine + 1).join('\n'));
|
||||
i = endBracketLine;
|
||||
} else {
|
||||
// For larger classes, just show the definition and methods
|
||||
let className = lines[i];
|
||||
classDefinitions.push(className);
|
||||
|
||||
// Look for methods in this class
|
||||
for (let j = i + 1; j < Math.min(endBracketLine, lines.length); j++) {
|
||||
if (lines[j].match(/^\s+(function|def|public|private|protected)\s+\w+/)) {
|
||||
classDefinitions.push(' ' + lines[j].trim());
|
||||
}
|
||||
}
|
||||
|
||||
if (endBracketLine > 0 && endBracketLine < lines.length) {
|
||||
i = endBracketLine;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find function definitions not inside classes
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
if (lines[i].match(/^(function|def|const\s+\w+\s*=\s*\(|let\s+\w+\s*=\s*\(|var\s+\w+\s*=\s*\()/)) {
|
||||
functionDefinitions.push(lines[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build the extracted structure
|
||||
extractedStructure += `# Code Structure (${lines.length} lines total)\n\n`;
|
||||
|
||||
if (importSection) {
|
||||
extractedStructure += "## Imports/Dependencies\n```" + language + "\n" + importSection + "```\n\n";
|
||||
}
|
||||
|
||||
if (classDefinitions.length > 0) {
|
||||
extractedStructure += "## Classes/Interfaces\n```" + language + "\n" + classDefinitions.join('\n\n') + "\n```\n\n";
|
||||
}
|
||||
|
||||
if (functionDefinitions.length > 0) {
|
||||
extractedStructure += "## Functions\n```" + language + "\n" + functionDefinitions.join('\n\n') + "\n```\n\n";
|
||||
}
|
||||
|
||||
// Add beginning and end of the file for context
|
||||
extractedStructure += "## Beginning of File\n```" + language + "\n" +
|
||||
lines.slice(0, Math.min(50, lines.length)).join('\n') + "\n```\n\n";
|
||||
|
||||
if (lines.length > 100) {
|
||||
extractedStructure += "## End of File\n```" + language + "\n" +
|
||||
lines.slice(Math.max(0, lines.length - 50)).join('\n') + "\n```\n\n";
|
||||
}
|
||||
|
||||
return extractedStructure;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the line number of the matching ending bracket/block
|
||||
*
|
||||
* @param lines - Array of code lines
|
||||
* @param startLine - Starting line number
|
||||
* @param language - Programming language
|
||||
* @returns The line number of the matching end, or -1 if not found
|
||||
*/
|
||||
private findMatchingEnd(lines: string[], startLine: number, language: string): number {
|
||||
let depth = 0;
|
||||
let inClass = false;
|
||||
|
||||
// Different languages have different ways to define blocks
|
||||
if (['javascript', 'typescript', 'java', 'csharp', 'cpp'].includes(language)) {
|
||||
// Curly brace languages
|
||||
for (let i = startLine; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
// Count opening braces
|
||||
for (const char of line) {
|
||||
if (char === '{') depth++;
|
||||
if (char === '}') {
|
||||
depth--;
|
||||
if (depth === 0 && inClass) return i;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this line contains the class declaration
|
||||
if (i === startLine && line.includes('{')) {
|
||||
inClass = true;
|
||||
} else if (i === startLine) {
|
||||
// If the first line doesn't have an opening brace, look at the next few lines
|
||||
if (i + 1 < lines.length && lines[i + 1].includes('{')) {
|
||||
inClass = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (language === 'python') {
|
||||
// Indentation-based language
|
||||
const baseIndentation = lines[startLine].match(/^\s*/)?.[0].length || 0;
|
||||
|
||||
for (let i = startLine + 1; i < lines.length; i++) {
|
||||
// Skip empty lines
|
||||
if (lines[i].trim() === '') continue;
|
||||
|
||||
const currentIndentation = lines[i].match(/^\s*/)?.[0].length || 0;
|
||||
|
||||
// If we're back to the same or lower indentation level, we've reached the end
|
||||
if (currentIndentation <= baseIndentation) {
|
||||
return i - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML content to plain text
|
||||
*/
|
||||
private sanitizeHtml(html: string): string {
|
||||
if (!html) return '';
|
||||
|
||||
// Use sanitizeHtml to remove all HTML tags
|
||||
let content = sanitizeHtml(html, {
|
||||
allowedTags: [],
|
||||
allowedAttributes: {},
|
||||
textFilter: (text) => {
|
||||
// Replace multiple newlines with a single one
|
||||
return text.replace(/\n\s*\n/g, '\n\n');
|
||||
}
|
||||
});
|
||||
|
||||
// Additional cleanup for any remaining HTML entities
|
||||
content = content
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'");
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get parent notes in the hierarchy
|
||||
*/
|
||||
private async getParentNotes(noteId: string, maxDepth: number): Promise<{noteId: string, title: string}[]> {
|
||||
const parentNotes: {noteId: string, title: string}[] = [];
|
||||
const startNote = becca.getNote(noteId);
|
||||
|
||||
if (!startNote) {
|
||||
return parentNotes;
|
||||
}
|
||||
|
||||
// Use non-null assertion as we checked above
|
||||
let currentNote: any = startNote;
|
||||
|
||||
for (let i = 0; i < maxDepth; i++) {
|
||||
// Get parent branches (should be just one in most cases)
|
||||
if (!currentNote) break;
|
||||
|
||||
const parentBranches: any[] = currentNote.getParentBranches();
|
||||
|
||||
if (!parentBranches || parentBranches.length === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Use the first parent branch
|
||||
const branch: any = parentBranches[0];
|
||||
if (!branch) break;
|
||||
|
||||
const parentNote: any = branch.getParentNote();
|
||||
|
||||
if (!parentNote || parentNote.noteId === 'root') {
|
||||
break;
|
||||
}
|
||||
|
||||
parentNotes.unshift({
|
||||
noteId: parentNote.noteId,
|
||||
title: parentNote.title
|
||||
});
|
||||
|
||||
currentNote = parentNote;
|
||||
}
|
||||
|
||||
return parentNotes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full context for a note, including parent hierarchy, content, and children
|
||||
*/
|
||||
async getFullContext(noteId: string): Promise<string> {
|
||||
const noteContent = await this.getNoteContent(noteId);
|
||||
if (!noteContent) {
|
||||
return 'Note not found';
|
||||
}
|
||||
|
||||
const parentContext = await this.getParentContext(noteId);
|
||||
const childContext = await this.getChildContext(noteId);
|
||||
const linkedContext = await this.getLinkedNotesContext(noteId);
|
||||
|
||||
return [
|
||||
parentContext,
|
||||
noteContent,
|
||||
childContext,
|
||||
linkedContext
|
||||
].filter(Boolean).join('\n\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get semantically ranked context based on semantic similarity to a query
|
||||
* This method delegates to the semantic context service for the actual ranking
|
||||
*
|
||||
* @param noteId - The ID of the current note
|
||||
* @param query - The user's query to compare against
|
||||
* @param maxResults - Maximum number of related notes to include
|
||||
* @returns Context with the most semantically relevant related notes
|
||||
*/
|
||||
async getSemanticContext(noteId: string, query: string, maxResults = 5): Promise<string> {
|
||||
try {
|
||||
// This requires the semantic context service to be available
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: aiServiceManager } = await import('./ai_service_manager.js');
|
||||
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
|
||||
|
||||
if (!semanticContext) {
|
||||
return this.getFullContext(noteId);
|
||||
}
|
||||
|
||||
return await semanticContext.getSemanticContext(noteId, query, maxResults);
|
||||
} catch (error) {
|
||||
// Fall back to regular context if semantic ranking fails
|
||||
console.error('Error in semantic context ranking:', error);
|
||||
return this.getFullContext(noteId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get progressively loaded context based on depth level
|
||||
* This provides different levels of context detail depending on the depth parameter
|
||||
*
|
||||
* @param noteId - The ID of the note to get context for
|
||||
* @param depth - Depth level (1-4) determining how much context to include
|
||||
* @returns Context appropriate for the requested depth
|
||||
*/
|
||||
async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
|
||||
try {
|
||||
// This requires the semantic context service to be available
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: aiServiceManager } = await import('./ai_service_manager.js');
|
||||
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
|
||||
|
||||
if (!semanticContext) {
|
||||
return this.getFullContext(noteId);
|
||||
}
|
||||
|
||||
return await semanticContext.getProgressiveContext(noteId, depth);
|
||||
} catch (error) {
|
||||
// Fall back to regular context if progressive loading fails
|
||||
console.error('Error in progressive context loading:', error);
|
||||
return this.getFullContext(noteId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get smart context based on the query complexity
|
||||
* This automatically selects the appropriate context depth and relevance
|
||||
*
|
||||
* @param noteId - The ID of the note to get context for
|
||||
* @param query - The user's query for semantic relevance matching
|
||||
* @returns The optimal context for answering the query
|
||||
*/
|
||||
async getSmartContext(noteId: string, query: string): Promise<string> {
|
||||
try {
|
||||
// This requires the semantic context service to be available
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: aiServiceManager } = await import('./ai_service_manager.js');
|
||||
const semanticContext = aiServiceManager.getInstance().getSemanticContextService();
|
||||
|
||||
if (!semanticContext) {
|
||||
return this.getFullContext(noteId);
|
||||
}
|
||||
|
||||
return await semanticContext.getSmartContext(noteId, query);
|
||||
} catch (error) {
|
||||
// Fall back to regular context if smart context fails
|
||||
console.error('Error in smart context selection:', error);
|
||||
return this.getFullContext(noteId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton instance
|
||||
const contextExtractor = new ContextExtractor();
|
||||
export default contextExtractor;
|
@ -412,7 +412,8 @@ export async function getNoteEmbeddingContext(noteId: string): Promise<NoteEmbed
|
||||
try {
|
||||
// Use the enhanced context extractor for improved content extraction
|
||||
// We're using a dynamic import to avoid circular dependencies
|
||||
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
|
||||
const { ContextExtractor } = await import('../../llm/context/index.js');
|
||||
const contextExtractor = new ContextExtractor();
|
||||
|
||||
// Get the content using the enhanced formatNoteContent method in context extractor
|
||||
const noteContent = await contextExtractor.getNoteContent(noteId);
|
||||
@ -836,7 +837,8 @@ async function processNoteWithChunking(
|
||||
): Promise<void> {
|
||||
try {
|
||||
// Get the context extractor dynamically to avoid circular dependencies
|
||||
const { default: contextExtractor } = await import('../../llm/context_extractor.js');
|
||||
const { ContextExtractor } = await import('../../llm/context/index.js');
|
||||
const contextExtractor = new ContextExtractor();
|
||||
|
||||
// Get chunks of the note content
|
||||
const chunks = await contextExtractor.getChunkedNoteContent(noteId);
|
||||
|
@ -1,4 +1,4 @@
|
||||
import contextExtractor from './context_extractor.js';
|
||||
import { ContextExtractor } from './context/index.js';
|
||||
import * as vectorStore from './embeddings/vector_store.js';
|
||||
import sql from '../sql.js';
|
||||
import { cosineSimilarity } from './embeddings/vector_store.js';
|
||||
@ -58,6 +58,9 @@ import options from '../options.js';
|
||||
* knowledge bases when working with limited-context LLMs.
|
||||
*/
|
||||
class SemanticContextService {
|
||||
// Create an instance of ContextExtractor for backward compatibility
|
||||
private contextExtractor = new ContextExtractor();
|
||||
|
||||
/**
|
||||
* Get the preferred embedding provider based on user settings
|
||||
* Tries to use the most appropriate provider in this order:
|
||||
@ -156,7 +159,7 @@ class SemanticContextService {
|
||||
|
||||
if (!noteEmbedding) {
|
||||
// If note doesn't have an embedding yet, get content and generate one
|
||||
const content = await contextExtractor.getNoteContent(note.noteId);
|
||||
const content = await this.contextExtractor.getNoteContent(note.noteId);
|
||||
if (content && provider) {
|
||||
try {
|
||||
noteEmbedding = await provider.generateEmbeddings(content);
|
||||
@ -225,7 +228,7 @@ class SemanticContextService {
|
||||
const mostRelevantNotes = rankedNotes.slice(0, maxResults);
|
||||
const relevantContent = await Promise.all(
|
||||
mostRelevantNotes.map(async note => {
|
||||
const content = await contextExtractor.getNoteContent(note.noteId);
|
||||
const content = await this.contextExtractor.getNoteContent(note.noteId);
|
||||
if (!content) return null;
|
||||
|
||||
// Format with relevance score and title
|
||||
@ -253,22 +256,22 @@ class SemanticContextService {
|
||||
*/
|
||||
async getProgressiveContext(noteId: string, depth = 1): Promise<string> {
|
||||
// Start with the note content
|
||||
const noteContent = await contextExtractor.getNoteContent(noteId);
|
||||
const noteContent = await this.contextExtractor.getNoteContent(noteId);
|
||||
if (!noteContent) return 'Note not found';
|
||||
|
||||
// If depth is 1, just return the note content
|
||||
if (depth <= 1) return noteContent;
|
||||
|
||||
// Add parent context for depth >= 2
|
||||
const parentContext = await contextExtractor.getParentContext(noteId);
|
||||
const parentContext = await this.contextExtractor.getParentContext(noteId);
|
||||
if (depth <= 2) return `${parentContext}\n\n${noteContent}`;
|
||||
|
||||
// Add child context for depth >= 3
|
||||
const childContext = await contextExtractor.getChildContext(noteId);
|
||||
const childContext = await this.contextExtractor.getChildContext(noteId);
|
||||
if (depth <= 3) return `${parentContext}\n\n${noteContent}\n\n${childContext}`;
|
||||
|
||||
// Add linked notes for depth >= 4
|
||||
const linkedContext = await contextExtractor.getLinkedNotesContext(noteId);
|
||||
const linkedContext = await this.contextExtractor.getLinkedNotesContext(noteId);
|
||||
return `${parentContext}\n\n${noteContent}\n\n${childContext}\n\n${linkedContext}`;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user