From 71b3b04c536ba8a548fbd3363850a907fbde87e0 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 11 Mar 2025 18:39:59 +0000 Subject: [PATCH] break up the huge context_extractor into smaller files --- src/services/llm/ai_service_manager.ts | 5 +- src/services/llm/chat_service.ts | 5 +- src/services/llm/context/chunking.ts | 288 ++++++ src/services/llm/context/code_handlers.ts | 433 +++++++++ src/services/llm/context/hierarchy.ts | 243 ++++++ src/services/llm/context/index.ts | 616 +++++++++++++ src/services/llm/context/note_content.ts | 223 +++++ src/services/llm/context/semantic_context.ts | 225 +++++ src/services/llm/context/summarization.ts | 162 ++++ src/services/llm/context_extractor.ts | 871 ------------------- src/services/llm/embeddings/vector_store.ts | 6 +- src/services/llm/semantic_context_service.ts | 17 +- 12 files changed, 2212 insertions(+), 882 deletions(-) create mode 100644 src/services/llm/context/chunking.ts create mode 100644 src/services/llm/context/code_handlers.ts create mode 100644 src/services/llm/context/hierarchy.ts create mode 100644 src/services/llm/context/index.ts create mode 100644 src/services/llm/context/note_content.ts create mode 100644 src/services/llm/context/semantic_context.ts create mode 100644 src/services/llm/context/summarization.ts delete mode 100644 src/services/llm/context_extractor.ts diff --git a/src/services/llm/ai_service_manager.ts b/src/services/llm/ai_service_manager.ts index a5fd11c87..6e88a1fb7 100644 --- a/src/services/llm/ai_service_manager.ts +++ b/src/services/llm/ai_service_manager.ts @@ -4,7 +4,7 @@ import { OpenAIService } from './providers/openai_service.js'; import { AnthropicService } from './providers/anthropic_service.js'; import { OllamaService } from './providers/ollama_service.js'; import log from '../log.js'; -import contextExtractor from './context_extractor.js'; +import { ContextExtractor } from './context/index.js'; import semanticContextService from './semantic_context_service.js'; type ServiceProviders = 'openai' | 'anthropic' | 'ollama'; @@ -216,3 +216,6 @@ export default { return getInstance().getSemanticContextService(); } }; + +// Create an instance of ContextExtractor for backward compatibility +const contextExtractor = new ContextExtractor(); diff --git a/src/services/llm/chat_service.ts b/src/services/llm/chat_service.ts index 416ca4131..1278c5f5d 100644 --- a/src/services/llm/chat_service.ts +++ b/src/services/llm/chat_service.ts @@ -1,7 +1,10 @@ import type { Message, ChatCompletionOptions } from './ai_interface.js'; import aiServiceManager from './ai_service_manager.js'; import chatStorageService from './chat_storage_service.js'; -import contextExtractor from './context_extractor.js'; +import { ContextExtractor } from './context/index.js'; + +// Create an instance of ContextExtractor for backward compatibility +const contextExtractor = new ContextExtractor(); export interface ChatSession { id: string; diff --git a/src/services/llm/context/chunking.ts b/src/services/llm/context/chunking.ts new file mode 100644 index 000000000..76e0f3d13 --- /dev/null +++ b/src/services/llm/context/chunking.ts @@ -0,0 +1,288 @@ +/** + * Contains functions for chunking content into smaller pieces for processing + * These functions are used to properly prepare content for LLM context windows + */ + +/** + * Interface for chunked content + */ +export interface ContentChunk { + content: string; + prefix: string; + noteId?: string; + title?: string; + path?: string; + metadata?: Record; +} + +/** + * Options for the chunking process + */ +export interface ChunkOptions { + /** + * Maximum size of each chunk in characters + * Defaults to LLM context window size (typically around 2048) + */ + maxChunkSize?: number; + + /** + * How much chunks should overlap to maintain context + */ + overlapSize?: number; + + /** + * Whether to respect sentence and paragraph boundaries + */ + respectBoundaries?: boolean; + + /** + * Whether to add metadata to chunks + */ + includeMetadata?: boolean; + + /** + * Additional information to include in chunk metadata + */ + metadata?: Record; +} + +/** + * Default options for chunking + */ +const DEFAULT_CHUNK_OPTIONS: Required = { + maxChunkSize: 1500, // Characters per chunk + overlapSize: 100, // Overlap between chunks + respectBoundaries: true, + includeMetadata: true, + metadata: {} +}; + +/** + * Chunk content into smaller pieces + * Used for processing large documents and preparing them for LLMs + */ +export function chunkContent( + content: string, + title: string = '', + noteId: string = '', + options: ChunkOptions = {} +): ContentChunk[] { + // Merge provided options with defaults + const config: Required = { ...DEFAULT_CHUNK_OPTIONS, ...options }; + + // If content is small enough, return as a single chunk + if (content.length <= config.maxChunkSize) { + return [{ + content, + prefix: title, + noteId, + title, + metadata: config.metadata + }]; + } + + const chunks: ContentChunk[] = []; + + if (config.respectBoundaries) { + // Try to split on paragraph boundaries first + const paragraphs = content.split(/\n\s*\n/); + + let currentChunk = ''; + let currentPrefix = title ? title : ''; + + for (const paragraph of paragraphs) { + // If adding this paragraph would exceed max size, create a new chunk + if (currentChunk.length + paragraph.length > config.maxChunkSize) { + // If current chunk is not empty, add it to chunks + if (currentChunk.length > 0) { + chunks.push({ + content: currentChunk, + prefix: currentPrefix, + noteId, + title, + metadata: config.metadata + }); + } + + // Start a new chunk, use the overlap if possible + if (config.overlapSize > 0 && currentChunk.length > 0) { + // For overlap, take the last N characters + const overlapText = currentChunk.slice(-config.overlapSize); + currentChunk = overlapText + paragraph; + currentPrefix = `${title} (continued)`; + } else { + currentChunk = paragraph; + currentPrefix = `${title} (continued)`; + } + } else { + // Add paragraph to current chunk + if (currentChunk.length > 0) { + currentChunk += '\n\n'; + } + currentChunk += paragraph; + } + } + + // Add the last chunk if it's not empty + if (currentChunk.length > 0) { + chunks.push({ + content: currentChunk, + prefix: currentPrefix, + noteId, + title, + metadata: config.metadata + }); + } + } else { + // Simple chunking by character count + let currentPosition = 0; + + while (currentPosition < content.length) { + const chunkEnd = Math.min(currentPosition + config.maxChunkSize, content.length); + + const chunk = content.substring(currentPosition, chunkEnd); + const prefix = currentPosition === 0 ? title : `${title} (continued)`; + + chunks.push({ + content: chunk, + prefix, + noteId, + title, + metadata: config.metadata + }); + + // Move position, considering overlap + currentPosition = chunkEnd - (config.overlapSize || 0); + + // Prevent infinite loop if overlap is too large + if (currentPosition <= 0 || currentPosition >= content.length) { + break; + } + } + } + + return chunks; +} + +/** + * Smarter chunking that tries to respect semantic boundaries like headers and sections + */ +export function semanticChunking( + content: string, + title: string = '', + noteId: string = '', + options: ChunkOptions = {} +): ContentChunk[] { + // Merge provided options with defaults + const config: Required = { ...DEFAULT_CHUNK_OPTIONS, ...options }; + + // If content is small enough, return as a single chunk + if (content.length <= config.maxChunkSize) { + return [{ + content, + prefix: title, + noteId, + title, + metadata: config.metadata + }]; + } + + const chunks: ContentChunk[] = []; + + // Try to split on headers first + const headerPattern = /#{1,6}\s+.+|]*>.*?<\/h[1-6]>/g; + const sections = []; + + let lastIndex = 0; + let match; + + // First, find all headers and split content into sections + while ((match = headerPattern.exec(content)) !== null) { + if (match.index > lastIndex) { + // Add the content before this header + sections.push(content.substring(lastIndex, match.index)); + } + + // Start a new section with this header + lastIndex = match.index; + } + + // Add the last section + if (lastIndex < content.length) { + sections.push(content.substring(lastIndex)); + } + + // If no headers were found, fall back to regular chunking + if (sections.length <= 1) { + return chunkContent(content, title, noteId, options); + } + + // Process each section + let currentChunk = ''; + let currentPrefix = title; + + for (const section of sections) { + // If adding this section would exceed max size, create a new chunk + if (currentChunk.length + section.length > config.maxChunkSize) { + // If this single section is too big, it needs to be chunked further + if (section.length > config.maxChunkSize) { + // First add the current chunk if not empty + if (currentChunk.length > 0) { + chunks.push({ + content: currentChunk, + prefix: currentPrefix, + noteId, + title, + metadata: config.metadata + }); + } + + // Chunk this section separately + const sectionChunks = chunkContent( + section, + title, + noteId, + options + ); + + chunks.push(...sectionChunks); + + // Reset current chunk + currentChunk = ''; + currentPrefix = `${title} (continued)`; + } else { + // Add current chunk to chunks + chunks.push({ + content: currentChunk, + prefix: currentPrefix, + noteId, + title, + metadata: config.metadata + }); + + // Start a new chunk with this section + currentChunk = section; + currentPrefix = `${title} (continued)`; + } + } else { + // Add section to current chunk + if (currentChunk.length > 0 && !currentChunk.endsWith('\n')) { + currentChunk += '\n\n'; + } + currentChunk += section; + } + } + + // Add the last chunk if it's not empty + if (currentChunk.length > 0) { + chunks.push({ + content: currentChunk, + prefix: currentPrefix, + noteId, + title, + metadata: config.metadata + }); + } + + return chunks; +} diff --git a/src/services/llm/context/code_handlers.ts b/src/services/llm/context/code_handlers.ts new file mode 100644 index 000000000..22a16b22b --- /dev/null +++ b/src/services/llm/context/code_handlers.ts @@ -0,0 +1,433 @@ +/** + * Helper functions for processing code notes, including language detection and structure extraction + */ + +/** + * Attempt to detect the programming language from code content or note attributes + */ +export function detectLanguage(content: string, mime: string): string { + // First check MIME type for hints + if (mime) { + const mimeLower = mime.toLowerCase(); + + // Map of mime types to language names + const mimeMap: {[key: string]: string} = { + 'text/javascript': 'javascript', + 'application/javascript': 'javascript', + 'text/typescript': 'typescript', + 'application/typescript': 'typescript', + 'text/x-python': 'python', + 'text/x-java': 'java', + 'text/x-c': 'c', + 'text/x-c++': 'cpp', + 'text/x-csharp': 'csharp', + 'text/x-go': 'go', + 'text/x-ruby': 'ruby', + 'text/x-php': 'php', + 'text/x-rust': 'rust', + 'text/x-swift': 'swift', + 'text/x-kotlin': 'kotlin', + 'text/x-scala': 'scala', + 'text/x-perl': 'perl', + 'text/x-lua': 'lua', + 'text/x-r': 'r', + 'text/x-dart': 'dart', + 'text/html': 'html', + 'text/css': 'css', + 'application/json': 'json', + 'application/xml': 'xml', + 'text/markdown': 'markdown', + 'text/yaml': 'yaml', + 'text/x-sql': 'sql' + }; + + if (mimeMap[mimeLower]) { + return mimeMap[mimeLower]; + } + } + + // Check for common language patterns in the first few lines + const firstLines = content.split('\n').slice(0, 10).join('\n'); + + // Simple heuristics for common languages + if (firstLines.includes('') || firstLines.includes('')) return 'html'; + if (firstLines.includes('function ') && firstLines.includes('var ') && firstLines.includes('const ')) return 'javascript'; + if (firstLines.includes('interface ') && firstLines.includes('export class ')) return 'typescript'; + if (firstLines.includes('@Component') || firstLines.includes('import { Component }')) return 'typescript'; + + // Default to 'text' if language can't be determined + return 'text'; +} + +/** + * Extract structure from code to create a summary + */ +export function extractCodeStructure(content: string, language: string): string { + // Avoid processing very large code files + if (content.length > 100000) { + return "Code content too large for structure extraction"; + } + + let structure = ""; + + try { + switch (language.toLowerCase()) { + case 'javascript': + case 'typescript': + structure = extractJsStructure(content); + break; + + case 'python': + structure = extractPythonStructure(content); + break; + + case 'java': + case 'csharp': + case 'cpp': + structure = extractClassBasedStructure(content); + break; + + case 'go': + structure = extractGoStructure(content); + break; + + case 'rust': + structure = extractRustStructure(content); + break; + + case 'html': + structure = extractHtmlStructure(content); + break; + + default: + // For other languages, just return a summary of the file size and a few lines + const lines = content.split('\n'); + structure = `Code file with ${lines.length} lines.\n`; + + // Add first few non-empty lines that aren't comments + const firstCodeLines = lines.filter(line => + line.trim() !== '' && + !line.trim().startsWith('//') && + !line.trim().startsWith('#') && + !line.trim().startsWith('*') && + !line.trim().startsWith('