2025-03-11 18:39:59 +00:00
|
|
|
/**
|
|
|
|
* Contains functions for chunking content into smaller pieces for processing
|
|
|
|
* These functions are used to properly prepare content for LLM context windows
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Interface for chunked content
|
|
|
|
*/
|
|
|
|
export interface ContentChunk {
|
|
|
|
content: string;
|
|
|
|
prefix: string;
|
|
|
|
noteId?: string;
|
|
|
|
title?: string;
|
|
|
|
path?: string;
|
|
|
|
metadata?: Record<string, any>;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Options for the chunking process
|
|
|
|
*/
|
|
|
|
export interface ChunkOptions {
|
|
|
|
/**
|
|
|
|
* Maximum size of each chunk in characters
|
|
|
|
* Defaults to LLM context window size (typically around 2048)
|
|
|
|
*/
|
|
|
|
maxChunkSize?: number;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* How much chunks should overlap to maintain context
|
|
|
|
*/
|
|
|
|
overlapSize?: number;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Whether to respect sentence and paragraph boundaries
|
|
|
|
*/
|
|
|
|
respectBoundaries?: boolean;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Whether to add metadata to chunks
|
|
|
|
*/
|
|
|
|
includeMetadata?: boolean;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Additional information to include in chunk metadata
|
|
|
|
*/
|
|
|
|
metadata?: Record<string, any>;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Default options for chunking
|
|
|
|
*/
|
2025-03-11 23:04:51 +00:00
|
|
|
async function getDefaultChunkOptions(): Promise<Required<ChunkOptions>> {
|
|
|
|
// Import constants dynamically to avoid circular dependencies
|
|
|
|
const { LLM_CONSTANTS } = await import('../../../routes/api/llm.js');
|
|
|
|
|
|
|
|
return {
|
|
|
|
maxChunkSize: LLM_CONSTANTS.CHUNKING.DEFAULT_SIZE,
|
|
|
|
overlapSize: LLM_CONSTANTS.CHUNKING.DEFAULT_OVERLAP,
|
|
|
|
respectBoundaries: true,
|
|
|
|
includeMetadata: true,
|
|
|
|
metadata: {}
|
|
|
|
};
|
|
|
|
}
|
2025-03-11 18:39:59 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Chunk content into smaller pieces
|
|
|
|
* Used for processing large documents and preparing them for LLMs
|
|
|
|
*/
|
2025-03-11 23:04:51 +00:00
|
|
|
export async function chunkContent(
|
2025-03-11 18:39:59 +00:00
|
|
|
content: string,
|
|
|
|
title: string = '',
|
|
|
|
noteId: string = '',
|
|
|
|
options: ChunkOptions = {}
|
2025-03-11 23:04:51 +00:00
|
|
|
): Promise<ContentChunk[]> {
|
2025-03-11 18:39:59 +00:00
|
|
|
// Merge provided options with defaults
|
2025-03-11 23:04:51 +00:00
|
|
|
const defaultOptions = await getDefaultChunkOptions();
|
|
|
|
const config: Required<ChunkOptions> = { ...defaultOptions, ...options };
|
2025-03-11 18:39:59 +00:00
|
|
|
|
|
|
|
// If content is small enough, return as a single chunk
|
|
|
|
if (content.length <= config.maxChunkSize) {
|
|
|
|
return [{
|
|
|
|
content,
|
|
|
|
prefix: title,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
}];
|
|
|
|
}
|
|
|
|
|
|
|
|
const chunks: ContentChunk[] = [];
|
|
|
|
|
|
|
|
if (config.respectBoundaries) {
|
|
|
|
// Try to split on paragraph boundaries first
|
|
|
|
const paragraphs = content.split(/\n\s*\n/);
|
|
|
|
|
|
|
|
let currentChunk = '';
|
|
|
|
let currentPrefix = title ? title : '';
|
|
|
|
|
|
|
|
for (const paragraph of paragraphs) {
|
|
|
|
// If adding this paragraph would exceed max size, create a new chunk
|
|
|
|
if (currentChunk.length + paragraph.length > config.maxChunkSize) {
|
|
|
|
// If current chunk is not empty, add it to chunks
|
|
|
|
if (currentChunk.length > 0) {
|
|
|
|
chunks.push({
|
|
|
|
content: currentChunk,
|
|
|
|
prefix: currentPrefix,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start a new chunk, use the overlap if possible
|
|
|
|
if (config.overlapSize > 0 && currentChunk.length > 0) {
|
|
|
|
// For overlap, take the last N characters
|
|
|
|
const overlapText = currentChunk.slice(-config.overlapSize);
|
|
|
|
currentChunk = overlapText + paragraph;
|
|
|
|
currentPrefix = `${title} (continued)`;
|
|
|
|
} else {
|
|
|
|
currentChunk = paragraph;
|
|
|
|
currentPrefix = `${title} (continued)`;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Add paragraph to current chunk
|
|
|
|
if (currentChunk.length > 0) {
|
|
|
|
currentChunk += '\n\n';
|
|
|
|
}
|
|
|
|
currentChunk += paragraph;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add the last chunk if it's not empty
|
|
|
|
if (currentChunk.length > 0) {
|
|
|
|
chunks.push({
|
|
|
|
content: currentChunk,
|
|
|
|
prefix: currentPrefix,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
});
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Simple chunking by character count
|
|
|
|
let currentPosition = 0;
|
|
|
|
|
|
|
|
while (currentPosition < content.length) {
|
|
|
|
const chunkEnd = Math.min(currentPosition + config.maxChunkSize, content.length);
|
|
|
|
|
|
|
|
const chunk = content.substring(currentPosition, chunkEnd);
|
|
|
|
const prefix = currentPosition === 0 ? title : `${title} (continued)`;
|
|
|
|
|
|
|
|
chunks.push({
|
|
|
|
content: chunk,
|
|
|
|
prefix,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
});
|
|
|
|
|
|
|
|
// Move position, considering overlap
|
|
|
|
currentPosition = chunkEnd - (config.overlapSize || 0);
|
|
|
|
|
|
|
|
// Prevent infinite loop if overlap is too large
|
|
|
|
if (currentPosition <= 0 || currentPosition >= content.length) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return chunks;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Smarter chunking that tries to respect semantic boundaries like headers and sections
|
|
|
|
*/
|
2025-03-11 23:04:51 +00:00
|
|
|
export async function semanticChunking(
|
2025-03-11 18:39:59 +00:00
|
|
|
content: string,
|
|
|
|
title: string = '',
|
|
|
|
noteId: string = '',
|
|
|
|
options: ChunkOptions = {}
|
2025-03-11 23:04:51 +00:00
|
|
|
): Promise<ContentChunk[]> {
|
2025-03-11 18:39:59 +00:00
|
|
|
// Merge provided options with defaults
|
2025-03-11 23:04:51 +00:00
|
|
|
const defaultOptions = await getDefaultChunkOptions();
|
|
|
|
const config: Required<ChunkOptions> = { ...defaultOptions, ...options };
|
2025-03-11 18:39:59 +00:00
|
|
|
|
|
|
|
// If content is small enough, return as a single chunk
|
|
|
|
if (content.length <= config.maxChunkSize) {
|
|
|
|
return [{
|
|
|
|
content,
|
|
|
|
prefix: title,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
}];
|
|
|
|
}
|
|
|
|
|
|
|
|
const chunks: ContentChunk[] = [];
|
|
|
|
|
|
|
|
// Try to split on headers first
|
|
|
|
const headerPattern = /#{1,6}\s+.+|<h[1-6][^>]*>.*?<\/h[1-6]>/g;
|
|
|
|
const sections = [];
|
|
|
|
|
|
|
|
let lastIndex = 0;
|
|
|
|
let match;
|
|
|
|
|
|
|
|
// First, find all headers and split content into sections
|
|
|
|
while ((match = headerPattern.exec(content)) !== null) {
|
|
|
|
if (match.index > lastIndex) {
|
|
|
|
// Add the content before this header
|
|
|
|
sections.push(content.substring(lastIndex, match.index));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start a new section with this header
|
|
|
|
lastIndex = match.index;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add the last section
|
|
|
|
if (lastIndex < content.length) {
|
|
|
|
sections.push(content.substring(lastIndex));
|
|
|
|
}
|
|
|
|
|
|
|
|
// If no headers were found, fall back to regular chunking
|
|
|
|
if (sections.length <= 1) {
|
2025-03-11 23:04:51 +00:00
|
|
|
return await chunkContent(content, title, noteId, options);
|
2025-03-11 18:39:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Process each section
|
|
|
|
let currentChunk = '';
|
|
|
|
let currentPrefix = title;
|
|
|
|
|
|
|
|
for (const section of sections) {
|
|
|
|
// If adding this section would exceed max size, create a new chunk
|
|
|
|
if (currentChunk.length + section.length > config.maxChunkSize) {
|
|
|
|
// If this single section is too big, it needs to be chunked further
|
|
|
|
if (section.length > config.maxChunkSize) {
|
|
|
|
// First add the current chunk if not empty
|
|
|
|
if (currentChunk.length > 0) {
|
|
|
|
chunks.push({
|
|
|
|
content: currentChunk,
|
|
|
|
prefix: currentPrefix,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Chunk this section separately
|
2025-03-11 23:04:51 +00:00
|
|
|
const sectionChunks = await chunkContent(
|
2025-03-11 18:39:59 +00:00
|
|
|
section,
|
|
|
|
title,
|
|
|
|
noteId,
|
|
|
|
options
|
|
|
|
);
|
|
|
|
|
|
|
|
chunks.push(...sectionChunks);
|
|
|
|
|
|
|
|
// Reset current chunk
|
|
|
|
currentChunk = '';
|
|
|
|
currentPrefix = `${title} (continued)`;
|
|
|
|
} else {
|
|
|
|
// Add current chunk to chunks
|
|
|
|
chunks.push({
|
|
|
|
content: currentChunk,
|
|
|
|
prefix: currentPrefix,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
});
|
|
|
|
|
|
|
|
// Start a new chunk with this section
|
|
|
|
currentChunk = section;
|
|
|
|
currentPrefix = `${title} (continued)`;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Add section to current chunk
|
|
|
|
if (currentChunk.length > 0 && !currentChunk.endsWith('\n')) {
|
|
|
|
currentChunk += '\n\n';
|
|
|
|
}
|
|
|
|
currentChunk += section;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add the last chunk if it's not empty
|
|
|
|
if (currentChunk.length > 0) {
|
|
|
|
chunks.push({
|
|
|
|
content: currentChunk,
|
|
|
|
prefix: currentPrefix,
|
|
|
|
noteId,
|
|
|
|
title,
|
|
|
|
metadata: config.metadata
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
return chunks;
|
|
|
|
}
|