import { NormalizationStatus } from './embeddings_interface.js'; import type { NoteEmbeddingContext } from './embeddings_interface.js'; import log from "../../log.js"; import { LLM_CONSTANTS } from "../constants/provider_constants.js"; import options from "../../options.js"; import { isBatchSizeError as checkBatchSizeError } from '../interfaces/error_interfaces.js'; import type { EmbeddingModelInfo } from '../interfaces/embedding_interfaces.js'; export interface EmbeddingConfig { model: string; dimension: number; type: 'float32' | 'float64'; apiKey?: string; baseUrl?: string; batchSize?: number; contextWidth?: number; normalizationStatus?: NormalizationStatus; } /** * Base class for embedding providers that implements common functionality */ export abstract class BaseEmbeddingProvider { protected model: string; protected dimension: number; protected type: 'float32' | 'float64'; protected maxBatchSize: number = 100; protected apiKey?: string; protected baseUrl: string; protected name: string = 'base'; protected modelInfoCache = new Map(); protected config: EmbeddingConfig; constructor(config: EmbeddingConfig) { this.model = config.model; this.dimension = config.dimension; this.type = config.type; this.apiKey = config.apiKey; this.baseUrl = config.baseUrl || ''; this.config = config; // If batch size is specified, use it as maxBatchSize if (config.batchSize) { this.maxBatchSize = config.batchSize; } } getConfig(): EmbeddingConfig { return { ...this.config }; } /** * Get the normalization status of this provider * Default implementation returns the status from config if available, * otherwise returns UNKNOWN status */ getNormalizationStatus(): NormalizationStatus { return this.config.normalizationStatus || NormalizationStatus.UNKNOWN; } getDimension(): number { return this.config.dimension; } async initialize(): Promise { // Default implementation does nothing return; } /** * Generate embeddings for a single text */ abstract generateEmbeddings(text: string): Promise; /** * Get the appropriate batch size for this provider * Override in provider implementations if needed */ protected async getBatchSize(): Promise { // Try to get the user-configured batch size let configuredBatchSize: number | null = null; try { const batchSizeStr = await options.getOption('embeddingBatchSize'); if (batchSizeStr) { configuredBatchSize = parseInt(batchSizeStr, 10); } } catch (error) { log.error(`Error getting batch size from options: ${error}`); } // If user has configured a specific batch size, use that if (configuredBatchSize && !isNaN(configuredBatchSize) && configuredBatchSize > 0) { return configuredBatchSize; } // Otherwise use the provider-specific default from constants return this.config.batchSize || LLM_CONSTANTS.BATCH_SIZE[this.name.toUpperCase() as keyof typeof LLM_CONSTANTS.BATCH_SIZE] || LLM_CONSTANTS.BATCH_SIZE.DEFAULT; } /** * Process a batch of texts with adaptive handling * This method will try to process the batch and reduce batch size if encountering errors */ protected async processWithAdaptiveBatch( items: T[], processFn: (batch: T[]) => Promise, isBatchSizeError: (error: unknown) => boolean ): Promise { const results: R[] = []; const failures: { index: number, error: string }[] = []; let currentBatchSize = await this.getBatchSize(); let lastError: Error | null = null; // Process items in batches for (let i = 0; i < items.length;) { const batch = items.slice(i, i + currentBatchSize); try { // Process the current batch const batchResults = await processFn(batch); results.push(...batchResults); i += batch.length; } catch (error) { lastError = error as Error; const errorMessage = (lastError as Error).message || 'Unknown error'; // Check if this is a batch size related error if (isBatchSizeError(error) && currentBatchSize > 1) { // Reduce batch size and retry const newBatchSize = Math.max(1, Math.floor(currentBatchSize / 2)); console.warn(`Batch size error detected, reducing batch size from ${currentBatchSize} to ${newBatchSize}: ${errorMessage}`); currentBatchSize = newBatchSize; } else if (currentBatchSize === 1) { // If we're already at batch size 1, we can't reduce further, so log the error and skip this item log.error(`Error processing item at index ${i} with batch size 1: ${errorMessage}`); failures.push({ index: i, error: errorMessage }); i++; // Move to the next item } else { // For other errors, retry with a smaller batch size as a precaution const newBatchSize = Math.max(1, Math.floor(currentBatchSize / 2)); console.warn(`Error processing batch, reducing batch size from ${currentBatchSize} to ${newBatchSize} as a precaution: ${errorMessage}`); currentBatchSize = newBatchSize; } } } // If all items failed and we have a last error, throw it if (results.length === 0 && failures.length > 0 && lastError) { throw lastError; } // If some items failed but others succeeded, log the summary if (failures.length > 0) { console.warn(`Processed ${results.length} items successfully, but ${failures.length} items failed`); } return results; } /** * Detect if an error is related to batch size limits * Override in provider-specific implementations */ protected isBatchSizeError(error: unknown): boolean { return checkBatchSizeError(error); } /** * Generate embeddings for multiple texts * Default implementation processes texts one by one */ async generateBatchEmbeddings(texts: string[]): Promise { if (texts.length === 0) { return []; } try { return await this.processWithAdaptiveBatch( texts, async (batch) => { const batchResults = await Promise.all( batch.map(text => this.generateEmbeddings(text)) ); return batchResults; }, this.isBatchSizeError.bind(this) ); } catch (error) { const errorMessage = (error as Error).message || "Unknown error"; log.error(`Batch embedding error for provider ${this.name}: ${errorMessage}`); throw new Error(`${this.name} batch embedding error: ${errorMessage}`); } } /** * Generate embeddings for a note with its context */ async generateNoteEmbeddings(context: NoteEmbeddingContext): Promise { const text = [context.title || "", context.content || ""].filter(Boolean).join(" "); return this.generateEmbeddings(text); } /** * Generate embeddings for multiple notes with their contexts */ async generateBatchNoteEmbeddings(contexts: NoteEmbeddingContext[]): Promise { if (contexts.length === 0) { return []; } try { return await this.processWithAdaptiveBatch( contexts, async (batch) => { const batchResults = await Promise.all( batch.map(context => this.generateNoteEmbeddings(context)) ); return batchResults; }, this.isBatchSizeError.bind(this) ); } catch (error) { const errorMessage = (error as Error).message || "Unknown error"; log.error(`Batch note embedding error for provider ${this.name}: ${errorMessage}`); throw new Error(`${this.name} batch note embedding error: ${errorMessage}`); } } /** * Cleans and normalizes text for embeddings by removing excessive whitespace */ private cleanText(text: string): string { return text.replace(/\s+/g, ' ').trim(); } /** * Generates a rich text representation of a note's context for embedding */ protected generateNoteContextText(context: NoteEmbeddingContext): string { // Build a relationship-focused summary first const relationshipSummary = []; // Summarize the note's place in the hierarchy if (context.parentTitles.length > 0) { relationshipSummary.push(`This note is a child of: ${context.parentTitles.map(t => this.cleanText(t)).join(', ')}.`); } if (context.childTitles.length > 0) { relationshipSummary.push(`This note has children: ${context.childTitles.map(t => this.cleanText(t)).join(', ')}.`); } // Emphasize relationships with other notes if (context.relatedNotes && context.relatedNotes.length > 0) { // Group by relation type for better understanding const relationsByType: Record = {}; for (const rel of context.relatedNotes) { if (!relationsByType[rel.relationName]) { relationsByType[rel.relationName] = []; } relationsByType[rel.relationName].push(this.cleanText(rel.targetTitle)); } for (const [relType, targets] of Object.entries(relationsByType)) { relationshipSummary.push(`This note has ${relType} relationship with: ${targets.join(', ')}.`); } } // Emphasize backlinks for bidirectional relationships if (context.backlinks && context.backlinks.length > 0) { // Group by relation type const backlinksByType: Record = {}; for (const link of context.backlinks) { if (!backlinksByType[link.relationName]) { backlinksByType[link.relationName] = []; } backlinksByType[link.relationName].push(this.cleanText(link.sourceTitle)); } for (const [relType, sources] of Object.entries(backlinksByType)) { relationshipSummary.push(`This note is ${relType} of: ${sources.join(', ')}.`); } } // Emphasize templates/inheritance if (context.templateTitles && context.templateTitles.length > 0) { relationshipSummary.push(`This note inherits from: ${context.templateTitles.map(t => this.cleanText(t)).join(', ')}.`); } // Start with core note information let result = `Title: ${this.cleanText(context.title)}\n` + `Type: ${context.type}\n` + `MIME: ${context.mime}\n`; // Add the relationship summary at the beginning for emphasis if (relationshipSummary.length > 0) { result += `Relationships: ${relationshipSummary.join(' ')}\n`; } // Continue with dates result += `Created: ${context.dateCreated}\n` + `Modified: ${context.dateModified}\n`; // Add attributes in a concise format if (context.attributes.length > 0) { result += 'Attributes: '; const attributeTexts = context.attributes.map(attr => `${attr.type}:${attr.name}=${this.cleanText(attr.value)}` ); result += attributeTexts.join('; ') + '\n'; } // Add important label values concisely if (context.labelValues && Object.keys(context.labelValues).length > 0) { result += 'Labels: '; const labelTexts = Object.entries(context.labelValues).map(([name, value]) => `${name}=${this.cleanText(value)}` ); result += labelTexts.join('; ') + '\n'; } // Parents, children, templates, relations, and backlinks are now handled in the relationship summary // But we'll include them again in a more structured format for organization if (context.parentTitles.length > 0) { result += `Parents: ${context.parentTitles.map(t => this.cleanText(t)).join('; ')}\n`; } if (context.childTitles.length > 0) { result += `Children: ${context.childTitles.map(t => this.cleanText(t)).join('; ')}\n`; } if (context.templateTitles && context.templateTitles.length > 0) { result += `Templates: ${context.templateTitles.map(t => this.cleanText(t)).join('; ')}\n`; } if (context.relatedNotes && context.relatedNotes.length > 0) { result += 'Related: '; const relatedTexts = context.relatedNotes.map(rel => `${rel.relationName}→${this.cleanText(rel.targetTitle)}` ); result += relatedTexts.join('; ') + '\n'; } if (context.backlinks && context.backlinks.length > 0) { result += 'Referenced By: '; const backlinkTexts = context.backlinks.map(link => `${this.cleanText(link.sourceTitle)}→${link.relationName}` ); result += backlinkTexts.join('; ') + '\n'; } // Add attachments concisely if (context.attachments.length > 0) { result += 'Attachments: '; const attachmentTexts = context.attachments.map(att => `${this.cleanText(att.title)}(${att.mime})` ); result += attachmentTexts.join('; ') + '\n'; } // Add content (already cleaned in getNoteEmbeddingContext) result += `Content: ${context.content}`; return result; } /** * Process a batch of items with automatic retries and batch size adjustment */ protected async processBatchWithRetries( items: T[], processFn: (batch: T[]) => Promise, isBatchSizeError: (error: unknown) => boolean = this.isBatchSizeError.bind(this) ): Promise { const results: Float32Array[] = []; const failures: { index: number, error: string }[] = []; let currentBatchSize = await this.getBatchSize(); let lastError: Error | null = null; // Process items in batches for (let i = 0; i < items.length;) { const batch = items.slice(i, i + currentBatchSize); try { // Process the current batch const batchResults = await processFn(batch); results.push(...batchResults); i += batch.length; } catch (error) { lastError = error as Error; const errorMessage = lastError.message || 'Unknown error'; // Check if this is a batch size related error if (isBatchSizeError(error) && currentBatchSize > 1) { // Reduce batch size and retry const newBatchSize = Math.max(1, Math.floor(currentBatchSize / 2)); console.warn(`Batch size error detected, reducing batch size from ${currentBatchSize} to ${newBatchSize}: ${errorMessage}`); currentBatchSize = newBatchSize; } else if (currentBatchSize === 1) { // If we're already at batch size 1, we can't reduce further, so log the error and skip this item console.error(`Error processing item at index ${i} with batch size 1: ${errorMessage}`); failures.push({ index: i, error: errorMessage }); i++; // Move to the next item } else { // For other errors, retry with a smaller batch size as a precaution const newBatchSize = Math.max(1, Math.floor(currentBatchSize / 2)); console.warn(`Error processing batch, reducing batch size from ${currentBatchSize} to ${newBatchSize} as a precaution: ${errorMessage}`); currentBatchSize = newBatchSize; } } } // If all items failed and we have a last error, throw it if (results.length === 0 && failures.length > 0 && lastError) { throw lastError; } // If some items failed but others succeeded, log the summary if (failures.length > 0) { console.warn(`Processed ${results.length} items successfully, but ${failures.length} items failed`); } return results; } }