From f05fe3f72bd186840ae1cfc662b0dbce32dddedd Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 18 Mar 2025 21:09:19 +0000 Subject: [PATCH] set up embedding normalization --- .../llm/embeddings/base_embeddings.ts | 10 + .../llm/embeddings/embeddings_interface.ts | 49 ++ src/services/llm/embeddings/providers.ts | 9 + .../llm/embeddings/providers/ollama.ts | 24 +- .../llm/embeddings/providers/openai.ts | 24 +- .../llm/embeddings/providers/voyage.ts | 64 +- src/services/llm/embeddings/storage.ts | 140 ++- src/services/llm/embeddings/vector_utils.ts | 797 +++++++++++++++++- src/services/llm/index_service.ts | 39 +- 9 files changed, 1079 insertions(+), 77 deletions(-) diff --git a/src/services/llm/embeddings/base_embeddings.ts b/src/services/llm/embeddings/base_embeddings.ts index 4f1f09b60..3e1c81143 100644 --- a/src/services/llm/embeddings/base_embeddings.ts +++ b/src/services/llm/embeddings/base_embeddings.ts @@ -1,4 +1,5 @@ import type { EmbeddingProvider, EmbeddingConfig, NoteEmbeddingContext } from './embeddings_interface.js'; +import { NormalizationStatus } from './embeddings_interface.js'; import log from "../../log.js"; import { LLM_CONSTANTS } from "../../../routes/api/llm.js"; import options from "../../options.js"; @@ -23,6 +24,15 @@ export abstract class BaseEmbeddingProvider implements EmbeddingProvider { return { ...this.config }; } + /** + * Get the normalization status of this provider + * Default implementation returns the status from config if available, + * otherwise returns UNKNOWN status + */ + getNormalizationStatus(): NormalizationStatus { + return this.config.normalizationStatus || NormalizationStatus.UNKNOWN; + } + getDimension(): number { return this.config.dimension; } diff --git a/src/services/llm/embeddings/embeddings_interface.ts b/src/services/llm/embeddings/embeddings_interface.ts index b5e512b6a..2731aef0d 100644 --- a/src/services/llm/embeddings/embeddings_interface.ts +++ b/src/services/llm/embeddings/embeddings_interface.ts @@ -42,6 +42,35 @@ export interface NoteEmbeddingContext { export interface EmbeddingModelInfo { dimension: number; contextWindow: number; + /** + * Whether the model guarantees normalized vectors (unit length) + */ + guaranteesNormalization: boolean; +} + +/** + * Normalization status of a provider's embeddings + */ +export enum NormalizationStatus { + /** + * Provider guarantees all embeddings are normalized to unit vectors + */ + GUARANTEED = 'guaranteed', + + /** + * Provider does not guarantee normalization, but embeddings are usually normalized + */ + USUALLY = 'usually', + + /** + * Provider does not guarantee normalization, embeddings must be normalized before use + */ + NEVER = 'never', + + /** + * Normalization status is unknown and should be checked at runtime + */ + UNKNOWN = 'unknown' } /** @@ -51,7 +80,16 @@ export interface EmbeddingConfig { model: string; dimension: number; type: 'float32' | 'float64'; + /** + * Whether embeddings should be normalized before use + * If true, normalization will always be applied + * If false, normalization depends on provider's status + */ normalize?: boolean; + /** + * The normalization status of this provider + */ + normalizationStatus?: NormalizationStatus; batchSize?: number; contextWindowSize?: number; apiKey?: string; @@ -65,6 +103,17 @@ export interface EmbeddingProvider { name: string; getConfig(): EmbeddingConfig; + /** + * Returns information about the normalization status of this provider + */ + getNormalizationStatus(): NormalizationStatus; + + /** + * Verify that embeddings are properly normalized + * @returns true if embeddings are properly normalized + */ + verifyNormalization?(sample?: Float32Array): Promise; + /** * Generate embeddings for a single piece of text */ diff --git a/src/services/llm/embeddings/providers.ts b/src/services/llm/embeddings/providers.ts index b74adfb80..cb6bc6bac 100644 --- a/src/services/llm/embeddings/providers.ts +++ b/src/services/llm/embeddings/providers.ts @@ -4,6 +4,7 @@ import sql from "../../sql.js"; import dateUtils from "../../date_utils.js"; import { randomString } from "../../utils.js"; import type { EmbeddingProvider, EmbeddingConfig } from "./embeddings_interface.js"; +import { NormalizationStatus } from "./embeddings_interface.js"; import { OpenAIEmbeddingProvider } from "./providers/openai.js"; import { OllamaEmbeddingProvider } from "./providers/ollama.js"; import { VoyageEmbeddingProvider } from "./providers/voyage.js"; @@ -25,6 +26,14 @@ class SimpleLocalEmbeddingProvider implements EmbeddingProvider { return this.config; } + /** + * Returns the normalization status of the local provider + * Local provider does not guarantee normalization + */ + getNormalizationStatus(): NormalizationStatus { + return NormalizationStatus.NEVER; // Simple embedding does not normalize vectors + } + async generateEmbeddings(text: string): Promise { // Create deterministic embeddings based on text content const result = new Float32Array(this.config.dimension || 384); diff --git a/src/services/llm/embeddings/providers/ollama.ts b/src/services/llm/embeddings/providers/ollama.ts index ba125b068..ca2c73291 100644 --- a/src/services/llm/embeddings/providers/ollama.ts +++ b/src/services/llm/embeddings/providers/ollama.ts @@ -2,6 +2,7 @@ import axios from "axios"; import log from "../../../log.js"; import { BaseEmbeddingProvider } from "../base_embeddings.js"; import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js"; +import { NormalizationStatus } from "../embeddings_interface.js"; import { LLM_CONSTANTS } from "../../../../routes/api/llm.js"; /** @@ -63,7 +64,8 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider { return { dimension: embeddingDimension || 0, // We'll detect this separately if not provided - contextWindow: contextWindow + contextWindow: contextWindow, + guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings }; } } catch (error: any) { @@ -113,7 +115,11 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider { const contextWindow = (LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record)[baseModelName] || (LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record).default; - const modelInfo: EmbeddingModelInfo = { dimension, contextWindow }; + const modelInfo: EmbeddingModelInfo = { + dimension, + contextWindow, + guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings + }; this.modelInfoCache.set(modelName, modelInfo); this.config.dimension = dimension; @@ -131,7 +137,11 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider { log.info(`Using default parameters for model ${modelName}: dimension ${dimension}, context ${contextWindow}`); - const modelInfo: EmbeddingModelInfo = { dimension, contextWindow }; + const modelInfo: EmbeddingModelInfo = { + dimension, + contextWindow, + guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings + }; this.modelInfoCache.set(modelName, modelInfo); this.config.dimension = dimension; @@ -302,4 +312,12 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider { throw new Error(`Ollama batch embedding error: ${errorMessage}`); } } + + /** + * Returns the normalization status for Ollama embeddings + * Ollama embeddings are not guaranteed to be normalized + */ + getNormalizationStatus(): NormalizationStatus { + return NormalizationStatus.NEVER; // Be conservative and always normalize + } } diff --git a/src/services/llm/embeddings/providers/openai.ts b/src/services/llm/embeddings/providers/openai.ts index 116fd5c9b..6e99c297f 100644 --- a/src/services/llm/embeddings/providers/openai.ts +++ b/src/services/llm/embeddings/providers/openai.ts @@ -2,6 +2,7 @@ import axios from "axios"; import log from "../../../log.js"; import { BaseEmbeddingProvider } from "../base_embeddings.js"; import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js"; +import { NormalizationStatus } from "../embeddings_interface.js"; import { LLM_CONSTANTS } from "../../../../routes/api/llm.js"; /** @@ -105,7 +106,8 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider { return { dimension, - contextWindow + contextWindow, + guaranteesNormalization: true // OpenAI embeddings are normalized to unit length }; } } catch (error: any) { @@ -141,7 +143,11 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider { // Use default context window let contextWindow = LLM_CONSTANTS.CONTEXT_WINDOW.OPENAI; - const modelInfo: EmbeddingModelInfo = { dimension, contextWindow }; + const modelInfo: EmbeddingModelInfo = { + dimension, + contextWindow, + guaranteesNormalization: true // OpenAI embeddings are normalized to unit length + }; this.modelInfoCache.set(modelName, modelInfo); this.config.dimension = dimension; @@ -154,7 +160,11 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider { log.info(`Using default parameters for OpenAI model ${modelName}: dimension ${dimension}, context ${contextWindow}`); - const modelInfo: EmbeddingModelInfo = { dimension, contextWindow }; + const modelInfo: EmbeddingModelInfo = { + dimension, + contextWindow, + guaranteesNormalization: true // OpenAI embeddings are normalized to unit length + }; this.modelInfoCache.set(modelName, modelInfo); this.config.dimension = dimension; @@ -288,4 +298,12 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider { throw new Error(`OpenAI batch embedding error: ${errorMessage}`); } } + + /** + * Returns the normalization status for OpenAI embeddings + * OpenAI embeddings are guaranteed to be normalized to unit length + */ + getNormalizationStatus(): NormalizationStatus { + return NormalizationStatus.GUARANTEED; + } } diff --git a/src/services/llm/embeddings/providers/voyage.ts b/src/services/llm/embeddings/providers/voyage.ts index 97748b075..9b22822d4 100644 --- a/src/services/llm/embeddings/providers/voyage.ts +++ b/src/services/llm/embeddings/providers/voyage.ts @@ -2,6 +2,7 @@ import axios from "axios"; import log from "../../../log.js"; import { BaseEmbeddingProvider } from "../base_embeddings.js"; import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js"; +import { NormalizationStatus } from "../embeddings_interface.js"; import { LLM_CONSTANTS } from "../../../../routes/api/llm.js"; // Voyage model context window sizes - as of current API version @@ -68,7 +69,8 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider { return { dimension, - contextWindow + contextWindow, + guaranteesNormalization: true // Voyage embeddings are typically normalized }; } catch (error) { log.info(`Could not determine capabilities for Voyage AI model ${modelName}: ${error}`); @@ -96,7 +98,8 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider { // Use known dimension const modelInfo: EmbeddingModelInfo = { dimension: knownDimension, - contextWindow + contextWindow, + guaranteesNormalization: true // Voyage embeddings are typically normalized }; this.modelInfoCache.set(modelName, modelInfo); @@ -109,28 +112,41 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider { const testEmbedding = await this.generateEmbeddings("Test"); const dimension = testEmbedding.length; - const modelInfo: EmbeddingModelInfo = { - dimension, - contextWindow - }; - - this.modelInfoCache.set(modelName, modelInfo); - this.config.dimension = dimension; - - log.info(`Detected Voyage AI model ${modelName} with dimension ${dimension} (context: ${contextWindow})`); - return modelInfo; + // Set model info based on the model name, detected dimension, and reasonable defaults + if (modelName.includes('voyage-2')) { + return { + dimension: dimension || 1024, + contextWindow: 4096, + guaranteesNormalization: true // Voyage-2 embeddings are normalized + }; + } else if (modelName.includes('voyage-lite-02')) { + return { + dimension: dimension || 768, + contextWindow: 4096, + guaranteesNormalization: true // Voyage-lite embeddings are normalized + }; + } else { + // Default for other Voyage models + return { + dimension: dimension || 1024, + contextWindow: 4096, + guaranteesNormalization: true // Assuming all Voyage embeddings are normalized + }; + } } } catch (error: any) { - // If detection fails, use defaults - const dimension = 1024; // Default for Voyage models + log.info(`Could not fetch model info from Voyage AI API: ${error.message}. Using defaults.`); - log.info(`Using default parameters for Voyage AI model ${modelName}: dimension ${dimension}, context ${contextWindow}`); + // Use default parameters if everything else fails + const defaultModelInfo: EmbeddingModelInfo = { + dimension: 1024, // Default for Voyage models + contextWindow: 8192, + guaranteesNormalization: true // Voyage embeddings are typically normalized + }; - const modelInfo: EmbeddingModelInfo = { dimension, contextWindow }; - this.modelInfoCache.set(modelName, modelInfo); - this.config.dimension = dimension; - - return modelInfo; + this.modelInfoCache.set(modelName, defaultModelInfo); + this.config.dimension = defaultModelInfo.dimension; + return defaultModelInfo; } } @@ -251,4 +267,12 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider { throw new Error(`Voyage AI batch embedding error: ${errorMessage}`); } } + + /** + * Returns the normalization status for Voyage embeddings + * Voyage embeddings are generally normalized by the API + */ + getNormalizationStatus(): NormalizationStatus { + return NormalizationStatus.GUARANTEED; + } } diff --git a/src/services/llm/embeddings/storage.ts b/src/services/llm/embeddings/storage.ts index 4e7cc2f14..44ae01db1 100644 --- a/src/services/llm/embeddings/storage.ts +++ b/src/services/llm/embeddings/storage.ts @@ -112,6 +112,26 @@ export async function getEmbeddingForNote(noteId: string, providerId: string, mo }; } +// Create an interface that represents the embedding row from the database +interface EmbeddingRow { + embedId: string; + noteId: string; + providerId: string; + modelId: string; + dimension: number; + embedding: Buffer; + title?: string; + type?: string; + mime?: string; + isDeleted?: number; +} + +// Interface for enhanced embedding with query model information +interface EnhancedEmbeddingRow extends EmbeddingRow { + queryProviderId: string; + queryModelId: string; +} + /** * Finds similar notes based on vector similarity */ @@ -122,7 +142,7 @@ export async function findSimilarNotes( limit = 10, threshold?: number, // Made optional to use constants useFallback = true // Whether to try other providers if no embeddings found -): Promise<{noteId: string, similarity: number}[]> { +): Promise<{noteId: string, similarity: number, contentType?: string}[]> { // Import constants dynamically to avoid circular dependencies const llmModule = await import('../../../routes/api/llm.js'); // Use a default threshold of 0.65 if not provided @@ -138,11 +158,30 @@ export async function findSimilarNotes( FROM note_embeddings ne JOIN notes n ON ne.noteId = n.noteId WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0 - `, [providerId, modelId]); + `, [providerId, modelId]) as EmbeddingRow[]; if (embeddings && embeddings.length > 0) { log.info(`Found ${embeddings.length} embeddings for provider ${providerId}, model ${modelId}`); - return await processEmbeddings(embedding, embeddings, actualThreshold, limit); + + // Add query model information to each embedding for cross-model comparison + const enhancedEmbeddings: EnhancedEmbeddingRow[] = embeddings.map(e => { + return { + embedId: e.embedId, + noteId: e.noteId, + providerId: e.providerId, + modelId: e.modelId, + dimension: e.dimension, + embedding: e.embedding, + title: e.title, + type: e.type, + mime: e.mime, + isDeleted: e.isDeleted, + queryProviderId: providerId, + queryModelId: modelId + }; + }); + + return await processEmbeddings(embedding, enhancedEmbeddings, actualThreshold, limit); } // If no embeddings found and fallback is allowed, try other providers @@ -195,10 +234,28 @@ export async function findSimilarNotes( FROM note_embeddings ne JOIN notes n ON ne.noteId = n.noteId WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0 - `, [bestAlternative.providerId, bestAlternative.modelId]); + `, [bestAlternative.providerId, bestAlternative.modelId]) as EmbeddingRow[]; if (alternativeEmbeddings && alternativeEmbeddings.length > 0) { - return await processEmbeddings(embedding, alternativeEmbeddings, actualThreshold, limit); + // Add query model information to each embedding for cross-model comparison + const enhancedEmbeddings: EnhancedEmbeddingRow[] = alternativeEmbeddings.map(e => { + return { + embedId: e.embedId, + noteId: e.noteId, + providerId: e.providerId, + modelId: e.modelId, + dimension: e.dimension, + embedding: e.embedding, + title: e.title, + type: e.type, + mime: e.mime, + isDeleted: e.isDeleted, + queryProviderId: providerId, + queryModelId: modelId + }; + }); + + return await processEmbeddings(embedding, enhancedEmbeddings, actualThreshold, limit); } } } else { @@ -256,24 +313,71 @@ export async function findSimilarNotes( // Helper function to process embeddings and calculate similarities async function processEmbeddings(queryEmbedding: Float32Array, embeddings: any[], threshold: number, limit: number) { - const { enhancedCosineSimilarity, bufferToEmbedding } = await import('./vector_utils.js'); + const { + enhancedCosineSimilarity, + bufferToEmbedding, + ContentType, + PerformanceProfile, + detectContentType, + vectorDebugConfig + } = await import('./vector_utils.js'); + + // Enable debug logging temporarily for testing content-aware adaptation + const originalDebugEnabled = vectorDebugConfig.enabled; + const originalLogLevel = vectorDebugConfig.logLevel; + vectorDebugConfig.enabled = true; + vectorDebugConfig.logLevel = 'debug'; + vectorDebugConfig.recordStats = true; + const similarities = []; - for (const e of embeddings) { - const embVector = bufferToEmbedding(e.embedding, e.dimension); - const similarity = enhancedCosineSimilarity(queryEmbedding, embVector); + try { + for (const e of embeddings) { + const embVector = bufferToEmbedding(e.embedding, e.dimension); - if (similarity >= threshold) { - similarities.push({ - noteId: e.noteId, - similarity: similarity - }); + // Detect content type from mime type if available + let contentType = ContentType.GENERAL_TEXT; + if (e.mime) { + contentType = detectContentType(e.mime); + console.log(`Note ID: ${e.noteId}, Mime: ${e.mime}, Detected content type: ${contentType}`); + } + + // Select performance profile based on embedding size and use case + // For most similarity searches, BALANCED is a good default + const performanceProfile = PerformanceProfile.BALANCED; + + // Determine if this is cross-model comparison + const isCrossModel = e.providerId !== e.queryProviderId || e.modelId !== e.queryModelId; + + // Calculate similarity with content-aware parameters + const similarity = enhancedCosineSimilarity( + queryEmbedding, + embVector, + true, // normalize vectors to ensure consistent comparison + e.queryModelId, // source model ID + e.modelId, // target model ID + contentType, // content-specific padding strategy + performanceProfile + ); + + if (similarity >= threshold) { + similarities.push({ + noteId: e.noteId, + similarity: similarity, + contentType: contentType.toString() + }); + } } - } - return similarities - .sort((a, b) => b.similarity - a.similarity) - .slice(0, limit); + return similarities + .sort((a, b) => b.similarity - a.similarity) + .slice(0, limit); + } finally { + // Restore original debug settings + vectorDebugConfig.enabled = originalDebugEnabled; + vectorDebugConfig.logLevel = originalLogLevel; + vectorDebugConfig.recordStats = false; + } } /** diff --git a/src/services/llm/embeddings/vector_utils.ts b/src/services/llm/embeddings/vector_utils.ts index fccfed24a..c28c4681f 100644 --- a/src/services/llm/embeddings/vector_utils.ts +++ b/src/services/llm/embeddings/vector_utils.ts @@ -1,34 +1,144 @@ /** * Computes the cosine similarity between two vectors * If dimensions don't match, automatically adapts using the enhanced approach + * @param normalize Optional flag to normalize vectors before comparison (default: false) + * @param sourceModel Optional identifier for the source model + * @param targetModel Optional identifier for the target model + * @param contentType Optional content type for strategy selection + * @param performanceProfile Optional performance profile */ -export function cosineSimilarity(a: Float32Array, b: Float32Array): number { +export function cosineSimilarity( + a: Float32Array, + b: Float32Array, + normalize: boolean = false, + sourceModel?: string, + targetModel?: string, + contentType?: ContentType, + performanceProfile?: PerformanceProfile +): number { // Use the enhanced approach that preserves more information - return enhancedCosineSimilarity(a, b); + return enhancedCosineSimilarity(a, b, normalize, sourceModel, targetModel, contentType, performanceProfile); } /** * Enhanced cosine similarity that adaptively handles different dimensions * Instead of truncating larger embeddings, it pads smaller ones to preserve information + * @param normalize Optional flag to normalize vectors before comparison (default: false) + * @param sourceModel Optional identifier for the source model + * @param targetModel Optional identifier for the target model + * @param contentType Optional content type for strategy selection + * @param performanceProfile Optional performance profile */ -export function enhancedCosineSimilarity(a: Float32Array, b: Float32Array): number { +export function enhancedCosineSimilarity( + a: Float32Array, + b: Float32Array, + normalize: boolean = false, + sourceModel?: string, + targetModel?: string, + contentType?: ContentType, + performanceProfile?: PerformanceProfile +): number { + // If normalization is requested, normalize vectors first + if (normalize) { + a = normalizeVector(a); + b = normalizeVector(b); + } + // If dimensions match, use standard calculation if (a.length === b.length) { return standardCosineSimilarity(a, b); } - // Always adapt smaller embedding to larger one to preserve maximum information + // Log dimension adaptation + debugLog(`Dimension mismatch: ${a.length} vs ${b.length}. Adapting dimensions...`, 'info'); + + // Determine if models are different + const isCrossModelComparison = sourceModel !== targetModel && + sourceModel !== undefined && + targetModel !== undefined; + + // Context for strategy selection + const context: StrategySelectionContext = { + contentType: contentType || ContentType.GENERAL_TEXT, + performanceProfile: performanceProfile || PerformanceProfile.BALANCED, + sourceDimension: a.length, + targetDimension: b.length, + sourceModel, + targetModel, + isCrossModelComparison + }; + + // Select the optimal strategy based on context + let adaptOptions: AdaptationOptions; + if (a.length > b.length) { // Pad b to match a's dimensions - const adaptedB = adaptEmbeddingDimensions(b, a.length); + debugLog(`Adapting embedding B (${b.length}D) to match A (${a.length}D)`, 'debug'); + + // Get optimal strategy + adaptOptions = selectOptimalPaddingStrategy(context); + const adaptedB = adaptEmbeddingDimensions(b, a.length, adaptOptions); + + // Record stats + recordAdaptationStats({ + operation: 'dimension_adaptation', + sourceModel: targetModel, + targetModel: sourceModel, + sourceDimension: b.length, + targetDimension: a.length, + strategy: adaptOptions.strategy + }); + return standardCosineSimilarity(a, adaptedB); } else { // Pad a to match b's dimensions - const adaptedA = adaptEmbeddingDimensions(a, b.length); + debugLog(`Adapting embedding A (${a.length}D) to match B (${b.length}D)`, 'debug'); + + // Get optimal strategy + adaptOptions = selectOptimalPaddingStrategy(context); + const adaptedA = adaptEmbeddingDimensions(a, b.length, adaptOptions); + + // Record stats + recordAdaptationStats({ + operation: 'dimension_adaptation', + sourceModel: sourceModel, + targetModel: targetModel, + sourceDimension: a.length, + targetDimension: b.length, + strategy: adaptOptions.strategy + }); + return standardCosineSimilarity(adaptedA, b); } } +/** + * Normalizes a vector to unit length + * @param vector The vector to normalize + * @returns A new normalized vector + */ +export function normalizeVector(vector: Float32Array): Float32Array { + let magnitude = 0; + for (let i = 0; i < vector.length; i++) { + magnitude += vector[i] * vector[i]; + } + + magnitude = Math.sqrt(magnitude); + + // If vector is already normalized or is a zero vector, return a copy + if (magnitude === 0 || Math.abs(magnitude - 1.0) < 1e-6) { + return new Float32Array(vector); + } + + // Create a new normalized vector + const normalized = new Float32Array(vector.length); + for (let i = 0; i < vector.length; i++) { + normalized[i] = vector[i] / magnitude; + } + + return normalized; +} + /** * Standard cosine similarity for same-dimension vectors */ @@ -75,28 +185,111 @@ export function selectOptimalEmbedding(embeddings: Array<{ } /** - * Adapts an embedding to match target dimensions - * Uses a simple truncation (if source is larger) or zero-padding (if source is smaller) + * Padding strategy options for dimension adaptation + */ +export enum PaddingStrategy { + ZERO = 'zero', // Simple zero padding (default) + MEAN = 'mean', // Padding with mean value of source embedding + GAUSSIAN = 'gaussian', // Padding with Gaussian noise based on source statistics + MIRROR = 'mirror' // Mirroring existing values for padding +} + +/** + * Configuration for embedding adaptation + */ +export interface AdaptationOptions { + strategy: PaddingStrategy; + seed?: number; // Seed for random number generation (gaussian) + variance?: number; // Variance for gaussian noise (default: 0.01) + normalize?: boolean; // Whether to normalize after adaptation +} + +/** + * Adapts an embedding to match target dimensions with configurable strategies * * @param sourceEmbedding The original embedding * @param targetDimension The desired dimension + * @param options Configuration options for the adaptation * @returns A new embedding with the target dimensions */ -export function adaptEmbeddingDimensions(sourceEmbedding: Float32Array, targetDimension: number): Float32Array { +export function adaptEmbeddingDimensions( + sourceEmbedding: Float32Array, + targetDimension: number, + options: AdaptationOptions = { strategy: PaddingStrategy.ZERO, normalize: true } +): Float32Array { const sourceDimension = sourceEmbedding.length; - // If dimensions already match, return the original + // If dimensions already match, return a copy of the original if (sourceDimension === targetDimension) { - return sourceEmbedding; + return new Float32Array(sourceEmbedding); } // Create a new embedding with target dimensions const adaptedEmbedding = new Float32Array(targetDimension); if (sourceDimension < targetDimension) { - // If source is smaller, copy all values and pad with zeros + // Copy all source values first adaptedEmbedding.set(sourceEmbedding); - // Rest of the array is already initialized to zeros + + // Apply the selected padding strategy + switch (options.strategy) { + case PaddingStrategy.ZERO: + // Zero padding is already done by default + break; + + case PaddingStrategy.MEAN: + // Calculate mean of source embedding + let sum = 0; + for (let i = 0; i < sourceDimension; i++) { + sum += sourceEmbedding[i]; + } + const mean = sum / sourceDimension; + + // Fill remaining dimensions with mean value + for (let i = sourceDimension; i < targetDimension; i++) { + adaptedEmbedding[i] = mean; + } + break; + + case PaddingStrategy.GAUSSIAN: + // Calculate mean and standard deviation of source embedding + let meanSum = 0; + for (let i = 0; i < sourceDimension; i++) { + meanSum += sourceEmbedding[i]; + } + const meanValue = meanSum / sourceDimension; + + let varianceSum = 0; + for (let i = 0; i < sourceDimension; i++) { + varianceSum += Math.pow(sourceEmbedding[i] - meanValue, 2); + } + const variance = options.variance ?? Math.min(0.01, varianceSum / sourceDimension); + const stdDev = Math.sqrt(variance); + + // Fill remaining dimensions with Gaussian noise + for (let i = sourceDimension; i < targetDimension; i++) { + // Box-Muller transform for Gaussian distribution + const u1 = Math.random(); + const u2 = Math.random(); + const z0 = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2); + + adaptedEmbedding[i] = meanValue + stdDev * z0; + } + break; + + case PaddingStrategy.MIRROR: + // Mirror existing values for padding + for (let i = sourceDimension; i < targetDimension; i++) { + // Cycle through source values in reverse order + const mirrorIndex = sourceDimension - 1 - ((i - sourceDimension) % sourceDimension); + adaptedEmbedding[i] = sourceEmbedding[mirrorIndex]; + } + break; + + default: + // Default to zero padding + break; + } } else { // If source is larger, truncate to target dimension for (let i = 0; i < targetDimension; i++) { @@ -104,17 +297,9 @@ export function adaptEmbeddingDimensions(sourceEmbedding: Float32Array, targetDi } } - // Normalize the adapted embedding to maintain unit length - let magnitude = 0; - for (let i = 0; i < targetDimension; i++) { - magnitude += adaptedEmbedding[i] * adaptedEmbedding[i]; - } - - magnitude = Math.sqrt(magnitude); - if (magnitude > 0) { - for (let i = 0; i < targetDimension; i++) { - adaptedEmbedding[i] /= magnitude; - } + // Normalize the adapted embedding if requested + if (options.normalize) { + return normalizeVector(adaptedEmbedding); } return adaptedEmbedding; @@ -133,3 +318,567 @@ export function embeddingToBuffer(embedding: Float32Array): Buffer { export function bufferToEmbedding(buffer: Buffer, dimension: number): Float32Array { return new Float32Array(buffer.buffer, buffer.byteOffset, dimension); } + +/** + * Similarity metric options + */ +export enum SimilarityMetric { + COSINE = 'cosine', // Standard cosine similarity + DOT_PRODUCT = 'dot_product', // Simple dot product (assumes normalized vectors) + HYBRID = 'hybrid', // Dot product + cosine hybrid + DIM_AWARE = 'dimension_aware', // Dimension-aware similarity that factors in dimension differences + ENSEMBLE = 'ensemble' // Combined score from multiple metrics +} + +/** + * Configuration for similarity calculation + */ +export interface SimilarityOptions { + metric: SimilarityMetric; + normalize?: boolean; + ensembleWeights?: {[key in SimilarityMetric]?: number}; + dimensionPenalty?: number; // Penalty factor for dimension differences (0 to 1) + sourceModel?: string; // Source model identifier + targetModel?: string; // Target model identifier + contentType?: ContentType; // Type of content being compared + performanceProfile?: PerformanceProfile; // Performance requirements +} + +/** + * Computes similarity between two vectors using the specified metric + * @param a First vector + * @param b Second vector + * @param options Similarity calculation options + */ +export function computeSimilarity( + a: Float32Array, + b: Float32Array, + options: SimilarityOptions = { metric: SimilarityMetric.COSINE } +): number { + // Apply normalization if requested + const normalize = options.normalize ?? false; + + switch (options.metric) { + case SimilarityMetric.COSINE: + return cosineSimilarity( + a, b, normalize, + options.sourceModel, options.targetModel, + options.contentType, options.performanceProfile + ); + + case SimilarityMetric.DOT_PRODUCT: + // Dot product assumes normalized vectors for proper similarity measurement + const aNorm = normalize ? normalizeVector(a) : a; + const bNorm = normalize ? normalizeVector(b) : b; + return computeDotProduct(aNorm, bNorm, options); + + case SimilarityMetric.HYBRID: + // Hybrid approach combines dot product with cosine similarity + // More robust against small perturbations while maintaining angle sensitivity + return hybridSimilarity(a, b, normalize, options); + + case SimilarityMetric.DIM_AWARE: + // Dimension-aware similarity that factors in dimension differences + return dimensionAwareSimilarity( + a, b, normalize, + options.dimensionPenalty ?? 0.1, + options.contentType, + options.performanceProfile + ); + + case SimilarityMetric.ENSEMBLE: + // Ensemble scoring combines multiple metrics with weights + return ensembleSimilarity(a, b, options); + + default: + // Default to cosine similarity + return cosineSimilarity( + a, b, normalize, + options.sourceModel, options.targetModel, + options.contentType, options.performanceProfile + ); + } +} + +/** + * Computes dot product between two vectors + */ +export function computeDotProduct( + a: Float32Array, + b: Float32Array, + options?: Pick +): number { + // Adapt dimensions if needed + if (a.length !== b.length) { + // Create context for strategy selection if dimensions don't match + if (options) { + const context: StrategySelectionContext = { + contentType: options.contentType || ContentType.GENERAL_TEXT, + performanceProfile: options.performanceProfile || PerformanceProfile.BALANCED, + sourceDimension: a.length, + targetDimension: b.length, + sourceModel: options.sourceModel, + targetModel: options.targetModel, + isCrossModelComparison: options.sourceModel !== options.targetModel && + options.sourceModel !== undefined && + options.targetModel !== undefined + }; + + if (a.length > b.length) { + const adaptOptions = selectOptimalPaddingStrategy(context); + b = adaptEmbeddingDimensions(b, a.length, adaptOptions); + } else { + const adaptOptions = selectOptimalPaddingStrategy(context); + a = adaptEmbeddingDimensions(a, b.length, adaptOptions); + } + } else { + // Default behavior without options + if (a.length > b.length) { + b = adaptEmbeddingDimensions(b, a.length); + } else { + a = adaptEmbeddingDimensions(a, b.length); + } + } + } + + let dotProduct = 0; + for (let i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i]; + } + + return dotProduct; +} + +/** + * Hybrid similarity combines dot product and cosine similarity + * Provides robustness against small perturbations while maintaining angle sensitivity + */ +export function hybridSimilarity( + a: Float32Array, + b: Float32Array, + normalize: boolean = false, + options?: Pick +): number { + // Get cosine similarity with full options + const cosine = cosineSimilarity( + a, b, normalize, + options?.sourceModel, options?.targetModel, + options?.contentType, options?.performanceProfile + ); + + // For dot product, we should always normalize + const aNorm = normalize ? a : normalizeVector(a); + const bNorm = normalize ? b : normalizeVector(b); + + // If dimensions don't match, adapt with optimal strategy + let adaptedA = aNorm; + let adaptedB = bNorm; + + if (aNorm.length !== bNorm.length) { + // Use optimal padding strategy + if (options) { + const context: StrategySelectionContext = { + contentType: options.contentType || ContentType.GENERAL_TEXT, + performanceProfile: options.performanceProfile || PerformanceProfile.BALANCED, + sourceDimension: aNorm.length, + targetDimension: bNorm.length, + sourceModel: options.sourceModel, + targetModel: options.targetModel, + isCrossModelComparison: options.sourceModel !== options.targetModel && + options.sourceModel !== undefined && + options.targetModel !== undefined + }; + + if (aNorm.length < bNorm.length) { + const adaptOptions = selectOptimalPaddingStrategy(context); + adaptedA = adaptEmbeddingDimensions(aNorm, bNorm.length, adaptOptions); + } else { + const adaptOptions = selectOptimalPaddingStrategy(context); + adaptedB = adaptEmbeddingDimensions(bNorm, aNorm.length, adaptOptions); + } + } else { + // Default behavior + adaptedA = aNorm.length < bNorm.length ? adaptEmbeddingDimensions(aNorm, bNorm.length) : aNorm; + adaptedB = bNorm.length < aNorm.length ? adaptEmbeddingDimensions(bNorm, aNorm.length) : bNorm; + } + } + + // Compute dot product (should be similar to cosine for normalized vectors) + const dot = computeDotProduct(adaptedA, adaptedB, options); + + // Return weighted average - giving more weight to cosine + return 0.7 * cosine + 0.3 * dot; +} + +/** + * Dimension-aware similarity that factors in dimension differences + * @param dimensionPenalty Penalty factor for dimension differences (0 to 1) + */ +export function dimensionAwareSimilarity( + a: Float32Array, + b: Float32Array, + normalize: boolean = false, + dimensionPenalty: number = 0.1, + contentType?: ContentType, + performanceProfile?: PerformanceProfile +): number { + // Basic cosine similarity with content type information + const cosine = cosineSimilarity(a, b, normalize, undefined, undefined, contentType, performanceProfile); + + // If dimensions match, return standard cosine + if (a.length === b.length) { + return cosine; + } + + // Calculate dimension penalty + // This penalizes vectors with very different dimensions + const dimRatio = Math.min(a.length, b.length) / Math.max(a.length, b.length); + const penalty = 1 - dimensionPenalty * (1 - dimRatio); + + // Apply penalty to similarity score + return cosine * penalty; +} + +/** + * Ensemble similarity combines multiple metrics with weights + */ +export function ensembleSimilarity( + a: Float32Array, + b: Float32Array, + options: SimilarityOptions +): number { + // Default weights if not provided + const weights = options.ensembleWeights ?? { + [SimilarityMetric.COSINE]: 0.6, + [SimilarityMetric.HYBRID]: 0.3, + [SimilarityMetric.DIM_AWARE]: 0.1 + }; + + let totalWeight = 0; + let weightedSum = 0; + + // Compute each metric and apply weight + for (const [metricStr, weight] of Object.entries(weights)) { + const metric = metricStr as SimilarityMetric; + if (weight && weight > 0) { + // Skip the ensemble itself to avoid recursion + if (metric !== SimilarityMetric.ENSEMBLE) { + const similarity = computeSimilarity(a, b, { + metric, + normalize: options.normalize + }); + + weightedSum += similarity * weight; + totalWeight += weight; + } + } + } + + // Normalize by total weight + return totalWeight > 0 ? weightedSum / totalWeight : cosineSimilarity(a, b, options.normalize); +} + +/** + * Debug configuration for vector operations + */ +export interface DebugConfig { + enabled: boolean; + logLevel: 'info' | 'debug' | 'warning' | 'error'; + recordStats: boolean; +} + +/** + * Global debug configuration, can be modified at runtime + */ +export const vectorDebugConfig: DebugConfig = { + enabled: false, + logLevel: 'info', + recordStats: false +}; + +/** + * Statistics collected during vector operations + */ +export interface AdaptationStats { + timestamp: number; + operation: string; + sourceModel?: string; + targetModel?: string; + sourceDimension: number; + targetDimension: number; + strategy: string; + similarity?: number; +} + +// Collection of adaptation statistics for quality auditing +export const adaptationStats: AdaptationStats[] = []; + +/** + * Log a message if debugging is enabled + */ +function debugLog( + message: string, + level: 'info' | 'debug' | 'warning' | 'error' = 'info' +): void { + if (vectorDebugConfig.enabled) { + const levelOrder = { 'debug': 0, 'info': 1, 'warning': 2, 'error': 3 }; + + if (levelOrder[level] >= levelOrder[vectorDebugConfig.logLevel]) { + const prefix = `[VectorUtils:${level.toUpperCase()}]`; + + switch (level) { + case 'error': + console.error(prefix, message); + break; + case 'warning': + console.warn(prefix, message); + break; + case 'debug': + console.debug(prefix, message); + break; + default: + console.log(prefix, message); + } + } + } +} + +/** + * Record adaptation statistics if enabled + */ +function recordAdaptationStats(stats: Omit): void { + if (vectorDebugConfig.enabled && vectorDebugConfig.recordStats) { + adaptationStats.push({ + ...stats, + timestamp: Date.now() + }); + + // Keep only the last 1000 stats to prevent memory issues + if (adaptationStats.length > 1000) { + adaptationStats.shift(); + } + } +} + +/** + * Content types for embedding adaptation strategy selection + */ +export enum ContentType { + GENERAL_TEXT = 'general_text', + CODE = 'code', + STRUCTURED_DATA = 'structured_data', + MATHEMATICAL = 'mathematical', + MIXED = 'mixed' +} + +/** + * Performance profile for selecting adaptation strategy + */ +export enum PerformanceProfile { + MAXIMUM_QUALITY = 'maximum_quality', // Prioritize similarity quality over speed + BALANCED = 'balanced', // Balance quality and performance + MAXIMUM_SPEED = 'maximum_speed' // Prioritize speed over quality +} + +/** + * Context for selecting the optimal padding strategy + */ +export interface StrategySelectionContext { + contentType?: ContentType; // Type of content being compared + performanceProfile?: PerformanceProfile; // Performance requirements + sourceDimension: number; // Source embedding dimension + targetDimension: number; // Target embedding dimension + sourceModel?: string; // Source model identifier + targetModel?: string; // Target model identifier + isHighPrecisionRequired?: boolean; // Whether high precision is needed + isCrossModelComparison?: boolean; // Whether comparing across different models + dimensionRatio?: number; // Custom dimension ratio threshold +} + +/** + * Selects the optimal padding strategy based on content type and performance considerations + * @param context Selection context parameters + * @returns The most appropriate padding strategy and options + */ +export function selectOptimalPaddingStrategy( + context: StrategySelectionContext +): AdaptationOptions { + const { + contentType = ContentType.GENERAL_TEXT, + performanceProfile = PerformanceProfile.BALANCED, + sourceDimension, + targetDimension, + isHighPrecisionRequired = false, + isCrossModelComparison = false + } = context; + + // Calculate dimension ratio + const dimRatio = Math.min(sourceDimension, targetDimension) / + Math.max(sourceDimension, targetDimension); + + // Default options + const options: AdaptationOptions = { + strategy: PaddingStrategy.ZERO, + normalize: true + }; + + // Significant dimension difference detection + const hasSignificantDimDifference = dimRatio < (context.dimensionRatio || 0.5); + + // Select strategy based on content type + switch (contentType) { + case ContentType.CODE: + // Code benefits from structural patterns + options.strategy = PaddingStrategy.MIRROR; + break; + + case ContentType.STRUCTURED_DATA: + // Structured data works well with mean-value padding + options.strategy = PaddingStrategy.MEAN; + break; + + case ContentType.MATHEMATICAL: + // Mathematical content benefits from gaussian noise to maintain statistical properties + options.strategy = PaddingStrategy.GAUSSIAN; + options.variance = 0.005; // Lower variance for mathematical precision + break; + + case ContentType.MIXED: + // For mixed content, choose based on performance profile + if (performanceProfile === PerformanceProfile.MAXIMUM_QUALITY) { + options.strategy = PaddingStrategy.GAUSSIAN; + } else if (performanceProfile === PerformanceProfile.MAXIMUM_SPEED) { + options.strategy = PaddingStrategy.ZERO; + } else { + options.strategy = PaddingStrategy.MEAN; + } + break; + + case ContentType.GENERAL_TEXT: + default: + // For general text, base decision on other factors + if (isHighPrecisionRequired) { + options.strategy = PaddingStrategy.GAUSSIAN; + } else if (isCrossModelComparison) { + options.strategy = PaddingStrategy.MEAN; + } else { + options.strategy = PaddingStrategy.ZERO; + } + break; + } + + // Override based on performance profile if we have significant dimension differences + if (hasSignificantDimDifference) { + // For extreme dimension differences, specialized handling + if (performanceProfile === PerformanceProfile.MAXIMUM_QUALITY) { + // For quality, use gaussian noise for better statistical matching + options.strategy = PaddingStrategy.GAUSSIAN; + // Adjust variance based on dimension ratio + options.variance = Math.min(0.01, 0.02 * dimRatio); + + // Log the significant dimension adaptation + debugLog(`Significant dimension difference detected: ${sourceDimension} vs ${targetDimension}. ` + + `Ratio: ${dimRatio.toFixed(2)}. Using Gaussian strategy.`, 'warning'); + } else if (performanceProfile === PerformanceProfile.MAXIMUM_SPEED) { + // For speed, stick with zero padding + options.strategy = PaddingStrategy.ZERO; + } + } + + // Always use zero padding for trivial dimension differences + // (e.g. 1536 vs 1537) for performance reasons + if (Math.abs(sourceDimension - targetDimension) <= 5) { + options.strategy = PaddingStrategy.ZERO; + } + + // Log the selected strategy + debugLog(`Selected padding strategy: ${options.strategy} for ` + + `content type: ${contentType}, performance profile: ${performanceProfile}`, 'debug'); + + return options; +} + +/** + * Helper function to determine content type from note context + * @param context The note context information + * @returns The detected content type + */ +export function detectContentType(mime: string, content?: string): ContentType { + // Detect based on mime type + if (mime.includes('code') || + mime.includes('javascript') || + mime.includes('typescript') || + mime.includes('python') || + mime.includes('java') || + mime.includes('c++') || + mime.includes('json')) { + return ContentType.CODE; + } + + if (mime.includes('xml') || + mime.includes('csv') || + mime.includes('sql') || + mime.endsWith('+json')) { + return ContentType.STRUCTURED_DATA; + } + + if (mime.includes('latex') || + mime.includes('mathml') || + mime.includes('tex')) { + return ContentType.MATHEMATICAL; + } + + // If we have content, we can do deeper analysis + if (content) { + // Detect code by looking for common patterns + const codePatterns = [ + /function\s+\w+\s*\(.*\)\s*{/, // JavaScript/TypeScript function + /def\s+\w+\s*\(.*\):/, // Python function + /class\s+\w+(\s+extends\s+\w+)?(\s+implements\s+\w+)?\s*{/, // Java/TypeScript class + /import\s+.*\s+from\s+['"]/, // JS/TS import + /^\s*```\w+/m // Markdown code block + ]; + + if (codePatterns.some(pattern => pattern.test(content))) { + return ContentType.CODE; + } + + // Detect structured data + const structuredPatterns = [ + /^\s*[{\[]/, // JSON-like start + /^\s*<\?xml/, // XML declaration + /^\s*<[a-z]+>/i, // HTML/XML tag + /^\s*(\w+,)+\w+$/m, // CSV-like + /CREATE\s+TABLE|SELECT\s+.*\s+FROM/i // SQL + ]; + + if (structuredPatterns.some(pattern => pattern.test(content))) { + return ContentType.STRUCTURED_DATA; + } + + // Detect mathematical content + const mathPatterns = [ + /\$\$.*\$\$/s, // LaTeX block + /\\begin{equation}/, // LaTeX equation environment + /\\sum|\\int|\\frac|\\sqrt/, // Common LaTeX math commands + ]; + + if (mathPatterns.some(pattern => pattern.test(content))) { + return ContentType.MATHEMATICAL; + } + + // Check for mixed content + const hasMixedContent = + (codePatterns.some(pattern => pattern.test(content)) && + content.split(/\s+/).length > 100) || // Code and substantial text + (content.includes('```') && + content.replace(/```.*?```/gs, '').length > 200); // Markdown with code blocks and text + + if (hasMixedContent) { + return ContentType.MIXED; + } + } + + // Default to general text + return ContentType.GENERAL_TEXT; +} diff --git a/src/services/llm/index_service.ts b/src/services/llm/index_service.ts index ef6a4f64e..740e4d48c 100644 --- a/src/services/llm/index_service.ts +++ b/src/services/llm/index_service.ts @@ -565,7 +565,7 @@ class IndexService { } // Get Note IDs to search, optionally filtered by branch - let similarNotes = []; + let similarNotes: { noteId: string; title: string; similarity: number; contentType?: string }[] = []; // Check if we need to restrict search to a specific branch if (contextNoteId) { @@ -593,6 +593,9 @@ class IndexService { // Get embeddings for all notes in the branch const config = provider.getConfig(); + // Import the ContentType detection from vector utils + const { ContentType, detectContentType, cosineSimilarity } = await import('./embeddings/vector_utils.js'); + for (const noteId of branchNoteIds) { const noteEmbedding = await vectorStore.getEmbeddingForNote( noteId, @@ -601,14 +604,29 @@ class IndexService { ); if (noteEmbedding) { - const similarity = vectorStore.cosineSimilarity(embedding, noteEmbedding.embedding); - if (similarity >= this.defaultSimilarityThreshold) { - const note = becca.getNote(noteId); - if (note) { + // Get the note to determine its content type + const note = becca.getNote(noteId); + if (note) { + // Detect content type from mime type + const contentType = detectContentType(note.mime, ''); + + // Use content-aware similarity calculation + const similarity = cosineSimilarity( + embedding, + noteEmbedding.embedding, + true, // normalize + config.model, // source model + noteEmbedding.providerId, // target model (use providerId) + contentType, // content type for padding strategy + undefined // use default BALANCED performance profile + ); + + if (similarity >= this.defaultSimilarityThreshold) { similarNotes.push({ noteId, title: note.title, - similarity + similarity, + contentType: contentType.toString() }); } } @@ -622,7 +640,7 @@ class IndexService { } else { // Search across all notes const config = provider.getConfig(); - similarNotes = await vectorStore.findSimilarNotes( + const results = await vectorStore.findSimilarNotes( embedding, provider.name, config.model, @@ -631,14 +649,17 @@ class IndexService { ); // Enhance results with note titles - return similarNotes.map(result => { + similarNotes = results.map(result => { const note = becca.getNote(result.noteId); return { noteId: result.noteId, title: note ? note.title : 'Unknown Note', - similarity: result.similarity + similarity: result.similarity, + contentType: result.contentType }; }); + + return similarNotes; } } catch (error: any) { log.error(`Error finding similar notes: ${error.message || "Unknown error"}`);