mirror of
				https://github.com/TriliumNext/Notes.git
				synced 2025-11-04 15:11:31 +08:00 
			
		
		
		
	set up embedding normalization
This commit is contained in:
		
							parent
							
								
									08f7f1962b
								
							
						
					
					
						commit
						f05fe3f72b
					
				@ -1,4 +1,5 @@
 | 
			
		||||
import type { EmbeddingProvider, EmbeddingConfig, NoteEmbeddingContext } from './embeddings_interface.js';
 | 
			
		||||
import { NormalizationStatus } from './embeddings_interface.js';
 | 
			
		||||
import log from "../../log.js";
 | 
			
		||||
import { LLM_CONSTANTS } from "../../../routes/api/llm.js";
 | 
			
		||||
import options from "../../options.js";
 | 
			
		||||
@ -23,6 +24,15 @@ export abstract class BaseEmbeddingProvider implements EmbeddingProvider {
 | 
			
		||||
        return { ...this.config };
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Get the normalization status of this provider
 | 
			
		||||
     * Default implementation returns the status from config if available,
 | 
			
		||||
     * otherwise returns UNKNOWN status
 | 
			
		||||
     */
 | 
			
		||||
    getNormalizationStatus(): NormalizationStatus {
 | 
			
		||||
        return this.config.normalizationStatus || NormalizationStatus.UNKNOWN;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    getDimension(): number {
 | 
			
		||||
        return this.config.dimension;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@ -42,6 +42,35 @@ export interface NoteEmbeddingContext {
 | 
			
		||||
export interface EmbeddingModelInfo {
 | 
			
		||||
    dimension: number;
 | 
			
		||||
    contextWindow: number;
 | 
			
		||||
    /**
 | 
			
		||||
     * Whether the model guarantees normalized vectors (unit length)
 | 
			
		||||
     */
 | 
			
		||||
    guaranteesNormalization: boolean;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Normalization status of a provider's embeddings
 | 
			
		||||
 */
 | 
			
		||||
export enum NormalizationStatus {
 | 
			
		||||
    /**
 | 
			
		||||
     * Provider guarantees all embeddings are normalized to unit vectors
 | 
			
		||||
     */
 | 
			
		||||
    GUARANTEED = 'guaranteed',
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Provider does not guarantee normalization, but embeddings are usually normalized
 | 
			
		||||
     */
 | 
			
		||||
    USUALLY = 'usually',
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Provider does not guarantee normalization, embeddings must be normalized before use
 | 
			
		||||
     */
 | 
			
		||||
    NEVER = 'never',
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Normalization status is unknown and should be checked at runtime
 | 
			
		||||
     */
 | 
			
		||||
    UNKNOWN = 'unknown'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
@ -51,7 +80,16 @@ export interface EmbeddingConfig {
 | 
			
		||||
    model: string;
 | 
			
		||||
    dimension: number;
 | 
			
		||||
    type: 'float32' | 'float64';
 | 
			
		||||
    /**
 | 
			
		||||
     * Whether embeddings should be normalized before use
 | 
			
		||||
     * If true, normalization will always be applied
 | 
			
		||||
     * If false, normalization depends on provider's status
 | 
			
		||||
     */
 | 
			
		||||
    normalize?: boolean;
 | 
			
		||||
    /**
 | 
			
		||||
     * The normalization status of this provider
 | 
			
		||||
     */
 | 
			
		||||
    normalizationStatus?: NormalizationStatus;
 | 
			
		||||
    batchSize?: number;
 | 
			
		||||
    contextWindowSize?: number;
 | 
			
		||||
    apiKey?: string;
 | 
			
		||||
@ -65,6 +103,17 @@ export interface EmbeddingProvider {
 | 
			
		||||
    name: string;
 | 
			
		||||
    getConfig(): EmbeddingConfig;
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Returns information about the normalization status of this provider
 | 
			
		||||
     */
 | 
			
		||||
    getNormalizationStatus(): NormalizationStatus;
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Verify that embeddings are properly normalized
 | 
			
		||||
     * @returns true if embeddings are properly normalized
 | 
			
		||||
     */
 | 
			
		||||
    verifyNormalization?(sample?: Float32Array): Promise<boolean>;
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Generate embeddings for a single piece of text
 | 
			
		||||
     */
 | 
			
		||||
 | 
			
		||||
@ -4,6 +4,7 @@ import sql from "../../sql.js";
 | 
			
		||||
import dateUtils from "../../date_utils.js";
 | 
			
		||||
import { randomString } from "../../utils.js";
 | 
			
		||||
import type { EmbeddingProvider, EmbeddingConfig } from "./embeddings_interface.js";
 | 
			
		||||
import { NormalizationStatus } from "./embeddings_interface.js";
 | 
			
		||||
import { OpenAIEmbeddingProvider } from "./providers/openai.js";
 | 
			
		||||
import { OllamaEmbeddingProvider } from "./providers/ollama.js";
 | 
			
		||||
import { VoyageEmbeddingProvider } from "./providers/voyage.js";
 | 
			
		||||
@ -25,6 +26,14 @@ class SimpleLocalEmbeddingProvider implements EmbeddingProvider {
 | 
			
		||||
        return this.config;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Returns the normalization status of the local provider
 | 
			
		||||
     * Local provider does not guarantee normalization
 | 
			
		||||
     */
 | 
			
		||||
    getNormalizationStatus(): NormalizationStatus {
 | 
			
		||||
        return NormalizationStatus.NEVER; // Simple embedding does not normalize vectors
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    async generateEmbeddings(text: string): Promise<Float32Array> {
 | 
			
		||||
        // Create deterministic embeddings based on text content
 | 
			
		||||
        const result = new Float32Array(this.config.dimension || 384);
 | 
			
		||||
 | 
			
		||||
@ -2,6 +2,7 @@ import axios from "axios";
 | 
			
		||||
import log from "../../../log.js";
 | 
			
		||||
import { BaseEmbeddingProvider } from "../base_embeddings.js";
 | 
			
		||||
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
 | 
			
		||||
import { NormalizationStatus } from "../embeddings_interface.js";
 | 
			
		||||
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
@ -63,7 +64,8 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
 | 
			
		||||
                return {
 | 
			
		||||
                    dimension: embeddingDimension || 0, // We'll detect this separately if not provided
 | 
			
		||||
                    contextWindow: contextWindow
 | 
			
		||||
                    contextWindow: contextWindow,
 | 
			
		||||
                    guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings
 | 
			
		||||
                };
 | 
			
		||||
            }
 | 
			
		||||
        } catch (error: any) {
 | 
			
		||||
@ -113,7 +115,11 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
            const contextWindow = (LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record<string, number>)[baseModelName] ||
 | 
			
		||||
                                (LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record<string, number>).default;
 | 
			
		||||
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                dimension,
 | 
			
		||||
                contextWindow,
 | 
			
		||||
                guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings
 | 
			
		||||
            };
 | 
			
		||||
            this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
            this.config.dimension = dimension;
 | 
			
		||||
 | 
			
		||||
@ -131,7 +137,11 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
 | 
			
		||||
            log.info(`Using default parameters for model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
 | 
			
		||||
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                dimension,
 | 
			
		||||
                contextWindow,
 | 
			
		||||
                guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings
 | 
			
		||||
            };
 | 
			
		||||
            this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
            this.config.dimension = dimension;
 | 
			
		||||
 | 
			
		||||
@ -302,4 +312,12 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
            throw new Error(`Ollama batch embedding error: ${errorMessage}`);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Returns the normalization status for Ollama embeddings
 | 
			
		||||
     * Ollama embeddings are not guaranteed to be normalized
 | 
			
		||||
     */
 | 
			
		||||
    getNormalizationStatus(): NormalizationStatus {
 | 
			
		||||
        return NormalizationStatus.NEVER; // Be conservative and always normalize
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -2,6 +2,7 @@ import axios from "axios";
 | 
			
		||||
import log from "../../../log.js";
 | 
			
		||||
import { BaseEmbeddingProvider } from "../base_embeddings.js";
 | 
			
		||||
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
 | 
			
		||||
import { NormalizationStatus } from "../embeddings_interface.js";
 | 
			
		||||
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
@ -105,7 +106,8 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
 | 
			
		||||
                return {
 | 
			
		||||
                    dimension,
 | 
			
		||||
                    contextWindow
 | 
			
		||||
                    contextWindow,
 | 
			
		||||
                    guaranteesNormalization: true // OpenAI embeddings are normalized to unit length
 | 
			
		||||
                };
 | 
			
		||||
            }
 | 
			
		||||
        } catch (error: any) {
 | 
			
		||||
@ -141,7 +143,11 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
            // Use default context window
 | 
			
		||||
            let contextWindow = LLM_CONSTANTS.CONTEXT_WINDOW.OPENAI;
 | 
			
		||||
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                dimension,
 | 
			
		||||
                contextWindow,
 | 
			
		||||
                guaranteesNormalization: true // OpenAI embeddings are normalized to unit length
 | 
			
		||||
            };
 | 
			
		||||
            this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
            this.config.dimension = dimension;
 | 
			
		||||
 | 
			
		||||
@ -154,7 +160,11 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
 | 
			
		||||
            log.info(`Using default parameters for OpenAI model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
 | 
			
		||||
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                dimension,
 | 
			
		||||
                contextWindow,
 | 
			
		||||
                guaranteesNormalization: true // OpenAI embeddings are normalized to unit length
 | 
			
		||||
            };
 | 
			
		||||
            this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
            this.config.dimension = dimension;
 | 
			
		||||
 | 
			
		||||
@ -288,4 +298,12 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
            throw new Error(`OpenAI batch embedding error: ${errorMessage}`);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Returns the normalization status for OpenAI embeddings
 | 
			
		||||
     * OpenAI embeddings are guaranteed to be normalized to unit length
 | 
			
		||||
     */
 | 
			
		||||
    getNormalizationStatus(): NormalizationStatus {
 | 
			
		||||
        return NormalizationStatus.GUARANTEED;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -2,6 +2,7 @@ import axios from "axios";
 | 
			
		||||
import log from "../../../log.js";
 | 
			
		||||
import { BaseEmbeddingProvider } from "../base_embeddings.js";
 | 
			
		||||
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
 | 
			
		||||
import { NormalizationStatus } from "../embeddings_interface.js";
 | 
			
		||||
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
 | 
			
		||||
 | 
			
		||||
// Voyage model context window sizes - as of current API version
 | 
			
		||||
@ -68,7 +69,8 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
 | 
			
		||||
            return {
 | 
			
		||||
                dimension,
 | 
			
		||||
                contextWindow
 | 
			
		||||
                contextWindow,
 | 
			
		||||
                guaranteesNormalization: true // Voyage embeddings are typically normalized
 | 
			
		||||
            };
 | 
			
		||||
        } catch (error) {
 | 
			
		||||
            log.info(`Could not determine capabilities for Voyage AI model ${modelName}: ${error}`);
 | 
			
		||||
@ -96,7 +98,8 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
                // Use known dimension
 | 
			
		||||
                const modelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                    dimension: knownDimension,
 | 
			
		||||
                    contextWindow
 | 
			
		||||
                    contextWindow,
 | 
			
		||||
                    guaranteesNormalization: true // Voyage embeddings are typically normalized
 | 
			
		||||
                };
 | 
			
		||||
 | 
			
		||||
                this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
@ -109,28 +112,41 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
                const testEmbedding = await this.generateEmbeddings("Test");
 | 
			
		||||
                const dimension = testEmbedding.length;
 | 
			
		||||
 | 
			
		||||
                const modelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                    dimension,
 | 
			
		||||
                    contextWindow
 | 
			
		||||
                // Set model info based on the model name, detected dimension, and reasonable defaults
 | 
			
		||||
                if (modelName.includes('voyage-2')) {
 | 
			
		||||
                    return {
 | 
			
		||||
                        dimension: dimension || 1024,
 | 
			
		||||
                        contextWindow: 4096,
 | 
			
		||||
                        guaranteesNormalization: true // Voyage-2 embeddings are normalized
 | 
			
		||||
                    };
 | 
			
		||||
 | 
			
		||||
                this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
                this.config.dimension = dimension;
 | 
			
		||||
 | 
			
		||||
                log.info(`Detected Voyage AI model ${modelName} with dimension ${dimension} (context: ${contextWindow})`);
 | 
			
		||||
                return modelInfo;
 | 
			
		||||
                } else if (modelName.includes('voyage-lite-02')) {
 | 
			
		||||
                    return {
 | 
			
		||||
                        dimension: dimension || 768,
 | 
			
		||||
                        contextWindow: 4096,
 | 
			
		||||
                        guaranteesNormalization: true // Voyage-lite embeddings are normalized
 | 
			
		||||
                    };
 | 
			
		||||
                } else {
 | 
			
		||||
                    // Default for other Voyage models
 | 
			
		||||
                    return {
 | 
			
		||||
                        dimension: dimension || 1024,
 | 
			
		||||
                        contextWindow: 4096,
 | 
			
		||||
                        guaranteesNormalization: true // Assuming all Voyage embeddings are normalized
 | 
			
		||||
                    };
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        } catch (error: any) {
 | 
			
		||||
            // If detection fails, use defaults
 | 
			
		||||
            const dimension = 1024; // Default for Voyage models
 | 
			
		||||
            log.info(`Could not fetch model info from Voyage AI API: ${error.message}. Using defaults.`);
 | 
			
		||||
 | 
			
		||||
            log.info(`Using default parameters for Voyage AI model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
 | 
			
		||||
            // Use default parameters if everything else fails
 | 
			
		||||
            const defaultModelInfo: EmbeddingModelInfo = {
 | 
			
		||||
                dimension: 1024, // Default for Voyage models
 | 
			
		||||
                contextWindow: 8192,
 | 
			
		||||
                guaranteesNormalization: true // Voyage embeddings are typically normalized
 | 
			
		||||
            };
 | 
			
		||||
 | 
			
		||||
            const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
 | 
			
		||||
            this.modelInfoCache.set(modelName, modelInfo);
 | 
			
		||||
            this.config.dimension = dimension;
 | 
			
		||||
 | 
			
		||||
            return modelInfo;
 | 
			
		||||
            this.modelInfoCache.set(modelName, defaultModelInfo);
 | 
			
		||||
            this.config.dimension = defaultModelInfo.dimension;
 | 
			
		||||
            return defaultModelInfo;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@ -251,4 +267,12 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
 | 
			
		||||
            throw new Error(`Voyage AI batch embedding error: ${errorMessage}`);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /**
 | 
			
		||||
     * Returns the normalization status for Voyage embeddings
 | 
			
		||||
     * Voyage embeddings are generally normalized by the API
 | 
			
		||||
     */
 | 
			
		||||
    getNormalizationStatus(): NormalizationStatus {
 | 
			
		||||
        return NormalizationStatus.GUARANTEED;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -112,6 +112,26 @@ export async function getEmbeddingForNote(noteId: string, providerId: string, mo
 | 
			
		||||
    };
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create an interface that represents the embedding row from the database
 | 
			
		||||
interface EmbeddingRow {
 | 
			
		||||
    embedId: string;
 | 
			
		||||
    noteId: string;
 | 
			
		||||
    providerId: string;
 | 
			
		||||
    modelId: string;
 | 
			
		||||
    dimension: number;
 | 
			
		||||
    embedding: Buffer;
 | 
			
		||||
    title?: string;
 | 
			
		||||
    type?: string;
 | 
			
		||||
    mime?: string;
 | 
			
		||||
    isDeleted?: number;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Interface for enhanced embedding with query model information
 | 
			
		||||
interface EnhancedEmbeddingRow extends EmbeddingRow {
 | 
			
		||||
    queryProviderId: string;
 | 
			
		||||
    queryModelId: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Finds similar notes based on vector similarity
 | 
			
		||||
 */
 | 
			
		||||
@ -122,7 +142,7 @@ export async function findSimilarNotes(
 | 
			
		||||
    limit = 10,
 | 
			
		||||
    threshold?: number,  // Made optional to use constants
 | 
			
		||||
    useFallback = true   // Whether to try other providers if no embeddings found
 | 
			
		||||
): Promise<{noteId: string, similarity: number}[]> {
 | 
			
		||||
): Promise<{noteId: string, similarity: number, contentType?: string}[]> {
 | 
			
		||||
    // Import constants dynamically to avoid circular dependencies
 | 
			
		||||
    const llmModule = await import('../../../routes/api/llm.js');
 | 
			
		||||
    // Use a default threshold of 0.65 if not provided
 | 
			
		||||
@ -138,11 +158,30 @@ export async function findSimilarNotes(
 | 
			
		||||
            FROM note_embeddings ne
 | 
			
		||||
            JOIN notes n ON ne.noteId = n.noteId
 | 
			
		||||
            WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0
 | 
			
		||||
        `, [providerId, modelId]);
 | 
			
		||||
        `, [providerId, modelId]) as EmbeddingRow[];
 | 
			
		||||
 | 
			
		||||
        if (embeddings && embeddings.length > 0) {
 | 
			
		||||
            log.info(`Found ${embeddings.length} embeddings for provider ${providerId}, model ${modelId}`);
 | 
			
		||||
            return await processEmbeddings(embedding, embeddings, actualThreshold, limit);
 | 
			
		||||
 | 
			
		||||
            // Add query model information to each embedding for cross-model comparison
 | 
			
		||||
            const enhancedEmbeddings: EnhancedEmbeddingRow[] = embeddings.map(e => {
 | 
			
		||||
                return {
 | 
			
		||||
                    embedId: e.embedId,
 | 
			
		||||
                    noteId: e.noteId,
 | 
			
		||||
                    providerId: e.providerId,
 | 
			
		||||
                    modelId: e.modelId,
 | 
			
		||||
                    dimension: e.dimension,
 | 
			
		||||
                    embedding: e.embedding,
 | 
			
		||||
                    title: e.title,
 | 
			
		||||
                    type: e.type,
 | 
			
		||||
                    mime: e.mime,
 | 
			
		||||
                    isDeleted: e.isDeleted,
 | 
			
		||||
                    queryProviderId: providerId,
 | 
			
		||||
                    queryModelId: modelId
 | 
			
		||||
                };
 | 
			
		||||
            });
 | 
			
		||||
 | 
			
		||||
            return await processEmbeddings(embedding, enhancedEmbeddings, actualThreshold, limit);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // If no embeddings found and fallback is allowed, try other providers
 | 
			
		||||
@ -195,10 +234,28 @@ export async function findSimilarNotes(
 | 
			
		||||
                            FROM note_embeddings ne
 | 
			
		||||
                            JOIN notes n ON ne.noteId = n.noteId
 | 
			
		||||
                            WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0
 | 
			
		||||
                        `, [bestAlternative.providerId, bestAlternative.modelId]);
 | 
			
		||||
                        `, [bestAlternative.providerId, bestAlternative.modelId]) as EmbeddingRow[];
 | 
			
		||||
 | 
			
		||||
                        if (alternativeEmbeddings && alternativeEmbeddings.length > 0) {
 | 
			
		||||
                            return await processEmbeddings(embedding, alternativeEmbeddings, actualThreshold, limit);
 | 
			
		||||
                            // Add query model information to each embedding for cross-model comparison
 | 
			
		||||
                            const enhancedEmbeddings: EnhancedEmbeddingRow[] = alternativeEmbeddings.map(e => {
 | 
			
		||||
                                return {
 | 
			
		||||
                                    embedId: e.embedId,
 | 
			
		||||
                                    noteId: e.noteId,
 | 
			
		||||
                                    providerId: e.providerId,
 | 
			
		||||
                                    modelId: e.modelId,
 | 
			
		||||
                                    dimension: e.dimension,
 | 
			
		||||
                                    embedding: e.embedding,
 | 
			
		||||
                                    title: e.title,
 | 
			
		||||
                                    type: e.type,
 | 
			
		||||
                                    mime: e.mime,
 | 
			
		||||
                                    isDeleted: e.isDeleted,
 | 
			
		||||
                                    queryProviderId: providerId,
 | 
			
		||||
                                    queryModelId: modelId
 | 
			
		||||
                                };
 | 
			
		||||
                            });
 | 
			
		||||
 | 
			
		||||
                            return await processEmbeddings(embedding, enhancedEmbeddings, actualThreshold, limit);
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                } else {
 | 
			
		||||
@ -256,17 +313,58 @@ export async function findSimilarNotes(
 | 
			
		||||
 | 
			
		||||
// Helper function to process embeddings and calculate similarities
 | 
			
		||||
async function processEmbeddings(queryEmbedding: Float32Array, embeddings: any[], threshold: number, limit: number) {
 | 
			
		||||
    const { enhancedCosineSimilarity, bufferToEmbedding } = await import('./vector_utils.js');
 | 
			
		||||
    const {
 | 
			
		||||
        enhancedCosineSimilarity,
 | 
			
		||||
        bufferToEmbedding,
 | 
			
		||||
        ContentType,
 | 
			
		||||
        PerformanceProfile,
 | 
			
		||||
        detectContentType,
 | 
			
		||||
        vectorDebugConfig
 | 
			
		||||
    } = await import('./vector_utils.js');
 | 
			
		||||
 | 
			
		||||
    // Enable debug logging temporarily for testing content-aware adaptation
 | 
			
		||||
    const originalDebugEnabled = vectorDebugConfig.enabled;
 | 
			
		||||
    const originalLogLevel = vectorDebugConfig.logLevel;
 | 
			
		||||
    vectorDebugConfig.enabled = true;
 | 
			
		||||
    vectorDebugConfig.logLevel = 'debug';
 | 
			
		||||
    vectorDebugConfig.recordStats = true;
 | 
			
		||||
 | 
			
		||||
    const similarities = [];
 | 
			
		||||
 | 
			
		||||
    try {
 | 
			
		||||
        for (const e of embeddings) {
 | 
			
		||||
            const embVector = bufferToEmbedding(e.embedding, e.dimension);
 | 
			
		||||
        const similarity = enhancedCosineSimilarity(queryEmbedding, embVector);
 | 
			
		||||
 | 
			
		||||
            // Detect content type from mime type if available
 | 
			
		||||
            let contentType = ContentType.GENERAL_TEXT;
 | 
			
		||||
            if (e.mime) {
 | 
			
		||||
                contentType = detectContentType(e.mime);
 | 
			
		||||
                console.log(`Note ID: ${e.noteId}, Mime: ${e.mime}, Detected content type: ${contentType}`);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Select performance profile based on embedding size and use case
 | 
			
		||||
            // For most similarity searches, BALANCED is a good default
 | 
			
		||||
            const performanceProfile = PerformanceProfile.BALANCED;
 | 
			
		||||
 | 
			
		||||
            // Determine if this is cross-model comparison
 | 
			
		||||
            const isCrossModel = e.providerId !== e.queryProviderId || e.modelId !== e.queryModelId;
 | 
			
		||||
 | 
			
		||||
            // Calculate similarity with content-aware parameters
 | 
			
		||||
            const similarity = enhancedCosineSimilarity(
 | 
			
		||||
                queryEmbedding,
 | 
			
		||||
                embVector,
 | 
			
		||||
                true, // normalize vectors to ensure consistent comparison
 | 
			
		||||
                e.queryModelId,  // source model ID
 | 
			
		||||
                e.modelId,       // target model ID
 | 
			
		||||
                contentType,     // content-specific padding strategy
 | 
			
		||||
                performanceProfile
 | 
			
		||||
            );
 | 
			
		||||
 | 
			
		||||
            if (similarity >= threshold) {
 | 
			
		||||
                similarities.push({
 | 
			
		||||
                    noteId: e.noteId,
 | 
			
		||||
                similarity: similarity
 | 
			
		||||
                    similarity: similarity,
 | 
			
		||||
                    contentType: contentType.toString()
 | 
			
		||||
                });
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
@ -274,6 +372,12 @@ async function processEmbeddings(queryEmbedding: Float32Array, embeddings: any[]
 | 
			
		||||
        return similarities
 | 
			
		||||
            .sort((a, b) => b.similarity - a.similarity)
 | 
			
		||||
            .slice(0, limit);
 | 
			
		||||
    } finally {
 | 
			
		||||
        // Restore original debug settings
 | 
			
		||||
        vectorDebugConfig.enabled = originalDebugEnabled;
 | 
			
		||||
        vectorDebugConfig.logLevel = originalLogLevel;
 | 
			
		||||
        vectorDebugConfig.recordStats = false;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 | 
			
		||||
@ -1,34 +1,144 @@
 | 
			
		||||
/**
 | 
			
		||||
 * Computes the cosine similarity between two vectors
 | 
			
		||||
 * If dimensions don't match, automatically adapts using the enhanced approach
 | 
			
		||||
 * @param normalize Optional flag to normalize vectors before comparison (default: false)
 | 
			
		||||
 * @param sourceModel Optional identifier for the source model
 | 
			
		||||
 * @param targetModel Optional identifier for the target model
 | 
			
		||||
 * @param contentType Optional content type for strategy selection
 | 
			
		||||
 * @param performanceProfile Optional performance profile
 | 
			
		||||
 */
 | 
			
		||||
export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
 | 
			
		||||
export function cosineSimilarity(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    normalize: boolean = false,
 | 
			
		||||
    sourceModel?: string,
 | 
			
		||||
    targetModel?: string,
 | 
			
		||||
    contentType?: ContentType,
 | 
			
		||||
    performanceProfile?: PerformanceProfile
 | 
			
		||||
): number {
 | 
			
		||||
    // Use the enhanced approach that preserves more information
 | 
			
		||||
    return enhancedCosineSimilarity(a, b);
 | 
			
		||||
    return enhancedCosineSimilarity(a, b, normalize, sourceModel, targetModel, contentType, performanceProfile);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Enhanced cosine similarity that adaptively handles different dimensions
 | 
			
		||||
 * Instead of truncating larger embeddings, it pads smaller ones to preserve information
 | 
			
		||||
 * @param normalize Optional flag to normalize vectors before comparison (default: false)
 | 
			
		||||
 * @param sourceModel Optional identifier for the source model
 | 
			
		||||
 * @param targetModel Optional identifier for the target model
 | 
			
		||||
 * @param contentType Optional content type for strategy selection
 | 
			
		||||
 * @param performanceProfile Optional performance profile
 | 
			
		||||
 */
 | 
			
		||||
export function enhancedCosineSimilarity(a: Float32Array, b: Float32Array): number {
 | 
			
		||||
export function enhancedCosineSimilarity(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    normalize: boolean = false,
 | 
			
		||||
    sourceModel?: string,
 | 
			
		||||
    targetModel?: string,
 | 
			
		||||
    contentType?: ContentType,
 | 
			
		||||
    performanceProfile?: PerformanceProfile
 | 
			
		||||
): number {
 | 
			
		||||
    // If normalization is requested, normalize vectors first
 | 
			
		||||
    if (normalize) {
 | 
			
		||||
        a = normalizeVector(a);
 | 
			
		||||
        b = normalizeVector(b);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // If dimensions match, use standard calculation
 | 
			
		||||
    if (a.length === b.length) {
 | 
			
		||||
        return standardCosineSimilarity(a, b);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Always adapt smaller embedding to larger one to preserve maximum information
 | 
			
		||||
    // Log dimension adaptation
 | 
			
		||||
    debugLog(`Dimension mismatch: ${a.length} vs ${b.length}. Adapting dimensions...`, 'info');
 | 
			
		||||
 | 
			
		||||
    // Determine if models are different
 | 
			
		||||
    const isCrossModelComparison = sourceModel !== targetModel &&
 | 
			
		||||
                                  sourceModel !== undefined &&
 | 
			
		||||
                                  targetModel !== undefined;
 | 
			
		||||
 | 
			
		||||
    // Context for strategy selection
 | 
			
		||||
    const context: StrategySelectionContext = {
 | 
			
		||||
        contentType: contentType || ContentType.GENERAL_TEXT,
 | 
			
		||||
        performanceProfile: performanceProfile || PerformanceProfile.BALANCED,
 | 
			
		||||
        sourceDimension: a.length,
 | 
			
		||||
        targetDimension: b.length,
 | 
			
		||||
        sourceModel,
 | 
			
		||||
        targetModel,
 | 
			
		||||
        isCrossModelComparison
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // Select the optimal strategy based on context
 | 
			
		||||
    let adaptOptions: AdaptationOptions;
 | 
			
		||||
 | 
			
		||||
    if (a.length > b.length) {
 | 
			
		||||
        // Pad b to match a's dimensions
 | 
			
		||||
        const adaptedB = adaptEmbeddingDimensions(b, a.length);
 | 
			
		||||
        debugLog(`Adapting embedding B (${b.length}D) to match A (${a.length}D)`, 'debug');
 | 
			
		||||
 | 
			
		||||
        // Get optimal strategy
 | 
			
		||||
        adaptOptions = selectOptimalPaddingStrategy(context);
 | 
			
		||||
        const adaptedB = adaptEmbeddingDimensions(b, a.length, adaptOptions);
 | 
			
		||||
 | 
			
		||||
        // Record stats
 | 
			
		||||
        recordAdaptationStats({
 | 
			
		||||
            operation: 'dimension_adaptation',
 | 
			
		||||
            sourceModel: targetModel,
 | 
			
		||||
            targetModel: sourceModel,
 | 
			
		||||
            sourceDimension: b.length,
 | 
			
		||||
            targetDimension: a.length,
 | 
			
		||||
            strategy: adaptOptions.strategy
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        return standardCosineSimilarity(a, adaptedB);
 | 
			
		||||
    } else {
 | 
			
		||||
        // Pad a to match b's dimensions
 | 
			
		||||
        const adaptedA = adaptEmbeddingDimensions(a, b.length);
 | 
			
		||||
        debugLog(`Adapting embedding A (${a.length}D) to match B (${b.length}D)`, 'debug');
 | 
			
		||||
 | 
			
		||||
        // Get optimal strategy
 | 
			
		||||
        adaptOptions = selectOptimalPaddingStrategy(context);
 | 
			
		||||
        const adaptedA = adaptEmbeddingDimensions(a, b.length, adaptOptions);
 | 
			
		||||
 | 
			
		||||
        // Record stats
 | 
			
		||||
        recordAdaptationStats({
 | 
			
		||||
            operation: 'dimension_adaptation',
 | 
			
		||||
            sourceModel: sourceModel,
 | 
			
		||||
            targetModel: targetModel,
 | 
			
		||||
            sourceDimension: a.length,
 | 
			
		||||
            targetDimension: b.length,
 | 
			
		||||
            strategy: adaptOptions.strategy
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        return standardCosineSimilarity(adaptedA, b);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Normalizes a vector to unit length
 | 
			
		||||
 * @param vector The vector to normalize
 | 
			
		||||
 * @returns A new normalized vector
 | 
			
		||||
 */
 | 
			
		||||
export function normalizeVector(vector: Float32Array): Float32Array {
 | 
			
		||||
    let magnitude = 0;
 | 
			
		||||
    for (let i = 0; i < vector.length; i++) {
 | 
			
		||||
        magnitude += vector[i] * vector[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    magnitude = Math.sqrt(magnitude);
 | 
			
		||||
 | 
			
		||||
    // If vector is already normalized or is a zero vector, return a copy
 | 
			
		||||
    if (magnitude === 0 || Math.abs(magnitude - 1.0) < 1e-6) {
 | 
			
		||||
        return new Float32Array(vector);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Create a new normalized vector
 | 
			
		||||
    const normalized = new Float32Array(vector.length);
 | 
			
		||||
    for (let i = 0; i < vector.length; i++) {
 | 
			
		||||
        normalized[i] = vector[i] / magnitude;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return normalized;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Standard cosine similarity for same-dimension vectors
 | 
			
		||||
 */
 | 
			
		||||
@ -75,28 +185,111 @@ export function selectOptimalEmbedding(embeddings: Array<{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Adapts an embedding to match target dimensions
 | 
			
		||||
 * Uses a simple truncation (if source is larger) or zero-padding (if source is smaller)
 | 
			
		||||
 * Padding strategy options for dimension adaptation
 | 
			
		||||
 */
 | 
			
		||||
export enum PaddingStrategy {
 | 
			
		||||
    ZERO = 'zero',               // Simple zero padding (default)
 | 
			
		||||
    MEAN = 'mean',               // Padding with mean value of source embedding
 | 
			
		||||
    GAUSSIAN = 'gaussian',       // Padding with Gaussian noise based on source statistics
 | 
			
		||||
    MIRROR = 'mirror'            // Mirroring existing values for padding
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Configuration for embedding adaptation
 | 
			
		||||
 */
 | 
			
		||||
export interface AdaptationOptions {
 | 
			
		||||
    strategy: PaddingStrategy;
 | 
			
		||||
    seed?: number;               // Seed for random number generation (gaussian)
 | 
			
		||||
    variance?: number;           // Variance for gaussian noise (default: 0.01)
 | 
			
		||||
    normalize?: boolean;         // Whether to normalize after adaptation
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Adapts an embedding to match target dimensions with configurable strategies
 | 
			
		||||
 *
 | 
			
		||||
 * @param sourceEmbedding The original embedding
 | 
			
		||||
 * @param targetDimension The desired dimension
 | 
			
		||||
 * @param options Configuration options for the adaptation
 | 
			
		||||
 * @returns A new embedding with the target dimensions
 | 
			
		||||
 */
 | 
			
		||||
export function adaptEmbeddingDimensions(sourceEmbedding: Float32Array, targetDimension: number): Float32Array {
 | 
			
		||||
export function adaptEmbeddingDimensions(
 | 
			
		||||
    sourceEmbedding: Float32Array,
 | 
			
		||||
    targetDimension: number,
 | 
			
		||||
    options: AdaptationOptions = { strategy: PaddingStrategy.ZERO, normalize: true }
 | 
			
		||||
): Float32Array {
 | 
			
		||||
    const sourceDimension = sourceEmbedding.length;
 | 
			
		||||
 | 
			
		||||
    // If dimensions already match, return the original
 | 
			
		||||
    // If dimensions already match, return a copy of the original
 | 
			
		||||
    if (sourceDimension === targetDimension) {
 | 
			
		||||
        return sourceEmbedding;
 | 
			
		||||
        return new Float32Array(sourceEmbedding);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Create a new embedding with target dimensions
 | 
			
		||||
    const adaptedEmbedding = new Float32Array(targetDimension);
 | 
			
		||||
 | 
			
		||||
    if (sourceDimension < targetDimension) {
 | 
			
		||||
        // If source is smaller, copy all values and pad with zeros
 | 
			
		||||
        // Copy all source values first
 | 
			
		||||
        adaptedEmbedding.set(sourceEmbedding);
 | 
			
		||||
        // Rest of the array is already initialized to zeros
 | 
			
		||||
 | 
			
		||||
        // Apply the selected padding strategy
 | 
			
		||||
        switch (options.strategy) {
 | 
			
		||||
            case PaddingStrategy.ZERO:
 | 
			
		||||
                // Zero padding is already done by default
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case PaddingStrategy.MEAN:
 | 
			
		||||
                // Calculate mean of source embedding
 | 
			
		||||
                let sum = 0;
 | 
			
		||||
                for (let i = 0; i < sourceDimension; i++) {
 | 
			
		||||
                    sum += sourceEmbedding[i];
 | 
			
		||||
                }
 | 
			
		||||
                const mean = sum / sourceDimension;
 | 
			
		||||
 | 
			
		||||
                // Fill remaining dimensions with mean value
 | 
			
		||||
                for (let i = sourceDimension; i < targetDimension; i++) {
 | 
			
		||||
                    adaptedEmbedding[i] = mean;
 | 
			
		||||
                }
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case PaddingStrategy.GAUSSIAN:
 | 
			
		||||
                // Calculate mean and standard deviation of source embedding
 | 
			
		||||
                let meanSum = 0;
 | 
			
		||||
                for (let i = 0; i < sourceDimension; i++) {
 | 
			
		||||
                    meanSum += sourceEmbedding[i];
 | 
			
		||||
                }
 | 
			
		||||
                const meanValue = meanSum / sourceDimension;
 | 
			
		||||
 | 
			
		||||
                let varianceSum = 0;
 | 
			
		||||
                for (let i = 0; i < sourceDimension; i++) {
 | 
			
		||||
                    varianceSum += Math.pow(sourceEmbedding[i] - meanValue, 2);
 | 
			
		||||
                }
 | 
			
		||||
                const variance = options.variance ?? Math.min(0.01, varianceSum / sourceDimension);
 | 
			
		||||
                const stdDev = Math.sqrt(variance);
 | 
			
		||||
 | 
			
		||||
                // Fill remaining dimensions with Gaussian noise
 | 
			
		||||
                for (let i = sourceDimension; i < targetDimension; i++) {
 | 
			
		||||
                    // Box-Muller transform for Gaussian distribution
 | 
			
		||||
                    const u1 = Math.random();
 | 
			
		||||
                    const u2 = Math.random();
 | 
			
		||||
                    const z0 = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
 | 
			
		||||
 | 
			
		||||
                    adaptedEmbedding[i] = meanValue + stdDev * z0;
 | 
			
		||||
                }
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case PaddingStrategy.MIRROR:
 | 
			
		||||
                // Mirror existing values for padding
 | 
			
		||||
                for (let i = sourceDimension; i < targetDimension; i++) {
 | 
			
		||||
                    // Cycle through source values in reverse order
 | 
			
		||||
                    const mirrorIndex = sourceDimension - 1 - ((i - sourceDimension) % sourceDimension);
 | 
			
		||||
                    adaptedEmbedding[i] = sourceEmbedding[mirrorIndex];
 | 
			
		||||
                }
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            default:
 | 
			
		||||
                // Default to zero padding
 | 
			
		||||
                break;
 | 
			
		||||
        }
 | 
			
		||||
    } else {
 | 
			
		||||
        // If source is larger, truncate to target dimension
 | 
			
		||||
        for (let i = 0; i < targetDimension; i++) {
 | 
			
		||||
@ -104,17 +297,9 @@ export function adaptEmbeddingDimensions(sourceEmbedding: Float32Array, targetDi
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Normalize the adapted embedding to maintain unit length
 | 
			
		||||
    let magnitude = 0;
 | 
			
		||||
    for (let i = 0; i < targetDimension; i++) {
 | 
			
		||||
        magnitude += adaptedEmbedding[i] * adaptedEmbedding[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    magnitude = Math.sqrt(magnitude);
 | 
			
		||||
    if (magnitude > 0) {
 | 
			
		||||
        for (let i = 0; i < targetDimension; i++) {
 | 
			
		||||
            adaptedEmbedding[i] /= magnitude;
 | 
			
		||||
        }
 | 
			
		||||
    // Normalize the adapted embedding if requested
 | 
			
		||||
    if (options.normalize) {
 | 
			
		||||
        return normalizeVector(adaptedEmbedding);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return adaptedEmbedding;
 | 
			
		||||
@ -133,3 +318,567 @@ export function embeddingToBuffer(embedding: Float32Array): Buffer {
 | 
			
		||||
export function bufferToEmbedding(buffer: Buffer, dimension: number): Float32Array {
 | 
			
		||||
    return new Float32Array(buffer.buffer, buffer.byteOffset, dimension);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Similarity metric options
 | 
			
		||||
 */
 | 
			
		||||
export enum SimilarityMetric {
 | 
			
		||||
    COSINE = 'cosine',               // Standard cosine similarity
 | 
			
		||||
    DOT_PRODUCT = 'dot_product',     // Simple dot product (assumes normalized vectors)
 | 
			
		||||
    HYBRID = 'hybrid',               // Dot product + cosine hybrid
 | 
			
		||||
    DIM_AWARE = 'dimension_aware',   // Dimension-aware similarity that factors in dimension differences
 | 
			
		||||
    ENSEMBLE = 'ensemble'            // Combined score from multiple metrics
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Configuration for similarity calculation
 | 
			
		||||
 */
 | 
			
		||||
export interface SimilarityOptions {
 | 
			
		||||
    metric: SimilarityMetric;
 | 
			
		||||
    normalize?: boolean;
 | 
			
		||||
    ensembleWeights?: {[key in SimilarityMetric]?: number};
 | 
			
		||||
    dimensionPenalty?: number; // Penalty factor for dimension differences (0 to 1)
 | 
			
		||||
    sourceModel?: string;      // Source model identifier
 | 
			
		||||
    targetModel?: string;      // Target model identifier
 | 
			
		||||
    contentType?: ContentType; // Type of content being compared
 | 
			
		||||
    performanceProfile?: PerformanceProfile; // Performance requirements
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Computes similarity between two vectors using the specified metric
 | 
			
		||||
 * @param a First vector
 | 
			
		||||
 * @param b Second vector
 | 
			
		||||
 * @param options Similarity calculation options
 | 
			
		||||
 */
 | 
			
		||||
export function computeSimilarity(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    options: SimilarityOptions = { metric: SimilarityMetric.COSINE }
 | 
			
		||||
): number {
 | 
			
		||||
    // Apply normalization if requested
 | 
			
		||||
    const normalize = options.normalize ?? false;
 | 
			
		||||
 | 
			
		||||
    switch (options.metric) {
 | 
			
		||||
        case SimilarityMetric.COSINE:
 | 
			
		||||
            return cosineSimilarity(
 | 
			
		||||
                a, b, normalize,
 | 
			
		||||
                options.sourceModel, options.targetModel,
 | 
			
		||||
                options.contentType, options.performanceProfile
 | 
			
		||||
            );
 | 
			
		||||
 | 
			
		||||
        case SimilarityMetric.DOT_PRODUCT:
 | 
			
		||||
            // Dot product assumes normalized vectors for proper similarity measurement
 | 
			
		||||
            const aNorm = normalize ? normalizeVector(a) : a;
 | 
			
		||||
            const bNorm = normalize ? normalizeVector(b) : b;
 | 
			
		||||
            return computeDotProduct(aNorm, bNorm, options);
 | 
			
		||||
 | 
			
		||||
        case SimilarityMetric.HYBRID:
 | 
			
		||||
            // Hybrid approach combines dot product with cosine similarity
 | 
			
		||||
            // More robust against small perturbations while maintaining angle sensitivity
 | 
			
		||||
            return hybridSimilarity(a, b, normalize, options);
 | 
			
		||||
 | 
			
		||||
        case SimilarityMetric.DIM_AWARE:
 | 
			
		||||
            // Dimension-aware similarity that factors in dimension differences
 | 
			
		||||
            return dimensionAwareSimilarity(
 | 
			
		||||
                a, b, normalize,
 | 
			
		||||
                options.dimensionPenalty ?? 0.1,
 | 
			
		||||
                options.contentType,
 | 
			
		||||
                options.performanceProfile
 | 
			
		||||
            );
 | 
			
		||||
 | 
			
		||||
        case SimilarityMetric.ENSEMBLE:
 | 
			
		||||
            // Ensemble scoring combines multiple metrics with weights
 | 
			
		||||
            return ensembleSimilarity(a, b, options);
 | 
			
		||||
 | 
			
		||||
        default:
 | 
			
		||||
            // Default to cosine similarity
 | 
			
		||||
            return cosineSimilarity(
 | 
			
		||||
                a, b, normalize,
 | 
			
		||||
                options.sourceModel, options.targetModel,
 | 
			
		||||
                options.contentType, options.performanceProfile
 | 
			
		||||
            );
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Computes dot product between two vectors
 | 
			
		||||
 */
 | 
			
		||||
export function computeDotProduct(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    options?: Pick<SimilarityOptions, 'contentType' | 'performanceProfile' | 'sourceModel' | 'targetModel'>
 | 
			
		||||
): number {
 | 
			
		||||
    // Adapt dimensions if needed
 | 
			
		||||
    if (a.length !== b.length) {
 | 
			
		||||
        // Create context for strategy selection if dimensions don't match
 | 
			
		||||
        if (options) {
 | 
			
		||||
            const context: StrategySelectionContext = {
 | 
			
		||||
                contentType: options.contentType || ContentType.GENERAL_TEXT,
 | 
			
		||||
                performanceProfile: options.performanceProfile || PerformanceProfile.BALANCED,
 | 
			
		||||
                sourceDimension: a.length,
 | 
			
		||||
                targetDimension: b.length,
 | 
			
		||||
                sourceModel: options.sourceModel,
 | 
			
		||||
                targetModel: options.targetModel,
 | 
			
		||||
                isCrossModelComparison: options.sourceModel !== options.targetModel &&
 | 
			
		||||
                                      options.sourceModel !== undefined &&
 | 
			
		||||
                                      options.targetModel !== undefined
 | 
			
		||||
            };
 | 
			
		||||
 | 
			
		||||
            if (a.length > b.length) {
 | 
			
		||||
                const adaptOptions = selectOptimalPaddingStrategy(context);
 | 
			
		||||
                b = adaptEmbeddingDimensions(b, a.length, adaptOptions);
 | 
			
		||||
            } else {
 | 
			
		||||
                const adaptOptions = selectOptimalPaddingStrategy(context);
 | 
			
		||||
                a = adaptEmbeddingDimensions(a, b.length, adaptOptions);
 | 
			
		||||
            }
 | 
			
		||||
        } else {
 | 
			
		||||
            // Default behavior without options
 | 
			
		||||
            if (a.length > b.length) {
 | 
			
		||||
                b = adaptEmbeddingDimensions(b, a.length);
 | 
			
		||||
            } else {
 | 
			
		||||
                a = adaptEmbeddingDimensions(a, b.length);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let dotProduct = 0;
 | 
			
		||||
    for (let i = 0; i < a.length; i++) {
 | 
			
		||||
        dotProduct += a[i] * b[i];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return dotProduct;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Hybrid similarity combines dot product and cosine similarity
 | 
			
		||||
 * Provides robustness against small perturbations while maintaining angle sensitivity
 | 
			
		||||
 */
 | 
			
		||||
export function hybridSimilarity(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    normalize: boolean = false,
 | 
			
		||||
    options?: Pick<SimilarityOptions, 'contentType' | 'performanceProfile' | 'sourceModel' | 'targetModel'>
 | 
			
		||||
): number {
 | 
			
		||||
    // Get cosine similarity with full options
 | 
			
		||||
    const cosine = cosineSimilarity(
 | 
			
		||||
        a, b, normalize,
 | 
			
		||||
        options?.sourceModel, options?.targetModel,
 | 
			
		||||
        options?.contentType, options?.performanceProfile
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    // For dot product, we should always normalize
 | 
			
		||||
    const aNorm = normalize ? a : normalizeVector(a);
 | 
			
		||||
    const bNorm = normalize ? b : normalizeVector(b);
 | 
			
		||||
 | 
			
		||||
    // If dimensions don't match, adapt with optimal strategy
 | 
			
		||||
    let adaptedA = aNorm;
 | 
			
		||||
    let adaptedB = bNorm;
 | 
			
		||||
 | 
			
		||||
    if (aNorm.length !== bNorm.length) {
 | 
			
		||||
        // Use optimal padding strategy
 | 
			
		||||
        if (options) {
 | 
			
		||||
            const context: StrategySelectionContext = {
 | 
			
		||||
                contentType: options.contentType || ContentType.GENERAL_TEXT,
 | 
			
		||||
                performanceProfile: options.performanceProfile || PerformanceProfile.BALANCED,
 | 
			
		||||
                sourceDimension: aNorm.length,
 | 
			
		||||
                targetDimension: bNorm.length,
 | 
			
		||||
                sourceModel: options.sourceModel,
 | 
			
		||||
                targetModel: options.targetModel,
 | 
			
		||||
                isCrossModelComparison: options.sourceModel !== options.targetModel &&
 | 
			
		||||
                                      options.sourceModel !== undefined &&
 | 
			
		||||
                                      options.targetModel !== undefined
 | 
			
		||||
            };
 | 
			
		||||
 | 
			
		||||
            if (aNorm.length < bNorm.length) {
 | 
			
		||||
                const adaptOptions = selectOptimalPaddingStrategy(context);
 | 
			
		||||
                adaptedA = adaptEmbeddingDimensions(aNorm, bNorm.length, adaptOptions);
 | 
			
		||||
            } else {
 | 
			
		||||
                const adaptOptions = selectOptimalPaddingStrategy(context);
 | 
			
		||||
                adaptedB = adaptEmbeddingDimensions(bNorm, aNorm.length, adaptOptions);
 | 
			
		||||
            }
 | 
			
		||||
        } else {
 | 
			
		||||
            // Default behavior
 | 
			
		||||
            adaptedA = aNorm.length < bNorm.length ? adaptEmbeddingDimensions(aNorm, bNorm.length) : aNorm;
 | 
			
		||||
            adaptedB = bNorm.length < aNorm.length ? adaptEmbeddingDimensions(bNorm, aNorm.length) : bNorm;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Compute dot product (should be similar to cosine for normalized vectors)
 | 
			
		||||
    const dot = computeDotProduct(adaptedA, adaptedB, options);
 | 
			
		||||
 | 
			
		||||
    // Return weighted average - giving more weight to cosine
 | 
			
		||||
    return 0.7 * cosine + 0.3 * dot;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Dimension-aware similarity that factors in dimension differences
 | 
			
		||||
 * @param dimensionPenalty Penalty factor for dimension differences (0 to 1)
 | 
			
		||||
 */
 | 
			
		||||
export function dimensionAwareSimilarity(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    normalize: boolean = false,
 | 
			
		||||
    dimensionPenalty: number = 0.1,
 | 
			
		||||
    contentType?: ContentType,
 | 
			
		||||
    performanceProfile?: PerformanceProfile
 | 
			
		||||
): number {
 | 
			
		||||
    // Basic cosine similarity with content type information
 | 
			
		||||
    const cosine = cosineSimilarity(a, b, normalize, undefined, undefined, contentType, performanceProfile);
 | 
			
		||||
 | 
			
		||||
    // If dimensions match, return standard cosine
 | 
			
		||||
    if (a.length === b.length) {
 | 
			
		||||
        return cosine;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Calculate dimension penalty
 | 
			
		||||
    // This penalizes vectors with very different dimensions
 | 
			
		||||
    const dimRatio = Math.min(a.length, b.length) / Math.max(a.length, b.length);
 | 
			
		||||
    const penalty = 1 - dimensionPenalty * (1 - dimRatio);
 | 
			
		||||
 | 
			
		||||
    // Apply penalty to similarity score
 | 
			
		||||
    return cosine * penalty;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Ensemble similarity combines multiple metrics with weights
 | 
			
		||||
 */
 | 
			
		||||
export function ensembleSimilarity(
 | 
			
		||||
    a: Float32Array,
 | 
			
		||||
    b: Float32Array,
 | 
			
		||||
    options: SimilarityOptions
 | 
			
		||||
): number {
 | 
			
		||||
    // Default weights if not provided
 | 
			
		||||
    const weights = options.ensembleWeights ?? {
 | 
			
		||||
        [SimilarityMetric.COSINE]: 0.6,
 | 
			
		||||
        [SimilarityMetric.HYBRID]: 0.3,
 | 
			
		||||
        [SimilarityMetric.DIM_AWARE]: 0.1
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    let totalWeight = 0;
 | 
			
		||||
    let weightedSum = 0;
 | 
			
		||||
 | 
			
		||||
    // Compute each metric and apply weight
 | 
			
		||||
    for (const [metricStr, weight] of Object.entries(weights)) {
 | 
			
		||||
        const metric = metricStr as SimilarityMetric;
 | 
			
		||||
        if (weight && weight > 0) {
 | 
			
		||||
            // Skip the ensemble itself to avoid recursion
 | 
			
		||||
            if (metric !== SimilarityMetric.ENSEMBLE) {
 | 
			
		||||
                const similarity = computeSimilarity(a, b, {
 | 
			
		||||
                    metric,
 | 
			
		||||
                    normalize: options.normalize
 | 
			
		||||
                });
 | 
			
		||||
 | 
			
		||||
                weightedSum += similarity * weight;
 | 
			
		||||
                totalWeight += weight;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Normalize by total weight
 | 
			
		||||
    return totalWeight > 0 ? weightedSum / totalWeight : cosineSimilarity(a, b, options.normalize);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Debug configuration for vector operations
 | 
			
		||||
 */
 | 
			
		||||
export interface DebugConfig {
 | 
			
		||||
    enabled: boolean;
 | 
			
		||||
    logLevel: 'info' | 'debug' | 'warning' | 'error';
 | 
			
		||||
    recordStats: boolean;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Global debug configuration, can be modified at runtime
 | 
			
		||||
 */
 | 
			
		||||
export const vectorDebugConfig: DebugConfig = {
 | 
			
		||||
    enabled: false,
 | 
			
		||||
    logLevel: 'info',
 | 
			
		||||
    recordStats: false
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Statistics collected during vector operations
 | 
			
		||||
 */
 | 
			
		||||
export interface AdaptationStats {
 | 
			
		||||
    timestamp: number;
 | 
			
		||||
    operation: string;
 | 
			
		||||
    sourceModel?: string;
 | 
			
		||||
    targetModel?: string;
 | 
			
		||||
    sourceDimension: number;
 | 
			
		||||
    targetDimension: number;
 | 
			
		||||
    strategy: string;
 | 
			
		||||
    similarity?: number;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Collection of adaptation statistics for quality auditing
 | 
			
		||||
export const adaptationStats: AdaptationStats[] = [];
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Log a message if debugging is enabled
 | 
			
		||||
 */
 | 
			
		||||
function debugLog(
 | 
			
		||||
    message: string,
 | 
			
		||||
    level: 'info' | 'debug' | 'warning' | 'error' = 'info'
 | 
			
		||||
): void {
 | 
			
		||||
    if (vectorDebugConfig.enabled) {
 | 
			
		||||
        const levelOrder = { 'debug': 0, 'info': 1, 'warning': 2, 'error': 3 };
 | 
			
		||||
 | 
			
		||||
        if (levelOrder[level] >= levelOrder[vectorDebugConfig.logLevel]) {
 | 
			
		||||
            const prefix = `[VectorUtils:${level.toUpperCase()}]`;
 | 
			
		||||
 | 
			
		||||
            switch (level) {
 | 
			
		||||
                case 'error':
 | 
			
		||||
                    console.error(prefix, message);
 | 
			
		||||
                    break;
 | 
			
		||||
                case 'warning':
 | 
			
		||||
                    console.warn(prefix, message);
 | 
			
		||||
                    break;
 | 
			
		||||
                case 'debug':
 | 
			
		||||
                    console.debug(prefix, message);
 | 
			
		||||
                    break;
 | 
			
		||||
                default:
 | 
			
		||||
                    console.log(prefix, message);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Record adaptation statistics if enabled
 | 
			
		||||
 */
 | 
			
		||||
function recordAdaptationStats(stats: Omit<AdaptationStats, 'timestamp'>): void {
 | 
			
		||||
    if (vectorDebugConfig.enabled && vectorDebugConfig.recordStats) {
 | 
			
		||||
        adaptationStats.push({
 | 
			
		||||
            ...stats,
 | 
			
		||||
            timestamp: Date.now()
 | 
			
		||||
        });
 | 
			
		||||
 | 
			
		||||
        // Keep only the last 1000 stats to prevent memory issues
 | 
			
		||||
        if (adaptationStats.length > 1000) {
 | 
			
		||||
            adaptationStats.shift();
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Content types for embedding adaptation strategy selection
 | 
			
		||||
 */
 | 
			
		||||
export enum ContentType {
 | 
			
		||||
    GENERAL_TEXT = 'general_text',
 | 
			
		||||
    CODE = 'code',
 | 
			
		||||
    STRUCTURED_DATA = 'structured_data',
 | 
			
		||||
    MATHEMATICAL = 'mathematical',
 | 
			
		||||
    MIXED = 'mixed'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Performance profile for selecting adaptation strategy
 | 
			
		||||
 */
 | 
			
		||||
export enum PerformanceProfile {
 | 
			
		||||
    MAXIMUM_QUALITY = 'maximum_quality',   // Prioritize similarity quality over speed
 | 
			
		||||
    BALANCED = 'balanced',                 // Balance quality and performance
 | 
			
		||||
    MAXIMUM_SPEED = 'maximum_speed'        // Prioritize speed over quality
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Context for selecting the optimal padding strategy
 | 
			
		||||
 */
 | 
			
		||||
export interface StrategySelectionContext {
 | 
			
		||||
    contentType?: ContentType;                 // Type of content being compared
 | 
			
		||||
    performanceProfile?: PerformanceProfile;   // Performance requirements
 | 
			
		||||
    sourceDimension: number;                   // Source embedding dimension
 | 
			
		||||
    targetDimension: number;                   // Target embedding dimension
 | 
			
		||||
    sourceModel?: string;                      // Source model identifier
 | 
			
		||||
    targetModel?: string;                      // Target model identifier
 | 
			
		||||
    isHighPrecisionRequired?: boolean;         // Whether high precision is needed
 | 
			
		||||
    isCrossModelComparison?: boolean;          // Whether comparing across different models
 | 
			
		||||
    dimensionRatio?: number;                   // Custom dimension ratio threshold
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Selects the optimal padding strategy based on content type and performance considerations
 | 
			
		||||
 * @param context Selection context parameters
 | 
			
		||||
 * @returns The most appropriate padding strategy and options
 | 
			
		||||
 */
 | 
			
		||||
export function selectOptimalPaddingStrategy(
 | 
			
		||||
    context: StrategySelectionContext
 | 
			
		||||
): AdaptationOptions {
 | 
			
		||||
    const {
 | 
			
		||||
        contentType = ContentType.GENERAL_TEXT,
 | 
			
		||||
        performanceProfile = PerformanceProfile.BALANCED,
 | 
			
		||||
        sourceDimension,
 | 
			
		||||
        targetDimension,
 | 
			
		||||
        isHighPrecisionRequired = false,
 | 
			
		||||
        isCrossModelComparison = false
 | 
			
		||||
    } = context;
 | 
			
		||||
 | 
			
		||||
    // Calculate dimension ratio
 | 
			
		||||
    const dimRatio = Math.min(sourceDimension, targetDimension) /
 | 
			
		||||
                     Math.max(sourceDimension, targetDimension);
 | 
			
		||||
 | 
			
		||||
    // Default options
 | 
			
		||||
    const options: AdaptationOptions = {
 | 
			
		||||
        strategy: PaddingStrategy.ZERO,
 | 
			
		||||
        normalize: true
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // Significant dimension difference detection
 | 
			
		||||
    const hasSignificantDimDifference = dimRatio < (context.dimensionRatio || 0.5);
 | 
			
		||||
 | 
			
		||||
    // Select strategy based on content type
 | 
			
		||||
    switch (contentType) {
 | 
			
		||||
        case ContentType.CODE:
 | 
			
		||||
            // Code benefits from structural patterns
 | 
			
		||||
            options.strategy = PaddingStrategy.MIRROR;
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        case ContentType.STRUCTURED_DATA:
 | 
			
		||||
            // Structured data works well with mean-value padding
 | 
			
		||||
            options.strategy = PaddingStrategy.MEAN;
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        case ContentType.MATHEMATICAL:
 | 
			
		||||
            // Mathematical content benefits from gaussian noise to maintain statistical properties
 | 
			
		||||
            options.strategy = PaddingStrategy.GAUSSIAN;
 | 
			
		||||
            options.variance = 0.005; // Lower variance for mathematical precision
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        case ContentType.MIXED:
 | 
			
		||||
            // For mixed content, choose based on performance profile
 | 
			
		||||
            if (performanceProfile === PerformanceProfile.MAXIMUM_QUALITY) {
 | 
			
		||||
                options.strategy = PaddingStrategy.GAUSSIAN;
 | 
			
		||||
            } else if (performanceProfile === PerformanceProfile.MAXIMUM_SPEED) {
 | 
			
		||||
                options.strategy = PaddingStrategy.ZERO;
 | 
			
		||||
            } else {
 | 
			
		||||
                options.strategy = PaddingStrategy.MEAN;
 | 
			
		||||
            }
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        case ContentType.GENERAL_TEXT:
 | 
			
		||||
        default:
 | 
			
		||||
            // For general text, base decision on other factors
 | 
			
		||||
            if (isHighPrecisionRequired) {
 | 
			
		||||
                options.strategy = PaddingStrategy.GAUSSIAN;
 | 
			
		||||
            } else if (isCrossModelComparison) {
 | 
			
		||||
                options.strategy = PaddingStrategy.MEAN;
 | 
			
		||||
            } else {
 | 
			
		||||
                options.strategy = PaddingStrategy.ZERO;
 | 
			
		||||
            }
 | 
			
		||||
            break;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Override based on performance profile if we have significant dimension differences
 | 
			
		||||
    if (hasSignificantDimDifference) {
 | 
			
		||||
        // For extreme dimension differences, specialized handling
 | 
			
		||||
        if (performanceProfile === PerformanceProfile.MAXIMUM_QUALITY) {
 | 
			
		||||
            // For quality, use gaussian noise for better statistical matching
 | 
			
		||||
            options.strategy = PaddingStrategy.GAUSSIAN;
 | 
			
		||||
            // Adjust variance based on dimension ratio
 | 
			
		||||
            options.variance = Math.min(0.01, 0.02 * dimRatio);
 | 
			
		||||
 | 
			
		||||
            // Log the significant dimension adaptation
 | 
			
		||||
            debugLog(`Significant dimension difference detected: ${sourceDimension} vs ${targetDimension}. ` +
 | 
			
		||||
                     `Ratio: ${dimRatio.toFixed(2)}. Using Gaussian strategy.`, 'warning');
 | 
			
		||||
        } else if (performanceProfile === PerformanceProfile.MAXIMUM_SPEED) {
 | 
			
		||||
            // For speed, stick with zero padding
 | 
			
		||||
            options.strategy = PaddingStrategy.ZERO;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Always use zero padding for trivial dimension differences
 | 
			
		||||
    // (e.g. 1536 vs 1537) for performance reasons
 | 
			
		||||
    if (Math.abs(sourceDimension - targetDimension) <= 5) {
 | 
			
		||||
        options.strategy = PaddingStrategy.ZERO;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Log the selected strategy
 | 
			
		||||
    debugLog(`Selected padding strategy: ${options.strategy} for ` +
 | 
			
		||||
             `content type: ${contentType}, performance profile: ${performanceProfile}`, 'debug');
 | 
			
		||||
 | 
			
		||||
    return options;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Helper function to determine content type from note context
 | 
			
		||||
 * @param context The note context information
 | 
			
		||||
 * @returns The detected content type
 | 
			
		||||
 */
 | 
			
		||||
export function detectContentType(mime: string, content?: string): ContentType {
 | 
			
		||||
    // Detect based on mime type
 | 
			
		||||
    if (mime.includes('code') ||
 | 
			
		||||
        mime.includes('javascript') ||
 | 
			
		||||
        mime.includes('typescript') ||
 | 
			
		||||
        mime.includes('python') ||
 | 
			
		||||
        mime.includes('java') ||
 | 
			
		||||
        mime.includes('c++') ||
 | 
			
		||||
        mime.includes('json')) {
 | 
			
		||||
        return ContentType.CODE;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (mime.includes('xml') ||
 | 
			
		||||
        mime.includes('csv') ||
 | 
			
		||||
        mime.includes('sql') ||
 | 
			
		||||
        mime.endsWith('+json')) {
 | 
			
		||||
        return ContentType.STRUCTURED_DATA;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (mime.includes('latex') ||
 | 
			
		||||
        mime.includes('mathml') ||
 | 
			
		||||
        mime.includes('tex')) {
 | 
			
		||||
        return ContentType.MATHEMATICAL;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // If we have content, we can do deeper analysis
 | 
			
		||||
    if (content) {
 | 
			
		||||
        // Detect code by looking for common patterns
 | 
			
		||||
        const codePatterns = [
 | 
			
		||||
            /function\s+\w+\s*\(.*\)\s*{/,  // JavaScript/TypeScript function
 | 
			
		||||
            /def\s+\w+\s*\(.*\):/,          // Python function
 | 
			
		||||
            /class\s+\w+(\s+extends\s+\w+)?(\s+implements\s+\w+)?\s*{/, // Java/TypeScript class
 | 
			
		||||
            /import\s+.*\s+from\s+['"]/,    // JS/TS import
 | 
			
		||||
            /^\s*```\w+/m                    // Markdown code block
 | 
			
		||||
        ];
 | 
			
		||||
 | 
			
		||||
        if (codePatterns.some(pattern => pattern.test(content))) {
 | 
			
		||||
            return ContentType.CODE;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Detect structured data
 | 
			
		||||
        const structuredPatterns = [
 | 
			
		||||
            /^\s*[{\[]/,                     // JSON-like start
 | 
			
		||||
            /^\s*<\?xml/,                    // XML declaration
 | 
			
		||||
            /^\s*<[a-z]+>/i,                 // HTML/XML tag
 | 
			
		||||
            /^\s*(\w+,)+\w+$/m,              // CSV-like
 | 
			
		||||
            /CREATE\s+TABLE|SELECT\s+.*\s+FROM/i  // SQL
 | 
			
		||||
        ];
 | 
			
		||||
 | 
			
		||||
        if (structuredPatterns.some(pattern => pattern.test(content))) {
 | 
			
		||||
            return ContentType.STRUCTURED_DATA;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Detect mathematical content
 | 
			
		||||
        const mathPatterns = [
 | 
			
		||||
            /\$\$.*\$\$/s,                   // LaTeX block
 | 
			
		||||
            /\\begin{equation}/,             // LaTeX equation environment
 | 
			
		||||
            /\\sum|\\int|\\frac|\\sqrt/,     // Common LaTeX math commands
 | 
			
		||||
        ];
 | 
			
		||||
 | 
			
		||||
        if (mathPatterns.some(pattern => pattern.test(content))) {
 | 
			
		||||
            return ContentType.MATHEMATICAL;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Check for mixed content
 | 
			
		||||
        const hasMixedContent =
 | 
			
		||||
            (codePatterns.some(pattern => pattern.test(content)) &&
 | 
			
		||||
             content.split(/\s+/).length > 100) || // Code and substantial text
 | 
			
		||||
            (content.includes('```') &&
 | 
			
		||||
             content.replace(/```.*?```/gs, '').length > 200); // Markdown with code blocks and text
 | 
			
		||||
 | 
			
		||||
        if (hasMixedContent) {
 | 
			
		||||
            return ContentType.MIXED;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Default to general text
 | 
			
		||||
    return ContentType.GENERAL_TEXT;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -565,7 +565,7 @@ class IndexService {
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Get Note IDs to search, optionally filtered by branch
 | 
			
		||||
            let similarNotes = [];
 | 
			
		||||
            let similarNotes: { noteId: string; title: string; similarity: number; contentType?: string }[] = [];
 | 
			
		||||
 | 
			
		||||
            // Check if we need to restrict search to a specific branch
 | 
			
		||||
            if (contextNoteId) {
 | 
			
		||||
@ -593,6 +593,9 @@ class IndexService {
 | 
			
		||||
                // Get embeddings for all notes in the branch
 | 
			
		||||
                const config = provider.getConfig();
 | 
			
		||||
 | 
			
		||||
                // Import the ContentType detection from vector utils
 | 
			
		||||
                const { ContentType, detectContentType, cosineSimilarity } = await import('./embeddings/vector_utils.js');
 | 
			
		||||
 | 
			
		||||
                for (const noteId of branchNoteIds) {
 | 
			
		||||
                    const noteEmbedding = await vectorStore.getEmbeddingForNote(
 | 
			
		||||
                        noteId,
 | 
			
		||||
@ -601,14 +604,29 @@ class IndexService {
 | 
			
		||||
                    );
 | 
			
		||||
 | 
			
		||||
                    if (noteEmbedding) {
 | 
			
		||||
                        const similarity = vectorStore.cosineSimilarity(embedding, noteEmbedding.embedding);
 | 
			
		||||
                        if (similarity >= this.defaultSimilarityThreshold) {
 | 
			
		||||
                        // Get the note to determine its content type
 | 
			
		||||
                        const note = becca.getNote(noteId);
 | 
			
		||||
                        if (note) {
 | 
			
		||||
                            // Detect content type from mime type
 | 
			
		||||
                            const contentType = detectContentType(note.mime, '');
 | 
			
		||||
 | 
			
		||||
                            // Use content-aware similarity calculation
 | 
			
		||||
                            const similarity = cosineSimilarity(
 | 
			
		||||
                                embedding,
 | 
			
		||||
                                noteEmbedding.embedding,
 | 
			
		||||
                                true, // normalize
 | 
			
		||||
                                config.model, // source model
 | 
			
		||||
                                noteEmbedding.providerId, // target model (use providerId)
 | 
			
		||||
                                contentType, // content type for padding strategy
 | 
			
		||||
                                undefined // use default BALANCED performance profile
 | 
			
		||||
                            );
 | 
			
		||||
 | 
			
		||||
                            if (similarity >= this.defaultSimilarityThreshold) {
 | 
			
		||||
                                similarNotes.push({
 | 
			
		||||
                                    noteId,
 | 
			
		||||
                                    title: note.title,
 | 
			
		||||
                                    similarity
 | 
			
		||||
                                    similarity,
 | 
			
		||||
                                    contentType: contentType.toString()
 | 
			
		||||
                                });
 | 
			
		||||
                            }
 | 
			
		||||
                        }
 | 
			
		||||
@ -622,7 +640,7 @@ class IndexService {
 | 
			
		||||
            } else {
 | 
			
		||||
                // Search across all notes
 | 
			
		||||
                const config = provider.getConfig();
 | 
			
		||||
                similarNotes = await vectorStore.findSimilarNotes(
 | 
			
		||||
                const results = await vectorStore.findSimilarNotes(
 | 
			
		||||
                    embedding,
 | 
			
		||||
                    provider.name,
 | 
			
		||||
                    config.model,
 | 
			
		||||
@ -631,14 +649,17 @@ class IndexService {
 | 
			
		||||
                );
 | 
			
		||||
 | 
			
		||||
                // Enhance results with note titles
 | 
			
		||||
                return similarNotes.map(result => {
 | 
			
		||||
                similarNotes = results.map(result => {
 | 
			
		||||
                    const note = becca.getNote(result.noteId);
 | 
			
		||||
                    return {
 | 
			
		||||
                        noteId: result.noteId,
 | 
			
		||||
                        title: note ? note.title : 'Unknown Note',
 | 
			
		||||
                        similarity: result.similarity
 | 
			
		||||
                        similarity: result.similarity,
 | 
			
		||||
                        contentType: result.contentType
 | 
			
		||||
                    };
 | 
			
		||||
                });
 | 
			
		||||
 | 
			
		||||
                return similarNotes;
 | 
			
		||||
            }
 | 
			
		||||
        } catch (error: any) {
 | 
			
		||||
            log.error(`Error finding similar notes: ${error.message || "Unknown error"}`);
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user