mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-11-09 02:41:39 +08:00
set up embedding normalization
This commit is contained in:
parent
08f7f1962b
commit
f05fe3f72b
@ -1,4 +1,5 @@
|
|||||||
import type { EmbeddingProvider, EmbeddingConfig, NoteEmbeddingContext } from './embeddings_interface.js';
|
import type { EmbeddingProvider, EmbeddingConfig, NoteEmbeddingContext } from './embeddings_interface.js';
|
||||||
|
import { NormalizationStatus } from './embeddings_interface.js';
|
||||||
import log from "../../log.js";
|
import log from "../../log.js";
|
||||||
import { LLM_CONSTANTS } from "../../../routes/api/llm.js";
|
import { LLM_CONSTANTS } from "../../../routes/api/llm.js";
|
||||||
import options from "../../options.js";
|
import options from "../../options.js";
|
||||||
@ -23,6 +24,15 @@ export abstract class BaseEmbeddingProvider implements EmbeddingProvider {
|
|||||||
return { ...this.config };
|
return { ...this.config };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the normalization status of this provider
|
||||||
|
* Default implementation returns the status from config if available,
|
||||||
|
* otherwise returns UNKNOWN status
|
||||||
|
*/
|
||||||
|
getNormalizationStatus(): NormalizationStatus {
|
||||||
|
return this.config.normalizationStatus || NormalizationStatus.UNKNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
getDimension(): number {
|
getDimension(): number {
|
||||||
return this.config.dimension;
|
return this.config.dimension;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,6 +42,35 @@ export interface NoteEmbeddingContext {
|
|||||||
export interface EmbeddingModelInfo {
|
export interface EmbeddingModelInfo {
|
||||||
dimension: number;
|
dimension: number;
|
||||||
contextWindow: number;
|
contextWindow: number;
|
||||||
|
/**
|
||||||
|
* Whether the model guarantees normalized vectors (unit length)
|
||||||
|
*/
|
||||||
|
guaranteesNormalization: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalization status of a provider's embeddings
|
||||||
|
*/
|
||||||
|
export enum NormalizationStatus {
|
||||||
|
/**
|
||||||
|
* Provider guarantees all embeddings are normalized to unit vectors
|
||||||
|
*/
|
||||||
|
GUARANTEED = 'guaranteed',
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provider does not guarantee normalization, but embeddings are usually normalized
|
||||||
|
*/
|
||||||
|
USUALLY = 'usually',
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provider does not guarantee normalization, embeddings must be normalized before use
|
||||||
|
*/
|
||||||
|
NEVER = 'never',
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalization status is unknown and should be checked at runtime
|
||||||
|
*/
|
||||||
|
UNKNOWN = 'unknown'
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -51,7 +80,16 @@ export interface EmbeddingConfig {
|
|||||||
model: string;
|
model: string;
|
||||||
dimension: number;
|
dimension: number;
|
||||||
type: 'float32' | 'float64';
|
type: 'float32' | 'float64';
|
||||||
|
/**
|
||||||
|
* Whether embeddings should be normalized before use
|
||||||
|
* If true, normalization will always be applied
|
||||||
|
* If false, normalization depends on provider's status
|
||||||
|
*/
|
||||||
normalize?: boolean;
|
normalize?: boolean;
|
||||||
|
/**
|
||||||
|
* The normalization status of this provider
|
||||||
|
*/
|
||||||
|
normalizationStatus?: NormalizationStatus;
|
||||||
batchSize?: number;
|
batchSize?: number;
|
||||||
contextWindowSize?: number;
|
contextWindowSize?: number;
|
||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
@ -65,6 +103,17 @@ export interface EmbeddingProvider {
|
|||||||
name: string;
|
name: string;
|
||||||
getConfig(): EmbeddingConfig;
|
getConfig(): EmbeddingConfig;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns information about the normalization status of this provider
|
||||||
|
*/
|
||||||
|
getNormalizationStatus(): NormalizationStatus;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verify that embeddings are properly normalized
|
||||||
|
* @returns true if embeddings are properly normalized
|
||||||
|
*/
|
||||||
|
verifyNormalization?(sample?: Float32Array): Promise<boolean>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate embeddings for a single piece of text
|
* Generate embeddings for a single piece of text
|
||||||
*/
|
*/
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import sql from "../../sql.js";
|
|||||||
import dateUtils from "../../date_utils.js";
|
import dateUtils from "../../date_utils.js";
|
||||||
import { randomString } from "../../utils.js";
|
import { randomString } from "../../utils.js";
|
||||||
import type { EmbeddingProvider, EmbeddingConfig } from "./embeddings_interface.js";
|
import type { EmbeddingProvider, EmbeddingConfig } from "./embeddings_interface.js";
|
||||||
|
import { NormalizationStatus } from "./embeddings_interface.js";
|
||||||
import { OpenAIEmbeddingProvider } from "./providers/openai.js";
|
import { OpenAIEmbeddingProvider } from "./providers/openai.js";
|
||||||
import { OllamaEmbeddingProvider } from "./providers/ollama.js";
|
import { OllamaEmbeddingProvider } from "./providers/ollama.js";
|
||||||
import { VoyageEmbeddingProvider } from "./providers/voyage.js";
|
import { VoyageEmbeddingProvider } from "./providers/voyage.js";
|
||||||
@ -25,6 +26,14 @@ class SimpleLocalEmbeddingProvider implements EmbeddingProvider {
|
|||||||
return this.config;
|
return this.config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the normalization status of the local provider
|
||||||
|
* Local provider does not guarantee normalization
|
||||||
|
*/
|
||||||
|
getNormalizationStatus(): NormalizationStatus {
|
||||||
|
return NormalizationStatus.NEVER; // Simple embedding does not normalize vectors
|
||||||
|
}
|
||||||
|
|
||||||
async generateEmbeddings(text: string): Promise<Float32Array> {
|
async generateEmbeddings(text: string): Promise<Float32Array> {
|
||||||
// Create deterministic embeddings based on text content
|
// Create deterministic embeddings based on text content
|
||||||
const result = new Float32Array(this.config.dimension || 384);
|
const result = new Float32Array(this.config.dimension || 384);
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import axios from "axios";
|
|||||||
import log from "../../../log.js";
|
import log from "../../../log.js";
|
||||||
import { BaseEmbeddingProvider } from "../base_embeddings.js";
|
import { BaseEmbeddingProvider } from "../base_embeddings.js";
|
||||||
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
|
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
|
||||||
|
import { NormalizationStatus } from "../embeddings_interface.js";
|
||||||
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
|
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -63,7 +64,8 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
dimension: embeddingDimension || 0, // We'll detect this separately if not provided
|
dimension: embeddingDimension || 0, // We'll detect this separately if not provided
|
||||||
contextWindow: contextWindow
|
contextWindow: contextWindow,
|
||||||
|
guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
@ -113,7 +115,11 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
const contextWindow = (LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record<string, number>)[baseModelName] ||
|
const contextWindow = (LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record<string, number>)[baseModelName] ||
|
||||||
(LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record<string, number>).default;
|
(LLM_CONSTANTS.OLLAMA_MODEL_CONTEXT_WINDOWS as Record<string, number>).default;
|
||||||
|
|
||||||
const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
|
const modelInfo: EmbeddingModelInfo = {
|
||||||
|
dimension,
|
||||||
|
contextWindow,
|
||||||
|
guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings
|
||||||
|
};
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
this.modelInfoCache.set(modelName, modelInfo);
|
||||||
this.config.dimension = dimension;
|
this.config.dimension = dimension;
|
||||||
|
|
||||||
@ -131,7 +137,11 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
|
|
||||||
log.info(`Using default parameters for model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
|
log.info(`Using default parameters for model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
|
||||||
|
|
||||||
const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
|
const modelInfo: EmbeddingModelInfo = {
|
||||||
|
dimension,
|
||||||
|
contextWindow,
|
||||||
|
guaranteesNormalization: false // Ollama models don't guarantee normalized embeddings
|
||||||
|
};
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
this.modelInfoCache.set(modelName, modelInfo);
|
||||||
this.config.dimension = dimension;
|
this.config.dimension = dimension;
|
||||||
|
|
||||||
@ -302,4 +312,12 @@ export class OllamaEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
throw new Error(`Ollama batch embedding error: ${errorMessage}`);
|
throw new Error(`Ollama batch embedding error: ${errorMessage}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the normalization status for Ollama embeddings
|
||||||
|
* Ollama embeddings are not guaranteed to be normalized
|
||||||
|
*/
|
||||||
|
getNormalizationStatus(): NormalizationStatus {
|
||||||
|
return NormalizationStatus.NEVER; // Be conservative and always normalize
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import axios from "axios";
|
|||||||
import log from "../../../log.js";
|
import log from "../../../log.js";
|
||||||
import { BaseEmbeddingProvider } from "../base_embeddings.js";
|
import { BaseEmbeddingProvider } from "../base_embeddings.js";
|
||||||
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
|
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
|
||||||
|
import { NormalizationStatus } from "../embeddings_interface.js";
|
||||||
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
|
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -105,7 +106,8 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
dimension,
|
dimension,
|
||||||
contextWindow
|
contextWindow,
|
||||||
|
guaranteesNormalization: true // OpenAI embeddings are normalized to unit length
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
@ -141,7 +143,11 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
// Use default context window
|
// Use default context window
|
||||||
let contextWindow = LLM_CONSTANTS.CONTEXT_WINDOW.OPENAI;
|
let contextWindow = LLM_CONSTANTS.CONTEXT_WINDOW.OPENAI;
|
||||||
|
|
||||||
const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
|
const modelInfo: EmbeddingModelInfo = {
|
||||||
|
dimension,
|
||||||
|
contextWindow,
|
||||||
|
guaranteesNormalization: true // OpenAI embeddings are normalized to unit length
|
||||||
|
};
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
this.modelInfoCache.set(modelName, modelInfo);
|
||||||
this.config.dimension = dimension;
|
this.config.dimension = dimension;
|
||||||
|
|
||||||
@ -154,7 +160,11 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
|
|
||||||
log.info(`Using default parameters for OpenAI model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
|
log.info(`Using default parameters for OpenAI model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
|
||||||
|
|
||||||
const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
|
const modelInfo: EmbeddingModelInfo = {
|
||||||
|
dimension,
|
||||||
|
contextWindow,
|
||||||
|
guaranteesNormalization: true // OpenAI embeddings are normalized to unit length
|
||||||
|
};
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
this.modelInfoCache.set(modelName, modelInfo);
|
||||||
this.config.dimension = dimension;
|
this.config.dimension = dimension;
|
||||||
|
|
||||||
@ -288,4 +298,12 @@ export class OpenAIEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
throw new Error(`OpenAI batch embedding error: ${errorMessage}`);
|
throw new Error(`OpenAI batch embedding error: ${errorMessage}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the normalization status for OpenAI embeddings
|
||||||
|
* OpenAI embeddings are guaranteed to be normalized to unit length
|
||||||
|
*/
|
||||||
|
getNormalizationStatus(): NormalizationStatus {
|
||||||
|
return NormalizationStatus.GUARANTEED;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import axios from "axios";
|
|||||||
import log from "../../../log.js";
|
import log from "../../../log.js";
|
||||||
import { BaseEmbeddingProvider } from "../base_embeddings.js";
|
import { BaseEmbeddingProvider } from "../base_embeddings.js";
|
||||||
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
|
import type { EmbeddingConfig, EmbeddingModelInfo } from "../embeddings_interface.js";
|
||||||
|
import { NormalizationStatus } from "../embeddings_interface.js";
|
||||||
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
|
import { LLM_CONSTANTS } from "../../../../routes/api/llm.js";
|
||||||
|
|
||||||
// Voyage model context window sizes - as of current API version
|
// Voyage model context window sizes - as of current API version
|
||||||
@ -68,7 +69,8 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
dimension,
|
dimension,
|
||||||
contextWindow
|
contextWindow,
|
||||||
|
guaranteesNormalization: true // Voyage embeddings are typically normalized
|
||||||
};
|
};
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
log.info(`Could not determine capabilities for Voyage AI model ${modelName}: ${error}`);
|
log.info(`Could not determine capabilities for Voyage AI model ${modelName}: ${error}`);
|
||||||
@ -96,7 +98,8 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
// Use known dimension
|
// Use known dimension
|
||||||
const modelInfo: EmbeddingModelInfo = {
|
const modelInfo: EmbeddingModelInfo = {
|
||||||
dimension: knownDimension,
|
dimension: knownDimension,
|
||||||
contextWindow
|
contextWindow,
|
||||||
|
guaranteesNormalization: true // Voyage embeddings are typically normalized
|
||||||
};
|
};
|
||||||
|
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
this.modelInfoCache.set(modelName, modelInfo);
|
||||||
@ -109,28 +112,41 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
const testEmbedding = await this.generateEmbeddings("Test");
|
const testEmbedding = await this.generateEmbeddings("Test");
|
||||||
const dimension = testEmbedding.length;
|
const dimension = testEmbedding.length;
|
||||||
|
|
||||||
const modelInfo: EmbeddingModelInfo = {
|
// Set model info based on the model name, detected dimension, and reasonable defaults
|
||||||
dimension,
|
if (modelName.includes('voyage-2')) {
|
||||||
contextWindow
|
return {
|
||||||
|
dimension: dimension || 1024,
|
||||||
|
contextWindow: 4096,
|
||||||
|
guaranteesNormalization: true // Voyage-2 embeddings are normalized
|
||||||
};
|
};
|
||||||
|
} else if (modelName.includes('voyage-lite-02')) {
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
return {
|
||||||
this.config.dimension = dimension;
|
dimension: dimension || 768,
|
||||||
|
contextWindow: 4096,
|
||||||
log.info(`Detected Voyage AI model ${modelName} with dimension ${dimension} (context: ${contextWindow})`);
|
guaranteesNormalization: true // Voyage-lite embeddings are normalized
|
||||||
return modelInfo;
|
};
|
||||||
|
} else {
|
||||||
|
// Default for other Voyage models
|
||||||
|
return {
|
||||||
|
dimension: dimension || 1024,
|
||||||
|
contextWindow: 4096,
|
||||||
|
guaranteesNormalization: true // Assuming all Voyage embeddings are normalized
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
// If detection fails, use defaults
|
log.info(`Could not fetch model info from Voyage AI API: ${error.message}. Using defaults.`);
|
||||||
const dimension = 1024; // Default for Voyage models
|
|
||||||
|
|
||||||
log.info(`Using default parameters for Voyage AI model ${modelName}: dimension ${dimension}, context ${contextWindow}`);
|
// Use default parameters if everything else fails
|
||||||
|
const defaultModelInfo: EmbeddingModelInfo = {
|
||||||
|
dimension: 1024, // Default for Voyage models
|
||||||
|
contextWindow: 8192,
|
||||||
|
guaranteesNormalization: true // Voyage embeddings are typically normalized
|
||||||
|
};
|
||||||
|
|
||||||
const modelInfo: EmbeddingModelInfo = { dimension, contextWindow };
|
this.modelInfoCache.set(modelName, defaultModelInfo);
|
||||||
this.modelInfoCache.set(modelName, modelInfo);
|
this.config.dimension = defaultModelInfo.dimension;
|
||||||
this.config.dimension = dimension;
|
return defaultModelInfo;
|
||||||
|
|
||||||
return modelInfo;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,4 +267,12 @@ export class VoyageEmbeddingProvider extends BaseEmbeddingProvider {
|
|||||||
throw new Error(`Voyage AI batch embedding error: ${errorMessage}`);
|
throw new Error(`Voyage AI batch embedding error: ${errorMessage}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the normalization status for Voyage embeddings
|
||||||
|
* Voyage embeddings are generally normalized by the API
|
||||||
|
*/
|
||||||
|
getNormalizationStatus(): NormalizationStatus {
|
||||||
|
return NormalizationStatus.GUARANTEED;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -112,6 +112,26 @@ export async function getEmbeddingForNote(noteId: string, providerId: string, mo
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create an interface that represents the embedding row from the database
|
||||||
|
interface EmbeddingRow {
|
||||||
|
embedId: string;
|
||||||
|
noteId: string;
|
||||||
|
providerId: string;
|
||||||
|
modelId: string;
|
||||||
|
dimension: number;
|
||||||
|
embedding: Buffer;
|
||||||
|
title?: string;
|
||||||
|
type?: string;
|
||||||
|
mime?: string;
|
||||||
|
isDeleted?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Interface for enhanced embedding with query model information
|
||||||
|
interface EnhancedEmbeddingRow extends EmbeddingRow {
|
||||||
|
queryProviderId: string;
|
||||||
|
queryModelId: string;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds similar notes based on vector similarity
|
* Finds similar notes based on vector similarity
|
||||||
*/
|
*/
|
||||||
@ -122,7 +142,7 @@ export async function findSimilarNotes(
|
|||||||
limit = 10,
|
limit = 10,
|
||||||
threshold?: number, // Made optional to use constants
|
threshold?: number, // Made optional to use constants
|
||||||
useFallback = true // Whether to try other providers if no embeddings found
|
useFallback = true // Whether to try other providers if no embeddings found
|
||||||
): Promise<{noteId: string, similarity: number}[]> {
|
): Promise<{noteId: string, similarity: number, contentType?: string}[]> {
|
||||||
// Import constants dynamically to avoid circular dependencies
|
// Import constants dynamically to avoid circular dependencies
|
||||||
const llmModule = await import('../../../routes/api/llm.js');
|
const llmModule = await import('../../../routes/api/llm.js');
|
||||||
// Use a default threshold of 0.65 if not provided
|
// Use a default threshold of 0.65 if not provided
|
||||||
@ -138,11 +158,30 @@ export async function findSimilarNotes(
|
|||||||
FROM note_embeddings ne
|
FROM note_embeddings ne
|
||||||
JOIN notes n ON ne.noteId = n.noteId
|
JOIN notes n ON ne.noteId = n.noteId
|
||||||
WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0
|
WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0
|
||||||
`, [providerId, modelId]);
|
`, [providerId, modelId]) as EmbeddingRow[];
|
||||||
|
|
||||||
if (embeddings && embeddings.length > 0) {
|
if (embeddings && embeddings.length > 0) {
|
||||||
log.info(`Found ${embeddings.length} embeddings for provider ${providerId}, model ${modelId}`);
|
log.info(`Found ${embeddings.length} embeddings for provider ${providerId}, model ${modelId}`);
|
||||||
return await processEmbeddings(embedding, embeddings, actualThreshold, limit);
|
|
||||||
|
// Add query model information to each embedding for cross-model comparison
|
||||||
|
const enhancedEmbeddings: EnhancedEmbeddingRow[] = embeddings.map(e => {
|
||||||
|
return {
|
||||||
|
embedId: e.embedId,
|
||||||
|
noteId: e.noteId,
|
||||||
|
providerId: e.providerId,
|
||||||
|
modelId: e.modelId,
|
||||||
|
dimension: e.dimension,
|
||||||
|
embedding: e.embedding,
|
||||||
|
title: e.title,
|
||||||
|
type: e.type,
|
||||||
|
mime: e.mime,
|
||||||
|
isDeleted: e.isDeleted,
|
||||||
|
queryProviderId: providerId,
|
||||||
|
queryModelId: modelId
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return await processEmbeddings(embedding, enhancedEmbeddings, actualThreshold, limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If no embeddings found and fallback is allowed, try other providers
|
// If no embeddings found and fallback is allowed, try other providers
|
||||||
@ -195,10 +234,28 @@ export async function findSimilarNotes(
|
|||||||
FROM note_embeddings ne
|
FROM note_embeddings ne
|
||||||
JOIN notes n ON ne.noteId = n.noteId
|
JOIN notes n ON ne.noteId = n.noteId
|
||||||
WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0
|
WHERE ne.providerId = ? AND ne.modelId = ? AND n.isDeleted = 0
|
||||||
`, [bestAlternative.providerId, bestAlternative.modelId]);
|
`, [bestAlternative.providerId, bestAlternative.modelId]) as EmbeddingRow[];
|
||||||
|
|
||||||
if (alternativeEmbeddings && alternativeEmbeddings.length > 0) {
|
if (alternativeEmbeddings && alternativeEmbeddings.length > 0) {
|
||||||
return await processEmbeddings(embedding, alternativeEmbeddings, actualThreshold, limit);
|
// Add query model information to each embedding for cross-model comparison
|
||||||
|
const enhancedEmbeddings: EnhancedEmbeddingRow[] = alternativeEmbeddings.map(e => {
|
||||||
|
return {
|
||||||
|
embedId: e.embedId,
|
||||||
|
noteId: e.noteId,
|
||||||
|
providerId: e.providerId,
|
||||||
|
modelId: e.modelId,
|
||||||
|
dimension: e.dimension,
|
||||||
|
embedding: e.embedding,
|
||||||
|
title: e.title,
|
||||||
|
type: e.type,
|
||||||
|
mime: e.mime,
|
||||||
|
isDeleted: e.isDeleted,
|
||||||
|
queryProviderId: providerId,
|
||||||
|
queryModelId: modelId
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return await processEmbeddings(embedding, enhancedEmbeddings, actualThreshold, limit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -256,17 +313,58 @@ export async function findSimilarNotes(
|
|||||||
|
|
||||||
// Helper function to process embeddings and calculate similarities
|
// Helper function to process embeddings and calculate similarities
|
||||||
async function processEmbeddings(queryEmbedding: Float32Array, embeddings: any[], threshold: number, limit: number) {
|
async function processEmbeddings(queryEmbedding: Float32Array, embeddings: any[], threshold: number, limit: number) {
|
||||||
const { enhancedCosineSimilarity, bufferToEmbedding } = await import('./vector_utils.js');
|
const {
|
||||||
|
enhancedCosineSimilarity,
|
||||||
|
bufferToEmbedding,
|
||||||
|
ContentType,
|
||||||
|
PerformanceProfile,
|
||||||
|
detectContentType,
|
||||||
|
vectorDebugConfig
|
||||||
|
} = await import('./vector_utils.js');
|
||||||
|
|
||||||
|
// Enable debug logging temporarily for testing content-aware adaptation
|
||||||
|
const originalDebugEnabled = vectorDebugConfig.enabled;
|
||||||
|
const originalLogLevel = vectorDebugConfig.logLevel;
|
||||||
|
vectorDebugConfig.enabled = true;
|
||||||
|
vectorDebugConfig.logLevel = 'debug';
|
||||||
|
vectorDebugConfig.recordStats = true;
|
||||||
|
|
||||||
const similarities = [];
|
const similarities = [];
|
||||||
|
|
||||||
|
try {
|
||||||
for (const e of embeddings) {
|
for (const e of embeddings) {
|
||||||
const embVector = bufferToEmbedding(e.embedding, e.dimension);
|
const embVector = bufferToEmbedding(e.embedding, e.dimension);
|
||||||
const similarity = enhancedCosineSimilarity(queryEmbedding, embVector);
|
|
||||||
|
// Detect content type from mime type if available
|
||||||
|
let contentType = ContentType.GENERAL_TEXT;
|
||||||
|
if (e.mime) {
|
||||||
|
contentType = detectContentType(e.mime);
|
||||||
|
console.log(`Note ID: ${e.noteId}, Mime: ${e.mime}, Detected content type: ${contentType}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Select performance profile based on embedding size and use case
|
||||||
|
// For most similarity searches, BALANCED is a good default
|
||||||
|
const performanceProfile = PerformanceProfile.BALANCED;
|
||||||
|
|
||||||
|
// Determine if this is cross-model comparison
|
||||||
|
const isCrossModel = e.providerId !== e.queryProviderId || e.modelId !== e.queryModelId;
|
||||||
|
|
||||||
|
// Calculate similarity with content-aware parameters
|
||||||
|
const similarity = enhancedCosineSimilarity(
|
||||||
|
queryEmbedding,
|
||||||
|
embVector,
|
||||||
|
true, // normalize vectors to ensure consistent comparison
|
||||||
|
e.queryModelId, // source model ID
|
||||||
|
e.modelId, // target model ID
|
||||||
|
contentType, // content-specific padding strategy
|
||||||
|
performanceProfile
|
||||||
|
);
|
||||||
|
|
||||||
if (similarity >= threshold) {
|
if (similarity >= threshold) {
|
||||||
similarities.push({
|
similarities.push({
|
||||||
noteId: e.noteId,
|
noteId: e.noteId,
|
||||||
similarity: similarity
|
similarity: similarity,
|
||||||
|
contentType: contentType.toString()
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -274,6 +372,12 @@ async function processEmbeddings(queryEmbedding: Float32Array, embeddings: any[]
|
|||||||
return similarities
|
return similarities
|
||||||
.sort((a, b) => b.similarity - a.similarity)
|
.sort((a, b) => b.similarity - a.similarity)
|
||||||
.slice(0, limit);
|
.slice(0, limit);
|
||||||
|
} finally {
|
||||||
|
// Restore original debug settings
|
||||||
|
vectorDebugConfig.enabled = originalDebugEnabled;
|
||||||
|
vectorDebugConfig.logLevel = originalLogLevel;
|
||||||
|
vectorDebugConfig.recordStats = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -1,34 +1,144 @@
|
|||||||
/**
|
/**
|
||||||
* Computes the cosine similarity between two vectors
|
* Computes the cosine similarity between two vectors
|
||||||
* If dimensions don't match, automatically adapts using the enhanced approach
|
* If dimensions don't match, automatically adapts using the enhanced approach
|
||||||
|
* @param normalize Optional flag to normalize vectors before comparison (default: false)
|
||||||
|
* @param sourceModel Optional identifier for the source model
|
||||||
|
* @param targetModel Optional identifier for the target model
|
||||||
|
* @param contentType Optional content type for strategy selection
|
||||||
|
* @param performanceProfile Optional performance profile
|
||||||
*/
|
*/
|
||||||
export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
|
export function cosineSimilarity(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
normalize: boolean = false,
|
||||||
|
sourceModel?: string,
|
||||||
|
targetModel?: string,
|
||||||
|
contentType?: ContentType,
|
||||||
|
performanceProfile?: PerformanceProfile
|
||||||
|
): number {
|
||||||
// Use the enhanced approach that preserves more information
|
// Use the enhanced approach that preserves more information
|
||||||
return enhancedCosineSimilarity(a, b);
|
return enhancedCosineSimilarity(a, b, normalize, sourceModel, targetModel, contentType, performanceProfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enhanced cosine similarity that adaptively handles different dimensions
|
* Enhanced cosine similarity that adaptively handles different dimensions
|
||||||
* Instead of truncating larger embeddings, it pads smaller ones to preserve information
|
* Instead of truncating larger embeddings, it pads smaller ones to preserve information
|
||||||
|
* @param normalize Optional flag to normalize vectors before comparison (default: false)
|
||||||
|
* @param sourceModel Optional identifier for the source model
|
||||||
|
* @param targetModel Optional identifier for the target model
|
||||||
|
* @param contentType Optional content type for strategy selection
|
||||||
|
* @param performanceProfile Optional performance profile
|
||||||
*/
|
*/
|
||||||
export function enhancedCosineSimilarity(a: Float32Array, b: Float32Array): number {
|
export function enhancedCosineSimilarity(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
normalize: boolean = false,
|
||||||
|
sourceModel?: string,
|
||||||
|
targetModel?: string,
|
||||||
|
contentType?: ContentType,
|
||||||
|
performanceProfile?: PerformanceProfile
|
||||||
|
): number {
|
||||||
|
// If normalization is requested, normalize vectors first
|
||||||
|
if (normalize) {
|
||||||
|
a = normalizeVector(a);
|
||||||
|
b = normalizeVector(b);
|
||||||
|
}
|
||||||
|
|
||||||
// If dimensions match, use standard calculation
|
// If dimensions match, use standard calculation
|
||||||
if (a.length === b.length) {
|
if (a.length === b.length) {
|
||||||
return standardCosineSimilarity(a, b);
|
return standardCosineSimilarity(a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Always adapt smaller embedding to larger one to preserve maximum information
|
// Log dimension adaptation
|
||||||
|
debugLog(`Dimension mismatch: ${a.length} vs ${b.length}. Adapting dimensions...`, 'info');
|
||||||
|
|
||||||
|
// Determine if models are different
|
||||||
|
const isCrossModelComparison = sourceModel !== targetModel &&
|
||||||
|
sourceModel !== undefined &&
|
||||||
|
targetModel !== undefined;
|
||||||
|
|
||||||
|
// Context for strategy selection
|
||||||
|
const context: StrategySelectionContext = {
|
||||||
|
contentType: contentType || ContentType.GENERAL_TEXT,
|
||||||
|
performanceProfile: performanceProfile || PerformanceProfile.BALANCED,
|
||||||
|
sourceDimension: a.length,
|
||||||
|
targetDimension: b.length,
|
||||||
|
sourceModel,
|
||||||
|
targetModel,
|
||||||
|
isCrossModelComparison
|
||||||
|
};
|
||||||
|
|
||||||
|
// Select the optimal strategy based on context
|
||||||
|
let adaptOptions: AdaptationOptions;
|
||||||
|
|
||||||
if (a.length > b.length) {
|
if (a.length > b.length) {
|
||||||
// Pad b to match a's dimensions
|
// Pad b to match a's dimensions
|
||||||
const adaptedB = adaptEmbeddingDimensions(b, a.length);
|
debugLog(`Adapting embedding B (${b.length}D) to match A (${a.length}D)`, 'debug');
|
||||||
|
|
||||||
|
// Get optimal strategy
|
||||||
|
adaptOptions = selectOptimalPaddingStrategy(context);
|
||||||
|
const adaptedB = adaptEmbeddingDimensions(b, a.length, adaptOptions);
|
||||||
|
|
||||||
|
// Record stats
|
||||||
|
recordAdaptationStats({
|
||||||
|
operation: 'dimension_adaptation',
|
||||||
|
sourceModel: targetModel,
|
||||||
|
targetModel: sourceModel,
|
||||||
|
sourceDimension: b.length,
|
||||||
|
targetDimension: a.length,
|
||||||
|
strategy: adaptOptions.strategy
|
||||||
|
});
|
||||||
|
|
||||||
return standardCosineSimilarity(a, adaptedB);
|
return standardCosineSimilarity(a, adaptedB);
|
||||||
} else {
|
} else {
|
||||||
// Pad a to match b's dimensions
|
// Pad a to match b's dimensions
|
||||||
const adaptedA = adaptEmbeddingDimensions(a, b.length);
|
debugLog(`Adapting embedding A (${a.length}D) to match B (${b.length}D)`, 'debug');
|
||||||
|
|
||||||
|
// Get optimal strategy
|
||||||
|
adaptOptions = selectOptimalPaddingStrategy(context);
|
||||||
|
const adaptedA = adaptEmbeddingDimensions(a, b.length, adaptOptions);
|
||||||
|
|
||||||
|
// Record stats
|
||||||
|
recordAdaptationStats({
|
||||||
|
operation: 'dimension_adaptation',
|
||||||
|
sourceModel: sourceModel,
|
||||||
|
targetModel: targetModel,
|
||||||
|
sourceDimension: a.length,
|
||||||
|
targetDimension: b.length,
|
||||||
|
strategy: adaptOptions.strategy
|
||||||
|
});
|
||||||
|
|
||||||
return standardCosineSimilarity(adaptedA, b);
|
return standardCosineSimilarity(adaptedA, b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalizes a vector to unit length
|
||||||
|
* @param vector The vector to normalize
|
||||||
|
* @returns A new normalized vector
|
||||||
|
*/
|
||||||
|
export function normalizeVector(vector: Float32Array): Float32Array {
|
||||||
|
let magnitude = 0;
|
||||||
|
for (let i = 0; i < vector.length; i++) {
|
||||||
|
magnitude += vector[i] * vector[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
magnitude = Math.sqrt(magnitude);
|
||||||
|
|
||||||
|
// If vector is already normalized or is a zero vector, return a copy
|
||||||
|
if (magnitude === 0 || Math.abs(magnitude - 1.0) < 1e-6) {
|
||||||
|
return new Float32Array(vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new normalized vector
|
||||||
|
const normalized = new Float32Array(vector.length);
|
||||||
|
for (let i = 0; i < vector.length; i++) {
|
||||||
|
normalized[i] = vector[i] / magnitude;
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Standard cosine similarity for same-dimension vectors
|
* Standard cosine similarity for same-dimension vectors
|
||||||
*/
|
*/
|
||||||
@ -75,28 +185,111 @@ export function selectOptimalEmbedding(embeddings: Array<{
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adapts an embedding to match target dimensions
|
* Padding strategy options for dimension adaptation
|
||||||
* Uses a simple truncation (if source is larger) or zero-padding (if source is smaller)
|
*/
|
||||||
|
export enum PaddingStrategy {
|
||||||
|
ZERO = 'zero', // Simple zero padding (default)
|
||||||
|
MEAN = 'mean', // Padding with mean value of source embedding
|
||||||
|
GAUSSIAN = 'gaussian', // Padding with Gaussian noise based on source statistics
|
||||||
|
MIRROR = 'mirror' // Mirroring existing values for padding
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration for embedding adaptation
|
||||||
|
*/
|
||||||
|
export interface AdaptationOptions {
|
||||||
|
strategy: PaddingStrategy;
|
||||||
|
seed?: number; // Seed for random number generation (gaussian)
|
||||||
|
variance?: number; // Variance for gaussian noise (default: 0.01)
|
||||||
|
normalize?: boolean; // Whether to normalize after adaptation
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adapts an embedding to match target dimensions with configurable strategies
|
||||||
*
|
*
|
||||||
* @param sourceEmbedding The original embedding
|
* @param sourceEmbedding The original embedding
|
||||||
* @param targetDimension The desired dimension
|
* @param targetDimension The desired dimension
|
||||||
|
* @param options Configuration options for the adaptation
|
||||||
* @returns A new embedding with the target dimensions
|
* @returns A new embedding with the target dimensions
|
||||||
*/
|
*/
|
||||||
export function adaptEmbeddingDimensions(sourceEmbedding: Float32Array, targetDimension: number): Float32Array {
|
export function adaptEmbeddingDimensions(
|
||||||
|
sourceEmbedding: Float32Array,
|
||||||
|
targetDimension: number,
|
||||||
|
options: AdaptationOptions = { strategy: PaddingStrategy.ZERO, normalize: true }
|
||||||
|
): Float32Array {
|
||||||
const sourceDimension = sourceEmbedding.length;
|
const sourceDimension = sourceEmbedding.length;
|
||||||
|
|
||||||
// If dimensions already match, return the original
|
// If dimensions already match, return a copy of the original
|
||||||
if (sourceDimension === targetDimension) {
|
if (sourceDimension === targetDimension) {
|
||||||
return sourceEmbedding;
|
return new Float32Array(sourceEmbedding);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a new embedding with target dimensions
|
// Create a new embedding with target dimensions
|
||||||
const adaptedEmbedding = new Float32Array(targetDimension);
|
const adaptedEmbedding = new Float32Array(targetDimension);
|
||||||
|
|
||||||
if (sourceDimension < targetDimension) {
|
if (sourceDimension < targetDimension) {
|
||||||
// If source is smaller, copy all values and pad with zeros
|
// Copy all source values first
|
||||||
adaptedEmbedding.set(sourceEmbedding);
|
adaptedEmbedding.set(sourceEmbedding);
|
||||||
// Rest of the array is already initialized to zeros
|
|
||||||
|
// Apply the selected padding strategy
|
||||||
|
switch (options.strategy) {
|
||||||
|
case PaddingStrategy.ZERO:
|
||||||
|
// Zero padding is already done by default
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PaddingStrategy.MEAN:
|
||||||
|
// Calculate mean of source embedding
|
||||||
|
let sum = 0;
|
||||||
|
for (let i = 0; i < sourceDimension; i++) {
|
||||||
|
sum += sourceEmbedding[i];
|
||||||
|
}
|
||||||
|
const mean = sum / sourceDimension;
|
||||||
|
|
||||||
|
// Fill remaining dimensions with mean value
|
||||||
|
for (let i = sourceDimension; i < targetDimension; i++) {
|
||||||
|
adaptedEmbedding[i] = mean;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PaddingStrategy.GAUSSIAN:
|
||||||
|
// Calculate mean and standard deviation of source embedding
|
||||||
|
let meanSum = 0;
|
||||||
|
for (let i = 0; i < sourceDimension; i++) {
|
||||||
|
meanSum += sourceEmbedding[i];
|
||||||
|
}
|
||||||
|
const meanValue = meanSum / sourceDimension;
|
||||||
|
|
||||||
|
let varianceSum = 0;
|
||||||
|
for (let i = 0; i < sourceDimension; i++) {
|
||||||
|
varianceSum += Math.pow(sourceEmbedding[i] - meanValue, 2);
|
||||||
|
}
|
||||||
|
const variance = options.variance ?? Math.min(0.01, varianceSum / sourceDimension);
|
||||||
|
const stdDev = Math.sqrt(variance);
|
||||||
|
|
||||||
|
// Fill remaining dimensions with Gaussian noise
|
||||||
|
for (let i = sourceDimension; i < targetDimension; i++) {
|
||||||
|
// Box-Muller transform for Gaussian distribution
|
||||||
|
const u1 = Math.random();
|
||||||
|
const u2 = Math.random();
|
||||||
|
const z0 = Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math.PI * u2);
|
||||||
|
|
||||||
|
adaptedEmbedding[i] = meanValue + stdDev * z0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case PaddingStrategy.MIRROR:
|
||||||
|
// Mirror existing values for padding
|
||||||
|
for (let i = sourceDimension; i < targetDimension; i++) {
|
||||||
|
// Cycle through source values in reverse order
|
||||||
|
const mirrorIndex = sourceDimension - 1 - ((i - sourceDimension) % sourceDimension);
|
||||||
|
adaptedEmbedding[i] = sourceEmbedding[mirrorIndex];
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
// Default to zero padding
|
||||||
|
break;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// If source is larger, truncate to target dimension
|
// If source is larger, truncate to target dimension
|
||||||
for (let i = 0; i < targetDimension; i++) {
|
for (let i = 0; i < targetDimension; i++) {
|
||||||
@ -104,17 +297,9 @@ export function adaptEmbeddingDimensions(sourceEmbedding: Float32Array, targetDi
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the adapted embedding to maintain unit length
|
// Normalize the adapted embedding if requested
|
||||||
let magnitude = 0;
|
if (options.normalize) {
|
||||||
for (let i = 0; i < targetDimension; i++) {
|
return normalizeVector(adaptedEmbedding);
|
||||||
magnitude += adaptedEmbedding[i] * adaptedEmbedding[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
magnitude = Math.sqrt(magnitude);
|
|
||||||
if (magnitude > 0) {
|
|
||||||
for (let i = 0; i < targetDimension; i++) {
|
|
||||||
adaptedEmbedding[i] /= magnitude;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return adaptedEmbedding;
|
return adaptedEmbedding;
|
||||||
@ -133,3 +318,567 @@ export function embeddingToBuffer(embedding: Float32Array): Buffer {
|
|||||||
export function bufferToEmbedding(buffer: Buffer, dimension: number): Float32Array {
|
export function bufferToEmbedding(buffer: Buffer, dimension: number): Float32Array {
|
||||||
return new Float32Array(buffer.buffer, buffer.byteOffset, dimension);
|
return new Float32Array(buffer.buffer, buffer.byteOffset, dimension);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similarity metric options
|
||||||
|
*/
|
||||||
|
export enum SimilarityMetric {
|
||||||
|
COSINE = 'cosine', // Standard cosine similarity
|
||||||
|
DOT_PRODUCT = 'dot_product', // Simple dot product (assumes normalized vectors)
|
||||||
|
HYBRID = 'hybrid', // Dot product + cosine hybrid
|
||||||
|
DIM_AWARE = 'dimension_aware', // Dimension-aware similarity that factors in dimension differences
|
||||||
|
ENSEMBLE = 'ensemble' // Combined score from multiple metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configuration for similarity calculation
|
||||||
|
*/
|
||||||
|
export interface SimilarityOptions {
|
||||||
|
metric: SimilarityMetric;
|
||||||
|
normalize?: boolean;
|
||||||
|
ensembleWeights?: {[key in SimilarityMetric]?: number};
|
||||||
|
dimensionPenalty?: number; // Penalty factor for dimension differences (0 to 1)
|
||||||
|
sourceModel?: string; // Source model identifier
|
||||||
|
targetModel?: string; // Target model identifier
|
||||||
|
contentType?: ContentType; // Type of content being compared
|
||||||
|
performanceProfile?: PerformanceProfile; // Performance requirements
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes similarity between two vectors using the specified metric
|
||||||
|
* @param a First vector
|
||||||
|
* @param b Second vector
|
||||||
|
* @param options Similarity calculation options
|
||||||
|
*/
|
||||||
|
export function computeSimilarity(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
options: SimilarityOptions = { metric: SimilarityMetric.COSINE }
|
||||||
|
): number {
|
||||||
|
// Apply normalization if requested
|
||||||
|
const normalize = options.normalize ?? false;
|
||||||
|
|
||||||
|
switch (options.metric) {
|
||||||
|
case SimilarityMetric.COSINE:
|
||||||
|
return cosineSimilarity(
|
||||||
|
a, b, normalize,
|
||||||
|
options.sourceModel, options.targetModel,
|
||||||
|
options.contentType, options.performanceProfile
|
||||||
|
);
|
||||||
|
|
||||||
|
case SimilarityMetric.DOT_PRODUCT:
|
||||||
|
// Dot product assumes normalized vectors for proper similarity measurement
|
||||||
|
const aNorm = normalize ? normalizeVector(a) : a;
|
||||||
|
const bNorm = normalize ? normalizeVector(b) : b;
|
||||||
|
return computeDotProduct(aNorm, bNorm, options);
|
||||||
|
|
||||||
|
case SimilarityMetric.HYBRID:
|
||||||
|
// Hybrid approach combines dot product with cosine similarity
|
||||||
|
// More robust against small perturbations while maintaining angle sensitivity
|
||||||
|
return hybridSimilarity(a, b, normalize, options);
|
||||||
|
|
||||||
|
case SimilarityMetric.DIM_AWARE:
|
||||||
|
// Dimension-aware similarity that factors in dimension differences
|
||||||
|
return dimensionAwareSimilarity(
|
||||||
|
a, b, normalize,
|
||||||
|
options.dimensionPenalty ?? 0.1,
|
||||||
|
options.contentType,
|
||||||
|
options.performanceProfile
|
||||||
|
);
|
||||||
|
|
||||||
|
case SimilarityMetric.ENSEMBLE:
|
||||||
|
// Ensemble scoring combines multiple metrics with weights
|
||||||
|
return ensembleSimilarity(a, b, options);
|
||||||
|
|
||||||
|
default:
|
||||||
|
// Default to cosine similarity
|
||||||
|
return cosineSimilarity(
|
||||||
|
a, b, normalize,
|
||||||
|
options.sourceModel, options.targetModel,
|
||||||
|
options.contentType, options.performanceProfile
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Computes dot product between two vectors
|
||||||
|
*/
|
||||||
|
export function computeDotProduct(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
options?: Pick<SimilarityOptions, 'contentType' | 'performanceProfile' | 'sourceModel' | 'targetModel'>
|
||||||
|
): number {
|
||||||
|
// Adapt dimensions if needed
|
||||||
|
if (a.length !== b.length) {
|
||||||
|
// Create context for strategy selection if dimensions don't match
|
||||||
|
if (options) {
|
||||||
|
const context: StrategySelectionContext = {
|
||||||
|
contentType: options.contentType || ContentType.GENERAL_TEXT,
|
||||||
|
performanceProfile: options.performanceProfile || PerformanceProfile.BALANCED,
|
||||||
|
sourceDimension: a.length,
|
||||||
|
targetDimension: b.length,
|
||||||
|
sourceModel: options.sourceModel,
|
||||||
|
targetModel: options.targetModel,
|
||||||
|
isCrossModelComparison: options.sourceModel !== options.targetModel &&
|
||||||
|
options.sourceModel !== undefined &&
|
||||||
|
options.targetModel !== undefined
|
||||||
|
};
|
||||||
|
|
||||||
|
if (a.length > b.length) {
|
||||||
|
const adaptOptions = selectOptimalPaddingStrategy(context);
|
||||||
|
b = adaptEmbeddingDimensions(b, a.length, adaptOptions);
|
||||||
|
} else {
|
||||||
|
const adaptOptions = selectOptimalPaddingStrategy(context);
|
||||||
|
a = adaptEmbeddingDimensions(a, b.length, adaptOptions);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Default behavior without options
|
||||||
|
if (a.length > b.length) {
|
||||||
|
b = adaptEmbeddingDimensions(b, a.length);
|
||||||
|
} else {
|
||||||
|
a = adaptEmbeddingDimensions(a, b.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let dotProduct = 0;
|
||||||
|
for (let i = 0; i < a.length; i++) {
|
||||||
|
dotProduct += a[i] * b[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return dotProduct;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hybrid similarity combines dot product and cosine similarity
|
||||||
|
* Provides robustness against small perturbations while maintaining angle sensitivity
|
||||||
|
*/
|
||||||
|
export function hybridSimilarity(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
normalize: boolean = false,
|
||||||
|
options?: Pick<SimilarityOptions, 'contentType' | 'performanceProfile' | 'sourceModel' | 'targetModel'>
|
||||||
|
): number {
|
||||||
|
// Get cosine similarity with full options
|
||||||
|
const cosine = cosineSimilarity(
|
||||||
|
a, b, normalize,
|
||||||
|
options?.sourceModel, options?.targetModel,
|
||||||
|
options?.contentType, options?.performanceProfile
|
||||||
|
);
|
||||||
|
|
||||||
|
// For dot product, we should always normalize
|
||||||
|
const aNorm = normalize ? a : normalizeVector(a);
|
||||||
|
const bNorm = normalize ? b : normalizeVector(b);
|
||||||
|
|
||||||
|
// If dimensions don't match, adapt with optimal strategy
|
||||||
|
let adaptedA = aNorm;
|
||||||
|
let adaptedB = bNorm;
|
||||||
|
|
||||||
|
if (aNorm.length !== bNorm.length) {
|
||||||
|
// Use optimal padding strategy
|
||||||
|
if (options) {
|
||||||
|
const context: StrategySelectionContext = {
|
||||||
|
contentType: options.contentType || ContentType.GENERAL_TEXT,
|
||||||
|
performanceProfile: options.performanceProfile || PerformanceProfile.BALANCED,
|
||||||
|
sourceDimension: aNorm.length,
|
||||||
|
targetDimension: bNorm.length,
|
||||||
|
sourceModel: options.sourceModel,
|
||||||
|
targetModel: options.targetModel,
|
||||||
|
isCrossModelComparison: options.sourceModel !== options.targetModel &&
|
||||||
|
options.sourceModel !== undefined &&
|
||||||
|
options.targetModel !== undefined
|
||||||
|
};
|
||||||
|
|
||||||
|
if (aNorm.length < bNorm.length) {
|
||||||
|
const adaptOptions = selectOptimalPaddingStrategy(context);
|
||||||
|
adaptedA = adaptEmbeddingDimensions(aNorm, bNorm.length, adaptOptions);
|
||||||
|
} else {
|
||||||
|
const adaptOptions = selectOptimalPaddingStrategy(context);
|
||||||
|
adaptedB = adaptEmbeddingDimensions(bNorm, aNorm.length, adaptOptions);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Default behavior
|
||||||
|
adaptedA = aNorm.length < bNorm.length ? adaptEmbeddingDimensions(aNorm, bNorm.length) : aNorm;
|
||||||
|
adaptedB = bNorm.length < aNorm.length ? adaptEmbeddingDimensions(bNorm, aNorm.length) : bNorm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute dot product (should be similar to cosine for normalized vectors)
|
||||||
|
const dot = computeDotProduct(adaptedA, adaptedB, options);
|
||||||
|
|
||||||
|
// Return weighted average - giving more weight to cosine
|
||||||
|
return 0.7 * cosine + 0.3 * dot;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dimension-aware similarity that factors in dimension differences
|
||||||
|
* @param dimensionPenalty Penalty factor for dimension differences (0 to 1)
|
||||||
|
*/
|
||||||
|
export function dimensionAwareSimilarity(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
normalize: boolean = false,
|
||||||
|
dimensionPenalty: number = 0.1,
|
||||||
|
contentType?: ContentType,
|
||||||
|
performanceProfile?: PerformanceProfile
|
||||||
|
): number {
|
||||||
|
// Basic cosine similarity with content type information
|
||||||
|
const cosine = cosineSimilarity(a, b, normalize, undefined, undefined, contentType, performanceProfile);
|
||||||
|
|
||||||
|
// If dimensions match, return standard cosine
|
||||||
|
if (a.length === b.length) {
|
||||||
|
return cosine;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate dimension penalty
|
||||||
|
// This penalizes vectors with very different dimensions
|
||||||
|
const dimRatio = Math.min(a.length, b.length) / Math.max(a.length, b.length);
|
||||||
|
const penalty = 1 - dimensionPenalty * (1 - dimRatio);
|
||||||
|
|
||||||
|
// Apply penalty to similarity score
|
||||||
|
return cosine * penalty;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensemble similarity combines multiple metrics with weights
|
||||||
|
*/
|
||||||
|
export function ensembleSimilarity(
|
||||||
|
a: Float32Array,
|
||||||
|
b: Float32Array,
|
||||||
|
options: SimilarityOptions
|
||||||
|
): number {
|
||||||
|
// Default weights if not provided
|
||||||
|
const weights = options.ensembleWeights ?? {
|
||||||
|
[SimilarityMetric.COSINE]: 0.6,
|
||||||
|
[SimilarityMetric.HYBRID]: 0.3,
|
||||||
|
[SimilarityMetric.DIM_AWARE]: 0.1
|
||||||
|
};
|
||||||
|
|
||||||
|
let totalWeight = 0;
|
||||||
|
let weightedSum = 0;
|
||||||
|
|
||||||
|
// Compute each metric and apply weight
|
||||||
|
for (const [metricStr, weight] of Object.entries(weights)) {
|
||||||
|
const metric = metricStr as SimilarityMetric;
|
||||||
|
if (weight && weight > 0) {
|
||||||
|
// Skip the ensemble itself to avoid recursion
|
||||||
|
if (metric !== SimilarityMetric.ENSEMBLE) {
|
||||||
|
const similarity = computeSimilarity(a, b, {
|
||||||
|
metric,
|
||||||
|
normalize: options.normalize
|
||||||
|
});
|
||||||
|
|
||||||
|
weightedSum += similarity * weight;
|
||||||
|
totalWeight += weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize by total weight
|
||||||
|
return totalWeight > 0 ? weightedSum / totalWeight : cosineSimilarity(a, b, options.normalize);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Debug configuration for vector operations
|
||||||
|
*/
|
||||||
|
export interface DebugConfig {
|
||||||
|
enabled: boolean;
|
||||||
|
logLevel: 'info' | 'debug' | 'warning' | 'error';
|
||||||
|
recordStats: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Global debug configuration, can be modified at runtime
|
||||||
|
*/
|
||||||
|
export const vectorDebugConfig: DebugConfig = {
|
||||||
|
enabled: false,
|
||||||
|
logLevel: 'info',
|
||||||
|
recordStats: false
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Statistics collected during vector operations
|
||||||
|
*/
|
||||||
|
export interface AdaptationStats {
|
||||||
|
timestamp: number;
|
||||||
|
operation: string;
|
||||||
|
sourceModel?: string;
|
||||||
|
targetModel?: string;
|
||||||
|
sourceDimension: number;
|
||||||
|
targetDimension: number;
|
||||||
|
strategy: string;
|
||||||
|
similarity?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collection of adaptation statistics for quality auditing
|
||||||
|
export const adaptationStats: AdaptationStats[] = [];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Log a message if debugging is enabled
|
||||||
|
*/
|
||||||
|
function debugLog(
|
||||||
|
message: string,
|
||||||
|
level: 'info' | 'debug' | 'warning' | 'error' = 'info'
|
||||||
|
): void {
|
||||||
|
if (vectorDebugConfig.enabled) {
|
||||||
|
const levelOrder = { 'debug': 0, 'info': 1, 'warning': 2, 'error': 3 };
|
||||||
|
|
||||||
|
if (levelOrder[level] >= levelOrder[vectorDebugConfig.logLevel]) {
|
||||||
|
const prefix = `[VectorUtils:${level.toUpperCase()}]`;
|
||||||
|
|
||||||
|
switch (level) {
|
||||||
|
case 'error':
|
||||||
|
console.error(prefix, message);
|
||||||
|
break;
|
||||||
|
case 'warning':
|
||||||
|
console.warn(prefix, message);
|
||||||
|
break;
|
||||||
|
case 'debug':
|
||||||
|
console.debug(prefix, message);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log(prefix, message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Record adaptation statistics if enabled
|
||||||
|
*/
|
||||||
|
function recordAdaptationStats(stats: Omit<AdaptationStats, 'timestamp'>): void {
|
||||||
|
if (vectorDebugConfig.enabled && vectorDebugConfig.recordStats) {
|
||||||
|
adaptationStats.push({
|
||||||
|
...stats,
|
||||||
|
timestamp: Date.now()
|
||||||
|
});
|
||||||
|
|
||||||
|
// Keep only the last 1000 stats to prevent memory issues
|
||||||
|
if (adaptationStats.length > 1000) {
|
||||||
|
adaptationStats.shift();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Content types for embedding adaptation strategy selection
|
||||||
|
*/
|
||||||
|
export enum ContentType {
|
||||||
|
GENERAL_TEXT = 'general_text',
|
||||||
|
CODE = 'code',
|
||||||
|
STRUCTURED_DATA = 'structured_data',
|
||||||
|
MATHEMATICAL = 'mathematical',
|
||||||
|
MIXED = 'mixed'
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performance profile for selecting adaptation strategy
|
||||||
|
*/
|
||||||
|
export enum PerformanceProfile {
|
||||||
|
MAXIMUM_QUALITY = 'maximum_quality', // Prioritize similarity quality over speed
|
||||||
|
BALANCED = 'balanced', // Balance quality and performance
|
||||||
|
MAXIMUM_SPEED = 'maximum_speed' // Prioritize speed over quality
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Context for selecting the optimal padding strategy
|
||||||
|
*/
|
||||||
|
export interface StrategySelectionContext {
|
||||||
|
contentType?: ContentType; // Type of content being compared
|
||||||
|
performanceProfile?: PerformanceProfile; // Performance requirements
|
||||||
|
sourceDimension: number; // Source embedding dimension
|
||||||
|
targetDimension: number; // Target embedding dimension
|
||||||
|
sourceModel?: string; // Source model identifier
|
||||||
|
targetModel?: string; // Target model identifier
|
||||||
|
isHighPrecisionRequired?: boolean; // Whether high precision is needed
|
||||||
|
isCrossModelComparison?: boolean; // Whether comparing across different models
|
||||||
|
dimensionRatio?: number; // Custom dimension ratio threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Selects the optimal padding strategy based on content type and performance considerations
|
||||||
|
* @param context Selection context parameters
|
||||||
|
* @returns The most appropriate padding strategy and options
|
||||||
|
*/
|
||||||
|
export function selectOptimalPaddingStrategy(
|
||||||
|
context: StrategySelectionContext
|
||||||
|
): AdaptationOptions {
|
||||||
|
const {
|
||||||
|
contentType = ContentType.GENERAL_TEXT,
|
||||||
|
performanceProfile = PerformanceProfile.BALANCED,
|
||||||
|
sourceDimension,
|
||||||
|
targetDimension,
|
||||||
|
isHighPrecisionRequired = false,
|
||||||
|
isCrossModelComparison = false
|
||||||
|
} = context;
|
||||||
|
|
||||||
|
// Calculate dimension ratio
|
||||||
|
const dimRatio = Math.min(sourceDimension, targetDimension) /
|
||||||
|
Math.max(sourceDimension, targetDimension);
|
||||||
|
|
||||||
|
// Default options
|
||||||
|
const options: AdaptationOptions = {
|
||||||
|
strategy: PaddingStrategy.ZERO,
|
||||||
|
normalize: true
|
||||||
|
};
|
||||||
|
|
||||||
|
// Significant dimension difference detection
|
||||||
|
const hasSignificantDimDifference = dimRatio < (context.dimensionRatio || 0.5);
|
||||||
|
|
||||||
|
// Select strategy based on content type
|
||||||
|
switch (contentType) {
|
||||||
|
case ContentType.CODE:
|
||||||
|
// Code benefits from structural patterns
|
||||||
|
options.strategy = PaddingStrategy.MIRROR;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ContentType.STRUCTURED_DATA:
|
||||||
|
// Structured data works well with mean-value padding
|
||||||
|
options.strategy = PaddingStrategy.MEAN;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ContentType.MATHEMATICAL:
|
||||||
|
// Mathematical content benefits from gaussian noise to maintain statistical properties
|
||||||
|
options.strategy = PaddingStrategy.GAUSSIAN;
|
||||||
|
options.variance = 0.005; // Lower variance for mathematical precision
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ContentType.MIXED:
|
||||||
|
// For mixed content, choose based on performance profile
|
||||||
|
if (performanceProfile === PerformanceProfile.MAXIMUM_QUALITY) {
|
||||||
|
options.strategy = PaddingStrategy.GAUSSIAN;
|
||||||
|
} else if (performanceProfile === PerformanceProfile.MAXIMUM_SPEED) {
|
||||||
|
options.strategy = PaddingStrategy.ZERO;
|
||||||
|
} else {
|
||||||
|
options.strategy = PaddingStrategy.MEAN;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ContentType.GENERAL_TEXT:
|
||||||
|
default:
|
||||||
|
// For general text, base decision on other factors
|
||||||
|
if (isHighPrecisionRequired) {
|
||||||
|
options.strategy = PaddingStrategy.GAUSSIAN;
|
||||||
|
} else if (isCrossModelComparison) {
|
||||||
|
options.strategy = PaddingStrategy.MEAN;
|
||||||
|
} else {
|
||||||
|
options.strategy = PaddingStrategy.ZERO;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Override based on performance profile if we have significant dimension differences
|
||||||
|
if (hasSignificantDimDifference) {
|
||||||
|
// For extreme dimension differences, specialized handling
|
||||||
|
if (performanceProfile === PerformanceProfile.MAXIMUM_QUALITY) {
|
||||||
|
// For quality, use gaussian noise for better statistical matching
|
||||||
|
options.strategy = PaddingStrategy.GAUSSIAN;
|
||||||
|
// Adjust variance based on dimension ratio
|
||||||
|
options.variance = Math.min(0.01, 0.02 * dimRatio);
|
||||||
|
|
||||||
|
// Log the significant dimension adaptation
|
||||||
|
debugLog(`Significant dimension difference detected: ${sourceDimension} vs ${targetDimension}. ` +
|
||||||
|
`Ratio: ${dimRatio.toFixed(2)}. Using Gaussian strategy.`, 'warning');
|
||||||
|
} else if (performanceProfile === PerformanceProfile.MAXIMUM_SPEED) {
|
||||||
|
// For speed, stick with zero padding
|
||||||
|
options.strategy = PaddingStrategy.ZERO;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always use zero padding for trivial dimension differences
|
||||||
|
// (e.g. 1536 vs 1537) for performance reasons
|
||||||
|
if (Math.abs(sourceDimension - targetDimension) <= 5) {
|
||||||
|
options.strategy = PaddingStrategy.ZERO;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log the selected strategy
|
||||||
|
debugLog(`Selected padding strategy: ${options.strategy} for ` +
|
||||||
|
`content type: ${contentType}, performance profile: ${performanceProfile}`, 'debug');
|
||||||
|
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function to determine content type from note context
|
||||||
|
* @param context The note context information
|
||||||
|
* @returns The detected content type
|
||||||
|
*/
|
||||||
|
export function detectContentType(mime: string, content?: string): ContentType {
|
||||||
|
// Detect based on mime type
|
||||||
|
if (mime.includes('code') ||
|
||||||
|
mime.includes('javascript') ||
|
||||||
|
mime.includes('typescript') ||
|
||||||
|
mime.includes('python') ||
|
||||||
|
mime.includes('java') ||
|
||||||
|
mime.includes('c++') ||
|
||||||
|
mime.includes('json')) {
|
||||||
|
return ContentType.CODE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mime.includes('xml') ||
|
||||||
|
mime.includes('csv') ||
|
||||||
|
mime.includes('sql') ||
|
||||||
|
mime.endsWith('+json')) {
|
||||||
|
return ContentType.STRUCTURED_DATA;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mime.includes('latex') ||
|
||||||
|
mime.includes('mathml') ||
|
||||||
|
mime.includes('tex')) {
|
||||||
|
return ContentType.MATHEMATICAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have content, we can do deeper analysis
|
||||||
|
if (content) {
|
||||||
|
// Detect code by looking for common patterns
|
||||||
|
const codePatterns = [
|
||||||
|
/function\s+\w+\s*\(.*\)\s*{/, // JavaScript/TypeScript function
|
||||||
|
/def\s+\w+\s*\(.*\):/, // Python function
|
||||||
|
/class\s+\w+(\s+extends\s+\w+)?(\s+implements\s+\w+)?\s*{/, // Java/TypeScript class
|
||||||
|
/import\s+.*\s+from\s+['"]/, // JS/TS import
|
||||||
|
/^\s*```\w+/m // Markdown code block
|
||||||
|
];
|
||||||
|
|
||||||
|
if (codePatterns.some(pattern => pattern.test(content))) {
|
||||||
|
return ContentType.CODE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect structured data
|
||||||
|
const structuredPatterns = [
|
||||||
|
/^\s*[{\[]/, // JSON-like start
|
||||||
|
/^\s*<\?xml/, // XML declaration
|
||||||
|
/^\s*<[a-z]+>/i, // HTML/XML tag
|
||||||
|
/^\s*(\w+,)+\w+$/m, // CSV-like
|
||||||
|
/CREATE\s+TABLE|SELECT\s+.*\s+FROM/i // SQL
|
||||||
|
];
|
||||||
|
|
||||||
|
if (structuredPatterns.some(pattern => pattern.test(content))) {
|
||||||
|
return ContentType.STRUCTURED_DATA;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect mathematical content
|
||||||
|
const mathPatterns = [
|
||||||
|
/\$\$.*\$\$/s, // LaTeX block
|
||||||
|
/\\begin{equation}/, // LaTeX equation environment
|
||||||
|
/\\sum|\\int|\\frac|\\sqrt/, // Common LaTeX math commands
|
||||||
|
];
|
||||||
|
|
||||||
|
if (mathPatterns.some(pattern => pattern.test(content))) {
|
||||||
|
return ContentType.MATHEMATICAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for mixed content
|
||||||
|
const hasMixedContent =
|
||||||
|
(codePatterns.some(pattern => pattern.test(content)) &&
|
||||||
|
content.split(/\s+/).length > 100) || // Code and substantial text
|
||||||
|
(content.includes('```') &&
|
||||||
|
content.replace(/```.*?```/gs, '').length > 200); // Markdown with code blocks and text
|
||||||
|
|
||||||
|
if (hasMixedContent) {
|
||||||
|
return ContentType.MIXED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to general text
|
||||||
|
return ContentType.GENERAL_TEXT;
|
||||||
|
}
|
||||||
|
|||||||
@ -565,7 +565,7 @@ class IndexService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get Note IDs to search, optionally filtered by branch
|
// Get Note IDs to search, optionally filtered by branch
|
||||||
let similarNotes = [];
|
let similarNotes: { noteId: string; title: string; similarity: number; contentType?: string }[] = [];
|
||||||
|
|
||||||
// Check if we need to restrict search to a specific branch
|
// Check if we need to restrict search to a specific branch
|
||||||
if (contextNoteId) {
|
if (contextNoteId) {
|
||||||
@ -593,6 +593,9 @@ class IndexService {
|
|||||||
// Get embeddings for all notes in the branch
|
// Get embeddings for all notes in the branch
|
||||||
const config = provider.getConfig();
|
const config = provider.getConfig();
|
||||||
|
|
||||||
|
// Import the ContentType detection from vector utils
|
||||||
|
const { ContentType, detectContentType, cosineSimilarity } = await import('./embeddings/vector_utils.js');
|
||||||
|
|
||||||
for (const noteId of branchNoteIds) {
|
for (const noteId of branchNoteIds) {
|
||||||
const noteEmbedding = await vectorStore.getEmbeddingForNote(
|
const noteEmbedding = await vectorStore.getEmbeddingForNote(
|
||||||
noteId,
|
noteId,
|
||||||
@ -601,14 +604,29 @@ class IndexService {
|
|||||||
);
|
);
|
||||||
|
|
||||||
if (noteEmbedding) {
|
if (noteEmbedding) {
|
||||||
const similarity = vectorStore.cosineSimilarity(embedding, noteEmbedding.embedding);
|
// Get the note to determine its content type
|
||||||
if (similarity >= this.defaultSimilarityThreshold) {
|
|
||||||
const note = becca.getNote(noteId);
|
const note = becca.getNote(noteId);
|
||||||
if (note) {
|
if (note) {
|
||||||
|
// Detect content type from mime type
|
||||||
|
const contentType = detectContentType(note.mime, '');
|
||||||
|
|
||||||
|
// Use content-aware similarity calculation
|
||||||
|
const similarity = cosineSimilarity(
|
||||||
|
embedding,
|
||||||
|
noteEmbedding.embedding,
|
||||||
|
true, // normalize
|
||||||
|
config.model, // source model
|
||||||
|
noteEmbedding.providerId, // target model (use providerId)
|
||||||
|
contentType, // content type for padding strategy
|
||||||
|
undefined // use default BALANCED performance profile
|
||||||
|
);
|
||||||
|
|
||||||
|
if (similarity >= this.defaultSimilarityThreshold) {
|
||||||
similarNotes.push({
|
similarNotes.push({
|
||||||
noteId,
|
noteId,
|
||||||
title: note.title,
|
title: note.title,
|
||||||
similarity
|
similarity,
|
||||||
|
contentType: contentType.toString()
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -622,7 +640,7 @@ class IndexService {
|
|||||||
} else {
|
} else {
|
||||||
// Search across all notes
|
// Search across all notes
|
||||||
const config = provider.getConfig();
|
const config = provider.getConfig();
|
||||||
similarNotes = await vectorStore.findSimilarNotes(
|
const results = await vectorStore.findSimilarNotes(
|
||||||
embedding,
|
embedding,
|
||||||
provider.name,
|
provider.name,
|
||||||
config.model,
|
config.model,
|
||||||
@ -631,14 +649,17 @@ class IndexService {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// Enhance results with note titles
|
// Enhance results with note titles
|
||||||
return similarNotes.map(result => {
|
similarNotes = results.map(result => {
|
||||||
const note = becca.getNote(result.noteId);
|
const note = becca.getNote(result.noteId);
|
||||||
return {
|
return {
|
||||||
noteId: result.noteId,
|
noteId: result.noteId,
|
||||||
title: note ? note.title : 'Unknown Note',
|
title: note ? note.title : 'Unknown Note',
|
||||||
similarity: result.similarity
|
similarity: result.similarity,
|
||||||
|
contentType: result.contentType
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
return similarNotes;
|
||||||
}
|
}
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
log.error(`Error finding similar notes: ${error.message || "Unknown error"}`);
|
log.error(`Error finding similar notes: ${error.message || "Unknown error"}`);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user