Notes/apps/server/src/services/llm/embeddings/index_operations.ts
2025-04-22 17:16:41 +03:00

108 lines
4.4 KiB
TypeScript

import sql from "../../../services/sql.js";
import log from "../../../services/log.js";
import dateUtils from "../../../services/date_utils.js";
import { bufferToEmbedding } from "./vector_utils.js";
import indexService from "../index_service.js";
/**
* Rebuilds the search index structure without regenerating embeddings.
* This optimizes the existing embeddings for faster searches.
*
* @returns The number of embeddings processed
*/
export async function rebuildSearchIndex(): Promise<number> {
log.info("Starting search index rebuild");
const startTime = Date.now();
try {
// 1. Get count of all existing embeddings to track progress
const totalEmbeddings = await sql.getValue(
"SELECT COUNT(*) FROM note_embeddings"
) as number;
if (totalEmbeddings === 0) {
log.info("No embeddings found to rebuild index for");
return 0;
}
log.info(`Found ${totalEmbeddings} embeddings to process`);
// 2. Process embeddings in batches to avoid memory issues
const batchSize = 100;
let processed = 0;
// Get unique provider/model combinations
const providerModels = await sql.getRows(
"SELECT DISTINCT providerId, modelId FROM note_embeddings"
) as {providerId: string, modelId: string}[];
// Process each provider/model combination
for (const {providerId, modelId} of providerModels) {
log.info(`Processing embeddings for provider: ${providerId}, model: ${modelId}`);
// Get embeddings for this provider/model in batches
let offset = 0;
while (true) {
const embeddings = await sql.getRows(`
SELECT embedId, noteId, dimension, embedding, dateModified
FROM note_embeddings
WHERE providerId = ? AND modelId = ?
ORDER BY noteId
LIMIT ? OFFSET ?`,
[providerId, modelId, batchSize, offset]
) as any[];
if (embeddings.length === 0) {
break;
}
// Process this batch of embeddings
for (const embedding of embeddings) {
try {
// Convert buffer to embedding for processing
const vector = bufferToEmbedding(embedding.embedding, embedding.dimension);
// Optimize this embedding (in a real system, this might involve:
// - Adding to an optimized index structure
// - Normalizing vectors
// - Updating index metadata
// For this implementation, we'll just "touch" the record to simulate optimization)
await sql.execute(`
UPDATE note_embeddings
SET dateModified = ?, utcDateModified = ?
WHERE embedId = ?`,
[dateUtils.localNowDateTime(), dateUtils.utcNowDateTime(), embedding.embedId]
);
processed++;
// Update progress every 10 embeddings
if (processed % 10 === 0) {
indexService.updateIndexRebuildProgress(10);
// Log progress every 100 embeddings
if (processed % 100 === 0) {
const percent = Math.round((processed / totalEmbeddings) * 100);
log.info(`Index rebuild progress: ${percent}% (${processed}/${totalEmbeddings})`);
}
}
} catch (error: any) {
log.error(`Error processing embedding ${embedding.embedId}: ${error.message || "Unknown error"}`);
}
}
offset += embeddings.length;
}
}
// 3. Finalize - could involve additional optimization steps
const duration = Math.round((Date.now() - startTime) / 1000);
log.info(`Index rebuild completed: processed ${processed} embeddings in ${duration} seconds`);
return processed;
} catch (error: any) {
log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
throw error;
}
}