import sql from "../../../services/sql.js"; import log from "../../../services/log.js"; import dateUtils from "../../../services/date_utils.js"; import { bufferToEmbedding } from "./vector_utils.js"; import indexService from "../index_service.js"; /** * Rebuilds the search index structure without regenerating embeddings. * This optimizes the existing embeddings for faster searches. * * @returns The number of embeddings processed */ export async function rebuildSearchIndex(): Promise { log.info("Starting search index rebuild"); const startTime = Date.now(); try { // 1. Get count of all existing embeddings to track progress const totalEmbeddings = await sql.getValue( "SELECT COUNT(*) FROM note_embeddings" ) as number; if (totalEmbeddings === 0) { log.info("No embeddings found to rebuild index for"); return 0; } log.info(`Found ${totalEmbeddings} embeddings to process`); // 2. Process embeddings in batches to avoid memory issues const batchSize = 100; let processed = 0; // Get unique provider/model combinations const providerModels = await sql.getRows( "SELECT DISTINCT providerId, modelId FROM note_embeddings" ) as {providerId: string, modelId: string}[]; // Process each provider/model combination for (const {providerId, modelId} of providerModels) { log.info(`Processing embeddings for provider: ${providerId}, model: ${modelId}`); // Get embeddings for this provider/model in batches let offset = 0; while (true) { const embeddings = await sql.getRows(` SELECT embedId, noteId, dimension, embedding, dateModified FROM note_embeddings WHERE providerId = ? AND modelId = ? ORDER BY noteId LIMIT ? OFFSET ?`, [providerId, modelId, batchSize, offset] ) as any[]; if (embeddings.length === 0) { break; } // Process this batch of embeddings for (const embedding of embeddings) { try { // Convert buffer to embedding for processing const vector = bufferToEmbedding(embedding.embedding, embedding.dimension); // Optimize this embedding (in a real system, this might involve: // - Adding to an optimized index structure // - Normalizing vectors // - Updating index metadata // For this implementation, we'll just "touch" the record to simulate optimization) await sql.execute(` UPDATE note_embeddings SET dateModified = ?, utcDateModified = ? WHERE embedId = ?`, [dateUtils.localNowDateTime(), dateUtils.utcNowDateTime(), embedding.embedId] ); processed++; // Update progress every 10 embeddings if (processed % 10 === 0) { indexService.updateIndexRebuildProgress(10); // Log progress every 100 embeddings if (processed % 100 === 0) { const percent = Math.round((processed / totalEmbeddings) * 100); log.info(`Index rebuild progress: ${percent}% (${processed}/${totalEmbeddings})`); } } } catch (error: any) { log.error(`Error processing embedding ${embedding.embedId}: ${error.message || "Unknown error"}`); } } offset += embeddings.length; } } // 3. Finalize - could involve additional optimization steps const duration = Math.round((Date.now() - startTime) / 1000); log.info(`Index rebuild completed: processed ${processed} embeddings in ${duration} seconds`); return processed; } catch (error: any) { log.error(`Error during index rebuild: ${error.message || "Unknown error"}`); throw error; } }