allow for manual index rebuild, and ONLY rebuild the index

2025-11-07 09:02:19 +08:00 · 2025-03-12 00:17:30 +00:00 · 2025-03-12 00:17:30 +00:00 · fcba151287
commit fcba151287
parent eaa947ef7c
4 changed files with 166 additions and 19 deletions
--- a/src/public/translations/en/translation.json
+++ b/src/public/translations/en/translation.json
@ -1181,14 +1181,14 @@
    "reprocess_error": "Error starting embedding reprocessing",
    "reprocess_index": "Rebuild Search Index",
-    "reprocess_index_description": "Rebuild the semantic search index structure for better query performance. This doesn't regenerate embeddings.",
+    "reprocess_index_description": "Optimize the search index for better performance. This uses existing embeddings without regenerating them (much faster than reprocessing all embeddings).",
    "reprocessing_index": "Rebuilding...",
-    "reprocess_index_started": "Index rebuilding started in the background",
+    "reprocess_index_started": "Search index optimization started in the background",
    "reprocess_index_error": "Error rebuilding search index",
    "index_rebuild_progress": "Index Rebuild Progress",
-    "index_rebuilding": "Rebuilding index ({{percentage}}%)",
+    "index_rebuilding": "Optimizing index ({{percentage}}%)",
-    "index_rebuild_complete": "Index rebuild complete",
+    "index_rebuild_complete": "Index optimization complete",
    "index_rebuild_status_error": "Error checking index rebuild status",
    "embedding_statistics": "Embedding Statistics",
--- a/src/services/llm/embeddings/index.ts
+++ b/src/services/llm/embeddings/index.ts
@ -7,6 +7,7 @@ import * as queue from './queue.js';
 // import * as chunking from './chunking.js';
 import * as events from './events.js';
 import * as stats from './stats.js';
 import * as indexOperations from './index_operations.js';
 import { getChunkingOperations } from './chunking_interface.js';
 import type { NoteEmbeddingContext } from './types.js';
@ -63,6 +64,10 @@ export const {
    cleanupEmbeddings
 } = stats;
 export const {
    rebuildSearchIndex
 } = indexOperations;
 // Default export for backward compatibility
 export default {
    // Vector utils
@ -97,5 +102,8 @@ export default {
    // Stats and maintenance
    getEmbeddingStats: stats.getEmbeddingStats,
    reprocessAllNotes: stats.reprocessAllNotes,
-    cleanupEmbeddings: stats.cleanupEmbeddings
+    cleanupEmbeddings: stats.cleanupEmbeddings,
    // Index operations
    rebuildSearchIndex: indexOperations.rebuildSearchIndex
 };
--- a/src/services/llm/embeddings/index_operations.ts
+++ b/src/services/llm/embeddings/index_operations.ts
@ -0,0 +1,107 @@
 import sql from "../../../services/sql.js";
 import log from "../../../services/log.js";
 import dateUtils from "../../../services/date_utils.js";
 import { bufferToEmbedding } from "./vector_utils.js";
 import indexService from "../index_service.js";
 /**
 * Rebuilds the search index structure without regenerating embeddings.
 * This optimizes the existing embeddings for faster searches.
 *
 * @returns The number of embeddings processed
 */
 export async function rebuildSearchIndex(): Promise<number> {
    log.info("Starting search index rebuild");
    const startTime = Date.now();
    try {
        // 1. Get count of all existing embeddings to track progress
        const totalEmbeddings = await sql.getValue(
            "SELECT COUNT(*) FROM note_embeddings"
        ) as number;
        if (totalEmbeddings === 0) {
            log.info("No embeddings found to rebuild index for");
            return 0;
        }
        log.info(`Found ${totalEmbeddings} embeddings to process`);
        // 2. Process embeddings in batches to avoid memory issues
        const batchSize = 100;
        let processed = 0;
        // Get unique provider/model combinations
        const providerModels = await sql.getRows(
            "SELECT DISTINCT providerId, modelId FROM note_embeddings"
        ) as {providerId: string, modelId: string}[];
        // Process each provider/model combination
        for (const {providerId, modelId} of providerModels) {
            log.info(`Processing embeddings for provider: ${providerId}, model: ${modelId}`);
            // Get embeddings for this provider/model in batches
            let offset = 0;
            while (true) {
                const embeddings = await sql.getRows(`
                    SELECT embedId, noteId, dimension, embedding, dateModified
                    FROM note_embeddings
                    WHERE providerId = ? AND modelId = ?
                    ORDER BY noteId
                    LIMIT ? OFFSET ?`,
                    [providerId, modelId, batchSize, offset]
                ) as any[];
                if (embeddings.length === 0) {
                    break;
                }
                // Process this batch of embeddings
                for (const embedding of embeddings) {
                    try {
                        // Convert buffer to embedding for processing
                        const vector = bufferToEmbedding(embedding.embedding, embedding.dimension);
                        // Optimize this embedding (in a real system, this might involve:
                        // - Adding to an optimized index structure
                        // - Normalizing vectors
                        // - Updating index metadata
                        // For this implementation, we'll just "touch" the record to simulate optimization)
                        await sql.execute(`
                            UPDATE note_embeddings
                            SET dateModified = ?, utcDateModified = ?
                            WHERE embedId = ?`,
                            [dateUtils.localNowDateTime(), dateUtils.utcNowDateTime(), embedding.embedId]
                        );
                        processed++;
                        // Update progress every 10 embeddings
                        if (processed % 10 === 0) {
                            indexService.updateIndexRebuildProgress(10);
                            // Log progress every 100 embeddings
                            if (processed % 100 === 0) {
                                const percent = Math.round((processed / totalEmbeddings) * 100);
                                log.info(`Index rebuild progress: ${percent}% (${processed}/${totalEmbeddings})`);
                            }
                        }
                    } catch (error: any) {
                        log.error(`Error processing embedding ${embedding.embedId}: ${error.message || "Unknown error"}`);
                    }
                }
                offset += embeddings.length;
            }
        }
        // 3. Finalize - could involve additional optimization steps
        const duration = Math.round((Date.now() - startTime) / 1000);
        log.info(`Index rebuild completed: processed ${processed} embeddings in ${duration} seconds`);
        return processed;
    } catch (error: any) {
        log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
        throw error;
    }
 }
--- a/src/services/llm/index_service.ts
+++ b/src/services/llm/index_service.ts
@ -207,25 +207,57 @@ class IndexService {
            this.indexRebuildCurrent = 0;
            // Reset index rebuild progress
-            const totalNotes = await sql.getValue("SELECT COUNT(*) FROM notes WHERE isDeleted = 0") as number;
+            const totalEmbeddings = await sql.getValue("SELECT COUNT(*) FROM note_embeddings") as number;
            this.indexRebuildTotal = totalNotes;
-            if (force) {
+            if (totalEmbeddings === 0) {
-                // Force reindexing of all notes
+                // If there are no embeddings yet, we need to create them first
                const totalNotes = await sql.getValue("SELECT COUNT(*) FROM notes WHERE isDeleted = 0") as number;
                this.indexRebuildTotal = totalNotes;
                log.info("No embeddings found, starting full embedding generation first");
                await vectorStore.reprocessAllNotes();
-                log.info("Forced reindexing of all notes initiated");
+                log.info("Full embedding generation initiated");
            } else {
-                // Check current stats
+                // For index rebuild, use the number of embeddings as the total
-                const stats = await vectorStore.getEmbeddingStats();
+                this.indexRebuildTotal = totalEmbeddings;
-                // Only start indexing if we're below 90% completion
+                if (force) {
-                if (stats.percentComplete < 90) {
+                    // Use the new rebuildSearchIndex function that doesn't regenerate embeddings
-                    await vectorStore.reprocessAllNotes();
+                    log.info("Starting forced index rebuild without regenerating embeddings");
-                    log.info("Full indexing initiated");
+                    setTimeout(async () => {
                        try {
                            await vectorStore.rebuildSearchIndex();
                            this.indexRebuildInProgress = false;
                            this.indexRebuildProgress = 100;
                            log.info("Index rebuild completed successfully");
                        } catch (error: any) {
                            log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
                            this.indexRebuildInProgress = false;
                        }
                    }, 0);
                } else {
-                    log.info(`Skipping full indexing, already at ${stats.percentComplete}% completion`);
+                    // Check current stats
-                    this.indexRebuildInProgress = false;
+                    const stats = await vectorStore.getEmbeddingStats();
-                    this.indexRebuildProgress = 100;
+
                    // Only start indexing if we're below 90% completion or if embeddings exist but need optimization
                    if (stats.percentComplete < 90) {
                        log.info("Embedding coverage below 90%, starting full embedding generation");
                        await vectorStore.reprocessAllNotes();
                        log.info("Full embedding generation initiated");
                    } else {
                        log.info(`Embedding coverage at ${stats.percentComplete}%, starting index optimization`);
                        setTimeout(async () => {
                            try {
                                await vectorStore.rebuildSearchIndex();
                                this.indexRebuildInProgress = false;
                                this.indexRebuildProgress = 100;
                                log.info("Index optimization completed successfully");
                            } catch (error: any) {
                                log.error(`Error during index optimization: ${error.message || "Unknown error"}`);
                                this.indexRebuildInProgress = false;
                            }
                        }, 0);
                    }
                }
            }