allow for manual index rebuild, and ONLY rebuild the index

This commit is contained in:
perf3ct 2025-03-12 00:17:30 +00:00
parent eaa947ef7c
commit fcba151287
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
4 changed files with 166 additions and 19 deletions

View File

@ -1181,14 +1181,14 @@
"reprocess_error": "Error starting embedding reprocessing",
"reprocess_index": "Rebuild Search Index",
"reprocess_index_description": "Rebuild the semantic search index structure for better query performance. This doesn't regenerate embeddings.",
"reprocess_index_description": "Optimize the search index for better performance. This uses existing embeddings without regenerating them (much faster than reprocessing all embeddings).",
"reprocessing_index": "Rebuilding...",
"reprocess_index_started": "Index rebuilding started in the background",
"reprocess_index_started": "Search index optimization started in the background",
"reprocess_index_error": "Error rebuilding search index",
"index_rebuild_progress": "Index Rebuild Progress",
"index_rebuilding": "Rebuilding index ({{percentage}}%)",
"index_rebuild_complete": "Index rebuild complete",
"index_rebuilding": "Optimizing index ({{percentage}}%)",
"index_rebuild_complete": "Index optimization complete",
"index_rebuild_status_error": "Error checking index rebuild status",
"embedding_statistics": "Embedding Statistics",

View File

@ -7,6 +7,7 @@ import * as queue from './queue.js';
// import * as chunking from './chunking.js';
import * as events from './events.js';
import * as stats from './stats.js';
import * as indexOperations from './index_operations.js';
import { getChunkingOperations } from './chunking_interface.js';
import type { NoteEmbeddingContext } from './types.js';
@ -63,6 +64,10 @@ export const {
cleanupEmbeddings
} = stats;
export const {
rebuildSearchIndex
} = indexOperations;
// Default export for backward compatibility
export default {
// Vector utils
@ -97,5 +102,8 @@ export default {
// Stats and maintenance
getEmbeddingStats: stats.getEmbeddingStats,
reprocessAllNotes: stats.reprocessAllNotes,
cleanupEmbeddings: stats.cleanupEmbeddings
cleanupEmbeddings: stats.cleanupEmbeddings,
// Index operations
rebuildSearchIndex: indexOperations.rebuildSearchIndex
};

View File

@ -0,0 +1,107 @@
import sql from "../../../services/sql.js";
import log from "../../../services/log.js";
import dateUtils from "../../../services/date_utils.js";
import { bufferToEmbedding } from "./vector_utils.js";
import indexService from "../index_service.js";
/**
* Rebuilds the search index structure without regenerating embeddings.
* This optimizes the existing embeddings for faster searches.
*
* @returns The number of embeddings processed
*/
export async function rebuildSearchIndex(): Promise<number> {
log.info("Starting search index rebuild");
const startTime = Date.now();
try {
// 1. Get count of all existing embeddings to track progress
const totalEmbeddings = await sql.getValue(
"SELECT COUNT(*) FROM note_embeddings"
) as number;
if (totalEmbeddings === 0) {
log.info("No embeddings found to rebuild index for");
return 0;
}
log.info(`Found ${totalEmbeddings} embeddings to process`);
// 2. Process embeddings in batches to avoid memory issues
const batchSize = 100;
let processed = 0;
// Get unique provider/model combinations
const providerModels = await sql.getRows(
"SELECT DISTINCT providerId, modelId FROM note_embeddings"
) as {providerId: string, modelId: string}[];
// Process each provider/model combination
for (const {providerId, modelId} of providerModels) {
log.info(`Processing embeddings for provider: ${providerId}, model: ${modelId}`);
// Get embeddings for this provider/model in batches
let offset = 0;
while (true) {
const embeddings = await sql.getRows(`
SELECT embedId, noteId, dimension, embedding, dateModified
FROM note_embeddings
WHERE providerId = ? AND modelId = ?
ORDER BY noteId
LIMIT ? OFFSET ?`,
[providerId, modelId, batchSize, offset]
) as any[];
if (embeddings.length === 0) {
break;
}
// Process this batch of embeddings
for (const embedding of embeddings) {
try {
// Convert buffer to embedding for processing
const vector = bufferToEmbedding(embedding.embedding, embedding.dimension);
// Optimize this embedding (in a real system, this might involve:
// - Adding to an optimized index structure
// - Normalizing vectors
// - Updating index metadata
// For this implementation, we'll just "touch" the record to simulate optimization)
await sql.execute(`
UPDATE note_embeddings
SET dateModified = ?, utcDateModified = ?
WHERE embedId = ?`,
[dateUtils.localNowDateTime(), dateUtils.utcNowDateTime(), embedding.embedId]
);
processed++;
// Update progress every 10 embeddings
if (processed % 10 === 0) {
indexService.updateIndexRebuildProgress(10);
// Log progress every 100 embeddings
if (processed % 100 === 0) {
const percent = Math.round((processed / totalEmbeddings) * 100);
log.info(`Index rebuild progress: ${percent}% (${processed}/${totalEmbeddings})`);
}
}
} catch (error: any) {
log.error(`Error processing embedding ${embedding.embedId}: ${error.message || "Unknown error"}`);
}
}
offset += embeddings.length;
}
}
// 3. Finalize - could involve additional optimization steps
const duration = Math.round((Date.now() - startTime) / 1000);
log.info(`Index rebuild completed: processed ${processed} embeddings in ${duration} seconds`);
return processed;
} catch (error: any) {
log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
throw error;
}
}

View File

@ -207,25 +207,57 @@ class IndexService {
this.indexRebuildCurrent = 0;
// Reset index rebuild progress
const totalNotes = await sql.getValue("SELECT COUNT(*) FROM notes WHERE isDeleted = 0") as number;
this.indexRebuildTotal = totalNotes;
const totalEmbeddings = await sql.getValue("SELECT COUNT(*) FROM note_embeddings") as number;
if (force) {
// Force reindexing of all notes
if (totalEmbeddings === 0) {
// If there are no embeddings yet, we need to create them first
const totalNotes = await sql.getValue("SELECT COUNT(*) FROM notes WHERE isDeleted = 0") as number;
this.indexRebuildTotal = totalNotes;
log.info("No embeddings found, starting full embedding generation first");
await vectorStore.reprocessAllNotes();
log.info("Forced reindexing of all notes initiated");
log.info("Full embedding generation initiated");
} else {
// Check current stats
const stats = await vectorStore.getEmbeddingStats();
// For index rebuild, use the number of embeddings as the total
this.indexRebuildTotal = totalEmbeddings;
// Only start indexing if we're below 90% completion
if (stats.percentComplete < 90) {
await vectorStore.reprocessAllNotes();
log.info("Full indexing initiated");
if (force) {
// Use the new rebuildSearchIndex function that doesn't regenerate embeddings
log.info("Starting forced index rebuild without regenerating embeddings");
setTimeout(async () => {
try {
await vectorStore.rebuildSearchIndex();
this.indexRebuildInProgress = false;
this.indexRebuildProgress = 100;
log.info("Index rebuild completed successfully");
} catch (error: any) {
log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
this.indexRebuildInProgress = false;
}
}, 0);
} else {
log.info(`Skipping full indexing, already at ${stats.percentComplete}% completion`);
this.indexRebuildInProgress = false;
this.indexRebuildProgress = 100;
// Check current stats
const stats = await vectorStore.getEmbeddingStats();
// Only start indexing if we're below 90% completion or if embeddings exist but need optimization
if (stats.percentComplete < 90) {
log.info("Embedding coverage below 90%, starting full embedding generation");
await vectorStore.reprocessAllNotes();
log.info("Full embedding generation initiated");
} else {
log.info(`Embedding coverage at ${stats.percentComplete}%, starting index optimization`);
setTimeout(async () => {
try {
await vectorStore.rebuildSearchIndex();
this.indexRebuildInProgress = false;
this.indexRebuildProgress = 100;
log.info("Index optimization completed successfully");
} catch (error: any) {
log.error(`Error during index optimization: ${error.message || "Unknown error"}`);
this.indexRebuildInProgress = false;
}
}, 0);
}
}
}