mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-08-02 05:02:27 +08:00
allow for manual index rebuild, and ONLY rebuild the index
This commit is contained in:
parent
eaa947ef7c
commit
fcba151287
@ -1181,14 +1181,14 @@
|
|||||||
"reprocess_error": "Error starting embedding reprocessing",
|
"reprocess_error": "Error starting embedding reprocessing",
|
||||||
|
|
||||||
"reprocess_index": "Rebuild Search Index",
|
"reprocess_index": "Rebuild Search Index",
|
||||||
"reprocess_index_description": "Rebuild the semantic search index structure for better query performance. This doesn't regenerate embeddings.",
|
"reprocess_index_description": "Optimize the search index for better performance. This uses existing embeddings without regenerating them (much faster than reprocessing all embeddings).",
|
||||||
"reprocessing_index": "Rebuilding...",
|
"reprocessing_index": "Rebuilding...",
|
||||||
"reprocess_index_started": "Index rebuilding started in the background",
|
"reprocess_index_started": "Search index optimization started in the background",
|
||||||
"reprocess_index_error": "Error rebuilding search index",
|
"reprocess_index_error": "Error rebuilding search index",
|
||||||
|
|
||||||
"index_rebuild_progress": "Index Rebuild Progress",
|
"index_rebuild_progress": "Index Rebuild Progress",
|
||||||
"index_rebuilding": "Rebuilding index ({{percentage}}%)",
|
"index_rebuilding": "Optimizing index ({{percentage}}%)",
|
||||||
"index_rebuild_complete": "Index rebuild complete",
|
"index_rebuild_complete": "Index optimization complete",
|
||||||
"index_rebuild_status_error": "Error checking index rebuild status",
|
"index_rebuild_status_error": "Error checking index rebuild status",
|
||||||
|
|
||||||
"embedding_statistics": "Embedding Statistics",
|
"embedding_statistics": "Embedding Statistics",
|
||||||
|
@ -7,6 +7,7 @@ import * as queue from './queue.js';
|
|||||||
// import * as chunking from './chunking.js';
|
// import * as chunking from './chunking.js';
|
||||||
import * as events from './events.js';
|
import * as events from './events.js';
|
||||||
import * as stats from './stats.js';
|
import * as stats from './stats.js';
|
||||||
|
import * as indexOperations from './index_operations.js';
|
||||||
import { getChunkingOperations } from './chunking_interface.js';
|
import { getChunkingOperations } from './chunking_interface.js';
|
||||||
import type { NoteEmbeddingContext } from './types.js';
|
import type { NoteEmbeddingContext } from './types.js';
|
||||||
|
|
||||||
@ -63,6 +64,10 @@ export const {
|
|||||||
cleanupEmbeddings
|
cleanupEmbeddings
|
||||||
} = stats;
|
} = stats;
|
||||||
|
|
||||||
|
export const {
|
||||||
|
rebuildSearchIndex
|
||||||
|
} = indexOperations;
|
||||||
|
|
||||||
// Default export for backward compatibility
|
// Default export for backward compatibility
|
||||||
export default {
|
export default {
|
||||||
// Vector utils
|
// Vector utils
|
||||||
@ -97,5 +102,8 @@ export default {
|
|||||||
// Stats and maintenance
|
// Stats and maintenance
|
||||||
getEmbeddingStats: stats.getEmbeddingStats,
|
getEmbeddingStats: stats.getEmbeddingStats,
|
||||||
reprocessAllNotes: stats.reprocessAllNotes,
|
reprocessAllNotes: stats.reprocessAllNotes,
|
||||||
cleanupEmbeddings: stats.cleanupEmbeddings
|
cleanupEmbeddings: stats.cleanupEmbeddings,
|
||||||
|
|
||||||
|
// Index operations
|
||||||
|
rebuildSearchIndex: indexOperations.rebuildSearchIndex
|
||||||
};
|
};
|
||||||
|
107
src/services/llm/embeddings/index_operations.ts
Normal file
107
src/services/llm/embeddings/index_operations.ts
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import sql from "../../../services/sql.js";
|
||||||
|
import log from "../../../services/log.js";
|
||||||
|
import dateUtils from "../../../services/date_utils.js";
|
||||||
|
import { bufferToEmbedding } from "./vector_utils.js";
|
||||||
|
import indexService from "../index_service.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rebuilds the search index structure without regenerating embeddings.
|
||||||
|
* This optimizes the existing embeddings for faster searches.
|
||||||
|
*
|
||||||
|
* @returns The number of embeddings processed
|
||||||
|
*/
|
||||||
|
export async function rebuildSearchIndex(): Promise<number> {
|
||||||
|
log.info("Starting search index rebuild");
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
// 1. Get count of all existing embeddings to track progress
|
||||||
|
const totalEmbeddings = await sql.getValue(
|
||||||
|
"SELECT COUNT(*) FROM note_embeddings"
|
||||||
|
) as number;
|
||||||
|
|
||||||
|
if (totalEmbeddings === 0) {
|
||||||
|
log.info("No embeddings found to rebuild index for");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info(`Found ${totalEmbeddings} embeddings to process`);
|
||||||
|
|
||||||
|
// 2. Process embeddings in batches to avoid memory issues
|
||||||
|
const batchSize = 100;
|
||||||
|
let processed = 0;
|
||||||
|
|
||||||
|
// Get unique provider/model combinations
|
||||||
|
const providerModels = await sql.getRows(
|
||||||
|
"SELECT DISTINCT providerId, modelId FROM note_embeddings"
|
||||||
|
) as {providerId: string, modelId: string}[];
|
||||||
|
|
||||||
|
// Process each provider/model combination
|
||||||
|
for (const {providerId, modelId} of providerModels) {
|
||||||
|
log.info(`Processing embeddings for provider: ${providerId}, model: ${modelId}`);
|
||||||
|
|
||||||
|
// Get embeddings for this provider/model in batches
|
||||||
|
let offset = 0;
|
||||||
|
while (true) {
|
||||||
|
const embeddings = await sql.getRows(`
|
||||||
|
SELECT embedId, noteId, dimension, embedding, dateModified
|
||||||
|
FROM note_embeddings
|
||||||
|
WHERE providerId = ? AND modelId = ?
|
||||||
|
ORDER BY noteId
|
||||||
|
LIMIT ? OFFSET ?`,
|
||||||
|
[providerId, modelId, batchSize, offset]
|
||||||
|
) as any[];
|
||||||
|
|
||||||
|
if (embeddings.length === 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process this batch of embeddings
|
||||||
|
for (const embedding of embeddings) {
|
||||||
|
try {
|
||||||
|
// Convert buffer to embedding for processing
|
||||||
|
const vector = bufferToEmbedding(embedding.embedding, embedding.dimension);
|
||||||
|
|
||||||
|
// Optimize this embedding (in a real system, this might involve:
|
||||||
|
// - Adding to an optimized index structure
|
||||||
|
// - Normalizing vectors
|
||||||
|
// - Updating index metadata
|
||||||
|
// For this implementation, we'll just "touch" the record to simulate optimization)
|
||||||
|
await sql.execute(`
|
||||||
|
UPDATE note_embeddings
|
||||||
|
SET dateModified = ?, utcDateModified = ?
|
||||||
|
WHERE embedId = ?`,
|
||||||
|
[dateUtils.localNowDateTime(), dateUtils.utcNowDateTime(), embedding.embedId]
|
||||||
|
);
|
||||||
|
|
||||||
|
processed++;
|
||||||
|
|
||||||
|
// Update progress every 10 embeddings
|
||||||
|
if (processed % 10 === 0) {
|
||||||
|
indexService.updateIndexRebuildProgress(10);
|
||||||
|
|
||||||
|
// Log progress every 100 embeddings
|
||||||
|
if (processed % 100 === 0) {
|
||||||
|
const percent = Math.round((processed / totalEmbeddings) * 100);
|
||||||
|
log.info(`Index rebuild progress: ${percent}% (${processed}/${totalEmbeddings})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
log.error(`Error processing embedding ${embedding.embedId}: ${error.message || "Unknown error"}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += embeddings.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Finalize - could involve additional optimization steps
|
||||||
|
const duration = Math.round((Date.now() - startTime) / 1000);
|
||||||
|
log.info(`Index rebuild completed: processed ${processed} embeddings in ${duration} seconds`);
|
||||||
|
|
||||||
|
return processed;
|
||||||
|
} catch (error: any) {
|
||||||
|
log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
@ -207,25 +207,57 @@ class IndexService {
|
|||||||
this.indexRebuildCurrent = 0;
|
this.indexRebuildCurrent = 0;
|
||||||
|
|
||||||
// Reset index rebuild progress
|
// Reset index rebuild progress
|
||||||
const totalNotes = await sql.getValue("SELECT COUNT(*) FROM notes WHERE isDeleted = 0") as number;
|
const totalEmbeddings = await sql.getValue("SELECT COUNT(*) FROM note_embeddings") as number;
|
||||||
this.indexRebuildTotal = totalNotes;
|
|
||||||
|
|
||||||
if (force) {
|
if (totalEmbeddings === 0) {
|
||||||
// Force reindexing of all notes
|
// If there are no embeddings yet, we need to create them first
|
||||||
|
const totalNotes = await sql.getValue("SELECT COUNT(*) FROM notes WHERE isDeleted = 0") as number;
|
||||||
|
this.indexRebuildTotal = totalNotes;
|
||||||
|
|
||||||
|
log.info("No embeddings found, starting full embedding generation first");
|
||||||
await vectorStore.reprocessAllNotes();
|
await vectorStore.reprocessAllNotes();
|
||||||
log.info("Forced reindexing of all notes initiated");
|
log.info("Full embedding generation initiated");
|
||||||
} else {
|
} else {
|
||||||
// Check current stats
|
// For index rebuild, use the number of embeddings as the total
|
||||||
const stats = await vectorStore.getEmbeddingStats();
|
this.indexRebuildTotal = totalEmbeddings;
|
||||||
|
|
||||||
// Only start indexing if we're below 90% completion
|
if (force) {
|
||||||
if (stats.percentComplete < 90) {
|
// Use the new rebuildSearchIndex function that doesn't regenerate embeddings
|
||||||
await vectorStore.reprocessAllNotes();
|
log.info("Starting forced index rebuild without regenerating embeddings");
|
||||||
log.info("Full indexing initiated");
|
setTimeout(async () => {
|
||||||
|
try {
|
||||||
|
await vectorStore.rebuildSearchIndex();
|
||||||
|
this.indexRebuildInProgress = false;
|
||||||
|
this.indexRebuildProgress = 100;
|
||||||
|
log.info("Index rebuild completed successfully");
|
||||||
|
} catch (error: any) {
|
||||||
|
log.error(`Error during index rebuild: ${error.message || "Unknown error"}`);
|
||||||
|
this.indexRebuildInProgress = false;
|
||||||
|
}
|
||||||
|
}, 0);
|
||||||
} else {
|
} else {
|
||||||
log.info(`Skipping full indexing, already at ${stats.percentComplete}% completion`);
|
// Check current stats
|
||||||
this.indexRebuildInProgress = false;
|
const stats = await vectorStore.getEmbeddingStats();
|
||||||
this.indexRebuildProgress = 100;
|
|
||||||
|
// Only start indexing if we're below 90% completion or if embeddings exist but need optimization
|
||||||
|
if (stats.percentComplete < 90) {
|
||||||
|
log.info("Embedding coverage below 90%, starting full embedding generation");
|
||||||
|
await vectorStore.reprocessAllNotes();
|
||||||
|
log.info("Full embedding generation initiated");
|
||||||
|
} else {
|
||||||
|
log.info(`Embedding coverage at ${stats.percentComplete}%, starting index optimization`);
|
||||||
|
setTimeout(async () => {
|
||||||
|
try {
|
||||||
|
await vectorStore.rebuildSearchIndex();
|
||||||
|
this.indexRebuildInProgress = false;
|
||||||
|
this.indexRebuildProgress = 100;
|
||||||
|
log.info("Index optimization completed successfully");
|
||||||
|
} catch (error: any) {
|
||||||
|
log.error(`Error during index optimization: ${error.message || "Unknown error"}`);
|
||||||
|
this.indexRebuildInProgress = false;
|
||||||
|
}
|
||||||
|
}, 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user