From f47b070f0f3ab782b4a60b6dce88af2fa29f0bc4 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 11 Mar 2025 20:22:01 +0000 Subject: [PATCH] I think this works to handle failed embeddings --- .../type_widgets/options/ai_settings.ts | 247 +++++++++-- src/public/translations/en/translation.json | 4 + src/routes/api/embeddings.ts | 59 ++- src/routes/routes.ts | 3 + src/services/llm/embeddings/vector_store.ts | 402 ++++++++++++++++-- 5 files changed, 642 insertions(+), 73 deletions(-) diff --git a/src/public/app/widgets/type_widgets/options/ai_settings.ts b/src/public/app/widgets/type_widgets/options/ai_settings.ts index 7a7da2daf..6b9519eb4 100644 --- a/src/public/app/widgets/type_widgets/options/ai_settings.ts +++ b/src/public/app/widgets/type_widgets/options/ai_settings.ts @@ -30,6 +30,21 @@ interface EmbeddingStats { } } +// Interface for failed embedding notes +interface FailedEmbeddingNotes { + success: boolean; + failedNotes: Array<{ + noteId: string; + title?: string; + operation: string; + attempts: number; + lastAttempt: string; + error: string; + failureType: string; + chunks: number; + }>; +} + export default class AiSettingsWidget extends OptionsWidget { private statsRefreshInterval: NodeJS.Timeout | null = null; private readonly STATS_REFRESH_INTERVAL = 5000; // 5 seconds @@ -227,6 +242,16 @@ export default class AiSettingsWidget extends OptionsWidget { + + +
+ +
+
+
${t("ai_llm.no_failed_embeddings")}
+
+
+
`); @@ -416,6 +441,54 @@ export default class AiSettingsWidget extends OptionsWidget { return this.$widget; } + optionsLoaded(options: OptionMap) { + if (!this.$widget) return; + + this.setCheckboxState(this.$widget.find('.ai-enabled'), options.aiEnabled || 'false'); + this.setCheckboxState(this.$widget.find('.ollama-enabled'), options.ollamaEnabled || 'false'); + + this.$widget.find('.ai-provider-precedence').val(options.aiProviderPrecedence || 'openai,anthropic,ollama'); + this.$widget.find('.ai-temperature').val(options.aiTemperature || '0.7'); + this.$widget.find('.ai-system-prompt').val(options.aiSystemPrompt || ''); + + this.$widget.find('.openai-api-key').val(options.openaiApiKey || ''); + this.$widget.find('.openai-default-model').val(options.openaiDefaultModel || 'gpt-4o'); + this.$widget.find('.openai-base-url').val(options.openaiBaseUrl || 'https://api.openai.com/v1'); + + this.$widget.find('.anthropic-api-key').val(options.anthropicApiKey || ''); + this.$widget.find('.anthropic-default-model').val(options.anthropicDefaultModel || 'claude-3-opus-20240229'); + this.$widget.find('.anthropic-base-url').val(options.anthropicBaseUrl || 'https://api.anthropic.com/v1'); + + this.$widget.find('.ollama-base-url').val(options.ollamaBaseUrl || 'http://localhost:11434'); + this.$widget.find('.ollama-default-model').val(options.ollamaDefaultModel || 'llama3'); + this.$widget.find('.ollama-embedding-model').val(options.ollamaEmbeddingModel || 'nomic-embed-text'); + + // Load embedding options + this.$widget.find('.embedding-default-provider').val(options.embeddingsDefaultProvider || 'openai'); + this.setCheckboxState(this.$widget.find('.embedding-auto-update-enabled'), options.embeddingAutoUpdateEnabled || 'true'); + this.$widget.find('.embedding-batch-size').val(options.embeddingBatchSize || '10'); + this.$widget.find('.embedding-update-interval').val(options.embeddingUpdateInterval || '5000'); + this.$widget.find('.embedding-default-dimension').val(options.embeddingDefaultDimension || '1536'); + + this.updateAiSectionVisibility(); + } + + updateAiSectionVisibility() { + if (!this.$widget) return; + + const aiEnabled = this.$widget.find('.ai-enabled').prop('checked'); + this.$widget.find('.ai-providers-section').toggle(aiEnabled); + this.$widget.find('.ai-provider').toggle(aiEnabled); + this.$widget.find('.embedding-section').toggle(aiEnabled); + + // Start or stop polling based on visibility + if (aiEnabled && this.$widget.find('.embedding-section').is(':visible')) { + this.startStatsPolling(); + } else { + this.stopStatsPolling(); + } + } + /** * Start automatic polling for embedding statistics */ @@ -429,6 +502,9 @@ export default class AiSettingsWidget extends OptionsWidget { if (this.$widget && this.$widget.is(':visible') && this.$widget.find('.embedding-section').is(':visible')) { await this.refreshEmbeddingStats(true); + + // Also update failed embeddings list periodically + await this.updateFailedEmbeddingsList(); } }, this.STATS_REFRESH_INTERVAL); } @@ -443,6 +519,62 @@ export default class AiSettingsWidget extends OptionsWidget { } } + // Clean up when the widget is removed + cleanup() { + this.stopStatsPolling(); + super.cleanup(); + } + + /** + * Get embedding stats from the server + */ + async getEmbeddingStats(): Promise { + try { + return await server.get('embeddings/stats') as EmbeddingStats; + } catch (error) { + console.error('Error fetching embedding stats:', error); + return null; + } + } + + /** + * Get failed embedding notes from the server + */ + async getFailedEmbeddingNotes(): Promise { + try { + return await server.get('embeddings/failed') as FailedEmbeddingNotes; + } catch (error) { + console.error('Error fetching failed embedding notes:', error); + return null; + } + } + + /** + * Retry a specific failed embedding + */ + async retryFailedEmbedding(noteId: string): Promise { + try { + const result = await server.post(`embeddings/retry/${noteId}`) as {success: boolean}; + return result.success; + } catch (error) { + console.error('Error retrying failed embedding:', error); + return false; + } + } + + /** + * Retry all failed embeddings + */ + async retryAllFailedEmbeddings(): Promise { + try { + const result = await server.post('embeddings/retry-all-failed') as {success: boolean}; + return result.success; + } catch (error) { + console.error('Error retrying all failed embeddings:', error); + return false; + } + } + async refreshEmbeddingStats(silent = false) { if (!this.$widget) return; @@ -455,7 +587,7 @@ export default class AiSettingsWidget extends OptionsWidget { $refreshButton.text(t("ai_llm.refreshing")); } - const response = await server.get('embeddings/stats'); + const response = await this.getEmbeddingStats(); if (response && response.success) { const stats = response.stats; @@ -498,6 +630,11 @@ export default class AiSettingsWidget extends OptionsWidget { $progressBar.removeClass('progress-bar-striped progress-bar-animated bg-info'); $progressBar.addClass('bg-success'); } + + // Update failed embeddings list if there are failures + if (stats.failedNotesCount > 0 && !silent) { + await this.updateFailedEmbeddingsList(); + } } } catch (error) { console.error("Error fetching embedding stats:", error); @@ -514,57 +651,85 @@ export default class AiSettingsWidget extends OptionsWidget { } } - updateAiSectionVisibility() { + async updateFailedEmbeddingsList() { if (!this.$widget) return; - const aiEnabled = this.$widget.find('.ai-enabled').prop('checked'); - this.$widget.find('.ai-providers-section').toggle(aiEnabled); - this.$widget.find('.ai-provider').toggle(aiEnabled); - this.$widget.find('.embedding-section').toggle(aiEnabled); - - // Start or stop polling based on visibility - if (aiEnabled && this.$widget.find('.embedding-section').is(':visible')) { - this.startStatsPolling(); - } else { - this.stopStatsPolling(); + const failedResult = await this.getFailedEmbeddingNotes(); + if (!failedResult || !failedResult.failedNotes.length) { + // Use consistent styling with the rest of the application + this.$widget.find('.embedding-failed-notes-list').html( + `
No failed embeddings
` + ); + return; } - } - // Clean up when the widget is removed - cleanup() { - this.stopStatsPolling(); - super.cleanup(); - } + const $failedHeader = $(` +
+
Failed Embeddings (${failedResult.failedNotes.length})
+ +
+ `); - optionsLoaded(options: OptionMap) { - if (!this.$widget) return; + const $failedList = $('
'); - this.setCheckboxState(this.$widget.find('.ai-enabled'), options.aiEnabled || 'false'); - this.setCheckboxState(this.$widget.find('.ollama-enabled'), options.ollamaEnabled || 'false'); + for (const note of failedResult.failedNotes) { + // Determine if this is a full note failure or just failed chunks + const isFullFailure = note.failureType === 'full'; + const badgeClass = isFullFailure ? 'badge-danger' : 'badge-warning'; + const badgeText = isFullFailure ? 'Full Note' : `${note.chunks} Chunks`; - this.$widget.find('.ai-provider-precedence').val(options.aiProviderPrecedence || 'openai,anthropic,ollama'); - this.$widget.find('.ai-temperature').val(options.aiTemperature || '0.7'); - this.$widget.find('.ai-system-prompt').val(options.aiSystemPrompt || ''); + const $item = $(` +
+
+
+
${note.title || note.noteId}
+ ${badgeText} +
+ +
+
+
Attempts: ${note.attempts}
+
Last attempt: ${note.lastAttempt}
+
Error: ${note.error}
+
+
+ `); - this.$widget.find('.openai-api-key').val(options.openaiApiKey || ''); - this.$widget.find('.openai-default-model').val(options.openaiDefaultModel || 'gpt-4o'); - this.$widget.find('.openai-base-url').val(options.openaiBaseUrl || 'https://api.openai.com/v1'); + $failedList.append($item); + } - this.$widget.find('.anthropic-api-key').val(options.anthropicApiKey || ''); - this.$widget.find('.anthropic-default-model').val(options.anthropicDefaultModel || 'claude-3-opus-20240229'); - this.$widget.find('.anthropic-base-url').val(options.anthropicBaseUrl || 'https://api.anthropic.com/v1'); + this.$widget.find('.embedding-failed-notes-list').empty().append($failedHeader, $failedList); - this.$widget.find('.ollama-base-url').val(options.ollamaBaseUrl || 'http://localhost:11434'); - this.$widget.find('.ollama-default-model').val(options.ollamaDefaultModel || 'llama3'); - this.$widget.find('.ollama-embedding-model').val(options.ollamaEmbeddingModel || 'nomic-embed-text'); + // Add event handlers using local variables to avoid 'this' issues + const self = this; - // Load embedding options - this.$widget.find('.embedding-default-provider').val(options.embeddingsDefaultProvider || 'openai'); - this.setCheckboxState(this.$widget.find('.embedding-auto-update-enabled'), options.embeddingAutoUpdateEnabled || 'true'); - this.$widget.find('.embedding-batch-size').val(options.embeddingBatchSize || '10'); - this.$widget.find('.embedding-update-interval').val(options.embeddingUpdateInterval || '5000'); - this.$widget.find('.embedding-default-dimension').val(options.embeddingDefaultDimension || '1536'); + this.$widget.find('.retry-btn').on('click', async function() { + const noteId = $(this).data('note-id'); + $(this).prop('disabled', true).text('Retrying...'); - this.updateAiSectionVisibility(); + const success = await self.retryFailedEmbedding(noteId); + + if (success) { + toastService.showMessage("Note queued for retry"); + await self.refreshEmbeddingStats(); + } else { + toastService.showError("Failed to retry note"); + $(this).prop('disabled', false).text('Retry'); + } + }); + + this.$widget.find('.retry-all-btn').on('click', async function() { + $(this).prop('disabled', true).text('Retrying All...'); + + const success = await self.retryAllFailedEmbeddings(); + + if (success) { + toastService.showMessage("All failed notes queued for retry"); + await self.refreshEmbeddingStats(); + } else { + toastService.showError("Failed to retry notes"); + $(this).prop('disabled', false).text('Retry All Failed'); + } + }); } } diff --git a/src/public/translations/en/translation.json b/src/public/translations/en/translation.json index 2ea6f51e8..b3f84ac09 100644 --- a/src/public/translations/en/translation.json +++ b/src/public/translations/en/translation.json @@ -1123,6 +1123,8 @@ }, "ai_llm": { "title": "AI/LLM Integration", + "enable_ai": "Enable AI/LLM features", + "enable_ai_desc": "Enable AI features like note summarization, content generation, and other LLM capabilities", "enable_ai_features": "Enable AI/LLM features", "enable_ai_description": "Enable AI features like note summarization, content generation, and other LLM capabilities", "provider_configuration": "AI Provider Configuration", @@ -1157,6 +1159,8 @@ "embedding_default_provider_description": "Select the default provider used for generating note embeddings", "enable_auto_update_embeddings": "Auto-update Embeddings", "enable_auto_update_embeddings_description": "Automatically update embeddings when notes are modified", + "auto_update_embeddings": "Auto-update Embeddings", + "auto_update_embeddings_desc": "Automatically update embeddings when notes are modified", "embedding_batch_size": "Batch Size", "embedding_batch_size_description": "Number of notes to process in a single batch (1-50)", "embedding_update_interval": "Update Interval (ms)", diff --git a/src/routes/api/embeddings.ts b/src/routes/api/embeddings.ts index ca1758df0..061bfecd6 100644 --- a/src/routes/api/embeddings.ts +++ b/src/routes/api/embeddings.ts @@ -203,6 +203,60 @@ async function getEmbeddingStats(req: Request, res: Response) { }; } +/** + * Get list of failed embedding notes + */ +async function getFailedNotes(req: Request, res: Response) { + const limit = parseInt(req.query.limit as string || '100', 10); + const failedNotes = await vectorStore.getFailedEmbeddingNotes(limit); + + // No need to fetch note titles here anymore as they're already included in the response + return { + success: true, + failedNotes: failedNotes + }; +} + +/** + * Retry a specific failed note embedding + */ +async function retryFailedNote(req: Request, res: Response) { + const { noteId } = req.params; + + if (!noteId) { + return [400, { + success: false, + message: "Note ID is required" + }]; + } + + const success = await vectorStore.retryFailedEmbedding(noteId); + + if (!success) { + return [404, { + success: false, + message: "Failed note not found or note is not marked as failed" + }]; + } + + return { + success: true, + message: "Note queued for retry" + }; +} + +/** + * Retry all failed note embeddings + */ +async function retryAllFailedNotes(req: Request, res: Response) { + const count = await vectorStore.retryAllFailedEmbeddings(); + + return { + success: true, + message: `${count} failed notes queued for retry` + }; +} + export default { findSimilarNotes, searchByText, @@ -210,5 +264,8 @@ export default { updateProvider, reprocessAllNotes, getQueueStatus, - getEmbeddingStats + getEmbeddingStats, + getFailedNotes, + retryFailedNote, + retryAllFailedNotes }; diff --git a/src/routes/routes.ts b/src/routes/routes.ts index ba1f8b7d1..31d78c9c4 100644 --- a/src/routes/routes.ts +++ b/src/routes/routes.ts @@ -380,6 +380,9 @@ function register(app: express.Application) { apiRoute(PST, "/api/embeddings/reprocess", embeddingsRoute.reprocessAllNotes); apiRoute(GET, "/api/embeddings/queue-status", embeddingsRoute.getQueueStatus); apiRoute(GET, "/api/embeddings/stats", embeddingsRoute.getEmbeddingStats); + apiRoute(GET, "/api/embeddings/failed", embeddingsRoute.getFailedNotes); + apiRoute(PST, "/api/embeddings/retry/:noteId", embeddingsRoute.retryFailedNote); + apiRoute(PST, "/api/embeddings/retry-all-failed", embeddingsRoute.retryAllFailedNotes); apiRoute(PST, "/api/llm/sessions", llmRoute.createSession); apiRoute(GET, "/api/llm/sessions", llmRoute.listSessions); diff --git a/src/services/llm/embeddings/vector_store.ts b/src/services/llm/embeddings/vector_store.ts index 099dc3fb9..168066ca9 100644 --- a/src/services/llm/embeddings/vector_store.ts +++ b/src/services/llm/embeddings/vector_store.ts @@ -566,6 +566,213 @@ export async function deleteNoteEmbeddings(noteId: string, providerId?: string, } } +/** + * Get notes that have failed embedding generation + * + * @param limit - Maximum number of failed notes to return + * @returns List of failed notes with their error information + */ +export async function getFailedEmbeddingNotes(limit: number = 100): Promise { + // Get notes with failed embedding attempts + const failedQueueItems = await sql.getRows(` + SELECT noteId, operation, attempts, lastAttempt, error + FROM embedding_queue + WHERE attempts > 0 + ORDER BY attempts DESC, lastAttempt DESC + LIMIT ?`, + [limit] + ) as {noteId: string, operation: string, attempts: number, lastAttempt: string, error: string}[]; + + // Add titles to the failed notes + const failedNotesWithTitles = []; + for (const item of failedQueueItems) { + const note = becca.getNote(item.noteId); + if (note) { + failedNotesWithTitles.push({ + ...item, + title: note.title, + failureType: 'full' // This indicates a complete embedding failure + }); + } else { + failedNotesWithTitles.push({ + ...item, + failureType: 'full' + }); + } + } + + // Now get notes with failed chunks + // We need to search for labels that contain failed chunks data + const notes = await sql.getRows(` + SELECT noteId, name, value + FROM attributes + WHERE type = 'label' AND name LIKE '%FailedChunks' + `) as {noteId: string, name: string, value: string}[]; + + // Process notes with failed chunks + for (const item of notes) { + try { + const noteId = item.noteId; + const note = becca.getNote(noteId); + if (!note) continue; + + // Parse the failed chunks data + const failedChunks = JSON.parse(item.value) as Record; + const chunkCount = Object.keys(failedChunks).length; + if (chunkCount === 0) continue; + + // Get the most recent failed chunk + let latestAttempt = ''; + let totalAttempts = 0; + let errorExample = ''; + + for (const chunkId in failedChunks) { + const chunk = failedChunks[chunkId]; + totalAttempts += chunk.attempts; + + if (!latestAttempt || chunk.lastAttempt > latestAttempt) { + latestAttempt = chunk.lastAttempt; + errorExample = chunk.error; + } + } + + // Add this to our list of failed notes + failedNotesWithTitles.push({ + noteId, + title: note.title, + failureType: 'chunks', + chunks: chunkCount, + attempts: totalAttempts, + lastAttempt: latestAttempt, + error: `${chunkCount} chunks failed: ${errorExample}` + }); + } catch (error) { + console.error("Error processing note with failed chunks:", error); + } + } + + // Sort by latest attempt + failedNotesWithTitles.sort((a, b) => { + if (a.lastAttempt && b.lastAttempt) { + return b.lastAttempt.localeCompare(a.lastAttempt); + } + return 0; + }); + + // Limit to the specified number + return failedNotesWithTitles.slice(0, limit); +} + +/** + * Retry embedding generation for a specific failed note + * + * @param noteId - ID of the note to retry + * @returns Success flag + */ +export async function retryFailedEmbedding(noteId: string): Promise { + let success = false; + + // First, check if the note is in the embedding queue with failed attempts + const exists = await sql.getValue( + "SELECT 1 FROM embedding_queue WHERE noteId = ? AND attempts > 0", + [noteId] + ); + + if (exists) { + // Reset the note in the queue + const now = dateUtils.localNowDateTime(); + const utcNow = dateUtils.utcNowDateTime(); + + await sql.execute(` + UPDATE embedding_queue + SET attempts = 0, error = NULL, dateQueued = ?, utcDateQueued = ? + WHERE noteId = ?`, + [now, utcNow, noteId] + ); + success = true; + } + + // Next, check for failed chunks in labels + const note = becca.getNote(noteId); + if (note) { + // Look for any provider-specific failed chunks + const labels = note.getLabels(); + const failedChunksLabels = labels.filter(label => label.name.endsWith('FailedChunks')); + + for (const label of failedChunksLabels) { + // Remove the label - this will cause all chunks to be retried + await note.removeLabel(label.name); + success = true; + } + + // If we had chunk failures but no queue entry, we need to add one + if (failedChunksLabels.length > 0 && !exists) { + await queueNoteForEmbedding(noteId, 'UPDATE'); + } + } + + return success; +} + +/** + * Retry all failed embeddings + * + * @returns Number of notes queued for retry + */ +export async function retryAllFailedEmbeddings(): Promise { + let totalRetried = 0; + + // Get count of failed notes in queue + const failedCount = await sql.getValue( + "SELECT COUNT(*) FROM embedding_queue WHERE attempts > 0" + ) as number; + + if (failedCount > 0) { + // Reset all failed notes in the queue + const now = dateUtils.localNowDateTime(); + const utcNow = dateUtils.utcNowDateTime(); + + await sql.execute(` + UPDATE embedding_queue + SET attempts = 0, error = NULL, dateQueued = ?, utcDateQueued = ? + WHERE attempts > 0`, + [now, utcNow] + ); + + totalRetried += failedCount; + } + + // Now find notes with failed chunks + const notesWithFailedChunks = await sql.getRows(` + SELECT DISTINCT noteId + FROM attributes + WHERE type = 'label' AND name LIKE '%FailedChunks' + `) as {noteId: string}[]; + + // Process each note with failed chunks + for (const item of notesWithFailedChunks) { + const noteId = item.noteId; + const note = becca.getNote(noteId); + + if (note) { + // Get all failed chunks labels + const labels = note.getLabels(); + const failedChunksLabels = labels.filter(label => label.name.endsWith('FailedChunks')); + + for (const label of failedChunksLabels) { + // Remove the label - this will cause all chunks to be retried + await note.removeLabel(label.name); + } + + // Make sure the note is in the queue + await queueNoteForEmbedding(noteId, 'UPDATE'); + totalRetried++; + } + } + + return totalRetried; +} + /** * Process the embedding queue */ @@ -621,7 +828,10 @@ export async function processEmbeddingQueue() { const context = await getNoteEmbeddingContext(noteData.noteId); // Check if we should use chunking for large content - const useChunking = context.content.length > 5000; // Use chunking for large notes by default + const useChunking = context.content.length > 5000; + + // Track if all providers failed + let allProvidersFailed = true; // Process with each enabled provider for (const provider of enabledProviders) { @@ -642,16 +852,35 @@ export async function processEmbeddingQueue() { embedding ); } + // At least one provider succeeded + allProvidersFailed = false; } catch (providerError: any) { log.error(`Error generating embedding with provider ${provider.name} for note ${noteData.noteId}: ${providerError.message || 'Unknown error'}`); } } - // Remove from queue on success - await sql.execute( - "DELETE FROM embedding_queue WHERE noteId = ?", - [noteData.noteId] - ); + // Only remove from queue on success if at least one provider succeeded + if (!allProvidersFailed) { + await sql.execute( + "DELETE FROM embedding_queue WHERE noteId = ?", + [noteData.noteId] + ); + } else { + // If all providers failed, mark as failed but keep in queue + await sql.execute(` + UPDATE embedding_queue + SET attempts = attempts + 1, + lastAttempt = ?, + error = ? + WHERE noteId = ?`, + [dateUtils.utcNowDateTime(), "All providers failed to generate embeddings", noteData.noteId] + ); + + // Remove from queue if too many attempts + if (noteData.attempts + 1 >= 3) { + log.error(`Marked note ${noteData.noteId} as permanently failed after multiple embedding attempts`); + } + } } catch (error: any) { const noteData = note as unknown as QueueItem; @@ -667,13 +896,10 @@ export async function processEmbeddingQueue() { log.error(`Error processing embedding for note ${noteData.noteId}: ${error.message || 'Unknown error'}`); - // Remove from queue if too many attempts + // Don't remove from queue even after multiple failures, just mark as failed + // This allows manual retries later if (noteData.attempts + 1 >= 3) { - await sql.execute( - "DELETE FROM embedding_queue WHERE noteId = ?", - [noteData.noteId] - ); - log.error(`Removed note ${noteData.noteId} from embedding queue after multiple failures`); + log.error(`Marked note ${noteData.noteId} as permanently failed after multiple embedding attempts`); } } } @@ -857,40 +1083,151 @@ async function processNoteWithChunking( // Delete existing embeddings first to avoid duplicates await deleteNoteEmbeddings(noteId, provider.name, config.model); + // Track successful and failed chunks + let successfulChunks = 0; + let failedChunks = 0; + const totalChunks = chunks.length; + + // Get existing chunk failure data from the database + // We'll store this in a special attribute on the note to track per-chunk failures + const failedChunksData = await getFailedChunksData(noteId, provider.name); + // Process each chunk with a slight delay to avoid rate limits for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i]; + const chunkId = `chunk_${i + 1}_of_${chunks.length}`; - // Create a modified context object with just this chunk's content - const chunkContext: NoteEmbeddingContext = { - ...context, - content: chunk - }; + // Skip chunks that have failed multiple times + if (failedChunksData[chunkId] && failedChunksData[chunkId].attempts >= 3) { + log.info(`Skipping chunk ${chunkId} for note ${noteId} after ${failedChunksData[chunkId].attempts} failed attempts`); + failedChunks++; + continue; + } - // Generate embedding for this chunk - const embedding = await provider.generateNoteEmbeddings(chunkContext); + try { + // Create a modified context object with just this chunk's content + const chunkContext: NoteEmbeddingContext = { + ...context, + content: chunk + }; - // Store with chunk information - await storeNoteEmbedding( - noteId, - provider.name, - config.model, - embedding - ); + // Generate embedding for this chunk + const embedding = await provider.generateNoteEmbeddings(chunkContext); - // Small delay between chunks to avoid rate limits - if (i < chunks.length - 1) { - await new Promise(resolve => setTimeout(resolve, 100)); + // Store with chunk information + await storeNoteEmbedding( + noteId, + provider.name, + config.model, + embedding + ); + + successfulChunks++; + + // Remove this chunk from failed chunks if it was previously failed + if (failedChunksData[chunkId]) { + delete failedChunksData[chunkId]; + await updateFailedChunksData(noteId, provider.name, failedChunksData); + } + + // Small delay between chunks to avoid rate limits + if (i < chunks.length - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } catch (error: any) { + // Track the failure for this specific chunk + failedChunks++; + + if (!failedChunksData[chunkId]) { + failedChunksData[chunkId] = { + attempts: 1, + lastAttempt: dateUtils.utcNowDateTime(), + error: error.message || 'Unknown error' + }; + } else { + failedChunksData[chunkId].attempts++; + failedChunksData[chunkId].lastAttempt = dateUtils.utcNowDateTime(); + failedChunksData[chunkId].error = error.message || 'Unknown error'; + } + + // Update the failed chunks data in the database + await updateFailedChunksData(noteId, provider.name, failedChunksData); + + log.error(`Error processing chunk ${chunkId} for note ${noteId}: ${error.message || 'Unknown error'}`); } } - log.info(`Generated ${chunks.length} chunk embeddings for note ${noteId}`); + // Log information about the processed chunks + if (successfulChunks > 0) { + log.info(`Generated ${successfulChunks} chunk embeddings for note ${noteId}`); + } + + if (failedChunks > 0) { + log.info(`Failed to generate ${failedChunks} chunk embeddings for note ${noteId}`); + } + + // If all chunks failed, throw an error so the note will be marked as failed + if (successfulChunks === 0 && failedChunks > 0) { + throw new Error(`All ${failedChunks} chunks failed for note ${noteId}`); + } } catch (error: any) { log.error(`Error in chunked embedding process for note ${noteId}: ${error.message || 'Unknown error'}`); throw error; } } +/** + * Store failed chunk data for a note + * This is stored in a special attribute on the note so we can track per-chunk failures + */ +async function getFailedChunksData(noteId: string, providerId: string): Promise> { + try { + const attributeName = `${providerId}FailedChunks`; + const note = becca.getNote(noteId); + + if (!note) { + return {}; + } + + const attr = note.getLabels().find(attr => attr.name === attributeName); + + if (!attr || !attr.value) { + return {}; + } + + return JSON.parse(attr.value); + } catch (e) { + return {}; + } +} + +/** + * Update failed chunk data for a note + */ +async function updateFailedChunksData(noteId: string, providerId: string, data: Record): Promise { + try { + const attributeName = `${providerId}FailedChunks`; + const note = becca.getNote(noteId); + + if (!note) { + return; + } + + // Only store if there are failed chunks + if (Object.keys(data).length > 0) { + await note.setLabel(attributeName, JSON.stringify(data)); + } else { + // If no failed chunks, remove the attribute if it exists + const attr = note.getLabels().find(attr => attr.name === attributeName); + if (attr) { + await note.removeLabel(attributeName); + } + } + } catch (e) { + log.error(`Error updating failed chunks data for note ${noteId}: ${e}`); + } +} + export function cleanupEmbeddings() { // Cleanup function implementation } @@ -910,5 +1247,8 @@ export default { setupEmbeddingBackgroundProcessing, initEmbeddings, reprocessAllNotes, - getEmbeddingStats + getEmbeddingStats, + getFailedEmbeddingNotes, + retryFailedEmbedding, + retryAllFailedEmbeddings };