more aggressively filter notes out that don't work for us

This commit is contained in:
perf3ct 2025-03-20 19:42:38 +00:00
parent 1be70f1163
commit 915c95f7cb
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
4 changed files with 64 additions and 48 deletions

View File

@ -75,11 +75,14 @@ export class ContextFormatter {
content = this.sanitizeNoteContent(source.content, source.type, source.mime);
} else {
sourcesSkipped++;
log.info(`Skipping note with no content: ${source.title || 'Untitled'}`);
continue; // Skip invalid sources
}
if (!content || content.trim().length === 0) {
// Skip if content is empty or just whitespace/minimal
if (!content || content.trim().length <= 10) {
sourcesSkipped++;
log.info(`Skipping note with minimal content: ${source.title || 'Untitled'}`);
continue;
}

View File

@ -133,8 +133,18 @@ export class ContextService {
// Convert map to array and limit to top results
relevantNotes = Array.from(allResults.values())
.filter(note => {
// Filter out notes with no content or very minimal content (less than 10 chars)
const hasContent = note.content && note.content.trim().length > 10;
if (!hasContent) {
log.info(`Filtering out empty/minimal note: "${note.title}" (${note.noteId})`);
}
return hasContent;
})
.sort((a, b) => b.similarity - a.similarity)
.slice(0, 20); // Increased from 8 to 20 notes
log.info(`After filtering out empty notes, ${relevantNotes.length} relevant notes remain`);
} catch (error) {
log.error(`Error finding relevant notes: ${error}`);
// Continue with empty notes list
@ -406,9 +416,17 @@ export class ContextService {
// Convert the combined Map to an array and sort by similarity
const combinedNotes = Array.from(allNotes.values())
.filter(note => {
// Filter out notes with no content or very minimal content
const hasContent = note.content && note.content.trim().length > 10;
if (!hasContent) {
log.info(`Filtering out empty/minimal note from combined results: "${note.title}" (${note.noteId})`);
}
return hasContent;
})
.sort((a, b) => b.similarity - a.similarity);
log.info(`Combined ${relevantNotes.length} notes from initial search with ${vectorSearchNotes.length} notes from vector search, resulting in ${combinedNotes.length} unique notes`);
log.info(`Combined ${relevantNotes.length} notes from initial search with ${vectorSearchNotes.length} notes from vector search, resulting in ${combinedNotes.length} unique notes after filtering out empty notes`);
// Filter for Qu-related notes
const quNotes = combinedNotes.filter(result =>

View File

@ -165,23 +165,48 @@ export class SemanticSearch {
// Get note content
const content = await this.contextExtractor.getNoteContent(result.noteId);
// Adjust similarity score based on content quality
let adjustedSimilarity = result.similarity;
// Penalize notes with empty or minimal content
if (!content || content.trim().length <= 10) {
// Reduce similarity by 80% for empty/minimal notes
adjustedSimilarity *= 0.2;
log.info(`Adjusting similarity for empty/minimal note "${note.title}" from ${Math.round(result.similarity * 100)}% to ${Math.round(adjustedSimilarity * 100)}%`);
}
// Slightly boost notes with substantial content
else if (content.length > 100) {
// Small boost of 10% for notes with substantial content
adjustedSimilarity = Math.min(1.0, adjustedSimilarity * 1.1);
}
return {
noteId: result.noteId,
title: note.title,
content,
similarity: result.similarity
similarity: adjustedSimilarity
};
})
);
// Filter out null results
const filteredResults = enrichedResults.filter(Boolean) as {
const filteredResults = enrichedResults.filter(result => {
// Filter out null results and notes with empty or minimal content
if (!result) return false;
// Instead of hard filtering by content length, now we use an adjusted
// similarity score, but we can still filter extremely low scores
return result.similarity > 0.2;
}) as {
noteId: string,
title: string,
content: string | null,
similarity: number
}[];
// Sort results by adjusted similarity
filteredResults.sort((a, b) => b.similarity - a.similarity);
// Cache results
cacheManager.storeQueryResults(cacheKey, filteredResults);
@ -224,48 +249,17 @@ export class SemanticSearch {
const model = provider.getConfig().model || '';
const providerName = provider.name;
// Check if vectorStore has the findSimilarNotesInSet method
if (typeof vectorStore.findSimilarNotesInSet === 'function') {
// Use the dedicated method if available
return await vectorStore.findSimilarNotesInSet(
embedding,
noteIds,
providerName,
model,
limit
);
}
// Fallback: Manually search through the notes in the subtree
const similarities: {noteId: string, similarity: number}[] = [];
for (const noteId of noteIds) {
try {
const noteEmbedding = await vectorStore.getEmbeddingForNote(
noteId,
providerName,
model
);
if (noteEmbedding && noteEmbedding.embedding) {
const similarity = cosineSimilarity(embedding, noteEmbedding.embedding);
if (similarity > 0.5) { // Apply a similarity threshold
similarities.push({
noteId,
similarity
});
}
}
} catch (error) {
// Skip notes that don't have embeddings
continue;
}
}
// Sort by similarity and return top results
return similarities
.sort((a, b) => b.similarity - a.similarity)
.slice(0, limit);
// Use vectorStore to find similar notes within this subset
// Ideally we'd have a method to find within a specific set, but we'll use the general findSimilarNotes
return await vectorStore.findSimilarNotes(
embedding,
providerName,
model,
limit
).then(results => {
// Filter to only include notes within our noteIds set
return results.filter(result => noteIds.includes(result.noteId));
});
} catch (error) {
log.error(`Error finding notes in branch: ${error}`);
return [];

View File

@ -10,7 +10,7 @@ You are an AI assistant integrated into TriliumNext Notes, a powerful note-takin
- Relation maps for visualizing connections between notes
- Synchronization between devices
Your primary goal is to help users find information in their notes, answer questions based on their knowledge base, and provide assistance with using TriliumNext Notes features.
Your primary goal is to help users find information in their notes, answer questions based on their knowledge base, and provide assistance with using TriliumNext Notes features. Be sure to summarize the notes and include the title of the notes when providing a summary.
When responding to queries:
1. For complex queries, decompose them into simpler parts and address each one
@ -29,4 +29,5 @@ When responding to queries:
14. When suggesting improvements to a user's note organization or structure, present these as optional enhancements rather than corrections
15. Maintain a helpful, knowledgeable tone focused on enhancing the user's knowledge management experience
16. Frame responses as collaborative assistance rather than authoritative instruction
17. Instead of telling a user on what Notes they have, provide them an overview of the notes and include the title of the notes when providing the overview.
17. Instead of telling a user on what Notes they have, summarize the notes and include the title of the notes when providing a summary.
18.