this is absolutely cursed, but query decomp works now

This commit is contained in:
perf3ct 2025-04-17 18:36:53 +00:00
parent daa56b10e8
commit a0dda48748
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232

View File

@ -99,29 +99,56 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes .replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
.trim(); .trim();
// Check if the text might contain a JSON array (has square brackets) log.info(`Cleaned JSON string: ${jsonStr}`);
if (jsonStr.includes('[') && jsonStr.includes(']')) {
// Extract just the array part if there's explanatory text // Check if the text might contain a JSON structure (has curly braces or square brackets)
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/); if ((jsonStr.includes('{') && jsonStr.includes('}')) || (jsonStr.includes('[') && jsonStr.includes(']'))) {
if (arrayMatch) { // Try to extract the JSON structure
jsonStr = arrayMatch[0]; let jsonMatch = jsonStr.match(/(\{[\s\S]*\}|\[[\s\S]*\])/);
if (jsonMatch) {
jsonStr = jsonMatch[0];
log.info(`Extracted JSON structure: ${jsonStr}`);
} }
// Try to parse the JSON // Try to parse the JSON
try { try {
const queries = JSON.parse(jsonStr); const parsed = JSON.parse(jsonStr);
if (Array.isArray(queries) && queries.length > 0) {
const result = queries.map(q => typeof q === 'string' ? q : String(q)).filter(Boolean); // Handle array format: ["query1", "query2"]
if (Array.isArray(parsed)) {
const result = parsed
.map(q => typeof q === 'string' ? q.trim() : String(q).trim())
.filter(Boolean);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result); cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result; return result;
} }
} catch (innerError) { // Handle object format: {"query1": "reason1", "query2": "reason2"} or {"query1" : "query2"}
// If parsing fails, log it and continue to the fallback else if (typeof parsed === 'object' && parsed !== null) {
log.info(`JSON parse error: ${innerError}. Will use fallback parsing for: ${jsonStr}`); // Extract both keys and values as potential queries
const keys = Object.keys(parsed);
const values = Object.values(parsed);
// Add keys as queries
const keysResult = keys
.filter(key => key && key.length > 3)
.map(key => key.trim());
// Add values as queries if they're strings and not already included
const valuesResult = values
.filter((val): val is string => typeof val === 'string' && val.length > 3)
.map(val => val.trim())
.filter(val => !keysResult.includes(val));
const result = [...keysResult, ...valuesResult];
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result;
}
} catch (parseError) {
log.info(`JSON parse error: ${parseError}. Will use fallback parsing.`);
} }
} }
// Fallback 1: Try to extract an array manually by splitting on commas between quotes // Fallback: Try to extract an array manually by splitting on commas between quotes
if (jsonStr.includes('[') && jsonStr.includes(']')) { if (jsonStr.includes('[') && jsonStr.includes(']')) {
const arrayContent = jsonStr.substring( const arrayContent = jsonStr.substring(
jsonStr.indexOf('[') + 1, jsonStr.indexOf('[') + 1,
@ -132,32 +159,43 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`
const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g); const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g);
if (stringMatches && stringMatches.length > 0) { if (stringMatches && stringMatches.length > 0) {
const result = stringMatches const result = stringMatches
.map((m: string) => m.substring(1, m.length - 1)) // Remove surrounding quotes .map((m: string) => m.substring(1, m.length - 1).trim()) // Remove surrounding quotes
.filter((s: string) => s.length > 0); .filter((s: string) => s.length > 0);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result); cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result; return result;
} }
} }
// Fallback 2: Extract queries line by line // Fallback: Try to extract key-value pairs from object notation manually
const lines = responseText.split('\n') if (jsonStr.includes('{') && jsonStr.includes('}')) {
.map((line: string) => line.trim()) // Extract content between curly braces
.filter((line: string) => const objectContent = jsonStr.substring(
line.length > 0 && jsonStr.indexOf('{') + 1,
!line.startsWith('```') && jsonStr.lastIndexOf('}')
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
!line.match(/^\[|\]$/) // Skip lines that are just brackets
); );
if (lines.length > 0) { // Split by commas that aren't inside quotes
// Remove numbering, quotes and other list markers from each line const pairs: string[] = objectContent.split(/,(?=(?:[^"]*"[^"]*")*[^"]*$)/);
const result = lines.map((line: string) => {
return line const result = pairs
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers (1., 2., etc) .map(pair => {
.replace(/^[-*•]\s*/, '') // Remove bullet list markers // Split by colon that isn't inside quotes
.replace(/^["']|["']$/g, '') // Remove surrounding quotes const keyValue = pair.split(/:(?=(?:[^"]*"[^"]*")*[^"]*$)/);
.trim(); if (keyValue.length === 2) {
}).filter((s: string) => s.length > 0); const key = keyValue[0].replace(/"/g, '').trim();
const value = keyValue[1].replace(/"/g, '').trim();
if (key && key.length > 3) {
return key;
}
if (value && value.length > 3) {
return value;
}
}
return null;
})
.filter((s: string | null) => s !== null);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result); cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result; return result;
@ -269,40 +307,24 @@ Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`
context?: string context?: string
): Promise<SubQuery[]> { ): Promise<SubQuery[]> {
try { try {
// Create a simple prompt for query decomposition // Use the proven prompt format that was working before
const prompt = `Decompose the following query into 3-5 specific search queries that would be effective for vector search. const prompt = `You are an AI assistant that decides what information needs to be retrieved from a user's knowledge base called TriliumNext Notes to answer the user's question.
Given the user's question, generate 3-5 specific search queries that would help find relevant information.
Your goal is to help find comprehensive information by breaking down the query into multiple search terms. Each query should be focused on a different aspect of the question.
Avoid generating queries that are too broad, vague, or about a user's entire Note database, and make sure they are relevant to the user's question.
IMPORTANT: DO NOT just reword the original query. Create MULTIPLE DISTINCT queries that explore different aspects. Format your answer as a JSON array of strings, with each string being a search query.
Example: ["exact topic mentioned", "related concept 1", "related concept 2"]`;
For example, if the query is "What are Docker containers?", good sub-queries would be:
1. "Docker container architecture and components"
2. "Docker vs virtual machines differences"
3. "Docker container use cases and benefits"
4. "Docker container deployment best practices"
Format your response as a JSON array of objects with 'text' and 'reason' properties.
Example: [
{"text": "Docker container architecture", "reason": "Understanding the technical structure"},
{"text": "Docker vs virtual machines", "reason": "Comparing with alternative technologies"},
{"text": "Docker container benefits", "reason": "Understanding advantages and use cases"},
{"text": "Docker deployment best practices", "reason": "Learning practical implementation"}
]
${context ? `\nContext: ${context}` : ''}
Query: ${query}`;
log.info(`Sending decomposition prompt to LLM for query: "${query}"`); log.info(`Sending decomposition prompt to LLM for query: "${query}"`);
const messages = [ const messages = [
{ role: "system" as const, content: prompt } { role: "system" as const, content: prompt },
{ role: "user" as const, content: query }
]; ];
const options = { const options = {
temperature: 0.7, temperature: 0.3,
maxTokens: SEARCH_CONSTANTS.LIMITS.QUERY_PROCESSOR_MAX_TOKENS, maxTokens: 300,
bypassFormatter: true, bypassFormatter: true,
expectsJsonResponse: true, expectsJsonResponse: true,
_bypassContextProcessing: true, _bypassContextProcessing: true,
@ -315,78 +337,144 @@ Query: ${query}`;
log.info(`Received LLM response for decomposition: ${responseText.substring(0, 200)}...`); log.info(`Received LLM response for decomposition: ${responseText.substring(0, 200)}...`);
// Try to parse the response as JSON // Parse the response to extract the queries
let subQueries: SubQuery[] = []; let searchQueries: string[] = [];
try { try {
// Extract the JSON from the response // Remove code blocks, quotes, and clean up the response text
const extractedJson = JsonExtractor.extract(responseText, { let jsonStr = responseText
extractArrays: true, .replace(/```(?:json)?|```/g, '') // Remove code block markers
applyFixes: true, .replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
useFallbacks: true .trim();
});
log.info(`Extracted JSON: ${JSON.stringify(extractedJson).substring(0, 200)}...`); log.info(`Cleaned JSON string: ${jsonStr}`);
if (Array.isArray(extractedJson) && extractedJson.length > 0) { // Check if the text might contain a JSON structure (has curly braces or square brackets)
// Convert the extracted data to SubQuery objects if ((jsonStr.includes('{') && jsonStr.includes('}')) || (jsonStr.includes('[') && jsonStr.includes(']'))) {
subQueries = extractedJson // Try to extract the JSON structure
.filter(item => item && typeof item === 'object' && item.text) let jsonMatch = jsonStr.match(/(\{[\s\S]*\}|\[[\s\S]*\])/);
.map(item => ({ if (jsonMatch) {
id: this.generateSubQueryId(), jsonStr = jsonMatch[0];
text: item.text, log.info(`Extracted JSON structure: ${jsonStr}`);
reason: item.reason || "Sub-aspect of the main question", }
isAnswered: false
}));
log.info(`Successfully created ${subQueries.length} sub-queries from LLM response`); // Try to parse the JSON
} else { try {
log.info(`Failed to extract array of sub-queries from LLM response`); const parsed = JSON.parse(jsonStr);
// Handle array format: ["query1", "query2"]
if (Array.isArray(parsed)) {
searchQueries = parsed
.map(q => typeof q === 'string' ? q.trim() : String(q).trim())
.filter(Boolean);
log.info(`Extracted ${searchQueries.length} queries from JSON array`);
}
// Handle object format: {"query1": "reason1", "query2": "reason2"} or {"query1" : "query2"}
else if (typeof parsed === 'object' && parsed !== null) {
// Extract both keys and values as potential queries
const keys = Object.keys(parsed);
const values = Object.values(parsed);
// Add keys as queries
searchQueries = keys
.filter(key => key && key.length > 3)
.map(key => key.trim());
// Add values as queries if they're strings and not already included
values
.filter((val): val is string => typeof val === 'string' && val.length > 3)
.map(val => val.trim())
.forEach((val: string) => {
if (!searchQueries.includes(val)) {
searchQueries.push(val);
}
});
log.info(`Extracted ${searchQueries.length} queries from JSON object`);
}
} catch (parseError) {
log.info(`JSON parse error: ${parseError}. Will use fallback parsing.`);
}
} }
} catch (error) {
log.error(`Error parsing LLM response: ${error}`);
}
// Always include the original query // Fallback: Try to extract an array manually by splitting on commas between quotes
const hasOriginal = subQueries.some(sq => sq.text.toLowerCase() === query.toLowerCase()); if (searchQueries.length === 0 && jsonStr.includes('[') && jsonStr.includes(']')) {
if (!hasOriginal) { const arrayContent = jsonStr.substring(
subQueries.push({ jsonStr.indexOf('[') + 1,
id: this.generateSubQueryId(), jsonStr.lastIndexOf(']')
text: query, );
reason: "Original query",
isAnswered: false
});
log.info(`Added original query to sub-queries list`);
}
// Ensure we have at least 3 queries for better search coverage // Use regex to match quoted strings, handling escaped quotes
if (subQueries.length < 3) { const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g);
// Create some generic variants of the original query if (stringMatches && stringMatches.length > 0) {
const genericVariants = [ searchQueries = stringMatches
{ text: `${query} examples and use cases`, reason: "Practical applications" }, .map((m: string) => m.substring(1, m.length - 1).trim()) // Remove surrounding quotes
{ text: `${query} concepts and definitions`, reason: "Conceptual understanding" }, .filter((s: string) => s.length > 0);
{ text: `${query} best practices`, reason: "Implementation guidance" } log.info(`Extracted ${searchQueries.length} queries using regex`);
]; }
}
// Add variants until we have at least 3 queries // Fallback: Try to extract key-value pairs from object notation manually
for (let i = 0; i < genericVariants.length && subQueries.length < 3; i++) { if (searchQueries.length === 0 && jsonStr.includes('{') && jsonStr.includes('}')) {
subQueries.push({ // Extract content between curly braces
const objectContent = jsonStr.substring(
jsonStr.indexOf('{') + 1,
jsonStr.lastIndexOf('}')
);
// Split by commas that aren't inside quotes
const pairs: string[] = objectContent.split(/,(?=(?:[^"]*"[^"]*")*[^"]*$)/);
for (const pair of pairs) {
// Split by colon that isn't inside quotes
const keyValue = pair.split(/:(?=(?:[^"]*"[^"]*")*[^"]*$)/);
if (keyValue.length === 2) {
const key = keyValue[0].replace(/"/g, '').trim();
const value = keyValue[1].replace(/"/g, '').trim();
if (key && key.length > 3 && !searchQueries.includes(key)) {
searchQueries.push(key);
}
if (value && value.length > 3 && !searchQueries.includes(value)) {
searchQueries.push(value);
}
}
}
log.info(`Extracted ${searchQueries.length} queries from manual object parsing`);
}
// Convert search queries to SubQuery objects
if (searchQueries.length > 0) {
const subQueries = searchQueries.map((text, index) => ({
id: this.generateSubQueryId(), id: this.generateSubQueryId(),
text: genericVariants[i].text, text,
reason: genericVariants[i].reason, reason: `Search query ${index + 1}`,
isAnswered: false isAnswered: false
}); }));
}
log.info(`Added ${3 - subQueries.length} generic variants to ensure minimum 3 queries`); // Always include the original query if not already included
const hasOriginal = subQueries.some(sq => sq.text.toLowerCase().includes(query.toLowerCase()) || query.toLowerCase().includes(sq.text.toLowerCase()));
if (!hasOriginal) {
subQueries.unshift({
id: this.generateSubQueryId(),
text: query.trim(),
reason: "Original query",
isAnswered: false
});
log.info(`Added original query to sub-queries list`);
}
log.info(`Final sub-queries for vector search: ${subQueries.map(sq => `"${sq.text}"`).join(', ')}`);
return subQueries;
}
} catch (parseError) {
log.error(`Error parsing search queries: ${parseError}`);
} }
log.info(`Final sub-queries for vector search: ${subQueries.map(sq => `"${sq.text}"`).join(', ')}`); // Fallback if all extraction methods fail
return subQueries; log.info(`Using fallback queries`);
} catch (error) { return [
log.error(`Error in simpleQueryDecomposition: ${error}`);
// Return the original query plus some variants as fallback
const fallbackQueries = [
{ {
id: this.generateSubQueryId(), id: this.generateSubQueryId(),
text: query, text: query,
@ -395,20 +483,27 @@ Query: ${query}`;
}, },
{ {
id: this.generateSubQueryId(), id: this.generateSubQueryId(),
text: `${query} overview`, text: `${query.trim()} overview`,
reason: "General information", reason: "General information",
isAnswered: false isAnswered: false
}, },
{ {
id: this.generateSubQueryId(), id: this.generateSubQueryId(),
text: `${query} examples`, text: `${query.trim()} examples`,
reason: "Practical examples", reason: "Practical examples",
isAnswered: false isAnswered: false
} }
]; ];
} catch (error) {
log.error(`Error in simpleQueryDecomposition: ${error}`);
log.info(`Using fallback queries due to error: ${fallbackQueries.map(sq => `"${sq.text}"`).join(', ')}`); // Return the original query as fallback
return fallbackQueries; return [{
id: this.generateSubQueryId(),
text: query,
reason: "Error occurred, using original query",
isAnswered: false
}];
} }
} }