From ed52d7172969ce5ad2b88464ff42d1145b122850 Mon Sep 17 00:00:00 2001 From: perf3ct Date: Tue, 1 Apr 2025 21:42:09 +0000 Subject: [PATCH] do a better job at centralizing json extraction, and query "enhancer" search queries --- src/services/llm/ai_interface.ts | 3 + .../llm/context/modules/query_enhancer.ts | 99 ++--- .../llm/formatters/ollama_formatter.ts | 26 +- .../pipeline/interfaces/message_formatter.ts | 84 ++-- src/services/llm/providers/ollama_service.ts | 32 +- src/services/llm/utils/json_extractor.ts | 387 ++++++++++++++++++ 6 files changed, 507 insertions(+), 124 deletions(-) create mode 100644 src/services/llm/utils/json_extractor.ts diff --git a/src/services/llm/ai_interface.ts b/src/services/llm/ai_interface.ts index c539f33a8..e93824aab 100644 --- a/src/services/llm/ai_interface.ts +++ b/src/services/llm/ai_interface.ts @@ -23,6 +23,9 @@ export interface ChatCompletionOptions { presencePenalty?: number; showThinking?: boolean; systemPrompt?: string; + preserveSystemPrompt?: boolean; // Whether to preserve existing system message + bypassFormatter?: boolean; // Whether to bypass the message formatter entirely + expectsJsonResponse?: boolean; // Whether this request expects a JSON response stream?: boolean; // Whether to stream the response } diff --git a/src/services/llm/context/modules/query_enhancer.ts b/src/services/llm/context/modules/query_enhancer.ts index 56453675c..e2f2eeefe 100644 --- a/src/services/llm/context/modules/query_enhancer.ts +++ b/src/services/llm/context/modules/query_enhancer.ts @@ -4,6 +4,7 @@ import type { Message } from '../../ai_interface.js'; import { CONTEXT_PROMPTS } from '../../constants/llm_prompt_constants.js'; import type { LLMServiceInterface } from '../../interfaces/agent_tool_interfaces.js'; import type { IQueryEnhancer } from '../../interfaces/context_interfaces.js'; +import JsonExtractor from '../../utils/json_extractor.js'; /** * Provides utilities for enhancing queries and generating search queries @@ -12,6 +13,15 @@ export class QueryEnhancer implements IQueryEnhancer { // Use the centralized query enhancer prompt private metaPrompt = CONTEXT_PROMPTS.QUERY_ENHANCER; + /** + * Get enhanced prompt with JSON formatting instructions + */ + private getEnhancedPrompt(): string { + return `${this.metaPrompt} +IMPORTANT: You must respond with valid JSON arrays. Always include commas between array elements. +Format your answer as a valid JSON array without markdown code blocks, like this: ["item1", "item2", "item3"]`; + } + /** * Generate search queries to find relevant information for the user question * @@ -32,95 +42,38 @@ export class QueryEnhancer implements IQueryEnhancer { } const messages: Message[] = [ - { role: "system", content: this.metaPrompt }, + { role: "system", content: this.getEnhancedPrompt() }, { role: "user", content: userQuestion } ]; const options = { temperature: 0.3, - maxTokens: 300 + maxTokens: 300, + bypassFormatter: true, // Completely bypass formatter for query enhancement + expectsJsonResponse: true // Explicitly request JSON-formatted response }; // Get the response from the LLM const response = await llmService.generateChatCompletion(messages, options); const responseText = response.text; // Extract the text from the response object - try { - // Remove code blocks, quotes, and clean up the response text - let jsonStr = responseText - .replace(/```(?:json)?|```/g, '') // Remove code block markers - .replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes - .trim(); + // Use the JsonExtractor to parse the response + const queries = JsonExtractor.extract(responseText, { + extractArrays: true, + minStringLength: 3, + applyFixes: true, + useFallbacks: true + }); - // Check if the text might contain a JSON array (has square brackets) - if (jsonStr.includes('[') && jsonStr.includes(']')) { - // Extract just the array part if there's explanatory text - const arrayMatch = jsonStr.match(/\[[\s\S]*\]/); - if (arrayMatch) { - jsonStr = arrayMatch[0]; - } - - // Try to parse the JSON - try { - const queries = JSON.parse(jsonStr); - if (Array.isArray(queries) && queries.length > 0) { - const result = queries.map(q => typeof q === 'string' ? q : String(q)).filter(Boolean); - cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result); - return result; - } - } catch (innerError) { - // If parsing fails, log it and continue to the fallback - log.info(`JSON parse error: ${innerError}. Will use fallback parsing for: ${jsonStr}`); - } - } - - // Fallback 1: Try to extract an array manually by splitting on commas between quotes - if (jsonStr.includes('[') && jsonStr.includes(']')) { - const arrayContent = jsonStr.substring( - jsonStr.indexOf('[') + 1, - jsonStr.lastIndexOf(']') - ); - - // Use regex to match quoted strings, handling escaped quotes - const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g); - if (stringMatches && stringMatches.length > 0) { - const result = stringMatches - .map((m: string) => m.substring(1, m.length - 1)) // Remove surrounding quotes - .filter((s: string) => s.length > 0); - cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result); - return result; - } - } - - // Fallback 2: Extract queries line by line - const lines = responseText.split('\n') - .map((line: string) => line.trim()) - .filter((line: string) => - line.length > 0 && - !line.startsWith('```') && - !line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone - !line.match(/^\[|\]$/) // Skip lines that are just brackets - ); - - if (lines.length > 0) { - // Remove numbering, quotes and other list markers from each line - const result = lines.map((line: string) => { - return line - .replace(/^\d+\.?\s*/, '') // Remove numbered list markers (1., 2., etc) - .replace(/^[-*•]\s*/, '') // Remove bullet list markers - .replace(/^["']|["']$/g, '') // Remove surrounding quotes - .trim(); - }).filter((s: string) => s.length > 0); - - cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result); - return result; - } - } catch (parseError) { - log.error(`Error parsing search queries: ${parseError}`); + if (queries && queries.length > 0) { + log.info(`Extracted ${queries.length} queries using JsonExtractor`); + cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, queries); + return queries; } // If all else fails, just use the original question const fallback = [userQuestion]; + log.info(`No queries extracted, using fallback: "${userQuestion}"`); cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, fallback); return fallback; } catch (error: unknown) { diff --git a/src/services/llm/formatters/ollama_formatter.ts b/src/services/llm/formatters/ollama_formatter.ts index 15216112c..91090cdd0 100644 --- a/src/services/llm/formatters/ollama_formatter.ts +++ b/src/services/llm/formatters/ollama_formatter.ts @@ -23,22 +23,30 @@ export class OllamaMessageFormatter extends BaseMessageFormatter { /** * Format messages for the Ollama API + * @param messages Messages to format + * @param systemPrompt Optional system prompt to use + * @param context Optional context to include + * @param preserveSystemPrompt When true, preserves existing system messages rather than replacing them */ - formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] { + formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] { const formattedMessages: Message[] = []; // First identify user and system messages const systemMessages = messages.filter(msg => msg.role === 'system'); const userMessages = messages.filter(msg => msg.role === 'user' || msg.role === 'assistant'); - // Create base system message with instructions or use default - const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO; - - // Always add a system message with the base prompt - formattedMessages.push({ - role: 'system', - content: basePrompt - }); + // Determine if we should preserve the existing system message + if (preserveSystemPrompt && systemMessages.length > 0) { + // Preserve the existing system message + formattedMessages.push(systemMessages[0]); + } else { + // Use provided systemPrompt or default + const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO; + formattedMessages.push({ + role: 'system', + content: basePrompt + }); + } // If we have context, inject it into the first user message if (context && userMessages.length > 0) { diff --git a/src/services/llm/pipeline/interfaces/message_formatter.ts b/src/services/llm/pipeline/interfaces/message_formatter.ts index c092eceef..9fc9f19f4 100644 --- a/src/services/llm/pipeline/interfaces/message_formatter.ts +++ b/src/services/llm/pipeline/interfaces/message_formatter.ts @@ -9,9 +9,10 @@ export interface MessageFormatter { * @param messages Original messages * @param systemPrompt Optional system prompt to override * @param context Optional context to include + * @param preserveSystemPrompt Optional flag to preserve existing system prompt * @returns Formatted messages optimized for the specific provider */ - formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[]; + formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[]; } /** @@ -22,15 +23,15 @@ export abstract class BaseMessageFormatter implements MessageFormatter { * Format messages with system prompt and context * Each provider should override this method with their specific formatting strategy */ - abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[]; - + abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[]; + /** * Helper method to extract existing system message from messages */ protected getSystemMessage(messages: Message[]): Message | undefined { return messages.find(msg => msg.role === 'system'); } - + /** * Helper method to create a copy of messages without system message */ @@ -44,22 +45,26 @@ export abstract class BaseMessageFormatter implements MessageFormatter { * Optimizes message format for OpenAI models (GPT-3.5, GPT-4, etc.) */ export class OpenAIMessageFormatter extends BaseMessageFormatter { - formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] { + formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] { const formattedMessages: Message[] = []; - + // OpenAI performs best with system message first, then context as a separate system message // or appended to the original system message - + // Handle system message const existingSystem = this.getSystemMessage(messages); - if (systemPrompt || existingSystem) { + + if (preserveSystemPrompt && existingSystem) { + // Use the existing system message + formattedMessages.push(existingSystem); + } else if (systemPrompt || existingSystem) { const systemContent = systemPrompt || existingSystem?.content || ''; formattedMessages.push({ role: 'system', content: systemContent }); } - + // Add context as a system message with clear instruction if (context) { formattedMessages.push({ @@ -67,10 +72,10 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter { content: `Please use the following context to respond to the user's messages:\n\n${context}` }); } - + // Add remaining messages (excluding system) formattedMessages.push(...this.getMessagesWithoutSystem(messages)); - + return formattedMessages; } } @@ -80,24 +85,26 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter { * Optimizes message format for Claude models */ export class AnthropicMessageFormatter extends BaseMessageFormatter { - formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] { + formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] { const formattedMessages: Message[] = []; - + // Anthropic performs best with a specific XML-like format for context and system instructions - + // Create system message with combined prompt and context if any let systemContent = ''; const existingSystem = this.getSystemMessage(messages); - - if (systemPrompt || existingSystem) { + + if (preserveSystemPrompt && existingSystem) { + systemContent = existingSystem.content; + } else if (systemPrompt || existingSystem) { systemContent = systemPrompt || existingSystem?.content || ''; } - + // For Claude, wrap context in XML tags for clear separation if (context) { systemContent += `\n\n\n${context}\n`; } - + // Add system message if we have content if (systemContent) { formattedMessages.push({ @@ -105,10 +112,10 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter { content: systemContent }); } - + // Add remaining messages (excluding system) formattedMessages.push(...this.getMessagesWithoutSystem(messages)); - + return formattedMessages; } } @@ -118,25 +125,25 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter { * Optimizes message format for open-source models */ export class OllamaMessageFormatter extends BaseMessageFormatter { - formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] { + formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] { const formattedMessages: Message[] = []; - + // Ollama format is closer to raw prompting and typically works better with // context embedded in system prompt rather than as separate messages - + // Build comprehensive system prompt let systemContent = ''; const existingSystem = this.getSystemMessage(messages); - + if (systemPrompt || existingSystem) { systemContent = systemPrompt || existingSystem?.content || ''; } - + // Add context to system prompt if (context) { systemContent += `\n\nReference information:\n${context}`; } - + // Add system message if we have content if (systemContent) { formattedMessages.push({ @@ -144,10 +151,10 @@ export class OllamaMessageFormatter extends BaseMessageFormatter { content: systemContent }); } - + // Add remaining messages (excluding system) formattedMessages.push(...this.getMessagesWithoutSystem(messages)); - + return formattedMessages; } } @@ -156,19 +163,22 @@ export class OllamaMessageFormatter extends BaseMessageFormatter { * Default message formatter when provider is unknown */ export class DefaultMessageFormatter extends BaseMessageFormatter { - formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] { + formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] { const formattedMessages: Message[] = []; - + // Handle system message const existingSystem = this.getSystemMessage(messages); - if (systemPrompt || existingSystem) { + + if (preserveSystemPrompt && existingSystem) { + formattedMessages.push(existingSystem); + } else if (systemPrompt || existingSystem) { const systemContent = systemPrompt || existingSystem?.content || ''; formattedMessages.push({ role: 'system', content: systemContent }); } - + // Add context as a user message if (context) { formattedMessages.push({ @@ -176,10 +186,10 @@ export class DefaultMessageFormatter extends BaseMessageFormatter { content: `Here is context to help you answer my questions: ${context}` }); } - + // Add user/assistant messages formattedMessages.push(...this.getMessagesWithoutSystem(messages)); - + return formattedMessages; } } @@ -194,7 +204,7 @@ export class MessageFormatterFactory { ollama: new OllamaMessageFormatter(), default: new DefaultMessageFormatter() }; - + /** * Get the appropriate formatter for a provider * @param provider Provider name @@ -203,7 +213,7 @@ export class MessageFormatterFactory { static getFormatter(provider: string): MessageFormatter { return this.formatters[provider] || this.formatters.default; } - + /** * Register a custom formatter for a provider * @param provider Provider name @@ -212,4 +222,4 @@ export class MessageFormatterFactory { static registerFormatter(provider: string, formatter: MessageFormatter): void { this.formatters[provider] = formatter; } -} \ No newline at end of file +} diff --git a/src/services/llm/providers/ollama_service.ts b/src/services/llm/providers/ollama_service.ts index d8aab4598..9dddc3e1b 100644 --- a/src/services/llm/providers/ollama_service.ts +++ b/src/services/llm/providers/ollama_service.ts @@ -48,10 +48,30 @@ export class OllamaService extends BaseAIService { const systemPrompt = this.getSystemPrompt(opts.systemPrompt || options.getOption('aiSystemPrompt')); try { - // Use the formatter to prepare messages - const formattedMessages = this.formatter.formatMessages(messages, systemPrompt); + // Determine whether to use the formatter or send messages directly + let messagesToSend: Message[]; - console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(formattedMessages, null, 2)); + if (opts.bypassFormatter) { + // Bypass the formatter entirely - use messages as is + messagesToSend = [...messages]; + console.log(`Bypassing formatter for Ollama request with ${messages.length} messages`); + } else { + // Use the formatter to prepare messages + messagesToSend = this.formatter.formatMessages( + messages, + systemPrompt, + undefined, // context + opts.preserveSystemPrompt + ); + console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(messagesToSend, null, 2)); + } + + // Check if this is a request that expects JSON response + const expectsJsonResponse = opts.expectsJsonResponse || false; + + if (expectsJsonResponse) { + console.log(`Request expects JSON response, adding response_format parameter`); + } const response = await fetch(`${apiBase}/api/chat`, { method: 'POST', @@ -60,9 +80,11 @@ export class OllamaService extends BaseAIService { }, body: JSON.stringify({ model, - messages: formattedMessages, + messages: messagesToSend, options: { - temperature + temperature, + // Add response_format for requests that expect JSON + ...(expectsJsonResponse ? { response_format: { type: "json_object" } } : {}) }, stream: false }) diff --git a/src/services/llm/utils/json_extractor.ts b/src/services/llm/utils/json_extractor.ts new file mode 100644 index 000000000..5196544c3 --- /dev/null +++ b/src/services/llm/utils/json_extractor.ts @@ -0,0 +1,387 @@ +import log from '../../log.js'; + +/** + * Options for JSON extraction + */ +export interface JsonExtractionOptions { + /** Attempt to find and extract arrays as the primary target (for query enhancers, etc.) */ + extractArrays?: boolean; + /** Minimum length for extracted strings to be considered valid */ + minStringLength?: number; + /** Apply fixes to malformed JSON before parsing */ + applyFixes?: boolean; + /** Whether to use fallback extraction methods when JSON parsing fails */ + useFallbacks?: boolean; +} + +/** + * Structure of a tool call extracted from an LLM response + */ +export interface ExtractedToolCall { + /** The name of the tool to call */ + tool_name: string; + /** Parameters for the tool call */ + parameters: Record; + /** The original JSON string that was parsed */ + originalJson?: string; +} + +/** + * Utility class for extracting and parsing JSON from LLM responses + * Handles malformed JSON, escaping issues, and provides fallback mechanisms + */ +export class JsonExtractor { + /** + * Extract JSON from an LLM response + * + * @param text - The raw text from an LLM response + * @param options - Options to control extraction behavior + * @returns The parsed JSON object or array, or null if extraction failed + */ + static extract(text: string, options: JsonExtractionOptions = {}): T | null { + const opts = { + extractArrays: false, + minStringLength: 3, + applyFixes: true, + useFallbacks: true, + ...options + }; + + try { + // Clean up the input text + let cleanedText = this.cleanMarkdownAndFormatting(text); + + // Try to extract specific JSON structures if needed + if (opts.extractArrays) { + const arrayResult = this.extractArray(cleanedText, opts); + if (arrayResult) { + return arrayResult as unknown as T; + } + } + + // Try direct JSON parsing with fixes if enabled + if (opts.applyFixes) { + const fixedResult = this.extractWithFixes(cleanedText); + if (fixedResult !== null) { + return fixedResult as T; + } + } + + // Try direct JSON parsing without fixes + try { + return JSON.parse(cleanedText) as T; + } catch (e) { + // Fall through to fallbacks + } + + // Use fallbacks if enabled + if (opts.useFallbacks) { + if (opts.extractArrays) { + const items = this.extractItemsAsFallback(text, opts.minStringLength); + if (items.length > 0) { + return items as unknown as T; + } + } + + // If it looks like a JSON object but can't be parsed, try regex extraction + if (cleanedText.includes('{') && cleanedText.includes('}')) { + const objectResult = this.extractObject(cleanedText); + if (objectResult) { + return objectResult as T; + } + } + } + + return null; + } catch (error) { + log.error(`JSON extraction error: ${error}`); + return null; + } + } + + /** + * Extract tool calls from an LLM response + * Specifically designed to handle Ollama tool call format + * + * @param text - Raw text from the LLM response + * @returns Array of tool calls or empty array if none found + */ + static extractToolCalls(text: string): ExtractedToolCall[] { + const toolCalls: ExtractedToolCall[] = []; + + try { + // Clean up the text and find all JSON objects + const cleanedText = this.cleanMarkdownAndFormatting(text); + + // Try to find complete JSON objects + const jsonObjectMatches = this.findJsonObjects(cleanedText); + + for (const jsonString of jsonObjectMatches) { + try { + // Try to fix and parse each potential JSON object + const fixedJson = this.applyJsonFixes(jsonString); + const parsedJson = JSON.parse(fixedJson); + + // Check if this looks like a tool call + if ( + parsedJson && + typeof parsedJson === 'object' && + parsedJson.tool_name && + typeof parsedJson.tool_name === 'string' && + parsedJson.parameters && + typeof parsedJson.parameters === 'object' + ) { + toolCalls.push({ + tool_name: parsedJson.tool_name, + parameters: parsedJson.parameters, + originalJson: jsonString + }); + } + } catch (e) { + // If this JSON object failed to parse, try more aggressive fixes + log.info(`Failed to parse potential tool call JSON: ${e}`); + } + } + + // If we couldn't find valid tool calls with the first approach, try regex pattern matching + if (toolCalls.length === 0) { + // Look for tool_name/parameters patterns in the text + const toolNameMatch = text.match(/["']?tool_name["']?\s*:\s*["']([^"']+)["']/); + const parametersMatch = text.match(/["']?parameters["']?\s*:\s*({[^}]+})/); + + if (toolNameMatch && parametersMatch) { + try { + const toolName = toolNameMatch[1]; + const parametersStr = this.applyJsonFixes(parametersMatch[1]); + const parameters = JSON.parse(parametersStr); + + toolCalls.push({ + tool_name: toolName, + parameters, + originalJson: `{"tool_name":"${toolName}","parameters":${parametersStr}}` + }); + } catch (e) { + log.info(`Failed to parse tool call with regex approach: ${e}`); + } + } + } + } catch (error) { + log.error(`Error extracting tool calls: ${error}`); + } + + return toolCalls; + } + + /** + * Find all potential JSON objects in a text + */ + private static findJsonObjects(text: string): string[] { + const jsonObjects: string[] = []; + let bracesCount = 0; + let currentObject = ''; + let insideObject = false; + + // Scan through text character by character + for (let i = 0; i < text.length; i++) { + const char = text[i]; + + if (char === '{') { + bracesCount++; + if (!insideObject) { + insideObject = true; + currentObject = '{'; + } else { + currentObject += char; + } + } else if (char === '}') { + bracesCount--; + currentObject += char; + + if (bracesCount === 0 && insideObject) { + jsonObjects.push(currentObject); + currentObject = ''; + insideObject = false; + } + } else if (insideObject) { + currentObject += char; + } + } + + return jsonObjects; + } + + /** + * Clean Markdown formatting and special characters from text + */ + private static cleanMarkdownAndFormatting(text: string): string { + return text + .replace(/```(?:json)?|```/g, '') // Remove code block markers + .replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes + .trim(); + } + + /** + * Extract an array from text using regex and pattern matching + */ + private static extractArray(text: string, options: JsonExtractionOptions): string[] | null { + // First attempt: Find JSON arrays via regex + const arrayPattern = /\[((?:"(?:\\.|[^"\\])*"(?:\s*,\s*)?)+)\]/g; + const matches = [...text.matchAll(arrayPattern)]; + + if (matches.length > 0) { + // Take the first complete array match + const arrayContent = matches[0][1]; + + // Extract all properly quoted strings from the array + const stringPattern = /"((?:\\.|[^"\\])*)"/g; + const stringMatches = [...arrayContent.matchAll(stringPattern)]; + + if (stringMatches.length > 0) { + const items = stringMatches + .map(m => m[1].trim()) + .filter(s => s.length >= (options.minStringLength || 3)); + + if (items.length > 0) { + log.info(`Successfully extracted ${items.length} items using regex pattern`); + return items; + } + } + } + + // Second attempt: Try to extract array via standard JSON parsing with fixes + if (text.includes('[') && text.includes(']')) { + const arrayMatch = text.match(/\[[\s\S]*\]/); + if (arrayMatch) { + const arrayText = this.applyJsonFixes(arrayMatch[0]); + + try { + const array = JSON.parse(arrayText); + if (Array.isArray(array) && array.length > 0) { + const items = array + .map(item => typeof item === 'string' ? item : String(item)) + .filter(s => s.length >= (options.minStringLength || 3)); + + if (items.length > 0) { + log.info(`Successfully parsed JSON array with ${items.length} items`); + return items; + } + } + } catch (e) { + // Fall through to fallbacks + } + } + } + + return null; + } + + /** + * Extract a JSON object using regex and pattern matching + */ + private static extractObject(text: string): Record | null { + const objectMatch = text.match(/{[\s\S]*}/); + if (!objectMatch) return null; + + const objectText = this.applyJsonFixes(objectMatch[0]); + + try { + const parsed = JSON.parse(objectText); + return parsed; + } catch (e) { + return null; + } + } + + /** + * Apply fixes to malformed JSON text + */ + private static applyJsonFixes(text: string): string { + let fixed = text; + + // Fix common JSON formatting issues - replace newlines inside the JSON + fixed = fixed.replace(/\r?\n/g, ' '); + + // Fix unclosed quotes - replace trailing commas before closing brackets + fixed = fixed.replace(/,\s*]/g, ']'); + fixed = fixed.replace(/,\s*}/g, '}'); + + // Fix quotes inside strings + fixed = fixed.replace(/"([^"]*)"([^"]*)"([^"]*)"/g, '"$1\'$2\'$3"'); + + // Fix missing commas between elements + fixed = fixed.replace(/"([^"]*)"(?:\s+)"([^"]*)"/g, '"$1", "$2"'); + + // Fix missing commas in arrays (quotes with only spaces between them) + fixed = fixed.replace(/"([^"]*)"\s+"/g, '"$1", "'); + + // Fix unclosed quotes before commas + fixed = fixed.replace(/"([^"]*),\s*(?="|])/g, '"$1", '); + + return fixed; + } + + /** + * Extract with fixes and direct JSON parsing + */ + private static extractWithFixes(text: string): any | null { + try { + const fixed = this.applyJsonFixes(text); + return JSON.parse(fixed); + } catch (e) { + return null; + } + } + + /** + * Extract items as a fallback using various patterns + */ + private static extractItemsAsFallback(text: string, minLength: number = 3): string[] { + const patterns = [ + /(?:^|\n)["'](.+?)["'](?:,|\n|$)/g, // Quoted strings + /(?:^|\n)\[["'](.+?)["']\](?:,|\n|$)/g, // Single item arrays + /(?:^|\n)(\d+\.\s*.+?)(?:\n|$)/g, // Numbered list items + /(?:^|\n)[-*•]\s*(.+?)(?:\n|$)/g // Bullet list items + ]; + + const extractedItems = new Set(); + + // Try each pattern + for (const pattern of patterns) { + const matches = [...text.matchAll(pattern)]; + for (const match of matches) { + if (match[1] && match[1].trim().length >= minLength) { + extractedItems.add(match[1].trim()); + } + } + } + + // Try line-by-line extraction as last resort + if (extractedItems.size === 0) { + const lines = text.split('\n') + .map(line => line.trim()) + .filter(line => + line.length >= minLength && + !line.startsWith('```') && + !line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone + !line.match(/^\[|\]$/) // Skip lines that are just brackets + ); + + for (const line of lines) { + // Remove common formatting + const cleaned = line + .replace(/^\d+\.?\s*/, '') // Remove numbered list markers + .replace(/^[-*•]\s*/, '') // Remove bullet list markers + .replace(/^["']|["']$/g, '') // Remove surrounding quotes + .trim(); + + if (cleaned.length >= minLength) { + extractedItems.add(cleaned); + } + } + } + + return Array.from(extractedItems); + } +} + +export default JsonExtractor;