do a better job at centralizing json extraction, and query "enhancer" search queries

This commit is contained in:
perf3ct 2025-04-01 21:42:09 +00:00
parent 5b3dca88d9
commit ed52d71729
No known key found for this signature in database
GPG Key ID: 569C4EEC436F5232
6 changed files with 507 additions and 124 deletions

View File

@ -23,6 +23,9 @@ export interface ChatCompletionOptions {
presencePenalty?: number;
showThinking?: boolean;
systemPrompt?: string;
preserveSystemPrompt?: boolean; // Whether to preserve existing system message
bypassFormatter?: boolean; // Whether to bypass the message formatter entirely
expectsJsonResponse?: boolean; // Whether this request expects a JSON response
stream?: boolean; // Whether to stream the response
}

View File

@ -4,6 +4,7 @@ import type { Message } from '../../ai_interface.js';
import { CONTEXT_PROMPTS } from '../../constants/llm_prompt_constants.js';
import type { LLMServiceInterface } from '../../interfaces/agent_tool_interfaces.js';
import type { IQueryEnhancer } from '../../interfaces/context_interfaces.js';
import JsonExtractor from '../../utils/json_extractor.js';
/**
* Provides utilities for enhancing queries and generating search queries
@ -12,6 +13,15 @@ export class QueryEnhancer implements IQueryEnhancer {
// Use the centralized query enhancer prompt
private metaPrompt = CONTEXT_PROMPTS.QUERY_ENHANCER;
/**
* Get enhanced prompt with JSON formatting instructions
*/
private getEnhancedPrompt(): string {
return `${this.metaPrompt}
IMPORTANT: You must respond with valid JSON arrays. Always include commas between array elements.
Format your answer as a valid JSON array without markdown code blocks, like this: ["item1", "item2", "item3"]`;
}
/**
* Generate search queries to find relevant information for the user question
*
@ -32,95 +42,38 @@ export class QueryEnhancer implements IQueryEnhancer {
}
const messages: Message[] = [
{ role: "system", content: this.metaPrompt },
{ role: "system", content: this.getEnhancedPrompt() },
{ role: "user", content: userQuestion }
];
const options = {
temperature: 0.3,
maxTokens: 300
maxTokens: 300,
bypassFormatter: true, // Completely bypass formatter for query enhancement
expectsJsonResponse: true // Explicitly request JSON-formatted response
};
// Get the response from the LLM
const response = await llmService.generateChatCompletion(messages, options);
const responseText = response.text; // Extract the text from the response object
try {
// Remove code blocks, quotes, and clean up the response text
let jsonStr = responseText
.replace(/```(?:json)?|```/g, '') // Remove code block markers
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
.trim();
// Use the JsonExtractor to parse the response
const queries = JsonExtractor.extract<string[]>(responseText, {
extractArrays: true,
minStringLength: 3,
applyFixes: true,
useFallbacks: true
});
// Check if the text might contain a JSON array (has square brackets)
if (jsonStr.includes('[') && jsonStr.includes(']')) {
// Extract just the array part if there's explanatory text
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
if (arrayMatch) {
jsonStr = arrayMatch[0];
}
// Try to parse the JSON
try {
const queries = JSON.parse(jsonStr);
if (Array.isArray(queries) && queries.length > 0) {
const result = queries.map(q => typeof q === 'string' ? q : String(q)).filter(Boolean);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result;
}
} catch (innerError) {
// If parsing fails, log it and continue to the fallback
log.info(`JSON parse error: ${innerError}. Will use fallback parsing for: ${jsonStr}`);
}
}
// Fallback 1: Try to extract an array manually by splitting on commas between quotes
if (jsonStr.includes('[') && jsonStr.includes(']')) {
const arrayContent = jsonStr.substring(
jsonStr.indexOf('[') + 1,
jsonStr.lastIndexOf(']')
);
// Use regex to match quoted strings, handling escaped quotes
const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g);
if (stringMatches && stringMatches.length > 0) {
const result = stringMatches
.map((m: string) => m.substring(1, m.length - 1)) // Remove surrounding quotes
.filter((s: string) => s.length > 0);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result;
}
}
// Fallback 2: Extract queries line by line
const lines = responseText.split('\n')
.map((line: string) => line.trim())
.filter((line: string) =>
line.length > 0 &&
!line.startsWith('```') &&
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
!line.match(/^\[|\]$/) // Skip lines that are just brackets
);
if (lines.length > 0) {
// Remove numbering, quotes and other list markers from each line
const result = lines.map((line: string) => {
return line
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers (1., 2., etc)
.replace(/^[-*•]\s*/, '') // Remove bullet list markers
.replace(/^["']|["']$/g, '') // Remove surrounding quotes
.trim();
}).filter((s: string) => s.length > 0);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
return result;
}
} catch (parseError) {
log.error(`Error parsing search queries: ${parseError}`);
if (queries && queries.length > 0) {
log.info(`Extracted ${queries.length} queries using JsonExtractor`);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, queries);
return queries;
}
// If all else fails, just use the original question
const fallback = [userQuestion];
log.info(`No queries extracted, using fallback: "${userQuestion}"`);
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, fallback);
return fallback;
} catch (error: unknown) {

View File

@ -23,22 +23,30 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
/**
* Format messages for the Ollama API
* @param messages Messages to format
* @param systemPrompt Optional system prompt to use
* @param context Optional context to include
* @param preserveSystemPrompt When true, preserves existing system messages rather than replacing them
*/
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
const formattedMessages: Message[] = [];
// First identify user and system messages
const systemMessages = messages.filter(msg => msg.role === 'system');
const userMessages = messages.filter(msg => msg.role === 'user' || msg.role === 'assistant');
// Create base system message with instructions or use default
const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO;
// Always add a system message with the base prompt
formattedMessages.push({
role: 'system',
content: basePrompt
});
// Determine if we should preserve the existing system message
if (preserveSystemPrompt && systemMessages.length > 0) {
// Preserve the existing system message
formattedMessages.push(systemMessages[0]);
} else {
// Use provided systemPrompt or default
const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO;
formattedMessages.push({
role: 'system',
content: basePrompt
});
}
// If we have context, inject it into the first user message
if (context && userMessages.length > 0) {

View File

@ -9,9 +9,10 @@ export interface MessageFormatter {
* @param messages Original messages
* @param systemPrompt Optional system prompt to override
* @param context Optional context to include
* @param preserveSystemPrompt Optional flag to preserve existing system prompt
* @returns Formatted messages optimized for the specific provider
*/
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[];
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[];
}
/**
@ -22,15 +23,15 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
* Format messages with system prompt and context
* Each provider should override this method with their specific formatting strategy
*/
abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[];
abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[];
/**
* Helper method to extract existing system message from messages
*/
protected getSystemMessage(messages: Message[]): Message | undefined {
return messages.find(msg => msg.role === 'system');
}
/**
* Helper method to create a copy of messages without system message
*/
@ -44,22 +45,26 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
* Optimizes message format for OpenAI models (GPT-3.5, GPT-4, etc.)
*/
export class OpenAIMessageFormatter extends BaseMessageFormatter {
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
const formattedMessages: Message[] = [];
// OpenAI performs best with system message first, then context as a separate system message
// or appended to the original system message
// Handle system message
const existingSystem = this.getSystemMessage(messages);
if (systemPrompt || existingSystem) {
if (preserveSystemPrompt && existingSystem) {
// Use the existing system message
formattedMessages.push(existingSystem);
} else if (systemPrompt || existingSystem) {
const systemContent = systemPrompt || existingSystem?.content || '';
formattedMessages.push({
role: 'system',
content: systemContent
});
}
// Add context as a system message with clear instruction
if (context) {
formattedMessages.push({
@ -67,10 +72,10 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
content: `Please use the following context to respond to the user's messages:\n\n${context}`
});
}
// Add remaining messages (excluding system)
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
return formattedMessages;
}
}
@ -80,24 +85,26 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
* Optimizes message format for Claude models
*/
export class AnthropicMessageFormatter extends BaseMessageFormatter {
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
const formattedMessages: Message[] = [];
// Anthropic performs best with a specific XML-like format for context and system instructions
// Create system message with combined prompt and context if any
let systemContent = '';
const existingSystem = this.getSystemMessage(messages);
if (systemPrompt || existingSystem) {
if (preserveSystemPrompt && existingSystem) {
systemContent = existingSystem.content;
} else if (systemPrompt || existingSystem) {
systemContent = systemPrompt || existingSystem?.content || '';
}
// For Claude, wrap context in XML tags for clear separation
if (context) {
systemContent += `\n\n<context>\n${context}\n</context>`;
}
// Add system message if we have content
if (systemContent) {
formattedMessages.push({
@ -105,10 +112,10 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
content: systemContent
});
}
// Add remaining messages (excluding system)
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
return formattedMessages;
}
}
@ -118,25 +125,25 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
* Optimizes message format for open-source models
*/
export class OllamaMessageFormatter extends BaseMessageFormatter {
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
const formattedMessages: Message[] = [];
// Ollama format is closer to raw prompting and typically works better with
// context embedded in system prompt rather than as separate messages
// Build comprehensive system prompt
let systemContent = '';
const existingSystem = this.getSystemMessage(messages);
if (systemPrompt || existingSystem) {
systemContent = systemPrompt || existingSystem?.content || '';
}
// Add context to system prompt
if (context) {
systemContent += `\n\nReference information:\n${context}`;
}
// Add system message if we have content
if (systemContent) {
formattedMessages.push({
@ -144,10 +151,10 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
content: systemContent
});
}
// Add remaining messages (excluding system)
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
return formattedMessages;
}
}
@ -156,19 +163,22 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
* Default message formatter when provider is unknown
*/
export class DefaultMessageFormatter extends BaseMessageFormatter {
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
const formattedMessages: Message[] = [];
// Handle system message
const existingSystem = this.getSystemMessage(messages);
if (systemPrompt || existingSystem) {
if (preserveSystemPrompt && existingSystem) {
formattedMessages.push(existingSystem);
} else if (systemPrompt || existingSystem) {
const systemContent = systemPrompt || existingSystem?.content || '';
formattedMessages.push({
role: 'system',
content: systemContent
});
}
// Add context as a user message
if (context) {
formattedMessages.push({
@ -176,10 +186,10 @@ export class DefaultMessageFormatter extends BaseMessageFormatter {
content: `Here is context to help you answer my questions: ${context}`
});
}
// Add user/assistant messages
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
return formattedMessages;
}
}
@ -194,7 +204,7 @@ export class MessageFormatterFactory {
ollama: new OllamaMessageFormatter(),
default: new DefaultMessageFormatter()
};
/**
* Get the appropriate formatter for a provider
* @param provider Provider name
@ -203,7 +213,7 @@ export class MessageFormatterFactory {
static getFormatter(provider: string): MessageFormatter {
return this.formatters[provider] || this.formatters.default;
}
/**
* Register a custom formatter for a provider
* @param provider Provider name
@ -212,4 +222,4 @@ export class MessageFormatterFactory {
static registerFormatter(provider: string, formatter: MessageFormatter): void {
this.formatters[provider] = formatter;
}
}
}

View File

@ -48,10 +48,30 @@ export class OllamaService extends BaseAIService {
const systemPrompt = this.getSystemPrompt(opts.systemPrompt || options.getOption('aiSystemPrompt'));
try {
// Use the formatter to prepare messages
const formattedMessages = this.formatter.formatMessages(messages, systemPrompt);
// Determine whether to use the formatter or send messages directly
let messagesToSend: Message[];
console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(formattedMessages, null, 2));
if (opts.bypassFormatter) {
// Bypass the formatter entirely - use messages as is
messagesToSend = [...messages];
console.log(`Bypassing formatter for Ollama request with ${messages.length} messages`);
} else {
// Use the formatter to prepare messages
messagesToSend = this.formatter.formatMessages(
messages,
systemPrompt,
undefined, // context
opts.preserveSystemPrompt
);
console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(messagesToSend, null, 2));
}
// Check if this is a request that expects JSON response
const expectsJsonResponse = opts.expectsJsonResponse || false;
if (expectsJsonResponse) {
console.log(`Request expects JSON response, adding response_format parameter`);
}
const response = await fetch(`${apiBase}/api/chat`, {
method: 'POST',
@ -60,9 +80,11 @@ export class OllamaService extends BaseAIService {
},
body: JSON.stringify({
model,
messages: formattedMessages,
messages: messagesToSend,
options: {
temperature
temperature,
// Add response_format for requests that expect JSON
...(expectsJsonResponse ? { response_format: { type: "json_object" } } : {})
},
stream: false
})

View File

@ -0,0 +1,387 @@
import log from '../../log.js';
/**
* Options for JSON extraction
*/
export interface JsonExtractionOptions {
/** Attempt to find and extract arrays as the primary target (for query enhancers, etc.) */
extractArrays?: boolean;
/** Minimum length for extracted strings to be considered valid */
minStringLength?: number;
/** Apply fixes to malformed JSON before parsing */
applyFixes?: boolean;
/** Whether to use fallback extraction methods when JSON parsing fails */
useFallbacks?: boolean;
}
/**
* Structure of a tool call extracted from an LLM response
*/
export interface ExtractedToolCall {
/** The name of the tool to call */
tool_name: string;
/** Parameters for the tool call */
parameters: Record<string, any>;
/** The original JSON string that was parsed */
originalJson?: string;
}
/**
* Utility class for extracting and parsing JSON from LLM responses
* Handles malformed JSON, escaping issues, and provides fallback mechanisms
*/
export class JsonExtractor {
/**
* Extract JSON from an LLM response
*
* @param text - The raw text from an LLM response
* @param options - Options to control extraction behavior
* @returns The parsed JSON object or array, or null if extraction failed
*/
static extract<T = any>(text: string, options: JsonExtractionOptions = {}): T | null {
const opts = {
extractArrays: false,
minStringLength: 3,
applyFixes: true,
useFallbacks: true,
...options
};
try {
// Clean up the input text
let cleanedText = this.cleanMarkdownAndFormatting(text);
// Try to extract specific JSON structures if needed
if (opts.extractArrays) {
const arrayResult = this.extractArray(cleanedText, opts);
if (arrayResult) {
return arrayResult as unknown as T;
}
}
// Try direct JSON parsing with fixes if enabled
if (opts.applyFixes) {
const fixedResult = this.extractWithFixes(cleanedText);
if (fixedResult !== null) {
return fixedResult as T;
}
}
// Try direct JSON parsing without fixes
try {
return JSON.parse(cleanedText) as T;
} catch (e) {
// Fall through to fallbacks
}
// Use fallbacks if enabled
if (opts.useFallbacks) {
if (opts.extractArrays) {
const items = this.extractItemsAsFallback(text, opts.minStringLength);
if (items.length > 0) {
return items as unknown as T;
}
}
// If it looks like a JSON object but can't be parsed, try regex extraction
if (cleanedText.includes('{') && cleanedText.includes('}')) {
const objectResult = this.extractObject(cleanedText);
if (objectResult) {
return objectResult as T;
}
}
}
return null;
} catch (error) {
log.error(`JSON extraction error: ${error}`);
return null;
}
}
/**
* Extract tool calls from an LLM response
* Specifically designed to handle Ollama tool call format
*
* @param text - Raw text from the LLM response
* @returns Array of tool calls or empty array if none found
*/
static extractToolCalls(text: string): ExtractedToolCall[] {
const toolCalls: ExtractedToolCall[] = [];
try {
// Clean up the text and find all JSON objects
const cleanedText = this.cleanMarkdownAndFormatting(text);
// Try to find complete JSON objects
const jsonObjectMatches = this.findJsonObjects(cleanedText);
for (const jsonString of jsonObjectMatches) {
try {
// Try to fix and parse each potential JSON object
const fixedJson = this.applyJsonFixes(jsonString);
const parsedJson = JSON.parse(fixedJson);
// Check if this looks like a tool call
if (
parsedJson &&
typeof parsedJson === 'object' &&
parsedJson.tool_name &&
typeof parsedJson.tool_name === 'string' &&
parsedJson.parameters &&
typeof parsedJson.parameters === 'object'
) {
toolCalls.push({
tool_name: parsedJson.tool_name,
parameters: parsedJson.parameters,
originalJson: jsonString
});
}
} catch (e) {
// If this JSON object failed to parse, try more aggressive fixes
log.info(`Failed to parse potential tool call JSON: ${e}`);
}
}
// If we couldn't find valid tool calls with the first approach, try regex pattern matching
if (toolCalls.length === 0) {
// Look for tool_name/parameters patterns in the text
const toolNameMatch = text.match(/["']?tool_name["']?\s*:\s*["']([^"']+)["']/);
const parametersMatch = text.match(/["']?parameters["']?\s*:\s*({[^}]+})/);
if (toolNameMatch && parametersMatch) {
try {
const toolName = toolNameMatch[1];
const parametersStr = this.applyJsonFixes(parametersMatch[1]);
const parameters = JSON.parse(parametersStr);
toolCalls.push({
tool_name: toolName,
parameters,
originalJson: `{"tool_name":"${toolName}","parameters":${parametersStr}}`
});
} catch (e) {
log.info(`Failed to parse tool call with regex approach: ${e}`);
}
}
}
} catch (error) {
log.error(`Error extracting tool calls: ${error}`);
}
return toolCalls;
}
/**
* Find all potential JSON objects in a text
*/
private static findJsonObjects(text: string): string[] {
const jsonObjects: string[] = [];
let bracesCount = 0;
let currentObject = '';
let insideObject = false;
// Scan through text character by character
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (char === '{') {
bracesCount++;
if (!insideObject) {
insideObject = true;
currentObject = '{';
} else {
currentObject += char;
}
} else if (char === '}') {
bracesCount--;
currentObject += char;
if (bracesCount === 0 && insideObject) {
jsonObjects.push(currentObject);
currentObject = '';
insideObject = false;
}
} else if (insideObject) {
currentObject += char;
}
}
return jsonObjects;
}
/**
* Clean Markdown formatting and special characters from text
*/
private static cleanMarkdownAndFormatting(text: string): string {
return text
.replace(/```(?:json)?|```/g, '') // Remove code block markers
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
.trim();
}
/**
* Extract an array from text using regex and pattern matching
*/
private static extractArray(text: string, options: JsonExtractionOptions): string[] | null {
// First attempt: Find JSON arrays via regex
const arrayPattern = /\[((?:"(?:\\.|[^"\\])*"(?:\s*,\s*)?)+)\]/g;
const matches = [...text.matchAll(arrayPattern)];
if (matches.length > 0) {
// Take the first complete array match
const arrayContent = matches[0][1];
// Extract all properly quoted strings from the array
const stringPattern = /"((?:\\.|[^"\\])*)"/g;
const stringMatches = [...arrayContent.matchAll(stringPattern)];
if (stringMatches.length > 0) {
const items = stringMatches
.map(m => m[1].trim())
.filter(s => s.length >= (options.minStringLength || 3));
if (items.length > 0) {
log.info(`Successfully extracted ${items.length} items using regex pattern`);
return items;
}
}
}
// Second attempt: Try to extract array via standard JSON parsing with fixes
if (text.includes('[') && text.includes(']')) {
const arrayMatch = text.match(/\[[\s\S]*\]/);
if (arrayMatch) {
const arrayText = this.applyJsonFixes(arrayMatch[0]);
try {
const array = JSON.parse(arrayText);
if (Array.isArray(array) && array.length > 0) {
const items = array
.map(item => typeof item === 'string' ? item : String(item))
.filter(s => s.length >= (options.minStringLength || 3));
if (items.length > 0) {
log.info(`Successfully parsed JSON array with ${items.length} items`);
return items;
}
}
} catch (e) {
// Fall through to fallbacks
}
}
}
return null;
}
/**
* Extract a JSON object using regex and pattern matching
*/
private static extractObject(text: string): Record<string, any> | null {
const objectMatch = text.match(/{[\s\S]*}/);
if (!objectMatch) return null;
const objectText = this.applyJsonFixes(objectMatch[0]);
try {
const parsed = JSON.parse(objectText);
return parsed;
} catch (e) {
return null;
}
}
/**
* Apply fixes to malformed JSON text
*/
private static applyJsonFixes(text: string): string {
let fixed = text;
// Fix common JSON formatting issues - replace newlines inside the JSON
fixed = fixed.replace(/\r?\n/g, ' ');
// Fix unclosed quotes - replace trailing commas before closing brackets
fixed = fixed.replace(/,\s*]/g, ']');
fixed = fixed.replace(/,\s*}/g, '}');
// Fix quotes inside strings
fixed = fixed.replace(/"([^"]*)"([^"]*)"([^"]*)"/g, '"$1\'$2\'$3"');
// Fix missing commas between elements
fixed = fixed.replace(/"([^"]*)"(?:\s+)"([^"]*)"/g, '"$1", "$2"');
// Fix missing commas in arrays (quotes with only spaces between them)
fixed = fixed.replace(/"([^"]*)"\s+"/g, '"$1", "');
// Fix unclosed quotes before commas
fixed = fixed.replace(/"([^"]*),\s*(?="|])/g, '"$1", ');
return fixed;
}
/**
* Extract with fixes and direct JSON parsing
*/
private static extractWithFixes(text: string): any | null {
try {
const fixed = this.applyJsonFixes(text);
return JSON.parse(fixed);
} catch (e) {
return null;
}
}
/**
* Extract items as a fallback using various patterns
*/
private static extractItemsAsFallback(text: string, minLength: number = 3): string[] {
const patterns = [
/(?:^|\n)["'](.+?)["'](?:,|\n|$)/g, // Quoted strings
/(?:^|\n)\[["'](.+?)["']\](?:,|\n|$)/g, // Single item arrays
/(?:^|\n)(\d+\.\s*.+?)(?:\n|$)/g, // Numbered list items
/(?:^|\n)[-*•]\s*(.+?)(?:\n|$)/g // Bullet list items
];
const extractedItems = new Set<string>();
// Try each pattern
for (const pattern of patterns) {
const matches = [...text.matchAll(pattern)];
for (const match of matches) {
if (match[1] && match[1].trim().length >= minLength) {
extractedItems.add(match[1].trim());
}
}
}
// Try line-by-line extraction as last resort
if (extractedItems.size === 0) {
const lines = text.split('\n')
.map(line => line.trim())
.filter(line =>
line.length >= minLength &&
!line.startsWith('```') &&
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
!line.match(/^\[|\]$/) // Skip lines that are just brackets
);
for (const line of lines) {
// Remove common formatting
const cleaned = line
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers
.replace(/^[-*•]\s*/, '') // Remove bullet list markers
.replace(/^["']|["']$/g, '') // Remove surrounding quotes
.trim();
if (cleaned.length >= minLength) {
extractedItems.add(cleaned);
}
}
}
return Array.from(extractedItems);
}
}
export default JsonExtractor;