mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-11-12 21:31:36 +08:00
do a better job at centralizing json extraction, and query "enhancer" search queries
This commit is contained in:
parent
5b3dca88d9
commit
ed52d71729
@ -23,6 +23,9 @@ export interface ChatCompletionOptions {
|
|||||||
presencePenalty?: number;
|
presencePenalty?: number;
|
||||||
showThinking?: boolean;
|
showThinking?: boolean;
|
||||||
systemPrompt?: string;
|
systemPrompt?: string;
|
||||||
|
preserveSystemPrompt?: boolean; // Whether to preserve existing system message
|
||||||
|
bypassFormatter?: boolean; // Whether to bypass the message formatter entirely
|
||||||
|
expectsJsonResponse?: boolean; // Whether this request expects a JSON response
|
||||||
stream?: boolean; // Whether to stream the response
|
stream?: boolean; // Whether to stream the response
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import type { Message } from '../../ai_interface.js';
|
|||||||
import { CONTEXT_PROMPTS } from '../../constants/llm_prompt_constants.js';
|
import { CONTEXT_PROMPTS } from '../../constants/llm_prompt_constants.js';
|
||||||
import type { LLMServiceInterface } from '../../interfaces/agent_tool_interfaces.js';
|
import type { LLMServiceInterface } from '../../interfaces/agent_tool_interfaces.js';
|
||||||
import type { IQueryEnhancer } from '../../interfaces/context_interfaces.js';
|
import type { IQueryEnhancer } from '../../interfaces/context_interfaces.js';
|
||||||
|
import JsonExtractor from '../../utils/json_extractor.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides utilities for enhancing queries and generating search queries
|
* Provides utilities for enhancing queries and generating search queries
|
||||||
@ -12,6 +13,15 @@ export class QueryEnhancer implements IQueryEnhancer {
|
|||||||
// Use the centralized query enhancer prompt
|
// Use the centralized query enhancer prompt
|
||||||
private metaPrompt = CONTEXT_PROMPTS.QUERY_ENHANCER;
|
private metaPrompt = CONTEXT_PROMPTS.QUERY_ENHANCER;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get enhanced prompt with JSON formatting instructions
|
||||||
|
*/
|
||||||
|
private getEnhancedPrompt(): string {
|
||||||
|
return `${this.metaPrompt}
|
||||||
|
IMPORTANT: You must respond with valid JSON arrays. Always include commas between array elements.
|
||||||
|
Format your answer as a valid JSON array without markdown code blocks, like this: ["item1", "item2", "item3"]`;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate search queries to find relevant information for the user question
|
* Generate search queries to find relevant information for the user question
|
||||||
*
|
*
|
||||||
@ -32,95 +42,38 @@ export class QueryEnhancer implements IQueryEnhancer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const messages: Message[] = [
|
const messages: Message[] = [
|
||||||
{ role: "system", content: this.metaPrompt },
|
{ role: "system", content: this.getEnhancedPrompt() },
|
||||||
{ role: "user", content: userQuestion }
|
{ role: "user", content: userQuestion }
|
||||||
];
|
];
|
||||||
|
|
||||||
const options = {
|
const options = {
|
||||||
temperature: 0.3,
|
temperature: 0.3,
|
||||||
maxTokens: 300
|
maxTokens: 300,
|
||||||
|
bypassFormatter: true, // Completely bypass formatter for query enhancement
|
||||||
|
expectsJsonResponse: true // Explicitly request JSON-formatted response
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get the response from the LLM
|
// Get the response from the LLM
|
||||||
const response = await llmService.generateChatCompletion(messages, options);
|
const response = await llmService.generateChatCompletion(messages, options);
|
||||||
const responseText = response.text; // Extract the text from the response object
|
const responseText = response.text; // Extract the text from the response object
|
||||||
|
|
||||||
try {
|
// Use the JsonExtractor to parse the response
|
||||||
// Remove code blocks, quotes, and clean up the response text
|
const queries = JsonExtractor.extract<string[]>(responseText, {
|
||||||
let jsonStr = responseText
|
extractArrays: true,
|
||||||
.replace(/```(?:json)?|```/g, '') // Remove code block markers
|
minStringLength: 3,
|
||||||
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
|
applyFixes: true,
|
||||||
.trim();
|
useFallbacks: true
|
||||||
|
});
|
||||||
|
|
||||||
// Check if the text might contain a JSON array (has square brackets)
|
if (queries && queries.length > 0) {
|
||||||
if (jsonStr.includes('[') && jsonStr.includes(']')) {
|
log.info(`Extracted ${queries.length} queries using JsonExtractor`);
|
||||||
// Extract just the array part if there's explanatory text
|
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, queries);
|
||||||
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
|
return queries;
|
||||||
if (arrayMatch) {
|
|
||||||
jsonStr = arrayMatch[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try to parse the JSON
|
|
||||||
try {
|
|
||||||
const queries = JSON.parse(jsonStr);
|
|
||||||
if (Array.isArray(queries) && queries.length > 0) {
|
|
||||||
const result = queries.map(q => typeof q === 'string' ? q : String(q)).filter(Boolean);
|
|
||||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
} catch (innerError) {
|
|
||||||
// If parsing fails, log it and continue to the fallback
|
|
||||||
log.info(`JSON parse error: ${innerError}. Will use fallback parsing for: ${jsonStr}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback 1: Try to extract an array manually by splitting on commas between quotes
|
|
||||||
if (jsonStr.includes('[') && jsonStr.includes(']')) {
|
|
||||||
const arrayContent = jsonStr.substring(
|
|
||||||
jsonStr.indexOf('[') + 1,
|
|
||||||
jsonStr.lastIndexOf(']')
|
|
||||||
);
|
|
||||||
|
|
||||||
// Use regex to match quoted strings, handling escaped quotes
|
|
||||||
const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g);
|
|
||||||
if (stringMatches && stringMatches.length > 0) {
|
|
||||||
const result = stringMatches
|
|
||||||
.map((m: string) => m.substring(1, m.length - 1)) // Remove surrounding quotes
|
|
||||||
.filter((s: string) => s.length > 0);
|
|
||||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback 2: Extract queries line by line
|
|
||||||
const lines = responseText.split('\n')
|
|
||||||
.map((line: string) => line.trim())
|
|
||||||
.filter((line: string) =>
|
|
||||||
line.length > 0 &&
|
|
||||||
!line.startsWith('```') &&
|
|
||||||
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
|
|
||||||
!line.match(/^\[|\]$/) // Skip lines that are just brackets
|
|
||||||
);
|
|
||||||
|
|
||||||
if (lines.length > 0) {
|
|
||||||
// Remove numbering, quotes and other list markers from each line
|
|
||||||
const result = lines.map((line: string) => {
|
|
||||||
return line
|
|
||||||
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers (1., 2., etc)
|
|
||||||
.replace(/^[-*•]\s*/, '') // Remove bullet list markers
|
|
||||||
.replace(/^["']|["']$/g, '') // Remove surrounding quotes
|
|
||||||
.trim();
|
|
||||||
}).filter((s: string) => s.length > 0);
|
|
||||||
|
|
||||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
} catch (parseError) {
|
|
||||||
log.error(`Error parsing search queries: ${parseError}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If all else fails, just use the original question
|
// If all else fails, just use the original question
|
||||||
const fallback = [userQuestion];
|
const fallback = [userQuestion];
|
||||||
|
log.info(`No queries extracted, using fallback: "${userQuestion}"`);
|
||||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, fallback);
|
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, fallback);
|
||||||
return fallback;
|
return fallback;
|
||||||
} catch (error: unknown) {
|
} catch (error: unknown) {
|
||||||
|
|||||||
@ -23,22 +23,30 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Format messages for the Ollama API
|
* Format messages for the Ollama API
|
||||||
|
* @param messages Messages to format
|
||||||
|
* @param systemPrompt Optional system prompt to use
|
||||||
|
* @param context Optional context to include
|
||||||
|
* @param preserveSystemPrompt When true, preserves existing system messages rather than replacing them
|
||||||
*/
|
*/
|
||||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||||
const formattedMessages: Message[] = [];
|
const formattedMessages: Message[] = [];
|
||||||
|
|
||||||
// First identify user and system messages
|
// First identify user and system messages
|
||||||
const systemMessages = messages.filter(msg => msg.role === 'system');
|
const systemMessages = messages.filter(msg => msg.role === 'system');
|
||||||
const userMessages = messages.filter(msg => msg.role === 'user' || msg.role === 'assistant');
|
const userMessages = messages.filter(msg => msg.role === 'user' || msg.role === 'assistant');
|
||||||
|
|
||||||
// Create base system message with instructions or use default
|
// Determine if we should preserve the existing system message
|
||||||
const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO;
|
if (preserveSystemPrompt && systemMessages.length > 0) {
|
||||||
|
// Preserve the existing system message
|
||||||
// Always add a system message with the base prompt
|
formattedMessages.push(systemMessages[0]);
|
||||||
formattedMessages.push({
|
} else {
|
||||||
role: 'system',
|
// Use provided systemPrompt or default
|
||||||
content: basePrompt
|
const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO;
|
||||||
});
|
formattedMessages.push({
|
||||||
|
role: 'system',
|
||||||
|
content: basePrompt
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// If we have context, inject it into the first user message
|
// If we have context, inject it into the first user message
|
||||||
if (context && userMessages.length > 0) {
|
if (context && userMessages.length > 0) {
|
||||||
|
|||||||
@ -9,9 +9,10 @@ export interface MessageFormatter {
|
|||||||
* @param messages Original messages
|
* @param messages Original messages
|
||||||
* @param systemPrompt Optional system prompt to override
|
* @param systemPrompt Optional system prompt to override
|
||||||
* @param context Optional context to include
|
* @param context Optional context to include
|
||||||
|
* @param preserveSystemPrompt Optional flag to preserve existing system prompt
|
||||||
* @returns Formatted messages optimized for the specific provider
|
* @returns Formatted messages optimized for the specific provider
|
||||||
*/
|
*/
|
||||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[];
|
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -22,7 +23,7 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
|
|||||||
* Format messages with system prompt and context
|
* Format messages with system prompt and context
|
||||||
* Each provider should override this method with their specific formatting strategy
|
* Each provider should override this method with their specific formatting strategy
|
||||||
*/
|
*/
|
||||||
abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[];
|
abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method to extract existing system message from messages
|
* Helper method to extract existing system message from messages
|
||||||
@ -44,7 +45,7 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
|
|||||||
* Optimizes message format for OpenAI models (GPT-3.5, GPT-4, etc.)
|
* Optimizes message format for OpenAI models (GPT-3.5, GPT-4, etc.)
|
||||||
*/
|
*/
|
||||||
export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
||||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||||
const formattedMessages: Message[] = [];
|
const formattedMessages: Message[] = [];
|
||||||
|
|
||||||
// OpenAI performs best with system message first, then context as a separate system message
|
// OpenAI performs best with system message first, then context as a separate system message
|
||||||
@ -52,7 +53,11 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
|||||||
|
|
||||||
// Handle system message
|
// Handle system message
|
||||||
const existingSystem = this.getSystemMessage(messages);
|
const existingSystem = this.getSystemMessage(messages);
|
||||||
if (systemPrompt || existingSystem) {
|
|
||||||
|
if (preserveSystemPrompt && existingSystem) {
|
||||||
|
// Use the existing system message
|
||||||
|
formattedMessages.push(existingSystem);
|
||||||
|
} else if (systemPrompt || existingSystem) {
|
||||||
const systemContent = systemPrompt || existingSystem?.content || '';
|
const systemContent = systemPrompt || existingSystem?.content || '';
|
||||||
formattedMessages.push({
|
formattedMessages.push({
|
||||||
role: 'system',
|
role: 'system',
|
||||||
@ -80,7 +85,7 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
|||||||
* Optimizes message format for Claude models
|
* Optimizes message format for Claude models
|
||||||
*/
|
*/
|
||||||
export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
||||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||||
const formattedMessages: Message[] = [];
|
const formattedMessages: Message[] = [];
|
||||||
|
|
||||||
// Anthropic performs best with a specific XML-like format for context and system instructions
|
// Anthropic performs best with a specific XML-like format for context and system instructions
|
||||||
@ -89,7 +94,9 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
|||||||
let systemContent = '';
|
let systemContent = '';
|
||||||
const existingSystem = this.getSystemMessage(messages);
|
const existingSystem = this.getSystemMessage(messages);
|
||||||
|
|
||||||
if (systemPrompt || existingSystem) {
|
if (preserveSystemPrompt && existingSystem) {
|
||||||
|
systemContent = existingSystem.content;
|
||||||
|
} else if (systemPrompt || existingSystem) {
|
||||||
systemContent = systemPrompt || existingSystem?.content || '';
|
systemContent = systemPrompt || existingSystem?.content || '';
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,7 +125,7 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
|||||||
* Optimizes message format for open-source models
|
* Optimizes message format for open-source models
|
||||||
*/
|
*/
|
||||||
export class OllamaMessageFormatter extends BaseMessageFormatter {
|
export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||||
const formattedMessages: Message[] = [];
|
const formattedMessages: Message[] = [];
|
||||||
|
|
||||||
// Ollama format is closer to raw prompting and typically works better with
|
// Ollama format is closer to raw prompting and typically works better with
|
||||||
@ -156,12 +163,15 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
|||||||
* Default message formatter when provider is unknown
|
* Default message formatter when provider is unknown
|
||||||
*/
|
*/
|
||||||
export class DefaultMessageFormatter extends BaseMessageFormatter {
|
export class DefaultMessageFormatter extends BaseMessageFormatter {
|
||||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||||
const formattedMessages: Message[] = [];
|
const formattedMessages: Message[] = [];
|
||||||
|
|
||||||
// Handle system message
|
// Handle system message
|
||||||
const existingSystem = this.getSystemMessage(messages);
|
const existingSystem = this.getSystemMessage(messages);
|
||||||
if (systemPrompt || existingSystem) {
|
|
||||||
|
if (preserveSystemPrompt && existingSystem) {
|
||||||
|
formattedMessages.push(existingSystem);
|
||||||
|
} else if (systemPrompt || existingSystem) {
|
||||||
const systemContent = systemPrompt || existingSystem?.content || '';
|
const systemContent = systemPrompt || existingSystem?.content || '';
|
||||||
formattedMessages.push({
|
formattedMessages.push({
|
||||||
role: 'system',
|
role: 'system',
|
||||||
|
|||||||
@ -48,10 +48,30 @@ export class OllamaService extends BaseAIService {
|
|||||||
const systemPrompt = this.getSystemPrompt(opts.systemPrompt || options.getOption('aiSystemPrompt'));
|
const systemPrompt = this.getSystemPrompt(opts.systemPrompt || options.getOption('aiSystemPrompt'));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Use the formatter to prepare messages
|
// Determine whether to use the formatter or send messages directly
|
||||||
const formattedMessages = this.formatter.formatMessages(messages, systemPrompt);
|
let messagesToSend: Message[];
|
||||||
|
|
||||||
console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(formattedMessages, null, 2));
|
if (opts.bypassFormatter) {
|
||||||
|
// Bypass the formatter entirely - use messages as is
|
||||||
|
messagesToSend = [...messages];
|
||||||
|
console.log(`Bypassing formatter for Ollama request with ${messages.length} messages`);
|
||||||
|
} else {
|
||||||
|
// Use the formatter to prepare messages
|
||||||
|
messagesToSend = this.formatter.formatMessages(
|
||||||
|
messages,
|
||||||
|
systemPrompt,
|
||||||
|
undefined, // context
|
||||||
|
opts.preserveSystemPrompt
|
||||||
|
);
|
||||||
|
console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(messagesToSend, null, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this is a request that expects JSON response
|
||||||
|
const expectsJsonResponse = opts.expectsJsonResponse || false;
|
||||||
|
|
||||||
|
if (expectsJsonResponse) {
|
||||||
|
console.log(`Request expects JSON response, adding response_format parameter`);
|
||||||
|
}
|
||||||
|
|
||||||
const response = await fetch(`${apiBase}/api/chat`, {
|
const response = await fetch(`${apiBase}/api/chat`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@ -60,9 +80,11 @@ export class OllamaService extends BaseAIService {
|
|||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
model,
|
model,
|
||||||
messages: formattedMessages,
|
messages: messagesToSend,
|
||||||
options: {
|
options: {
|
||||||
temperature
|
temperature,
|
||||||
|
// Add response_format for requests that expect JSON
|
||||||
|
...(expectsJsonResponse ? { response_format: { type: "json_object" } } : {})
|
||||||
},
|
},
|
||||||
stream: false
|
stream: false
|
||||||
})
|
})
|
||||||
|
|||||||
387
src/services/llm/utils/json_extractor.ts
Normal file
387
src/services/llm/utils/json_extractor.ts
Normal file
@ -0,0 +1,387 @@
|
|||||||
|
import log from '../../log.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Options for JSON extraction
|
||||||
|
*/
|
||||||
|
export interface JsonExtractionOptions {
|
||||||
|
/** Attempt to find and extract arrays as the primary target (for query enhancers, etc.) */
|
||||||
|
extractArrays?: boolean;
|
||||||
|
/** Minimum length for extracted strings to be considered valid */
|
||||||
|
minStringLength?: number;
|
||||||
|
/** Apply fixes to malformed JSON before parsing */
|
||||||
|
applyFixes?: boolean;
|
||||||
|
/** Whether to use fallback extraction methods when JSON parsing fails */
|
||||||
|
useFallbacks?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Structure of a tool call extracted from an LLM response
|
||||||
|
*/
|
||||||
|
export interface ExtractedToolCall {
|
||||||
|
/** The name of the tool to call */
|
||||||
|
tool_name: string;
|
||||||
|
/** Parameters for the tool call */
|
||||||
|
parameters: Record<string, any>;
|
||||||
|
/** The original JSON string that was parsed */
|
||||||
|
originalJson?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class for extracting and parsing JSON from LLM responses
|
||||||
|
* Handles malformed JSON, escaping issues, and provides fallback mechanisms
|
||||||
|
*/
|
||||||
|
export class JsonExtractor {
|
||||||
|
/**
|
||||||
|
* Extract JSON from an LLM response
|
||||||
|
*
|
||||||
|
* @param text - The raw text from an LLM response
|
||||||
|
* @param options - Options to control extraction behavior
|
||||||
|
* @returns The parsed JSON object or array, or null if extraction failed
|
||||||
|
*/
|
||||||
|
static extract<T = any>(text: string, options: JsonExtractionOptions = {}): T | null {
|
||||||
|
const opts = {
|
||||||
|
extractArrays: false,
|
||||||
|
minStringLength: 3,
|
||||||
|
applyFixes: true,
|
||||||
|
useFallbacks: true,
|
||||||
|
...options
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Clean up the input text
|
||||||
|
let cleanedText = this.cleanMarkdownAndFormatting(text);
|
||||||
|
|
||||||
|
// Try to extract specific JSON structures if needed
|
||||||
|
if (opts.extractArrays) {
|
||||||
|
const arrayResult = this.extractArray(cleanedText, opts);
|
||||||
|
if (arrayResult) {
|
||||||
|
return arrayResult as unknown as T;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try direct JSON parsing with fixes if enabled
|
||||||
|
if (opts.applyFixes) {
|
||||||
|
const fixedResult = this.extractWithFixes(cleanedText);
|
||||||
|
if (fixedResult !== null) {
|
||||||
|
return fixedResult as T;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try direct JSON parsing without fixes
|
||||||
|
try {
|
||||||
|
return JSON.parse(cleanedText) as T;
|
||||||
|
} catch (e) {
|
||||||
|
// Fall through to fallbacks
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use fallbacks if enabled
|
||||||
|
if (opts.useFallbacks) {
|
||||||
|
if (opts.extractArrays) {
|
||||||
|
const items = this.extractItemsAsFallback(text, opts.minStringLength);
|
||||||
|
if (items.length > 0) {
|
||||||
|
return items as unknown as T;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If it looks like a JSON object but can't be parsed, try regex extraction
|
||||||
|
if (cleanedText.includes('{') && cleanedText.includes('}')) {
|
||||||
|
const objectResult = this.extractObject(cleanedText);
|
||||||
|
if (objectResult) {
|
||||||
|
return objectResult as T;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`JSON extraction error: ${error}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract tool calls from an LLM response
|
||||||
|
* Specifically designed to handle Ollama tool call format
|
||||||
|
*
|
||||||
|
* @param text - Raw text from the LLM response
|
||||||
|
* @returns Array of tool calls or empty array if none found
|
||||||
|
*/
|
||||||
|
static extractToolCalls(text: string): ExtractedToolCall[] {
|
||||||
|
const toolCalls: ExtractedToolCall[] = [];
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Clean up the text and find all JSON objects
|
||||||
|
const cleanedText = this.cleanMarkdownAndFormatting(text);
|
||||||
|
|
||||||
|
// Try to find complete JSON objects
|
||||||
|
const jsonObjectMatches = this.findJsonObjects(cleanedText);
|
||||||
|
|
||||||
|
for (const jsonString of jsonObjectMatches) {
|
||||||
|
try {
|
||||||
|
// Try to fix and parse each potential JSON object
|
||||||
|
const fixedJson = this.applyJsonFixes(jsonString);
|
||||||
|
const parsedJson = JSON.parse(fixedJson);
|
||||||
|
|
||||||
|
// Check if this looks like a tool call
|
||||||
|
if (
|
||||||
|
parsedJson &&
|
||||||
|
typeof parsedJson === 'object' &&
|
||||||
|
parsedJson.tool_name &&
|
||||||
|
typeof parsedJson.tool_name === 'string' &&
|
||||||
|
parsedJson.parameters &&
|
||||||
|
typeof parsedJson.parameters === 'object'
|
||||||
|
) {
|
||||||
|
toolCalls.push({
|
||||||
|
tool_name: parsedJson.tool_name,
|
||||||
|
parameters: parsedJson.parameters,
|
||||||
|
originalJson: jsonString
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// If this JSON object failed to parse, try more aggressive fixes
|
||||||
|
log.info(`Failed to parse potential tool call JSON: ${e}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we couldn't find valid tool calls with the first approach, try regex pattern matching
|
||||||
|
if (toolCalls.length === 0) {
|
||||||
|
// Look for tool_name/parameters patterns in the text
|
||||||
|
const toolNameMatch = text.match(/["']?tool_name["']?\s*:\s*["']([^"']+)["']/);
|
||||||
|
const parametersMatch = text.match(/["']?parameters["']?\s*:\s*({[^}]+})/);
|
||||||
|
|
||||||
|
if (toolNameMatch && parametersMatch) {
|
||||||
|
try {
|
||||||
|
const toolName = toolNameMatch[1];
|
||||||
|
const parametersStr = this.applyJsonFixes(parametersMatch[1]);
|
||||||
|
const parameters = JSON.parse(parametersStr);
|
||||||
|
|
||||||
|
toolCalls.push({
|
||||||
|
tool_name: toolName,
|
||||||
|
parameters,
|
||||||
|
originalJson: `{"tool_name":"${toolName}","parameters":${parametersStr}}`
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
log.info(`Failed to parse tool call with regex approach: ${e}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
log.error(`Error extracting tool calls: ${error}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return toolCalls;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find all potential JSON objects in a text
|
||||||
|
*/
|
||||||
|
private static findJsonObjects(text: string): string[] {
|
||||||
|
const jsonObjects: string[] = [];
|
||||||
|
let bracesCount = 0;
|
||||||
|
let currentObject = '';
|
||||||
|
let insideObject = false;
|
||||||
|
|
||||||
|
// Scan through text character by character
|
||||||
|
for (let i = 0; i < text.length; i++) {
|
||||||
|
const char = text[i];
|
||||||
|
|
||||||
|
if (char === '{') {
|
||||||
|
bracesCount++;
|
||||||
|
if (!insideObject) {
|
||||||
|
insideObject = true;
|
||||||
|
currentObject = '{';
|
||||||
|
} else {
|
||||||
|
currentObject += char;
|
||||||
|
}
|
||||||
|
} else if (char === '}') {
|
||||||
|
bracesCount--;
|
||||||
|
currentObject += char;
|
||||||
|
|
||||||
|
if (bracesCount === 0 && insideObject) {
|
||||||
|
jsonObjects.push(currentObject);
|
||||||
|
currentObject = '';
|
||||||
|
insideObject = false;
|
||||||
|
}
|
||||||
|
} else if (insideObject) {
|
||||||
|
currentObject += char;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return jsonObjects;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean Markdown formatting and special characters from text
|
||||||
|
*/
|
||||||
|
private static cleanMarkdownAndFormatting(text: string): string {
|
||||||
|
return text
|
||||||
|
.replace(/```(?:json)?|```/g, '') // Remove code block markers
|
||||||
|
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract an array from text using regex and pattern matching
|
||||||
|
*/
|
||||||
|
private static extractArray(text: string, options: JsonExtractionOptions): string[] | null {
|
||||||
|
// First attempt: Find JSON arrays via regex
|
||||||
|
const arrayPattern = /\[((?:"(?:\\.|[^"\\])*"(?:\s*,\s*)?)+)\]/g;
|
||||||
|
const matches = [...text.matchAll(arrayPattern)];
|
||||||
|
|
||||||
|
if (matches.length > 0) {
|
||||||
|
// Take the first complete array match
|
||||||
|
const arrayContent = matches[0][1];
|
||||||
|
|
||||||
|
// Extract all properly quoted strings from the array
|
||||||
|
const stringPattern = /"((?:\\.|[^"\\])*)"/g;
|
||||||
|
const stringMatches = [...arrayContent.matchAll(stringPattern)];
|
||||||
|
|
||||||
|
if (stringMatches.length > 0) {
|
||||||
|
const items = stringMatches
|
||||||
|
.map(m => m[1].trim())
|
||||||
|
.filter(s => s.length >= (options.minStringLength || 3));
|
||||||
|
|
||||||
|
if (items.length > 0) {
|
||||||
|
log.info(`Successfully extracted ${items.length} items using regex pattern`);
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second attempt: Try to extract array via standard JSON parsing with fixes
|
||||||
|
if (text.includes('[') && text.includes(']')) {
|
||||||
|
const arrayMatch = text.match(/\[[\s\S]*\]/);
|
||||||
|
if (arrayMatch) {
|
||||||
|
const arrayText = this.applyJsonFixes(arrayMatch[0]);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const array = JSON.parse(arrayText);
|
||||||
|
if (Array.isArray(array) && array.length > 0) {
|
||||||
|
const items = array
|
||||||
|
.map(item => typeof item === 'string' ? item : String(item))
|
||||||
|
.filter(s => s.length >= (options.minStringLength || 3));
|
||||||
|
|
||||||
|
if (items.length > 0) {
|
||||||
|
log.info(`Successfully parsed JSON array with ${items.length} items`);
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Fall through to fallbacks
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract a JSON object using regex and pattern matching
|
||||||
|
*/
|
||||||
|
private static extractObject(text: string): Record<string, any> | null {
|
||||||
|
const objectMatch = text.match(/{[\s\S]*}/);
|
||||||
|
if (!objectMatch) return null;
|
||||||
|
|
||||||
|
const objectText = this.applyJsonFixes(objectMatch[0]);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(objectText);
|
||||||
|
return parsed;
|
||||||
|
} catch (e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Apply fixes to malformed JSON text
|
||||||
|
*/
|
||||||
|
private static applyJsonFixes(text: string): string {
|
||||||
|
let fixed = text;
|
||||||
|
|
||||||
|
// Fix common JSON formatting issues - replace newlines inside the JSON
|
||||||
|
fixed = fixed.replace(/\r?\n/g, ' ');
|
||||||
|
|
||||||
|
// Fix unclosed quotes - replace trailing commas before closing brackets
|
||||||
|
fixed = fixed.replace(/,\s*]/g, ']');
|
||||||
|
fixed = fixed.replace(/,\s*}/g, '}');
|
||||||
|
|
||||||
|
// Fix quotes inside strings
|
||||||
|
fixed = fixed.replace(/"([^"]*)"([^"]*)"([^"]*)"/g, '"$1\'$2\'$3"');
|
||||||
|
|
||||||
|
// Fix missing commas between elements
|
||||||
|
fixed = fixed.replace(/"([^"]*)"(?:\s+)"([^"]*)"/g, '"$1", "$2"');
|
||||||
|
|
||||||
|
// Fix missing commas in arrays (quotes with only spaces between them)
|
||||||
|
fixed = fixed.replace(/"([^"]*)"\s+"/g, '"$1", "');
|
||||||
|
|
||||||
|
// Fix unclosed quotes before commas
|
||||||
|
fixed = fixed.replace(/"([^"]*),\s*(?="|])/g, '"$1", ');
|
||||||
|
|
||||||
|
return fixed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract with fixes and direct JSON parsing
|
||||||
|
*/
|
||||||
|
private static extractWithFixes(text: string): any | null {
|
||||||
|
try {
|
||||||
|
const fixed = this.applyJsonFixes(text);
|
||||||
|
return JSON.parse(fixed);
|
||||||
|
} catch (e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract items as a fallback using various patterns
|
||||||
|
*/
|
||||||
|
private static extractItemsAsFallback(text: string, minLength: number = 3): string[] {
|
||||||
|
const patterns = [
|
||||||
|
/(?:^|\n)["'](.+?)["'](?:,|\n|$)/g, // Quoted strings
|
||||||
|
/(?:^|\n)\[["'](.+?)["']\](?:,|\n|$)/g, // Single item arrays
|
||||||
|
/(?:^|\n)(\d+\.\s*.+?)(?:\n|$)/g, // Numbered list items
|
||||||
|
/(?:^|\n)[-*•]\s*(.+?)(?:\n|$)/g // Bullet list items
|
||||||
|
];
|
||||||
|
|
||||||
|
const extractedItems = new Set<string>();
|
||||||
|
|
||||||
|
// Try each pattern
|
||||||
|
for (const pattern of patterns) {
|
||||||
|
const matches = [...text.matchAll(pattern)];
|
||||||
|
for (const match of matches) {
|
||||||
|
if (match[1] && match[1].trim().length >= minLength) {
|
||||||
|
extractedItems.add(match[1].trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try line-by-line extraction as last resort
|
||||||
|
if (extractedItems.size === 0) {
|
||||||
|
const lines = text.split('\n')
|
||||||
|
.map(line => line.trim())
|
||||||
|
.filter(line =>
|
||||||
|
line.length >= minLength &&
|
||||||
|
!line.startsWith('```') &&
|
||||||
|
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
|
||||||
|
!line.match(/^\[|\]$/) // Skip lines that are just brackets
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
// Remove common formatting
|
||||||
|
const cleaned = line
|
||||||
|
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers
|
||||||
|
.replace(/^[-*•]\s*/, '') // Remove bullet list markers
|
||||||
|
.replace(/^["']|["']$/g, '') // Remove surrounding quotes
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
if (cleaned.length >= minLength) {
|
||||||
|
extractedItems.add(cleaned);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Array.from(extractedItems);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default JsonExtractor;
|
||||||
Loading…
x
Reference in New Issue
Block a user