mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-11-02 22:21:41 +08:00
do a better job at centralizing json extraction, and query "enhancer" search queries
This commit is contained in:
parent
5b3dca88d9
commit
ed52d71729
@ -23,6 +23,9 @@ export interface ChatCompletionOptions {
|
||||
presencePenalty?: number;
|
||||
showThinking?: boolean;
|
||||
systemPrompt?: string;
|
||||
preserveSystemPrompt?: boolean; // Whether to preserve existing system message
|
||||
bypassFormatter?: boolean; // Whether to bypass the message formatter entirely
|
||||
expectsJsonResponse?: boolean; // Whether this request expects a JSON response
|
||||
stream?: boolean; // Whether to stream the response
|
||||
}
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import type { Message } from '../../ai_interface.js';
|
||||
import { CONTEXT_PROMPTS } from '../../constants/llm_prompt_constants.js';
|
||||
import type { LLMServiceInterface } from '../../interfaces/agent_tool_interfaces.js';
|
||||
import type { IQueryEnhancer } from '../../interfaces/context_interfaces.js';
|
||||
import JsonExtractor from '../../utils/json_extractor.js';
|
||||
|
||||
/**
|
||||
* Provides utilities for enhancing queries and generating search queries
|
||||
@ -12,6 +13,15 @@ export class QueryEnhancer implements IQueryEnhancer {
|
||||
// Use the centralized query enhancer prompt
|
||||
private metaPrompt = CONTEXT_PROMPTS.QUERY_ENHANCER;
|
||||
|
||||
/**
|
||||
* Get enhanced prompt with JSON formatting instructions
|
||||
*/
|
||||
private getEnhancedPrompt(): string {
|
||||
return `${this.metaPrompt}
|
||||
IMPORTANT: You must respond with valid JSON arrays. Always include commas between array elements.
|
||||
Format your answer as a valid JSON array without markdown code blocks, like this: ["item1", "item2", "item3"]`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate search queries to find relevant information for the user question
|
||||
*
|
||||
@ -32,95 +42,38 @@ export class QueryEnhancer implements IQueryEnhancer {
|
||||
}
|
||||
|
||||
const messages: Message[] = [
|
||||
{ role: "system", content: this.metaPrompt },
|
||||
{ role: "system", content: this.getEnhancedPrompt() },
|
||||
{ role: "user", content: userQuestion }
|
||||
];
|
||||
|
||||
const options = {
|
||||
temperature: 0.3,
|
||||
maxTokens: 300
|
||||
maxTokens: 300,
|
||||
bypassFormatter: true, // Completely bypass formatter for query enhancement
|
||||
expectsJsonResponse: true // Explicitly request JSON-formatted response
|
||||
};
|
||||
|
||||
// Get the response from the LLM
|
||||
const response = await llmService.generateChatCompletion(messages, options);
|
||||
const responseText = response.text; // Extract the text from the response object
|
||||
|
||||
try {
|
||||
// Remove code blocks, quotes, and clean up the response text
|
||||
let jsonStr = responseText
|
||||
.replace(/```(?:json)?|```/g, '') // Remove code block markers
|
||||
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
|
||||
.trim();
|
||||
// Use the JsonExtractor to parse the response
|
||||
const queries = JsonExtractor.extract<string[]>(responseText, {
|
||||
extractArrays: true,
|
||||
minStringLength: 3,
|
||||
applyFixes: true,
|
||||
useFallbacks: true
|
||||
});
|
||||
|
||||
// Check if the text might contain a JSON array (has square brackets)
|
||||
if (jsonStr.includes('[') && jsonStr.includes(']')) {
|
||||
// Extract just the array part if there's explanatory text
|
||||
const arrayMatch = jsonStr.match(/\[[\s\S]*\]/);
|
||||
if (arrayMatch) {
|
||||
jsonStr = arrayMatch[0];
|
||||
}
|
||||
|
||||
// Try to parse the JSON
|
||||
try {
|
||||
const queries = JSON.parse(jsonStr);
|
||||
if (Array.isArray(queries) && queries.length > 0) {
|
||||
const result = queries.map(q => typeof q === 'string' ? q : String(q)).filter(Boolean);
|
||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
|
||||
return result;
|
||||
}
|
||||
} catch (innerError) {
|
||||
// If parsing fails, log it and continue to the fallback
|
||||
log.info(`JSON parse error: ${innerError}. Will use fallback parsing for: ${jsonStr}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback 1: Try to extract an array manually by splitting on commas between quotes
|
||||
if (jsonStr.includes('[') && jsonStr.includes(']')) {
|
||||
const arrayContent = jsonStr.substring(
|
||||
jsonStr.indexOf('[') + 1,
|
||||
jsonStr.lastIndexOf(']')
|
||||
);
|
||||
|
||||
// Use regex to match quoted strings, handling escaped quotes
|
||||
const stringMatches = arrayContent.match(/"((?:\\.|[^"\\])*)"/g);
|
||||
if (stringMatches && stringMatches.length > 0) {
|
||||
const result = stringMatches
|
||||
.map((m: string) => m.substring(1, m.length - 1)) // Remove surrounding quotes
|
||||
.filter((s: string) => s.length > 0);
|
||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback 2: Extract queries line by line
|
||||
const lines = responseText.split('\n')
|
||||
.map((line: string) => line.trim())
|
||||
.filter((line: string) =>
|
||||
line.length > 0 &&
|
||||
!line.startsWith('```') &&
|
||||
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
|
||||
!line.match(/^\[|\]$/) // Skip lines that are just brackets
|
||||
);
|
||||
|
||||
if (lines.length > 0) {
|
||||
// Remove numbering, quotes and other list markers from each line
|
||||
const result = lines.map((line: string) => {
|
||||
return line
|
||||
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers (1., 2., etc)
|
||||
.replace(/^[-*•]\s*/, '') // Remove bullet list markers
|
||||
.replace(/^["']|["']$/g, '') // Remove surrounding quotes
|
||||
.trim();
|
||||
}).filter((s: string) => s.length > 0);
|
||||
|
||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, result);
|
||||
return result;
|
||||
}
|
||||
} catch (parseError) {
|
||||
log.error(`Error parsing search queries: ${parseError}`);
|
||||
if (queries && queries.length > 0) {
|
||||
log.info(`Extracted ${queries.length} queries using JsonExtractor`);
|
||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, queries);
|
||||
return queries;
|
||||
}
|
||||
|
||||
// If all else fails, just use the original question
|
||||
const fallback = [userQuestion];
|
||||
log.info(`No queries extracted, using fallback: "${userQuestion}"`);
|
||||
cacheManager.storeQueryResults(`searchQueries:${userQuestion}`, fallback);
|
||||
return fallback;
|
||||
} catch (error: unknown) {
|
||||
|
||||
@ -23,22 +23,30 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||
|
||||
/**
|
||||
* Format messages for the Ollama API
|
||||
* @param messages Messages to format
|
||||
* @param systemPrompt Optional system prompt to use
|
||||
* @param context Optional context to include
|
||||
* @param preserveSystemPrompt When true, preserves existing system messages rather than replacing them
|
||||
*/
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||
const formattedMessages: Message[] = [];
|
||||
|
||||
// First identify user and system messages
|
||||
const systemMessages = messages.filter(msg => msg.role === 'system');
|
||||
const userMessages = messages.filter(msg => msg.role === 'user' || msg.role === 'assistant');
|
||||
|
||||
// Create base system message with instructions or use default
|
||||
const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO;
|
||||
|
||||
// Always add a system message with the base prompt
|
||||
formattedMessages.push({
|
||||
role: 'system',
|
||||
content: basePrompt
|
||||
});
|
||||
// Determine if we should preserve the existing system message
|
||||
if (preserveSystemPrompt && systemMessages.length > 0) {
|
||||
// Preserve the existing system message
|
||||
formattedMessages.push(systemMessages[0]);
|
||||
} else {
|
||||
// Use provided systemPrompt or default
|
||||
const basePrompt = systemPrompt || PROVIDER_PROMPTS.COMMON.DEFAULT_ASSISTANT_INTRO;
|
||||
formattedMessages.push({
|
||||
role: 'system',
|
||||
content: basePrompt
|
||||
});
|
||||
}
|
||||
|
||||
// If we have context, inject it into the first user message
|
||||
if (context && userMessages.length > 0) {
|
||||
|
||||
@ -9,9 +9,10 @@ export interface MessageFormatter {
|
||||
* @param messages Original messages
|
||||
* @param systemPrompt Optional system prompt to override
|
||||
* @param context Optional context to include
|
||||
* @param preserveSystemPrompt Optional flag to preserve existing system prompt
|
||||
* @returns Formatted messages optimized for the specific provider
|
||||
*/
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[];
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[];
|
||||
}
|
||||
|
||||
/**
|
||||
@ -22,15 +23,15 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
|
||||
* Format messages with system prompt and context
|
||||
* Each provider should override this method with their specific formatting strategy
|
||||
*/
|
||||
abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[];
|
||||
|
||||
abstract formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[];
|
||||
|
||||
/**
|
||||
* Helper method to extract existing system message from messages
|
||||
*/
|
||||
protected getSystemMessage(messages: Message[]): Message | undefined {
|
||||
return messages.find(msg => msg.role === 'system');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper method to create a copy of messages without system message
|
||||
*/
|
||||
@ -44,22 +45,26 @@ export abstract class BaseMessageFormatter implements MessageFormatter {
|
||||
* Optimizes message format for OpenAI models (GPT-3.5, GPT-4, etc.)
|
||||
*/
|
||||
export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||
const formattedMessages: Message[] = [];
|
||||
|
||||
|
||||
// OpenAI performs best with system message first, then context as a separate system message
|
||||
// or appended to the original system message
|
||||
|
||||
|
||||
// Handle system message
|
||||
const existingSystem = this.getSystemMessage(messages);
|
||||
if (systemPrompt || existingSystem) {
|
||||
|
||||
if (preserveSystemPrompt && existingSystem) {
|
||||
// Use the existing system message
|
||||
formattedMessages.push(existingSystem);
|
||||
} else if (systemPrompt || existingSystem) {
|
||||
const systemContent = systemPrompt || existingSystem?.content || '';
|
||||
formattedMessages.push({
|
||||
role: 'system',
|
||||
content: systemContent
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Add context as a system message with clear instruction
|
||||
if (context) {
|
||||
formattedMessages.push({
|
||||
@ -67,10 +72,10 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
||||
content: `Please use the following context to respond to the user's messages:\n\n${context}`
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Add remaining messages (excluding system)
|
||||
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
|
||||
|
||||
|
||||
return formattedMessages;
|
||||
}
|
||||
}
|
||||
@ -80,24 +85,26 @@ export class OpenAIMessageFormatter extends BaseMessageFormatter {
|
||||
* Optimizes message format for Claude models
|
||||
*/
|
||||
export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||
const formattedMessages: Message[] = [];
|
||||
|
||||
|
||||
// Anthropic performs best with a specific XML-like format for context and system instructions
|
||||
|
||||
|
||||
// Create system message with combined prompt and context if any
|
||||
let systemContent = '';
|
||||
const existingSystem = this.getSystemMessage(messages);
|
||||
|
||||
if (systemPrompt || existingSystem) {
|
||||
|
||||
if (preserveSystemPrompt && existingSystem) {
|
||||
systemContent = existingSystem.content;
|
||||
} else if (systemPrompt || existingSystem) {
|
||||
systemContent = systemPrompt || existingSystem?.content || '';
|
||||
}
|
||||
|
||||
|
||||
// For Claude, wrap context in XML tags for clear separation
|
||||
if (context) {
|
||||
systemContent += `\n\n<context>\n${context}\n</context>`;
|
||||
}
|
||||
|
||||
|
||||
// Add system message if we have content
|
||||
if (systemContent) {
|
||||
formattedMessages.push({
|
||||
@ -105,10 +112,10 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
||||
content: systemContent
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Add remaining messages (excluding system)
|
||||
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
|
||||
|
||||
|
||||
return formattedMessages;
|
||||
}
|
||||
}
|
||||
@ -118,25 +125,25 @@ export class AnthropicMessageFormatter extends BaseMessageFormatter {
|
||||
* Optimizes message format for open-source models
|
||||
*/
|
||||
export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||
const formattedMessages: Message[] = [];
|
||||
|
||||
|
||||
// Ollama format is closer to raw prompting and typically works better with
|
||||
// context embedded in system prompt rather than as separate messages
|
||||
|
||||
|
||||
// Build comprehensive system prompt
|
||||
let systemContent = '';
|
||||
const existingSystem = this.getSystemMessage(messages);
|
||||
|
||||
|
||||
if (systemPrompt || existingSystem) {
|
||||
systemContent = systemPrompt || existingSystem?.content || '';
|
||||
}
|
||||
|
||||
|
||||
// Add context to system prompt
|
||||
if (context) {
|
||||
systemContent += `\n\nReference information:\n${context}`;
|
||||
}
|
||||
|
||||
|
||||
// Add system message if we have content
|
||||
if (systemContent) {
|
||||
formattedMessages.push({
|
||||
@ -144,10 +151,10 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||
content: systemContent
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Add remaining messages (excluding system)
|
||||
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
|
||||
|
||||
|
||||
return formattedMessages;
|
||||
}
|
||||
}
|
||||
@ -156,19 +163,22 @@ export class OllamaMessageFormatter extends BaseMessageFormatter {
|
||||
* Default message formatter when provider is unknown
|
||||
*/
|
||||
export class DefaultMessageFormatter extends BaseMessageFormatter {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string): Message[] {
|
||||
formatMessages(messages: Message[], systemPrompt?: string, context?: string, preserveSystemPrompt?: boolean): Message[] {
|
||||
const formattedMessages: Message[] = [];
|
||||
|
||||
|
||||
// Handle system message
|
||||
const existingSystem = this.getSystemMessage(messages);
|
||||
if (systemPrompt || existingSystem) {
|
||||
|
||||
if (preserveSystemPrompt && existingSystem) {
|
||||
formattedMessages.push(existingSystem);
|
||||
} else if (systemPrompt || existingSystem) {
|
||||
const systemContent = systemPrompt || existingSystem?.content || '';
|
||||
formattedMessages.push({
|
||||
role: 'system',
|
||||
content: systemContent
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Add context as a user message
|
||||
if (context) {
|
||||
formattedMessages.push({
|
||||
@ -176,10 +186,10 @@ export class DefaultMessageFormatter extends BaseMessageFormatter {
|
||||
content: `Here is context to help you answer my questions: ${context}`
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
// Add user/assistant messages
|
||||
formattedMessages.push(...this.getMessagesWithoutSystem(messages));
|
||||
|
||||
|
||||
return formattedMessages;
|
||||
}
|
||||
}
|
||||
@ -194,7 +204,7 @@ export class MessageFormatterFactory {
|
||||
ollama: new OllamaMessageFormatter(),
|
||||
default: new DefaultMessageFormatter()
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Get the appropriate formatter for a provider
|
||||
* @param provider Provider name
|
||||
@ -203,7 +213,7 @@ export class MessageFormatterFactory {
|
||||
static getFormatter(provider: string): MessageFormatter {
|
||||
return this.formatters[provider] || this.formatters.default;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Register a custom formatter for a provider
|
||||
* @param provider Provider name
|
||||
@ -212,4 +222,4 @@ export class MessageFormatterFactory {
|
||||
static registerFormatter(provider: string, formatter: MessageFormatter): void {
|
||||
this.formatters[provider] = formatter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -48,10 +48,30 @@ export class OllamaService extends BaseAIService {
|
||||
const systemPrompt = this.getSystemPrompt(opts.systemPrompt || options.getOption('aiSystemPrompt'));
|
||||
|
||||
try {
|
||||
// Use the formatter to prepare messages
|
||||
const formattedMessages = this.formatter.formatMessages(messages, systemPrompt);
|
||||
// Determine whether to use the formatter or send messages directly
|
||||
let messagesToSend: Message[];
|
||||
|
||||
console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(formattedMessages, null, 2));
|
||||
if (opts.bypassFormatter) {
|
||||
// Bypass the formatter entirely - use messages as is
|
||||
messagesToSend = [...messages];
|
||||
console.log(`Bypassing formatter for Ollama request with ${messages.length} messages`);
|
||||
} else {
|
||||
// Use the formatter to prepare messages
|
||||
messagesToSend = this.formatter.formatMessages(
|
||||
messages,
|
||||
systemPrompt,
|
||||
undefined, // context
|
||||
opts.preserveSystemPrompt
|
||||
);
|
||||
console.log(`Sending to Ollama with formatted messages:`, JSON.stringify(messagesToSend, null, 2));
|
||||
}
|
||||
|
||||
// Check if this is a request that expects JSON response
|
||||
const expectsJsonResponse = opts.expectsJsonResponse || false;
|
||||
|
||||
if (expectsJsonResponse) {
|
||||
console.log(`Request expects JSON response, adding response_format parameter`);
|
||||
}
|
||||
|
||||
const response = await fetch(`${apiBase}/api/chat`, {
|
||||
method: 'POST',
|
||||
@ -60,9 +80,11 @@ export class OllamaService extends BaseAIService {
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
messages: formattedMessages,
|
||||
messages: messagesToSend,
|
||||
options: {
|
||||
temperature
|
||||
temperature,
|
||||
// Add response_format for requests that expect JSON
|
||||
...(expectsJsonResponse ? { response_format: { type: "json_object" } } : {})
|
||||
},
|
||||
stream: false
|
||||
})
|
||||
|
||||
387
src/services/llm/utils/json_extractor.ts
Normal file
387
src/services/llm/utils/json_extractor.ts
Normal file
@ -0,0 +1,387 @@
|
||||
import log from '../../log.js';
|
||||
|
||||
/**
|
||||
* Options for JSON extraction
|
||||
*/
|
||||
export interface JsonExtractionOptions {
|
||||
/** Attempt to find and extract arrays as the primary target (for query enhancers, etc.) */
|
||||
extractArrays?: boolean;
|
||||
/** Minimum length for extracted strings to be considered valid */
|
||||
minStringLength?: number;
|
||||
/** Apply fixes to malformed JSON before parsing */
|
||||
applyFixes?: boolean;
|
||||
/** Whether to use fallback extraction methods when JSON parsing fails */
|
||||
useFallbacks?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Structure of a tool call extracted from an LLM response
|
||||
*/
|
||||
export interface ExtractedToolCall {
|
||||
/** The name of the tool to call */
|
||||
tool_name: string;
|
||||
/** Parameters for the tool call */
|
||||
parameters: Record<string, any>;
|
||||
/** The original JSON string that was parsed */
|
||||
originalJson?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility class for extracting and parsing JSON from LLM responses
|
||||
* Handles malformed JSON, escaping issues, and provides fallback mechanisms
|
||||
*/
|
||||
export class JsonExtractor {
|
||||
/**
|
||||
* Extract JSON from an LLM response
|
||||
*
|
||||
* @param text - The raw text from an LLM response
|
||||
* @param options - Options to control extraction behavior
|
||||
* @returns The parsed JSON object or array, or null if extraction failed
|
||||
*/
|
||||
static extract<T = any>(text: string, options: JsonExtractionOptions = {}): T | null {
|
||||
const opts = {
|
||||
extractArrays: false,
|
||||
minStringLength: 3,
|
||||
applyFixes: true,
|
||||
useFallbacks: true,
|
||||
...options
|
||||
};
|
||||
|
||||
try {
|
||||
// Clean up the input text
|
||||
let cleanedText = this.cleanMarkdownAndFormatting(text);
|
||||
|
||||
// Try to extract specific JSON structures if needed
|
||||
if (opts.extractArrays) {
|
||||
const arrayResult = this.extractArray(cleanedText, opts);
|
||||
if (arrayResult) {
|
||||
return arrayResult as unknown as T;
|
||||
}
|
||||
}
|
||||
|
||||
// Try direct JSON parsing with fixes if enabled
|
||||
if (opts.applyFixes) {
|
||||
const fixedResult = this.extractWithFixes(cleanedText);
|
||||
if (fixedResult !== null) {
|
||||
return fixedResult as T;
|
||||
}
|
||||
}
|
||||
|
||||
// Try direct JSON parsing without fixes
|
||||
try {
|
||||
return JSON.parse(cleanedText) as T;
|
||||
} catch (e) {
|
||||
// Fall through to fallbacks
|
||||
}
|
||||
|
||||
// Use fallbacks if enabled
|
||||
if (opts.useFallbacks) {
|
||||
if (opts.extractArrays) {
|
||||
const items = this.extractItemsAsFallback(text, opts.minStringLength);
|
||||
if (items.length > 0) {
|
||||
return items as unknown as T;
|
||||
}
|
||||
}
|
||||
|
||||
// If it looks like a JSON object but can't be parsed, try regex extraction
|
||||
if (cleanedText.includes('{') && cleanedText.includes('}')) {
|
||||
const objectResult = this.extractObject(cleanedText);
|
||||
if (objectResult) {
|
||||
return objectResult as T;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
log.error(`JSON extraction error: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract tool calls from an LLM response
|
||||
* Specifically designed to handle Ollama tool call format
|
||||
*
|
||||
* @param text - Raw text from the LLM response
|
||||
* @returns Array of tool calls or empty array if none found
|
||||
*/
|
||||
static extractToolCalls(text: string): ExtractedToolCall[] {
|
||||
const toolCalls: ExtractedToolCall[] = [];
|
||||
|
||||
try {
|
||||
// Clean up the text and find all JSON objects
|
||||
const cleanedText = this.cleanMarkdownAndFormatting(text);
|
||||
|
||||
// Try to find complete JSON objects
|
||||
const jsonObjectMatches = this.findJsonObjects(cleanedText);
|
||||
|
||||
for (const jsonString of jsonObjectMatches) {
|
||||
try {
|
||||
// Try to fix and parse each potential JSON object
|
||||
const fixedJson = this.applyJsonFixes(jsonString);
|
||||
const parsedJson = JSON.parse(fixedJson);
|
||||
|
||||
// Check if this looks like a tool call
|
||||
if (
|
||||
parsedJson &&
|
||||
typeof parsedJson === 'object' &&
|
||||
parsedJson.tool_name &&
|
||||
typeof parsedJson.tool_name === 'string' &&
|
||||
parsedJson.parameters &&
|
||||
typeof parsedJson.parameters === 'object'
|
||||
) {
|
||||
toolCalls.push({
|
||||
tool_name: parsedJson.tool_name,
|
||||
parameters: parsedJson.parameters,
|
||||
originalJson: jsonString
|
||||
});
|
||||
}
|
||||
} catch (e) {
|
||||
// If this JSON object failed to parse, try more aggressive fixes
|
||||
log.info(`Failed to parse potential tool call JSON: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
// If we couldn't find valid tool calls with the first approach, try regex pattern matching
|
||||
if (toolCalls.length === 0) {
|
||||
// Look for tool_name/parameters patterns in the text
|
||||
const toolNameMatch = text.match(/["']?tool_name["']?\s*:\s*["']([^"']+)["']/);
|
||||
const parametersMatch = text.match(/["']?parameters["']?\s*:\s*({[^}]+})/);
|
||||
|
||||
if (toolNameMatch && parametersMatch) {
|
||||
try {
|
||||
const toolName = toolNameMatch[1];
|
||||
const parametersStr = this.applyJsonFixes(parametersMatch[1]);
|
||||
const parameters = JSON.parse(parametersStr);
|
||||
|
||||
toolCalls.push({
|
||||
tool_name: toolName,
|
||||
parameters,
|
||||
originalJson: `{"tool_name":"${toolName}","parameters":${parametersStr}}`
|
||||
});
|
||||
} catch (e) {
|
||||
log.info(`Failed to parse tool call with regex approach: ${e}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
log.error(`Error extracting tool calls: ${error}`);
|
||||
}
|
||||
|
||||
return toolCalls;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all potential JSON objects in a text
|
||||
*/
|
||||
private static findJsonObjects(text: string): string[] {
|
||||
const jsonObjects: string[] = [];
|
||||
let bracesCount = 0;
|
||||
let currentObject = '';
|
||||
let insideObject = false;
|
||||
|
||||
// Scan through text character by character
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const char = text[i];
|
||||
|
||||
if (char === '{') {
|
||||
bracesCount++;
|
||||
if (!insideObject) {
|
||||
insideObject = true;
|
||||
currentObject = '{';
|
||||
} else {
|
||||
currentObject += char;
|
||||
}
|
||||
} else if (char === '}') {
|
||||
bracesCount--;
|
||||
currentObject += char;
|
||||
|
||||
if (bracesCount === 0 && insideObject) {
|
||||
jsonObjects.push(currentObject);
|
||||
currentObject = '';
|
||||
insideObject = false;
|
||||
}
|
||||
} else if (insideObject) {
|
||||
currentObject += char;
|
||||
}
|
||||
}
|
||||
|
||||
return jsonObjects;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean Markdown formatting and special characters from text
|
||||
*/
|
||||
private static cleanMarkdownAndFormatting(text: string): string {
|
||||
return text
|
||||
.replace(/```(?:json)?|```/g, '') // Remove code block markers
|
||||
.replace(/[\u201C\u201D]/g, '"') // Replace smart quotes with straight quotes
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract an array from text using regex and pattern matching
|
||||
*/
|
||||
private static extractArray(text: string, options: JsonExtractionOptions): string[] | null {
|
||||
// First attempt: Find JSON arrays via regex
|
||||
const arrayPattern = /\[((?:"(?:\\.|[^"\\])*"(?:\s*,\s*)?)+)\]/g;
|
||||
const matches = [...text.matchAll(arrayPattern)];
|
||||
|
||||
if (matches.length > 0) {
|
||||
// Take the first complete array match
|
||||
const arrayContent = matches[0][1];
|
||||
|
||||
// Extract all properly quoted strings from the array
|
||||
const stringPattern = /"((?:\\.|[^"\\])*)"/g;
|
||||
const stringMatches = [...arrayContent.matchAll(stringPattern)];
|
||||
|
||||
if (stringMatches.length > 0) {
|
||||
const items = stringMatches
|
||||
.map(m => m[1].trim())
|
||||
.filter(s => s.length >= (options.minStringLength || 3));
|
||||
|
||||
if (items.length > 0) {
|
||||
log.info(`Successfully extracted ${items.length} items using regex pattern`);
|
||||
return items;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second attempt: Try to extract array via standard JSON parsing with fixes
|
||||
if (text.includes('[') && text.includes(']')) {
|
||||
const arrayMatch = text.match(/\[[\s\S]*\]/);
|
||||
if (arrayMatch) {
|
||||
const arrayText = this.applyJsonFixes(arrayMatch[0]);
|
||||
|
||||
try {
|
||||
const array = JSON.parse(arrayText);
|
||||
if (Array.isArray(array) && array.length > 0) {
|
||||
const items = array
|
||||
.map(item => typeof item === 'string' ? item : String(item))
|
||||
.filter(s => s.length >= (options.minStringLength || 3));
|
||||
|
||||
if (items.length > 0) {
|
||||
log.info(`Successfully parsed JSON array with ${items.length} items`);
|
||||
return items;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Fall through to fallbacks
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a JSON object using regex and pattern matching
|
||||
*/
|
||||
private static extractObject(text: string): Record<string, any> | null {
|
||||
const objectMatch = text.match(/{[\s\S]*}/);
|
||||
if (!objectMatch) return null;
|
||||
|
||||
const objectText = this.applyJsonFixes(objectMatch[0]);
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(objectText);
|
||||
return parsed;
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply fixes to malformed JSON text
|
||||
*/
|
||||
private static applyJsonFixes(text: string): string {
|
||||
let fixed = text;
|
||||
|
||||
// Fix common JSON formatting issues - replace newlines inside the JSON
|
||||
fixed = fixed.replace(/\r?\n/g, ' ');
|
||||
|
||||
// Fix unclosed quotes - replace trailing commas before closing brackets
|
||||
fixed = fixed.replace(/,\s*]/g, ']');
|
||||
fixed = fixed.replace(/,\s*}/g, '}');
|
||||
|
||||
// Fix quotes inside strings
|
||||
fixed = fixed.replace(/"([^"]*)"([^"]*)"([^"]*)"/g, '"$1\'$2\'$3"');
|
||||
|
||||
// Fix missing commas between elements
|
||||
fixed = fixed.replace(/"([^"]*)"(?:\s+)"([^"]*)"/g, '"$1", "$2"');
|
||||
|
||||
// Fix missing commas in arrays (quotes with only spaces between them)
|
||||
fixed = fixed.replace(/"([^"]*)"\s+"/g, '"$1", "');
|
||||
|
||||
// Fix unclosed quotes before commas
|
||||
fixed = fixed.replace(/"([^"]*),\s*(?="|])/g, '"$1", ');
|
||||
|
||||
return fixed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract with fixes and direct JSON parsing
|
||||
*/
|
||||
private static extractWithFixes(text: string): any | null {
|
||||
try {
|
||||
const fixed = this.applyJsonFixes(text);
|
||||
return JSON.parse(fixed);
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract items as a fallback using various patterns
|
||||
*/
|
||||
private static extractItemsAsFallback(text: string, minLength: number = 3): string[] {
|
||||
const patterns = [
|
||||
/(?:^|\n)["'](.+?)["'](?:,|\n|$)/g, // Quoted strings
|
||||
/(?:^|\n)\[["'](.+?)["']\](?:,|\n|$)/g, // Single item arrays
|
||||
/(?:^|\n)(\d+\.\s*.+?)(?:\n|$)/g, // Numbered list items
|
||||
/(?:^|\n)[-*•]\s*(.+?)(?:\n|$)/g // Bullet list items
|
||||
];
|
||||
|
||||
const extractedItems = new Set<string>();
|
||||
|
||||
// Try each pattern
|
||||
for (const pattern of patterns) {
|
||||
const matches = [...text.matchAll(pattern)];
|
||||
for (const match of matches) {
|
||||
if (match[1] && match[1].trim().length >= minLength) {
|
||||
extractedItems.add(match[1].trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try line-by-line extraction as last resort
|
||||
if (extractedItems.size === 0) {
|
||||
const lines = text.split('\n')
|
||||
.map(line => line.trim())
|
||||
.filter(line =>
|
||||
line.length >= minLength &&
|
||||
!line.startsWith('```') &&
|
||||
!line.match(/^\d+\.?\s*$/) && // Skip numbered list markers alone
|
||||
!line.match(/^\[|\]$/) // Skip lines that are just brackets
|
||||
);
|
||||
|
||||
for (const line of lines) {
|
||||
// Remove common formatting
|
||||
const cleaned = line
|
||||
.replace(/^\d+\.?\s*/, '') // Remove numbered list markers
|
||||
.replace(/^[-*•]\s*/, '') // Remove bullet list markers
|
||||
.replace(/^["']|["']$/g, '') // Remove surrounding quotes
|
||||
.trim();
|
||||
|
||||
if (cleaned.length >= minLength) {
|
||||
extractedItems.add(cleaned);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(extractedItems);
|
||||
}
|
||||
}
|
||||
|
||||
export default JsonExtractor;
|
||||
Loading…
x
Reference in New Issue
Block a user