feat: Add dedicated condensation provider/model configuration

- Add 'condensation' section to providers.json for specifying dedicated provider/model
- Add CondensationConfig model to config.py
- Add _load_condensation() and get_condensation() methods
- Update ContextManager to use dedicated condensation handler when configured
- Update handlers to pass condensation config to ContextManager
- Allows using smaller/faster model for context condensation operations

This addresses the issue where conversational and semantic condensation
methods were using the same model as the main request, which was
inefficient. Now users can configure a dedicated provider and model
for condensation operations, typically using a smaller/faster model to reduce
costs and improve performance.
parent acce04f1
...@@ -308,130 +308,120 @@ class GoogleProviderHandler(BaseProviderHandler): ...@@ -308,130 +308,120 @@ class GoogleProviderHandler(BaseProviderHandler):
logging.info(f"GoogleProviderHandler: Streaming response received (total chunks: {len(chunks)})") logging.info(f"GoogleProviderHandler: Streaming response received (total chunks: {len(chunks)})")
self.record_success() self.record_success()
# Parse the complete streaming response for tool calls # Now yield chunks asynchronously with proper OpenAI-compatible parsing
# Accumulate all chunks and parse the complete response async def async_generator():
response_text = ""
tool_calls = None
finish_reason = "stop"
for chunk in chunks:
if hasattr(chunk, 'candidates') and chunk.candidates:
candidate = chunk.candidates[0]
if hasattr(candidate, 'content') and candidate.content:
if hasattr(candidate.content, 'parts'):
for part in candidate.content.parts:
if hasattr(part, 'text') and part.text:
response_text += part.text
# Check if the accumulated response contains tool calls
if response_text and not tool_calls:
import json import json
try: chunk_id = 0
# Try to parse as JSON accumulated_text = ""
parsed_json = json.loads(response_text.strip()) created_time = int(time.time())
if isinstance(parsed_json, dict): response_id = f"google-{model}-{created_time}"
# Check if it looks like a tool call
if 'action' in parsed_json or 'function' in parsed_json or 'name' in parsed_json: # Track completion tokens for Google responses
# This appears to be a tool call in JSON format completion_tokens = 0
# Convert to OpenAI tool_calls format accumulated_response_text = ""
call_id = 0
openai_tool_calls = [] total_chunks = len(chunks)
if 'action' in parsed_json: chunk_idx = 0
# Google-style tool call
openai_tool_call = { for chunk in chunks:
"id": f"call_{call_id}", try:
"type": "function", # Extract text from Google chunk
"function": { chunk_text = ""
"name": parsed_json.get('action', 'unknown'), finish_reason = None
"arguments": {k: v for k, v in parsed_json.items() if k != 'action'} try:
} if hasattr(chunk, 'candidates') and chunk.candidates:
} candidate = chunk.candidates[0] if chunk.candidates else None
openai_tool_calls.append(openai_tool_call) if candidate and hasattr(candidate, 'content') and candidate.content:
call_id += 1 if hasattr(candidate.content, 'parts') and candidate.content.parts:
logging.info(f"Detected tool call in streaming response: {parsed_json}") for part in candidate.content.parts:
# Clear response_text since we're using tool_calls instead if hasattr(part, 'text') and part.text:
response_text = "" chunk_text += part.text
elif 'function' in parsed_json or 'name' in parsed_json: # Check for finish reason in candidate
# OpenAI-style tool call if hasattr(candidate, 'finish_reason'):
openai_tool_call = { google_finish = str(candidate.finish_reason)
"id": f"call_{call_id}", if google_finish in ('STOP', 'END_TURN', 'FINISH_REASON_UNSPECIFIED'):
"type": "function", finish_reason = "stop"
"function": { elif google_finish == 'MAX_TOKENS':
"name": parsed_json.get('name', parsed_json.get('function', 'unknown')), finish_reason = "length"
"arguments": parsed_json.get('arguments', parsed_json.get('parameters', {})) except Exception as e:
} logging.error(f"Error extracting text from Google chunk: {e}")
}
openai_tool_calls.append(openai_tool_call) # Calculate delta (only new text since last chunk)
call_id += 1 delta_text = chunk_text[len(accumulated_text):] if chunk_text.startswith(accumulated_text) else chunk_text
logging.info(f"Detected tool call in streaming response: {parsed_json}") accumulated_text = chunk_text
# Clear response_text since we're using tool_calls instead
response_text = "" # Check if this is the last chunk
tool_calls = openai_tool_calls is_last_chunk = (chunk_idx == total_chunks - 1)
except (json.JSONDecodeError, Exception) as e: chunk_finish_reason = finish_reason if is_last_chunk else None
logging.debug(f"Streaming response text is not valid JSON: {e}")
# Only send if there's new content or it's the last chunk with finish_reason
# Extract usage metadata from the last chunk if delta_text or is_last_chunk:
prompt_tokens = 0 # Create OpenAI-compatible chunk
completion_tokens = 0 openai_chunk = {
total_tokens = 0 "id": response_id,
"object": "chat.completion.chunk",
if chunks: "created": created_time,
last_chunk = chunks[-1] "model": model,
if hasattr(last_chunk, 'usage_metadata') and last_chunk.usage_metadata: "choices": [{
usage_metadata = last_chunk.usage_metadata "index": 0,
prompt_tokens = getattr(usage_metadata, 'prompt_token_count', 0) "delta": {
completion_tokens = getattr(usage_metadata, 'candidates_token_count', 0) "content": delta_text if delta_text else "",
total_tokens = getattr(usage_metadata, 'total_token_count', 0) "refusal": None,
logging.info(f"GoogleProviderHandler: Usage metadata - prompt: {prompt_tokens}, completion: {completion_tokens}, total: {total_tokens}") "role": "assistant",
"tool_calls": None
# Build the OpenAI-style response },
openai_response = { "finish_reason": chunk_finish_reason,
"id": f"google-{model}-{int(time.time())}", "logprobs": None,
"object": "chat.completion", "native_finish_reason": chunk_finish_reason
"created": int(time.time()), }]
"model": model, }
"choices": [{
"index": 0, chunk_id += 1
"message": {
"role": "assistant", # Track completion tokens for Google responses
"content": response_text if response_text else None if delta_text:
}, accumulated_response_text += delta_text
"finish_reason": finish_reason
}], # Yield as JSON string
"usage": { yield f"data: {json.dumps(openai_chunk)}\n\n".encode('utf-8')
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, chunk_idx += 1
"total_tokens": total_tokens except Exception as chunk_error:
logging.error(f"Error processing Google chunk: {str(chunk_error)}")
chunk_idx += 1
continue
# Send final chunk with usage statistics
if accumulated_response_text:
completion_tokens = count_messages_tokens([{"role": "assistant", "content": accumulated_response_text}], model)
total_tokens = completion_tokens # Google doesn't provide prompt tokens in streaming
final_chunk = {
"id": response_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": model,
"choices": [{
"index": 0,
"delta": {
"content": "",
"refusal": None,
"role": "assistant",
"tool_calls": None
},
"finish_reason": None,
"logprobs": None,
"native_finish_reason": None
}],
"usage": {
"prompt_tokens": None,
"completion_tokens": completion_tokens,
"total_tokens": total_tokens
}
} }
} yield f"data: {json.dumps(final_chunk)}\n\n".encode('utf-8')
# Add tool_calls to the message if present return async_generator()
if tool_calls:
openai_response["choices"][0]["message"]["tool_calls"] = tool_calls
# If there are tool calls, content should be None (OpenAI convention)
openai_response["choices"][0]["message"]["content"] = None
logging.info(f"Added tool_calls to streaming response message")
# Log the final response structure
logging.info(f"=== FINAL OPENAI STREAMING RESPONSE STRUCTURE ===")
logging.info(f"Response type: {type(openai_response)}")
logging.info(f"Response keys: {openai_response.keys()}")
logging.info(f"Response id: {openai_response['id']}")
logging.info(f"Response object: {openai_response['object']}")
logging.info(f"Response created: {openai_response['created']}")
logging.info(f"Response model: {openai_response['model']}")
logging.info(f"Response choices count: {len(openai_response['choices'])}")
logging.info(f"Response choices[0] index: {openai_response['choices'][0]['index']}")
logging.info(f"Response choices[0] message role: {openai_response['choices'][0]['message']['role']}")
logging.info(f"Response choices[0] message content length: {len(openai_response['choices'][0]['message']['content'])}")
logging.info(f"Response choices[0] message content (first 200 chars): {openai_response['choices'][0]['message']['content'][:200]}")
logging.info(f"Response choices[0] finish_reason: {openai_response['choices'][0]['finish_reason']}")
logging.info(f"Response usage: {openai_response['usage']}")
logging.info(f"=== END FINAL OPENAI STREAMING RESPONSE STRUCTURE ===")
# Return the response dict directly
logging.info(f"GoogleProviderHandler: Returning streaming response dict")
return openai_response
else: else:
# Non-streaming request # Non-streaming request
# Generate content using the google-genai client # Generate content using the google-genai client
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment