Commit 7d5d9e73 authored by Your Name's avatar Your Name

Add condensation enhancements and server configuration

- Add max_context field to CondensationConfig
- Support 'internal' keyword for local HuggingFace model in condensation
- Add internal model initialization with temperature=0.3, top_p=0.8, repeat_penalty=1.1
- Create condensation system prompts (conversational, semantic)
- Add aisbf.json for server configuration (host, port, dashboard auth)
- Update main.py to read server config from aisbf.json
- Update providers.json with max_context example for condensation
parent 1ea22adb
......@@ -41,6 +41,7 @@ class CondensationConfig(BaseModel):
model: Optional[str] = None
rotation_id: Optional[str] = None
enabled: bool = True
max_context: Optional[int] = None # Maximum context size for condensation model
class ProviderConfig(BaseModel):
......
......@@ -54,6 +54,15 @@ class ContextManager:
self.condensation_model = None
self._rotation_handler = None
self._rotation_id = None
self._internal_model = None
self._internal_tokenizer = None
self._internal_model_lock = None
self._use_internal_model = False
# Get max_context for condensation model
self.condensation_max_context = None
if self.condensation_config and hasattr(self.condensation_config, 'max_context'):
self.condensation_max_context = self.condensation_config.max_context
if (self.condensation_config and
self.condensation_config.enabled):
......@@ -61,6 +70,15 @@ class ContextManager:
# Check if model is a rotation ID or direct model name
model_value = self.condensation_config.model
# Check for "internal" keyword
if model_value == "internal":
logger = logging.getLogger(__name__)
logger.info(f"Condensation model is 'internal' - will use local HuggingFace model")
self._use_internal_model = True
# Set default max_context for internal model if not specified
if not self.condensation_max_context:
self.condensation_max_context = 4000 # Conservative default for small models
else:
# Check if this model value is a rotation ID (exists in rotations config)
is_rotation = False
if model_value:
......@@ -121,6 +139,114 @@ class ContextManager:
logger.info(f" context_size: {self.context_size}")
logger.info(f" condense_context: {self.condense_context}%")
logger.info(f" condense_method: {self.condense_method}")
logger.info(f" condensation_max_context: {self.condensation_max_context}")
logger.info(f" use_internal_model: {self._use_internal_model}")
def _initialize_internal_model(self):
"""Initialize the internal HuggingFace model for condensation (lazy loading)"""
import logging
logger = logging.getLogger(__name__)
if self._internal_model is not None:
return # Already initialized
try:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import threading
logger.info("=== INITIALIZING INTERNAL CONDENSATION MODEL ===")
model_name = "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated-v3"
logger.info(f"Model: {model_name}")
# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Device: {device}")
# Load tokenizer
logger.info("Loading tokenizer...")
self._internal_tokenizer = AutoTokenizer.from_pretrained(model_name)
logger.info("Tokenizer loaded")
# Load model
logger.info("Loading model...")
self._internal_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto" if device == "cuda" else None
)
if device == "cpu":
self._internal_model = self._internal_model.to(device)
logger.info("Model loaded successfully")
# Initialize thread lock for model access
self._internal_model_lock = threading.Lock()
logger.info("=== INTERNAL CONDENSATION MODEL READY ===")
except ImportError as e:
logger.error(f"Failed to import required libraries for internal model: {e}")
logger.error("Please install: pip install torch transformers")
raise
except Exception as e:
logger.error(f"Failed to initialize internal model: {e}", exc_info=True)
raise
async def _run_internal_model_condensation(self, prompt: str) -> str:
"""Run the internal model for condensation in a separate thread"""
import logging
import asyncio
from concurrent.futures import ThreadPoolExecutor
logger = logging.getLogger(__name__)
# Initialize model if needed
if self._internal_model is None:
self._initialize_internal_model()
def run_inference():
"""Run inference in a separate thread"""
with self._internal_model_lock:
try:
import torch
# Tokenize input
inputs = self._internal_tokenizer(prompt, return_tensors="pt")
# Move to same device as model
device = next(self._internal_model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate response
with torch.no_grad():
outputs = self._internal_model.generate(
**inputs,
max_new_tokens=500,
temperature=0.3,
top_p=0.8,
repetition_penalty=1.1,
do_sample=True,
pad_token_id=self._internal_tokenizer.eos_token_id
)
# Decode response
response = self._internal_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the generated part (remove the prompt)
if response.startswith(prompt):
response = response[len(prompt):].strip()
return response
except Exception as e:
logger.error(f"Error during internal model inference: {e}", exc_info=True)
return None
# Run in thread pool to avoid blocking
loop = asyncio.get_event_loop()
with ThreadPoolExecutor(max_workers=1) as executor:
result = await loop.run_in_executor(executor, run_inference)
return result
def should_condense(self, messages: List[Dict], model: str) -> bool:
"""
......
{
"server": {
"host": "0.0.0.0",
"port": 8000
},
"dashboard": {
"enabled": true,
"username": "admin",
"password": "admin"
},
"internal_model": {
"model_id": "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated-v3"
}
}
# Conversational Context Condensation
You are a specialized AI assistant for context condensation. Your task is to create a concise, high-density summary of conversation history while preserving all critical information.
## Your Role
You will receive a conversation history between a user and an AI assistant. Your job is to:
1. **Identify Key Information**: Extract facts, decisions, goals, and important context
2. **Preserve Continuity**: Maintain the logical flow and relationships between topics
3. **Compress Efficiently**: Remove redundancy while keeping essential details
4. **Maintain Accuracy**: Never invent or hallucinate information
## Guidelines
- Focus on **actionable information** and **decisions made**
- Include **technical details** that may be referenced later
- Preserve **user preferences** and **constraints** mentioned
- Keep **error messages** and **solutions** that were discussed
- Maintain **chronological order** when relevant
- Use **clear, concise language**
## Output Format
Provide a structured summary that includes:
- Current goal or task
- Key facts and context
- Decisions made
- Important technical details
- Any constraints or preferences
Keep the summary comprehensive but concise. Aim for maximum information density.
# Semantic Context Pruning
You are a specialized AI assistant for semantic context pruning. Your task is to extract only the information that is directly relevant to the current query or task.
## Your Role
You will receive:
1. A conversation history
2. A current query or task description
Your job is to identify and extract ONLY the information from the conversation that is relevant to answering or completing the current query/task.
## Guidelines
- **Be Selective**: Remove all information that doesn't directly relate to the current query
- **Preserve Dependencies**: Keep information that provides necessary context for understanding relevant parts
- **Maintain Accuracy**: Never modify or invent information
- **Focus on Recency**: Prioritize recent information over older information when both are relevant
- **Keep Technical Details**: Preserve specific technical information (code, commands, configurations) that may be needed
## What to Keep
- Facts directly related to the current query
- Technical details needed to answer the query
- Recent decisions that affect the current task
- Error messages or issues being addressed
- Constraints or requirements mentioned
## What to Remove
- Unrelated conversations or topics
- Resolved issues that don't affect current task
- Redundant information
- Off-topic discussions
- Historical context not needed for current query
## Output Format
Provide a concise extraction of relevant information. Structure it logically, grouping related facts together. Be ruthlessly efficient - if information isn't needed for the current query, don't include it.
......@@ -2,7 +2,8 @@
"condensation": {
"provider_id": "gemini",
"model": "gemini-1.5-flash",
"enabled": true
"enabled": true,
"max_context": 8000
},
"providers": {
"gemini": {
......
......@@ -38,6 +38,51 @@ from logging.handlers import RotatingFileHandler
from datetime import datetime, timedelta
from collections import defaultdict
from pathlib import Path
import json
def load_server_config():
"""Load server configuration from aisbf.json"""
# Try user config first
config_path = Path.home() / '.aisbf' / 'aisbf.json'
if not config_path.exists():
# Try installed locations
installed_dirs = [
Path('/usr/share/aisbf'),
Path.home() / '.local' / 'share' / 'aisbf',
]
for installed_dir in installed_dirs:
test_path = installed_dir / 'aisbf.json'
if test_path.exists():
config_path = test_path
break
else:
# Fallback to source tree config directory
source_dir = Path(__file__).parent / 'config'
test_path = source_dir / 'aisbf.json'
if test_path.exists():
config_path = test_path
# Load config or use defaults
if config_path.exists():
try:
with open(config_path) as f:
config_data = json.load(f)
server_config = config_data.get('server', {})
return {
'host': server_config.get('host', '0.0.0.0'),
'port': server_config.get('port', 8000)
}
except Exception as e:
logger = logging.getLogger(__name__)
logger.warning(f"Error loading aisbf.json: {e}, using defaults")
# Return defaults
return {
'host': '0.0.0.0',
'port': 8000
}
class BrokenPipeFilter(logging.Filter):
"""Filter to suppress BrokenPipeError logging errors"""
......@@ -533,8 +578,14 @@ async def catch_all_post(provider_id: str, request: Request):
def main():
"""Main entry point for the AISBF server"""
import uvicorn
logger.info("Starting AI Proxy Server on http://127.0.0.1:17765")
uvicorn.run(app, host="127.0.0.1", port=17765)
# Load server configuration
server_config = load_server_config()
host = server_config['host']
port = server_config['port']
logger.info(f"Starting AI Proxy Server on http://{host}:{port}")
uvicorn.run(app, host=host, port=port)
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment