Commit 7d5d9e73 authored by Your Name's avatar Your Name

Add condensation enhancements and server configuration

- Add max_context field to CondensationConfig
- Support 'internal' keyword for local HuggingFace model in condensation
- Add internal model initialization with temperature=0.3, top_p=0.8, repeat_penalty=1.1
- Create condensation system prompts (conversational, semantic)
- Add aisbf.json for server configuration (host, port, dashboard auth)
- Update main.py to read server config from aisbf.json
- Update providers.json with max_context example for condensation
parent 1ea22adb
...@@ -41,6 +41,7 @@ class CondensationConfig(BaseModel): ...@@ -41,6 +41,7 @@ class CondensationConfig(BaseModel):
model: Optional[str] = None model: Optional[str] = None
rotation_id: Optional[str] = None rotation_id: Optional[str] = None
enabled: bool = True enabled: bool = True
max_context: Optional[int] = None # Maximum context size for condensation model
class ProviderConfig(BaseModel): class ProviderConfig(BaseModel):
......
...@@ -54,6 +54,15 @@ class ContextManager: ...@@ -54,6 +54,15 @@ class ContextManager:
self.condensation_model = None self.condensation_model = None
self._rotation_handler = None self._rotation_handler = None
self._rotation_id = None self._rotation_id = None
self._internal_model = None
self._internal_tokenizer = None
self._internal_model_lock = None
self._use_internal_model = False
# Get max_context for condensation model
self.condensation_max_context = None
if self.condensation_config and hasattr(self.condensation_config, 'max_context'):
self.condensation_max_context = self.condensation_config.max_context
if (self.condensation_config and if (self.condensation_config and
self.condensation_config.enabled): self.condensation_config.enabled):
...@@ -61,6 +70,15 @@ class ContextManager: ...@@ -61,6 +70,15 @@ class ContextManager:
# Check if model is a rotation ID or direct model name # Check if model is a rotation ID or direct model name
model_value = self.condensation_config.model model_value = self.condensation_config.model
# Check for "internal" keyword
if model_value == "internal":
logger = logging.getLogger(__name__)
logger.info(f"Condensation model is 'internal' - will use local HuggingFace model")
self._use_internal_model = True
# Set default max_context for internal model if not specified
if not self.condensation_max_context:
self.condensation_max_context = 4000 # Conservative default for small models
else:
# Check if this model value is a rotation ID (exists in rotations config) # Check if this model value is a rotation ID (exists in rotations config)
is_rotation = False is_rotation = False
if model_value: if model_value:
...@@ -121,6 +139,114 @@ class ContextManager: ...@@ -121,6 +139,114 @@ class ContextManager:
logger.info(f" context_size: {self.context_size}") logger.info(f" context_size: {self.context_size}")
logger.info(f" condense_context: {self.condense_context}%") logger.info(f" condense_context: {self.condense_context}%")
logger.info(f" condense_method: {self.condense_method}") logger.info(f" condense_method: {self.condense_method}")
logger.info(f" condensation_max_context: {self.condensation_max_context}")
logger.info(f" use_internal_model: {self._use_internal_model}")
def _initialize_internal_model(self):
"""Initialize the internal HuggingFace model for condensation (lazy loading)"""
import logging
logger = logging.getLogger(__name__)
if self._internal_model is not None:
return # Already initialized
try:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import threading
logger.info("=== INITIALIZING INTERNAL CONDENSATION MODEL ===")
model_name = "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated-v3"
logger.info(f"Model: {model_name}")
# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Device: {device}")
# Load tokenizer
logger.info("Loading tokenizer...")
self._internal_tokenizer = AutoTokenizer.from_pretrained(model_name)
logger.info("Tokenizer loaded")
# Load model
logger.info("Loading model...")
self._internal_model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto" if device == "cuda" else None
)
if device == "cpu":
self._internal_model = self._internal_model.to(device)
logger.info("Model loaded successfully")
# Initialize thread lock for model access
self._internal_model_lock = threading.Lock()
logger.info("=== INTERNAL CONDENSATION MODEL READY ===")
except ImportError as e:
logger.error(f"Failed to import required libraries for internal model: {e}")
logger.error("Please install: pip install torch transformers")
raise
except Exception as e:
logger.error(f"Failed to initialize internal model: {e}", exc_info=True)
raise
async def _run_internal_model_condensation(self, prompt: str) -> str:
"""Run the internal model for condensation in a separate thread"""
import logging
import asyncio
from concurrent.futures import ThreadPoolExecutor
logger = logging.getLogger(__name__)
# Initialize model if needed
if self._internal_model is None:
self._initialize_internal_model()
def run_inference():
"""Run inference in a separate thread"""
with self._internal_model_lock:
try:
import torch
# Tokenize input
inputs = self._internal_tokenizer(prompt, return_tensors="pt")
# Move to same device as model
device = next(self._internal_model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate response
with torch.no_grad():
outputs = self._internal_model.generate(
**inputs,
max_new_tokens=500,
temperature=0.3,
top_p=0.8,
repetition_penalty=1.1,
do_sample=True,
pad_token_id=self._internal_tokenizer.eos_token_id
)
# Decode response
response = self._internal_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the generated part (remove the prompt)
if response.startswith(prompt):
response = response[len(prompt):].strip()
return response
except Exception as e:
logger.error(f"Error during internal model inference: {e}", exc_info=True)
return None
# Run in thread pool to avoid blocking
loop = asyncio.get_event_loop()
with ThreadPoolExecutor(max_workers=1) as executor:
result = await loop.run_in_executor(executor, run_inference)
return result
def should_condense(self, messages: List[Dict], model: str) -> bool: def should_condense(self, messages: List[Dict], model: str) -> bool:
""" """
......
{
"server": {
"host": "0.0.0.0",
"port": 8000
},
"dashboard": {
"enabled": true,
"username": "admin",
"password": "admin"
},
"internal_model": {
"model_id": "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated-v3"
}
}
# Conversational Context Condensation
You are a specialized AI assistant for context condensation. Your task is to create a concise, high-density summary of conversation history while preserving all critical information.
## Your Role
You will receive a conversation history between a user and an AI assistant. Your job is to:
1. **Identify Key Information**: Extract facts, decisions, goals, and important context
2. **Preserve Continuity**: Maintain the logical flow and relationships between topics
3. **Compress Efficiently**: Remove redundancy while keeping essential details
4. **Maintain Accuracy**: Never invent or hallucinate information
## Guidelines
- Focus on **actionable information** and **decisions made**
- Include **technical details** that may be referenced later
- Preserve **user preferences** and **constraints** mentioned
- Keep **error messages** and **solutions** that were discussed
- Maintain **chronological order** when relevant
- Use **clear, concise language**
## Output Format
Provide a structured summary that includes:
- Current goal or task
- Key facts and context
- Decisions made
- Important technical details
- Any constraints or preferences
Keep the summary comprehensive but concise. Aim for maximum information density.
# Semantic Context Pruning
You are a specialized AI assistant for semantic context pruning. Your task is to extract only the information that is directly relevant to the current query or task.
## Your Role
You will receive:
1. A conversation history
2. A current query or task description
Your job is to identify and extract ONLY the information from the conversation that is relevant to answering or completing the current query/task.
## Guidelines
- **Be Selective**: Remove all information that doesn't directly relate to the current query
- **Preserve Dependencies**: Keep information that provides necessary context for understanding relevant parts
- **Maintain Accuracy**: Never modify or invent information
- **Focus on Recency**: Prioritize recent information over older information when both are relevant
- **Keep Technical Details**: Preserve specific technical information (code, commands, configurations) that may be needed
## What to Keep
- Facts directly related to the current query
- Technical details needed to answer the query
- Recent decisions that affect the current task
- Error messages or issues being addressed
- Constraints or requirements mentioned
## What to Remove
- Unrelated conversations or topics
- Resolved issues that don't affect current task
- Redundant information
- Off-topic discussions
- Historical context not needed for current query
## Output Format
Provide a concise extraction of relevant information. Structure it logically, grouping related facts together. Be ruthlessly efficient - if information isn't needed for the current query, don't include it.
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
"condensation": { "condensation": {
"provider_id": "gemini", "provider_id": "gemini",
"model": "gemini-1.5-flash", "model": "gemini-1.5-flash",
"enabled": true "enabled": true,
"max_context": 8000
}, },
"providers": { "providers": {
"gemini": { "gemini": {
......
...@@ -38,6 +38,51 @@ from logging.handlers import RotatingFileHandler ...@@ -38,6 +38,51 @@ from logging.handlers import RotatingFileHandler
from datetime import datetime, timedelta from datetime import datetime, timedelta
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
import json
def load_server_config():
"""Load server configuration from aisbf.json"""
# Try user config first
config_path = Path.home() / '.aisbf' / 'aisbf.json'
if not config_path.exists():
# Try installed locations
installed_dirs = [
Path('/usr/share/aisbf'),
Path.home() / '.local' / 'share' / 'aisbf',
]
for installed_dir in installed_dirs:
test_path = installed_dir / 'aisbf.json'
if test_path.exists():
config_path = test_path
break
else:
# Fallback to source tree config directory
source_dir = Path(__file__).parent / 'config'
test_path = source_dir / 'aisbf.json'
if test_path.exists():
config_path = test_path
# Load config or use defaults
if config_path.exists():
try:
with open(config_path) as f:
config_data = json.load(f)
server_config = config_data.get('server', {})
return {
'host': server_config.get('host', '0.0.0.0'),
'port': server_config.get('port', 8000)
}
except Exception as e:
logger = logging.getLogger(__name__)
logger.warning(f"Error loading aisbf.json: {e}, using defaults")
# Return defaults
return {
'host': '0.0.0.0',
'port': 8000
}
class BrokenPipeFilter(logging.Filter): class BrokenPipeFilter(logging.Filter):
"""Filter to suppress BrokenPipeError logging errors""" """Filter to suppress BrokenPipeError logging errors"""
...@@ -533,8 +578,14 @@ async def catch_all_post(provider_id: str, request: Request): ...@@ -533,8 +578,14 @@ async def catch_all_post(provider_id: str, request: Request):
def main(): def main():
"""Main entry point for the AISBF server""" """Main entry point for the AISBF server"""
import uvicorn import uvicorn
logger.info("Starting AI Proxy Server on http://127.0.0.1:17765")
uvicorn.run(app, host="127.0.0.1", port=17765) # Load server configuration
server_config = load_server_config()
host = server_config['host']
port = server_config['port']
logger.info(f"Starting AI Proxy Server on http://{host}:{port}")
uvicorn.run(app, host=host, port=port)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment