Add condensation enhancements and server configuration

- Add max_context field to CondensationConfig - Support 'internal' keyword for local HuggingFace model in condensation - Add internal model initialization with temperature=0.3, top_p=0.8, repeat_penalty=1.1 - Create condensation system prompts (conversational, semantic) - Add aisbf.json for server configuration (host, port, dashboard auth) - Update main.py to read server config from aisbf.json - Update providers.json with max_context example for condensation

Add condensation enhancements and server configuration
- Add max_context field to CondensationConfig - Support 'internal' keyword for local HuggingFace model in condensation - Add internal model initialization with temperature=0.3, top_p=0.8, repeat_penalty=1.1 - Create condensation system prompts (conversational, semantic) - Add aisbf.json for server configuration (host, port, dashboard auth) - Update main.py to read server config from aisbf.json - Update providers.json with max_context example for condensation
7d5d9e73 · Your Name · 1ea22adb · 7d5d9e73 · 7d5d9e73 · 7d5d9e73
Commit 7d5d9e73 authored Mar 22, 2026 by Your Name
7 changed files
--- a/aisbf/config.py
+++ b/aisbf/config.py
@@ -41,6 +41,7 @@ class CondensationConfig(BaseModel):
    model: Optional[str] = None
    rotation_id: Optional[str] = None
    enabled: bool = True
+    max_context: Optional[int] = None  # Maximum context size for condensation model


 class ProviderConfig(BaseModel):

--- a/aisbf/context.py
+++ b/aisbf/context.py
@@ -54,6 +54,15 @@ class ContextManager:
        self.condensation_model = None
        self._rotation_handler = None
        self._rotation_id = None
+        self._internal_model = None
+        self._internal_tokenizer = None
+        self._internal_model_lock = None
+        self._use_internal_model = False
+        
+        # Get max_context for condensation model
+        self.condensation_max_context = None
+        if self.condensation_config and hasattr(self.condensation_config, 'max_context'):
+            self.condensation_max_context = self.condensation_config.max_context
        
        if (self.condensation_config and
            self.condensation_config.enabled):
@@ -61,6 +70,15 @@ class ContextManager:
                # Check if model is a rotation ID or direct model name
                model_value = self.condensation_config.model
                
+                # Check for "internal" keyword
+                if model_value == "internal":
+                    logger = logging.getLogger(__name__)
+                    logger.info(f"Condensation model is 'internal' - will use local HuggingFace model")
+                    self._use_internal_model = True
+                    # Set default max_context for internal model if not specified
+                    if not self.condensation_max_context:
+                        self.condensation_max_context = 4000  # Conservative default for small models
+                else:
                    # Check if this model value is a rotation ID (exists in rotations config)
                    is_rotation = False
                    if model_value:
@@ -121,6 +139,114 @@ class ContextManager:
        logger.info(f"  context_size: {self.context_size}")
        logger.info(f"  condense_context: {self.condense_context}%")
        logger.info(f"  condense_method: {self.condense_method}")
+        logger.info(f"  condensation_max_context: {self.condensation_max_context}")
+        logger.info(f"  use_internal_model: {self._use_internal_model}")
+    
+    def _initialize_internal_model(self):
+        """Initialize the internal HuggingFace model for condensation (lazy loading)"""
+        import logging
+        logger = logging.getLogger(__name__)
+        
+        if self._internal_model is not None:
+            return  # Already initialized
+        
+        try:
+            import torch
+            from transformers import AutoTokenizer, AutoModelForCausalLM
+            import threading
+            
+            logger.info("=== INITIALIZING INTERNAL CONDENSATION MODEL ===")
+            model_name = "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated-v3"
+            logger.info(f"Model: {model_name}")
+            
+            # Check for GPU availability
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Device: {device}")
+            
+            # Load tokenizer
+            logger.info("Loading tokenizer...")
+            self._internal_tokenizer = AutoTokenizer.from_pretrained(model_name)
+            logger.info("Tokenizer loaded")
+            
+            # Load model
+            logger.info("Loading model...")
+            self._internal_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+                device_map="auto" if device == "cuda" else None
+            )
+            
+            if device == "cpu":
+                self._internal_model = self._internal_model.to(device)
+            
+            logger.info("Model loaded successfully")
+            
+            # Initialize thread lock for model access
+            self._internal_model_lock = threading.Lock()
+            
+            logger.info("=== INTERNAL CONDENSATION MODEL READY ===")
+        except ImportError as e:
+            logger.error(f"Failed to import required libraries for internal model: {e}")
+            logger.error("Please install: pip install torch transformers")
+            raise
+        except Exception as e:
+            logger.error(f"Failed to initialize internal model: {e}", exc_info=True)
+            raise
+    
+    async def _run_internal_model_condensation(self, prompt: str) -> str:
+        """Run the internal model for condensation in a separate thread"""
+        import logging
+        import asyncio
+        from concurrent.futures import ThreadPoolExecutor
+        logger = logging.getLogger(__name__)
+        
+        # Initialize model if needed
+        if self._internal_model is None:
+            self._initialize_internal_model()
+        
+        def run_inference():
+            """Run inference in a separate thread"""
+            with self._internal_model_lock:
+                try:
+                    import torch
+                    
+                    # Tokenize input
+                    inputs = self._internal_tokenizer(prompt, return_tensors="pt")
+                    
+                    # Move to same device as model
+                    device = next(self._internal_model.parameters()).device
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                    
+                    # Generate response
+                    with torch.no_grad():
+                        outputs = self._internal_model.generate(
+                            **inputs,
+                            max_new_tokens=500,
+                            temperature=0.3,
+                            top_p=0.8,
+                            repetition_penalty=1.1,
+                            do_sample=True,
+                            pad_token_id=self._internal_tokenizer.eos_token_id
+                        )
+                    
+                    # Decode response
+                    response = self._internal_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    
+                    # Extract only the generated part (remove the prompt)
+                    if response.startswith(prompt):
+                        response = response[len(prompt):].strip()
+                    
+                    return response
+                except Exception as e:
+                    logger.error(f"Error during internal model inference: {e}", exc_info=True)
+                    return None
+        
+        # Run in thread pool to avoid blocking
+        loop = asyncio.get_event_loop()
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            result = await loop.run_in_executor(executor, run_inference)
+        
+        return result
    
    def should_condense(self, messages: List[Dict], model: str) -> bool:
        """

--- a/config/aisbf.json
+++ b/config/aisbf.json
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000
+  },
+  "dashboard": {
+    "enabled": true,
+    "username": "admin",
+    "password": "admin"
+  },
+  "internal_model": {
+    "model_id": "huihui-ai/Qwen2.5-0.5B-Instruct-abliterated-v3"
+  }
+}
--- a/config/condensation_conversational.md
+++ b/config/condensation_conversational.md
+# Conversational Context Condensation
+
+You are a specialized AI assistant for context condensation. Your task is to create a concise, high-density summary of conversation history while preserving all critical information.
+
+## Your Role
+
+You will receive a conversation history between a user and an AI assistant. Your job is to:
+
+1. **Identify Key Information**: Extract facts, decisions, goals, and important context
+2. **Preserve Continuity**: Maintain the logical flow and relationships between topics
+3. **Compress Efficiently**: Remove redundancy while keeping essential details
+4. **Maintain Accuracy**: Never invent or hallucinate information
+
+## Guidelines
+
+- Focus on **actionable information** and **decisions made**
+- Include **technical details** that may be referenced later
+- Preserve **user preferences** and **constraints** mentioned
+- Keep **error messages** and **solutions** that were discussed
+- Maintain **chronological order** when relevant
+- Use **clear, concise language**
+
+## Output Format
+
+Provide a structured summary that includes:
+- Current goal or task
+- Key facts and context
+- Decisions made
+- Important technical details
+- Any constraints or preferences
+
+Keep the summary comprehensive but concise. Aim for maximum information density.
--- a/config/condensation_semantic.md
+++ b/config/condensation_semantic.md
+# Semantic Context Pruning
+
+You are a specialized AI assistant for semantic context pruning. Your task is to extract only the information that is directly relevant to the current query or task.
+
+## Your Role
+
+You will receive:
+1. A conversation history
+2. A current query or task description
+
+Your job is to identify and extract ONLY the information from the conversation that is relevant to answering or completing the current query/task.
+
+## Guidelines
+
+- **Be Selective**: Remove all information that doesn't directly relate to the current query
+- **Preserve Dependencies**: Keep information that provides necessary context for understanding relevant parts
+- **Maintain Accuracy**: Never modify or invent information
+- **Focus on Recency**: Prioritize recent information over older information when both are relevant
+- **Keep Technical Details**: Preserve specific technical information (code, commands, configurations) that may be needed
+
+## What to Keep
+
+- Facts directly related to the current query
+- Technical details needed to answer the query
+- Recent decisions that affect the current task
+- Error messages or issues being addressed
+- Constraints or requirements mentioned
+
+## What to Remove
+
+- Unrelated conversations or topics
+- Resolved issues that don't affect current task
+- Redundant information
+- Off-topic discussions
+- Historical context not needed for current query
+
+## Output Format
+
+Provide a concise extraction of relevant information. Structure it logically, grouping related facts together. Be ruthlessly efficient - if information isn't needed for the current query, don't include it.
--- a/config/providers.json
+++ b/config/providers.json
@@ -2,7 +2,8 @@
  "condensation": {
    "provider_id": "gemini",
    "model": "gemini-1.5-flash",
-    "enabled": true
+    "enabled": true,
+    "max_context": 8000
  },
  "providers": {
    "gemini": {

--- a/main.py
+++ b/main.py
@@ -38,6 +38,51 @@ from logging.handlers import RotatingFileHandler
 from datetime import datetime, timedelta
 from collections import defaultdict
 from pathlib import Path
+import json
+
+def load_server_config():
+    """Load server configuration from aisbf.json"""
+    # Try user config first
+    config_path = Path.home() / '.aisbf' / 'aisbf.json'
+    
+    if not config_path.exists():
+        # Try installed locations
+        installed_dirs = [
+            Path('/usr/share/aisbf'),
+            Path.home() / '.local' / 'share' / 'aisbf',
+        ]
+        
+        for installed_dir in installed_dirs:
+            test_path = installed_dir / 'aisbf.json'
+            if test_path.exists():
+                config_path = test_path
+                break
+        else:
+            # Fallback to source tree config directory
+            source_dir = Path(__file__).parent / 'config'
+            test_path = source_dir / 'aisbf.json'
+            if test_path.exists():
+                config_path = test_path
+    
+    # Load config or use defaults
+    if config_path.exists():
+        try:
+            with open(config_path) as f:
+                config_data = json.load(f)
+                server_config = config_data.get('server', {})
+                return {
+                    'host': server_config.get('host', '0.0.0.0'),
+                    'port': server_config.get('port', 8000)
+                }
+        except Exception as e:
+            logger = logging.getLogger(__name__)
+            logger.warning(f"Error loading aisbf.json: {e}, using defaults")
+    
+    # Return defaults
+    return {
+        'host': '0.0.0.0',
+        'port': 8000
+    }

 class BrokenPipeFilter(logging.Filter):
    """Filter to suppress BrokenPipeError logging errors"""
@@ -533,8 +578,14 @@ async def catch_all_post(provider_id: str, request: Request):
 def main():
    """Main entry point for the AISBF server"""
    import uvicorn
-    logger.info("Starting AI Proxy Server on http://127.0.0.1:17765")
-    uvicorn.run(app, host="127.0.0.1", port=17765)
+    
+    # Load server configuration
+    server_config = load_server_config()
+    host = server_config['host']
+    port = server_config['port']
+    
+    logger.info(f"Starting AI Proxy Server on http://{host}:{port}")
+    uvicorn.run(app, host=host, port=port)

 if __name__ == "__main__":
    main()