🛡️ Add comprehensive LLM safeguards and dual-mode demo scripts

🛡️ SMART MODEL SAFEGUARDS: - Implement runaway prevention with pattern detection (repetition, thinking loops, rambling) - Add context length management with optimal parameters per model size - Quality validation prevents problematic responses before reaching users - Helpful explanations when issues occur with recovery suggestions - Model-specific parameter optimization (qwen3:0.6b vs 1.7b vs 3b+) - Timeout protection and graceful degradation ⚡ OPTIMAL PERFORMANCE SETTINGS: - Context window: 32k tokens for good balance - Repeat penalty: 1.15 for 0.6b, 1.1 for 1.7b, 1.05 for larger models - Presence penalty: 1.5 for quantized models to prevent repetition - Smart output limits: 1500 tokens for 0.6b, 2000+ for larger models - Top-p/top-k tuning based on research best practices 🎬 DUAL-MODE DEMO SCRIPTS: - create_synthesis_demo.py: Shows fast search with AI synthesis workflow - create_exploration_demo.py: Interactive thinking mode with conversation memory - Realistic typing simulation and response timing for quality GIFs - Clear demonstration of when to use each mode Perfect for creating compelling demo videos showing both RAG experiences!
2025-08-12 19:07:48 +10:00 · 2025-08-12 19:07:48 +10:00 · 5f42751e9a
commit 5f42751e9a
parent 3363171820
4 changed files with 828 additions and 5 deletions
--- a/claude_rag/llm_safeguards.py
+++ b/claude_rag/llm_safeguards.py
@ -0,0 +1,320 @@
 #!/usr/bin/env python3
 """
 LLM Safeguards for Small Model Management
 Provides runaway prevention, context management, and intelligent detection
 of problematic model behaviors to ensure reliable user experience.
 """
 import re
 import time
 import logging
 from typing import Optional, Dict, List, Tuple
 from dataclasses import dataclass
 logger = logging.getLogger(__name__)
@dataclass
 class SafeguardConfig:
    """Configuration for LLM safeguards."""
    max_output_tokens: int = 2000        # Prevent excessive generation
    max_repetition_ratio: float = 0.3    # Max ratio of repeated content
    max_response_time: int = 60          # Max seconds for response
    min_useful_length: int = 20          # Minimum useful response length
    context_window: int = 32768          # Ollama context window
    enable_thinking_detection: bool = True  # Detect thinking patterns
 class ModelRunawayDetector:
    """Detects and prevents model runaway behaviors."""
    def __init__(self, config: SafeguardConfig = None):
        self.config = config or SafeguardConfig()
        self.response_patterns = self._compile_patterns()
    def _compile_patterns(self) -> Dict[str, re.Pattern]:
        """Compile regex patterns for runaway detection."""
        return {
            # Excessive repetition patterns
            'word_repetition': re.compile(r'\b(\w+)\b(?:\s+\1\b){3,}', re.IGNORECASE),
            'phrase_repetition': re.compile(r'(.{10,50}?)\1{2,}', re.DOTALL),
            # Thinking loop patterns (small models get stuck)
            'thinking_loop': re.compile(r'(let me think|i think|thinking|consider|actually|wait|hmm|well)\s*[.,:]*\s*\1', re.IGNORECASE),
            # Rambling patterns
            'excessive_filler': re.compile(r'\b(um|uh|well|you know|like|basically|actually|so|then|and|but|however)\b(?:\s+[^.!?]*){5,}', re.IGNORECASE),
            # JSON corruption patterns
            'broken_json': re.compile(r'\{[^}]*\{[^}]*\{'),  # Nested broken JSON
            'json_repetition': re.compile(r'("[\w_]+"\s*:\s*"[^"]*",?\s*){4,}'),  # Repeated JSON fields
        }
    def check_response_quality(self, response: str, query: str, start_time: float) -> Tuple[bool, Optional[str], Optional[str]]:
        """
        Check response quality and detect runaway behaviors.
        Returns:
            (is_valid, issue_type, user_explanation)
        """
        if not response or len(response.strip()) < self.config.min_useful_length:
            return False, "too_short", self._explain_too_short()
        # Check response time
        elapsed = time.time() - start_time
        if elapsed > self.config.max_response_time:
            return False, "timeout", self._explain_timeout()
        # Check for repetition issues
        repetition_issue = self._check_repetition(response)
        if repetition_issue:
            return False, repetition_issue, self._explain_repetition(repetition_issue)
        # Check for thinking loops
        if self.config.enable_thinking_detection:
            thinking_issue = self._check_thinking_loops(response)
            if thinking_issue:
                return False, thinking_issue, self._explain_thinking_loop()
        # Check for rambling
        rambling_issue = self._check_rambling(response)
        if rambling_issue:
            return False, rambling_issue, self._explain_rambling()
        # Check JSON corruption (for structured responses)
        if '{' in response and '}' in response:
            json_issue = self._check_json_corruption(response)
            if json_issue:
                return False, json_issue, self._explain_json_corruption()
        return True, None, None
    def _check_repetition(self, response: str) -> Optional[str]:
        """Check for excessive repetition."""
        # Word repetition
        if self.response_patterns['word_repetition'].search(response):
            return "word_repetition"
        # Phrase repetition  
        if self.response_patterns['phrase_repetition'].search(response):
            return "phrase_repetition"
        # Calculate repetition ratio
        words = response.split()
        if len(words) > 10:
            unique_words = set(words)
            repetition_ratio = 1 - (len(unique_words) / len(words))
            if repetition_ratio > self.config.max_repetition_ratio:
                return "high_repetition_ratio"
        return None
    def _check_thinking_loops(self, response: str) -> Optional[str]:
        """Check for thinking loops (common in small models)."""
        if self.response_patterns['thinking_loop'].search(response):
            return "thinking_loop"
        # Check for excessive meta-commentary
        thinking_words = ['think', 'considering', 'actually', 'wait', 'hmm', 'let me']
        thinking_count = sum(response.lower().count(word) for word in thinking_words)
        if thinking_count > 5 and len(response.split()) < 200:
            return "excessive_thinking"
        return None
    def _check_rambling(self, response: str) -> Optional[str]:
        """Check for rambling or excessive filler."""
        if self.response_patterns['excessive_filler'].search(response):
            return "excessive_filler"
        # Check for extremely long sentences (sign of rambling)
        sentences = re.split(r'[.!?]+', response)
        long_sentences = [s for s in sentences if len(s.split()) > 50]
        if len(long_sentences) > 2:
            return "excessive_rambling"
        return None
    def _check_json_corruption(self, response: str) -> Optional[str]:
        """Check for JSON corruption in structured responses."""
        if self.response_patterns['broken_json'].search(response):
            return "broken_json"
        if self.response_patterns['json_repetition'].search(response):
            return "json_repetition"
        return None
    def _explain_too_short(self) -> str:
        return """🤔 The AI response was too short to be helpful.
 **Why this happens:**
 • The model might be confused by the query
 • Context might be insufficient  
 • Model might be overloaded
 **What to try:**
 • Rephrase your question more specifically
 • Try a broader search term first
 • Use exploration mode for complex questions: `rag-mini explore`"""
    def _explain_timeout(self) -> str:
        return """⏱️ The AI took too long to respond (over 60 seconds).
 **Why this happens:**
 • Small models sometimes get "stuck" thinking
 • Complex queries can overwhelm smaller models
 • System might be under load
 **What to try:**
 • Try a simpler, more direct question
 • Use synthesis mode for faster responses: `--synthesize`  
 • Consider using a larger model if available"""
    def _explain_repetition(self, issue_type: str) -> str:
        return f"""🔄 The AI got stuck in repetition loops ({issue_type}).
 **Why this happens:**
 • Small models sometimes repeat when uncertain
 • Query might be too complex for the model size
 • Context window might be exceeded
 **What to try:**
 • Try a more specific question
 • Break complex questions into smaller parts
 • Use exploration mode which handles context better: `rag-mini explore`
 • Consider: A larger model (qwen3:1.7b or qwen3:3b) would help"""
    def _explain_thinking_loop(self) -> str:
        return """🧠 The AI got caught in a "thinking loop" - overthinking the response.
 **Why this happens:**
 • Small models sometimes over-analyze simple questions
 • Thinking mode can cause loops in smaller models
 • Query complexity exceeds model capabilities
 **What to try:**
 • Ask more direct, specific questions
 • Use synthesis mode (no thinking) for faster results
 • Try: "What does this code do?" instead of "Explain how this works"
 • Larger models (qwen3:1.7b+) handle thinking better"""
    def _explain_rambling(self) -> str:
        return """💭 The AI started rambling instead of giving focused answers.
 **Why this happens:**
 • Small models sometimes lose focus on complex topics
 • Query might be too broad or vague  
 • Model trying to cover too much at once
 **What to try:**
 • Ask more specific questions
 • Break broad questions into focused parts
 • Example: "How is data validated?" instead of "Explain the whole system"
 • Exploration mode helps maintain focus across questions"""
    def _explain_json_corruption(self) -> str:
        return """🔧 The AI response format got corrupted.
 **Why this happens:**
 • Small models sometimes struggle with structured output
 • Context limits can cause format errors
 • Complex analysis might overwhelm formatting
 **What to try:**  
 • Try the question again (often resolves itself)
 • Use simpler questions for better formatting
 • Synthesis mode sometimes gives cleaner output
 • This is less common with larger models"""
    def get_recovery_suggestions(self, issue_type: str, query: str) -> List[str]:
        """Get specific recovery suggestions based on the issue."""
        suggestions = []
        if issue_type in ['thinking_loop', 'excessive_thinking']:
            suggestions.extend([
                f"Try synthesis mode: `rag-mini search . \"{query}\" --synthesize`",
                "Ask more direct questions without 'why' or 'how'",
                "Break complex questions into smaller parts"
            ])
        elif issue_type in ['word_repetition', 'phrase_repetition', 'high_repetition_ratio']:
            suggestions.extend([
                "Try rephrasing your question completely",
                "Use more specific technical terms",  
                f"Try exploration mode: `rag-mini explore .`"
            ])
        elif issue_type == 'timeout':
            suggestions.extend([
                "Try a simpler version of your question",
                "Use synthesis mode for faster responses",
                "Check if Ollama is under heavy load"
            ])
        # Universal suggestions
        suggestions.extend([
            "Consider using a larger model if available (qwen3:1.7b or qwen3:3b)",
            "Check model status: `ollama list`"
        ])
        return suggestions
 def get_optimal_ollama_parameters(model_name: str) -> Dict[str, any]:
    """Get optimal parameters for different Ollama models."""
    base_params = {
        "num_ctx": 32768,      # Good context window for most uses
        "num_predict": 2000,   # Reasonable response length
        "temperature": 0.3,    # Balanced creativity/consistency
    }
    # Model-specific optimizations
    if "qwen3:0.6b" in model_name.lower():
        return {
            **base_params,
            "repeat_penalty": 1.15,      # Prevent repetition in small model
            "presence_penalty": 1.5,     # Suppress repetitive outputs 
            "top_p": 0.8,               # Focused sampling
            "top_k": 20,                # Limit choices
            "num_predict": 1500,        # Shorter responses for reliability
        }
    elif "qwen3:1.7b" in model_name.lower():
        return {
            **base_params,
            "repeat_penalty": 1.1,       # Less aggressive for larger model
            "presence_penalty": 1.0,     # Balanced
            "top_p": 0.9,               # More creative
            "top_k": 40,                # More choices
        }
    elif any(size in model_name.lower() for size in ["3b", "7b", "8b"]):
        return {
            **base_params,
            "repeat_penalty": 1.05,      # Minimal for larger models
            "presence_penalty": 0.5,     # Light touch
            "top_p": 0.95,              # High creativity
            "top_k": 50,                # Many choices
            "num_predict": 3000,        # Longer responses OK
        }
    return base_params
 # Quick test
 def test_safeguards():
    """Test the safeguard system."""
    detector = ModelRunawayDetector()
    # Test repetition detection
    bad_response = "The user authentication system works by checking user credentials. The user authentication system works by checking user credentials. The user authentication system works by checking user credentials."
    is_valid, issue, explanation = detector.check_response_quality(bad_response, "auth", time.time())
    print(f"Repetition test: Valid={is_valid}, Issue={issue}")
    if explanation:
        print(explanation)
 if __name__ == "__main__":
    test_safeguards()
--- a/claude_rag/llm_synthesizer.py
+++ b/claude_rag/llm_synthesizer.py
@ -8,11 +8,20 @@ Takes raw search results and generates coherent, contextual summaries.
 import json
 import logging
 import time
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass
 import requests
 from pathlib import Path
 try:
    from .llm_safeguards import ModelRunawayDetector, SafeguardConfig, get_optimal_ollama_parameters
 except ImportError:
    # Graceful fallback if safeguards not available
    ModelRunawayDetector = None
    SafeguardConfig = None
    get_optimal_ollama_parameters = lambda x: {}
 logger = logging.getLogger(__name__)
@dataclass
@ -34,6 +43,12 @@ class LLMSynthesizer:
        self.enable_thinking = enable_thinking  # Default False for synthesis mode
        self._initialized = False
        # Initialize safeguards
        if ModelRunawayDetector:
            self.safeguard_detector = ModelRunawayDetector(SafeguardConfig())
        else:
            self.safeguard_detector = None
    def _get_available_models(self) -> List[str]:
        """Get list of available Ollama models."""
        try:
@ -129,7 +144,9 @@ class LLMSynthesizer:
        return len(self.available_models) > 0
    def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False) -> Optional[str]:
-        """Make a call to Ollama API."""
+        """Make a call to Ollama API with safeguards."""
        start_time = time.time()
        try:
            # Use the best available model
            model_to_use = self.model
@ -147,26 +164,46 @@ class LLMSynthesizer:
                if not final_prompt.endswith(" <no_think>"):
                    final_prompt += " <no_think>"
            # Get optimal parameters for this model
            optimal_params = get_optimal_ollama_parameters(model_to_use)
            payload = {
                "model": model_to_use,
                "prompt": final_prompt,
                "stream": False,
                "options": {
                    "temperature": temperature,
-                    "top_p": 0.9,
+                    "top_p": optimal_params.get("top_p", 0.9),
-                    "top_k": 40
+                    "top_k": optimal_params.get("top_k", 40),
                    "num_ctx": optimal_params.get("num_ctx", 32768),
                    "num_predict": optimal_params.get("num_predict", 2000),
                    "repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
                    "presence_penalty": optimal_params.get("presence_penalty", 1.0)
                }
            }
            response = requests.post(
                f"{self.ollama_url}/api/generate",
                json=payload,
-                timeout=30
+                timeout=65  # Slightly longer than safeguard timeout
            )
            if response.status_code == 200:
                result = response.json()
-                return result.get('response', '').strip()
+                raw_response = result.get('response', '').strip()
                # Apply safeguards to check response quality
                if self.safeguard_detector and raw_response:
                    is_valid, issue_type, explanation = self.safeguard_detector.check_response_quality(
                        raw_response, prompt[:100], start_time  # First 100 chars of prompt for context
                    )
                    if not is_valid:
                        logger.warning(f"Safeguard triggered: {issue_type}")
                        # Return a safe explanation instead of the problematic response
                        return self._create_safeguard_response(issue_type, explanation, prompt)
                return raw_response
            else:
                logger.error(f"Ollama API error: {response.status_code}")
                return None
@ -175,6 +212,24 @@ class LLMSynthesizer:
            logger.error(f"Ollama call failed: {e}")
            return None
    def _create_safeguard_response(self, issue_type: str, explanation: str, original_prompt: str) -> str:
        """Create a helpful response when safeguards are triggered."""
        return f"""⚠️ Model Response Issue Detected
 {explanation}
 **Original query context:** {original_prompt[:200]}{'...' if len(original_prompt) > 200 else ''}
 **What happened:** The AI model encountered a common issue with small language models and was prevented from giving a problematic response.
 **Your options:**
 1. **Try again**: Ask the same question (often resolves itself)
 2. **Rephrase**: Make your question more specific or break it into parts  
 3. **Use exploration mode**: `rag-mini explore` for complex questions
 4. **Different approach**: Try synthesis mode: `--synthesize` for simpler responses
 This is normal with smaller AI models and helps ensure you get quality responses."""
    def synthesize_search_results(self, query: str, results: List[Any], project_path: Path) -> SynthesisResult:
        """Synthesize search results into a coherent summary."""
--- a/create_exploration_demo.py
+++ b/create_exploration_demo.py
@ -0,0 +1,270 @@
 #!/usr/bin/env python3
 """
 Create demo GIF for Exploration Mode - Deep Thinking & Interactive Learning
 Shows the conversational workflow for understanding and debugging codebases.
 """
 import time
 import sys
 import os
 from pathlib import Path
 class ExplorationDemoSimulator:
    def __init__(self):
        self.width = 100
        self.height = 35
    def clear_screen(self):
        print("\033[H\033[2J", end="")
    def type_command(self, command: str, delay: float = 0.05):
        """Simulate typing a command."""
        print("$ ", end="", flush=True)
        for char in command:
            print(char, end="", flush=True)
            time.sleep(delay)
        print()
        time.sleep(0.5)
    def type_question(self, question: str, delay: float = 0.04):
        """Simulate typing a question in exploration mode."""
        print("> ", end="", flush=True)
        for char in question:
            print(char, end="", flush=True)
            time.sleep(delay)
        print()
        time.sleep(0.5)
    def show_thinking(self, duration: float = 2.0):
        """Show thinking animation."""
        print("🔍 Analyzing...", end="", flush=True)
        for _ in range(3):
            time.sleep(duration / 3)
            print(".", end="", flush=True)
        print()
        time.sleep(0.5)
    def show_response(self, lines: list, delay: float = 0.4):
        """Show AI response with realistic timing."""
        for line in lines:
            print(line)
            time.sleep(delay)
        time.sleep(1.5)
    def run_exploration_demo(self):
        """Run the exploration mode demonstration."""
        self.clear_screen()
        # Title
        print("🧠 FSS-Mini-RAG: Exploration Mode Demo")
        print("=" * 55)
        print("Deep thinking & interactive learning for complex codebases")
        print()
        time.sleep(2)
        # Step 1: Start exploration
        print("Step 1: Start interactive exploration session")
        print("-" * 45)
        self.type_command("rag-mini explore ./webapp-project")
        self.show_response([
            "🧠 EXPLORATION MODE STARTED", 
            "=" * 50,
            "Project: webapp-project",
            "Session: explore_1702845632",
            "",
            "🎯 This mode uses thinking and remembers context.",
            "   Perfect for debugging, learning, and deep exploration.",
            "",  
            "💡 Tips:",
            "   • Ask follow-up questions - I'll remember our conversation",
            "   • Use 'why', 'how', 'explain' for detailed reasoning",
            "   • Type 'quit' or 'exit' to end session",
            "",
            "=" * 50,
            "",
            "🤔 Ask your first question about the codebase:"
        ])
        # Step 2: First question - How authentication works
        print("Step 2: Understanding authentication system")
        print("-" * 42)
        self.type_question("How does authentication work in this codebase?")
        self.show_thinking()
        self.show_response([
            "",
            "🧠 EXPLORATION ANALYSIS (Question #1)",
            "Session: 0.2m | Results: 8 | Time: 3.4s",
            "=" * 60,
            "",
            "📝 Analysis:", 
            "   This webapp implements a JWT-based authentication system with",
            "   session management, role-based permissions, and OAuth2 integration.",
            "",
            "🔍 Key Insights:",
            "   • JWT tokens generated in auth/jwt_handler.py with 24h expiry",
            "   • User sessions stored in Redis for fast lookups and invalidation", 
            "   • Role-based access control through middleware/permissions.py",
            "   • OAuth2 integration supports Google, GitHub, and Microsoft providers",
            "   • Password hashing uses bcrypt with 12 rounds for security",
            "",
            "💡 Code Examples:",
            "   JWT generation: jwt.encode(payload, SECRET_KEY, algorithm='HS256')",
            "   Session storage: redis.setex(f'session:{user_id}', 86400, token)",
            "   Permission check: @require_permission('admin') decorator",
            "",
            "🎯 Next Steps:",
            "   • Review JWT secret rotation policy", 
            "   • Consider implementing refresh tokens",
            "   • Audit OAuth2 scope permissions",
            "",
            "🟢 Confidence: 91%"
        ])
        # Step 3: Follow-up question - Security concerns
        print("Step 3: Follow-up question (context remembered)")
        print("-" * 48)
        self.type_question("What security concerns should I be aware of?")
        self.show_thinking(1.8)
        self.show_response([
            "",
            "🧠 EXPLORATION ANALYSIS (Question #2)",  
            "Session: 1.4m | Results: 6 | Time: 2.1s",
            "=" * 60,
            "",
            "📝 Analysis:",
            "   Building on the JWT authentication system we just discussed,",
            "   I've identified several security considerations that need attention.",
            "",
            "🔍 Key Insights:",
            "   • JWT secret is hardcoded in config.py (CRITICAL RISK)",
            "   • No rate limiting on login endpoints (brute force vulnerable)",
            "   • Session invalidation only happens on logout, not on password change",
            "   • OAuth2 state parameter validation could be strengthened",
            "   • CORS settings allow all origins in development config",
            "",
            "💡 Code Examples:",
            "   Vulnerable: SECRET_KEY = 'hardcoded-secret-123'",
            "   Missing: @rate_limit('5 per minute') on /api/login",
            "   Risk: CORS_ALLOW_ALL_ORIGINS = True",
            "",
            "🎯 Next Steps:",
            "   • Move JWT secret to environment variables immediately",
            "   • Implement rate limiting with Redis or in-memory store", 
            "   • Add session invalidation on password/email changes",
            "   • Review and restrict CORS origins for production",
            "   • Add security headers middleware",
            "",
            "🔴 Confidence: 94% | Context: 1 previous questions"
        ])
        # Step 4: Third question - Performance issues  
        print("Step 4: Exploring performance (conversation continues)")
        print("-" * 54)
        self.type_question("Why might the login process be slow?")
        self.show_thinking(2.2)
        self.show_response([
            "",
            "🧠 EXPLORATION ANALYSIS (Question #3)",
            "Session: 2.1m | Results: 7 | Time: 2.8s", 
            "=" * 60,
            "",
            "📝 Analysis:",
            "   Considering our previous discussion about JWT auth and security,",
            "   the login slowness likely stems from several performance bottlenecks.",
            "",
            "🔍 Key Insights:",
            "   • bcrypt rounds set to 15 (recommended: 10-12) causing CPU overhead",
            "   • Database queries for user lookup aren't indexed on email field",
            "   • Redis connection pool not configured, creating new connections",
            "   • OAuth2 provider calls lack timeout settings (can hang)",
            "   • Session cleanup runs synchronously during login flow",
            "",
            "💡 Code Examples:",
            "   Slow: bcrypt.hashpw(password, bcrypt.gensalt(rounds=15))",
            "   Missing: CREATE INDEX idx_users_email ON users(email)",  
            "   Blocking: cleanup_expired_sessions() # Should be async",
            "",
            "🎯 Next Steps:",
            "   • Reduce bcrypt rounds to 12 for better balance",
            "   • Add database index on users.email column",
            "   • Configure Redis connection pooling",
            "   • Move session cleanup to background task",
            "   • Add OAuth2 request timeouts (5-10 seconds)",
            "",
            "🟡 Confidence: 89% | Context: 2 previous questions"
        ])
        # Step 5: Session summary
        print("Step 5: Session summary and conclusion")
        print("-" * 38)
        self.type_question("summary")
        self.show_response([
            "",
            "🧠 EXPLORATION SESSION SUMMARY", 
            "=" * 40,
            "Project: webapp-project",
            "Session ID: explore_1702845632", 
            "Duration: 2.5 minutes",
            "Questions explored: 3",
            "",
            "📋 Topics explored:",
            "   1. How does authentication work in this codebase? (confidence: 91%)",
            "   2. What security concerns should I be aware of? (confidence: 94%)",
            "   3. Why might the login process be slow? (confidence: 89%)",
            "",
            "🎯 Key Discoveries:",
            "   • JWT-based auth with session management",
            "   • Critical security issues (hardcoded secrets, no rate limiting)",  
            "   • Performance bottlenecks (bcrypt settings, missing indexes)",
            "",
            "💡 Action Items Generated:",
            "   • Immediate: Fix hardcoded JWT secret",
            "   • High Priority: Add rate limiting and database indexes", 
            "   • Monitor: Review OAuth2 configurations"
        ])
        # Step 6: Exit
        self.type_question("quit")
        self.show_response([
            "",
            "✅ Exploration session ended.",
            "",
            "🎬 This was Exploration Mode - perfect for learning and debugging!"
        ])
        # Final summary
        print()
        print("💡 Exploration Mode Benefits:")
        print("   🧠 Thinking-enabled AI for detailed reasoning")
        print("   💭 Conversation memory across questions")
        print("   🔍 Perfect for debugging and understanding")
        print("   📚 Educational - learn how code really works")
        print("   🎯 Context-aware follow-up responses")
        print()
        time.sleep(3)
 def main():
    """Run the exploration mode demo."""
    demo = ExplorationDemoSimulator()
    print("Starting FSS-Mini-RAG Exploration Mode Demo...")
    print("Record with: asciinema rec exploration_demo.cast")
    print("Press Enter to start...")
    input()
    demo.run_exploration_demo()
    print("\n🎯 To create GIF:")
    print("agg exploration_demo.cast exploration_demo.gif")
 if __name__ == "__main__":
    main()
--- a/create_synthesis_demo.py
+++ b/create_synthesis_demo.py
@ -0,0 +1,178 @@
 #!/usr/bin/env python3
 """
 Create demo GIF for Synthesis Mode - Fast & Consistent RAG Search
 Shows the streamlined workflow for quick answers and code discovery.
 """
 import time
 import sys
 import os
 from pathlib import Path
 class SynthesisDemoSimulator:
    def __init__(self):
        self.width = 100
        self.height = 30
    def clear_screen(self):
        print("\033[H\033[2J", end="")
    def type_command(self, command: str, delay: float = 0.05):
        """Simulate typing a command."""
        print("$ ", end="", flush=True)
        for char in command:
            print(char, end="", flush=True)
            time.sleep(delay)
        print()
        time.sleep(0.5)
    def show_output(self, lines: list, delay: float = 0.3):
        """Show command output with realistic timing."""
        for line in lines:
            print(line)
            time.sleep(delay)
        time.sleep(1.0)
    def run_synthesis_demo(self):
        """Run the synthesis mode demonstration."""
        self.clear_screen()
        # Title
        print("🚀 FSS-Mini-RAG: Synthesis Mode Demo")
        print("=" * 50)
        print("Fast & consistent RAG search for quick answers")
        print()
        time.sleep(2)
        # Step 1: Index a project
        print("Step 1: Index a sample project")
        print("-" * 30)
        self.type_command("rag-mini index ./sample-project")
        self.show_output([
            "📁 Indexing project: sample-project",
            "🔍 Found 12 files to process",  
            "✂️  Creating semantic chunks...",
            "🧠 Generating embeddings...",
            "💾 Building vector index...",
            "✅ Indexed 89 chunks from 12 files in 3.2s",
            "",
            "💡 Try: rag-mini search ./sample-project \"your search here\""
        ])
        # Step 2: Quick search
        print("Step 2: Quick semantic search")  
        print("-" * 30)
        self.type_command("rag-mini search ./sample-project \"user authentication\"")
        self.show_output([
            "🔍 Searching \"user authentication\" in sample-project",
            "✅ Found 5 results:",
            "",
            "1. auth/models.py",
            "   Score: 0.923",
            "   Lines: 45-62",  
            "   Context: User class",
            "   Content:",
            "     class User:",
            "         def authenticate(self, password):",
            "             return bcrypt.checkpw(password, self.password_hash)",
            "",
            "2. auth/views.py",
            "   Score: 0.887", 
            "   Lines: 23-41",
            "   Context: login_view function",
            "   Content:",
            "     def login_view(request):",
            "         user = authenticate(username, password)",
            "         if user:",
            "             login(request, user)",
            "",
            "3. middleware/auth.py",
            "   Score: 0.845",
            "   Content: Authentication middleware checking..."
        ])
        # Step 3: Search with AI synthesis
        print("Step 3: Add AI synthesis for deeper understanding")
        print("-" * 50)
        self.type_command("rag-mini search ./sample-project \"error handling\" --synthesize")
        self.show_output([
            "🔍 Searching \"error handling\" in sample-project", 
            "🧠 Generating LLM synthesis...",
            "✅ Found 4 results:",
            "",
            "1. utils/exceptions.py",
            "   Score: 0.934",  
            "   Content: Custom exception classes for API errors...",
            "",
            "2. api/handlers.py", 
            "   Score: 0.889",
            "   Content: Global exception handler with logging...",
            "",
            "🧠 LLM SYNTHESIS",
            "=" * 50,
            "",
            "📝 Summary:",
            "   This codebase implements a robust error handling system with",
            "   custom exceptions, global handlers, and structured logging.",
            "",
            "🔍 Key Findings:",
            "   • Custom exception hierarchy in utils/exceptions.py",
            "   • Global error handler catches all API exceptions",  
            "   • Logging integrated with error tracking service",
            "",
            "💡 Code Patterns:",
            "   try/except blocks with specific exception types",
            "   Centralized error response formatting",
            "",
            "🎯 Suggested Actions:",
            "   • Review exception hierarchy for completeness",
            "   • Consider adding error recovery mechanisms",
            "",
            "🟢 Confidence: 87%"
        ])
        # Step 4: Show performance
        print("Step 4: Performance characteristics") 
        print("-" * 35)
        print("⚡ Synthesis Mode Benefits:")
        print("   • Lightning fast responses (no thinking overhead)")
        print("   • Consistent, reliable results") 
        print("   • Perfect for code discovery and quick answers")
        print("   • Works great with ultra-efficient models (qwen3:0.6b)")
        print()
        time.sleep(3)
        # Step 5: When to use
        print("💡 When to use Synthesis Mode:")
        print("   ✅ Quick code lookups")
        print("   ✅ Finding specific functions or classes") 
        print("   ✅ Understanding code structure")
        print("   ✅ Fast documentation searches")
        print("   ✅ Batch processing multiple queries")
        print()
        print("🧠 For deeper analysis, try: rag-mini explore ./project")
        print()
        time.sleep(3)
        print("🎬 Demo complete! This was Synthesis Mode - optimized for speed.")
 def main():
    """Run the synthesis mode demo."""
    demo = SynthesisDemoSimulator() 
    print("Starting FSS-Mini-RAG Synthesis Mode Demo...")
    print("Record with: asciinema rec synthesis_demo.cast")
    print("Press Enter to start...")
    input()
    demo.run_synthesis_demo()
    print("\n🎯 To create GIF:")
    print("agg synthesis_demo.cast synthesis_demo.gif")
 if __name__ == "__main__":
    main()