🛡️ Add comprehensive LLM safeguards and dual-mode demo scripts

🛡️ SMART MODEL SAFEGUARDS: - Implement runaway prevention with pattern detection (repetition, thinking loops, rambling) - Add context length management with optimal parameters per model size - Quality validation prevents problematic responses before reaching users - Helpful explanations when issues occur with recovery suggestions - Model-specific parameter optimization (qwen3:0.6b vs 1.7b vs 3b+) - Timeout protection and graceful degradation ⚡ OPTIMAL PERFORMANCE SETTINGS: - Context window: 32k tokens for good balance - Repeat penalty: 1.15 for 0.6b, 1.1 for 1.7b, 1.05 for larger models - Presence penalty: 1.5 for quantized models to prevent repetition - Smart output limits: 1500 tokens for 0.6b, 2000+ for larger models - Top-p/top-k tuning based on research best practices 🎬 DUAL-MODE DEMO SCRIPTS: - create_synthesis_demo.py: Shows fast search with AI synthesis workflow - create_exploration_demo.py: Interactive thinking mode with conversation memory - Realistic typing simulation and response timing for quality GIFs - Clear demonstration of when to use each mode Perfect for creating compelling demo videos showing both RAG experiences!
2025-08-12 19:07:48 +10:00 · 2025-08-12 19:07:48 +10:00 · 5f42751e9a
commit 5f42751e9a
parent 3363171820
4 changed files with 828 additions and 5 deletions
--- a/claude_rag/llm_safeguards.py
+++ b/claude_rag/llm_safeguards.py
@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""
+LLM Safeguards for Small Model Management
+
+Provides runaway prevention, context management, and intelligent detection
+of problematic model behaviors to ensure reliable user experience.
+"""
+
+import re
+import time
+import logging
+from typing import Optional, Dict, List, Tuple
+from dataclasses import dataclass
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class SafeguardConfig:
+    """Configuration for LLM safeguards."""
+    max_output_tokens: int = 2000        # Prevent excessive generation
+    max_repetition_ratio: float = 0.3    # Max ratio of repeated content
+    max_response_time: int = 60          # Max seconds for response
+    min_useful_length: int = 20          # Minimum useful response length
+    context_window: int = 32768          # Ollama context window
+    enable_thinking_detection: bool = True  # Detect thinking patterns
+    
+class ModelRunawayDetector:
+    """Detects and prevents model runaway behaviors."""
+    
+    def __init__(self, config: SafeguardConfig = None):
+        self.config = config or SafeguardConfig()
+        self.response_patterns = self._compile_patterns()
+    
+    def _compile_patterns(self) -> Dict[str, re.Pattern]:
+        """Compile regex patterns for runaway detection."""
+        return {
+            # Excessive repetition patterns
+            'word_repetition': re.compile(r'\b(\w+)\b(?:\s+\1\b){3,}', re.IGNORECASE),
+            'phrase_repetition': re.compile(r'(.{10,50}?)\1{2,}', re.DOTALL),
+            
+            # Thinking loop patterns (small models get stuck)
+            'thinking_loop': re.compile(r'(let me think|i think|thinking|consider|actually|wait|hmm|well)\s*[.,:]*\s*\1', re.IGNORECASE),
+            
+            # Rambling patterns
+            'excessive_filler': re.compile(r'\b(um|uh|well|you know|like|basically|actually|so|then|and|but|however)\b(?:\s+[^.!?]*){5,}', re.IGNORECASE),
+            
+            # JSON corruption patterns
+            'broken_json': re.compile(r'\{[^}]*\{[^}]*\{'),  # Nested broken JSON
+            'json_repetition': re.compile(r'("[\w_]+"\s*:\s*"[^"]*",?\s*){4,}'),  # Repeated JSON fields
+        }
+    
+    def check_response_quality(self, response: str, query: str, start_time: float) -> Tuple[bool, Optional[str], Optional[str]]:
+        """
+        Check response quality and detect runaway behaviors.
+        
+        Returns:
+            (is_valid, issue_type, user_explanation)
+        """
+        if not response or len(response.strip()) < self.config.min_useful_length:
+            return False, "too_short", self._explain_too_short()
+        
+        # Check response time
+        elapsed = time.time() - start_time
+        if elapsed > self.config.max_response_time:
+            return False, "timeout", self._explain_timeout()
+        
+        # Check for repetition issues
+        repetition_issue = self._check_repetition(response)
+        if repetition_issue:
+            return False, repetition_issue, self._explain_repetition(repetition_issue)
+        
+        # Check for thinking loops
+        if self.config.enable_thinking_detection:
+            thinking_issue = self._check_thinking_loops(response)
+            if thinking_issue:
+                return False, thinking_issue, self._explain_thinking_loop()
+        
+        # Check for rambling
+        rambling_issue = self._check_rambling(response)
+        if rambling_issue:
+            return False, rambling_issue, self._explain_rambling()
+        
+        # Check JSON corruption (for structured responses)
+        if '{' in response and '}' in response:
+            json_issue = self._check_json_corruption(response)
+            if json_issue:
+                return False, json_issue, self._explain_json_corruption()
+        
+        return True, None, None
+    
+    def _check_repetition(self, response: str) -> Optional[str]:
+        """Check for excessive repetition."""
+        # Word repetition
+        if self.response_patterns['word_repetition'].search(response):
+            return "word_repetition"
+        
+        # Phrase repetition  
+        if self.response_patterns['phrase_repetition'].search(response):
+            return "phrase_repetition"
+        
+        # Calculate repetition ratio
+        words = response.split()
+        if len(words) > 10:
+            unique_words = set(words)
+            repetition_ratio = 1 - (len(unique_words) / len(words))
+            if repetition_ratio > self.config.max_repetition_ratio:
+                return "high_repetition_ratio"
+        
+        return None
+    
+    def _check_thinking_loops(self, response: str) -> Optional[str]:
+        """Check for thinking loops (common in small models)."""
+        if self.response_patterns['thinking_loop'].search(response):
+            return "thinking_loop"
+        
+        # Check for excessive meta-commentary
+        thinking_words = ['think', 'considering', 'actually', 'wait', 'hmm', 'let me']
+        thinking_count = sum(response.lower().count(word) for word in thinking_words)
+        
+        if thinking_count > 5 and len(response.split()) < 200:
+            return "excessive_thinking"
+        
+        return None
+    
+    def _check_rambling(self, response: str) -> Optional[str]:
+        """Check for rambling or excessive filler."""
+        if self.response_patterns['excessive_filler'].search(response):
+            return "excessive_filler"
+        
+        # Check for extremely long sentences (sign of rambling)
+        sentences = re.split(r'[.!?]+', response)
+        long_sentences = [s for s in sentences if len(s.split()) > 50]
+        
+        if len(long_sentences) > 2:
+            return "excessive_rambling"
+        
+        return None
+    
+    def _check_json_corruption(self, response: str) -> Optional[str]:
+        """Check for JSON corruption in structured responses."""
+        if self.response_patterns['broken_json'].search(response):
+            return "broken_json"
+        
+        if self.response_patterns['json_repetition'].search(response):
+            return "json_repetition"
+        
+        return None
+    
+    def _explain_too_short(self) -> str:
+        return """🤔 The AI response was too short to be helpful.
+
+**Why this happens:**
+• The model might be confused by the query
+• Context might be insufficient  
+• Model might be overloaded
+
+**What to try:**
+• Rephrase your question more specifically
+• Try a broader search term first
+• Use exploration mode for complex questions: `rag-mini explore`"""
+
+    def _explain_timeout(self) -> str:
+        return """⏱️ The AI took too long to respond (over 60 seconds).
+
+**Why this happens:**
+• Small models sometimes get "stuck" thinking
+• Complex queries can overwhelm smaller models
+• System might be under load
+
+**What to try:**
+• Try a simpler, more direct question
+• Use synthesis mode for faster responses: `--synthesize`  
+• Consider using a larger model if available"""
+
+    def _explain_repetition(self, issue_type: str) -> str:
+        return f"""🔄 The AI got stuck in repetition loops ({issue_type}).
+
+**Why this happens:**
+• Small models sometimes repeat when uncertain
+• Query might be too complex for the model size
+• Context window might be exceeded
+
+**What to try:**
+• Try a more specific question
+• Break complex questions into smaller parts
+• Use exploration mode which handles context better: `rag-mini explore`
+• Consider: A larger model (qwen3:1.7b or qwen3:3b) would help"""
+
+    def _explain_thinking_loop(self) -> str:
+        return """🧠 The AI got caught in a "thinking loop" - overthinking the response.
+
+**Why this happens:**
+• Small models sometimes over-analyze simple questions
+• Thinking mode can cause loops in smaller models
+• Query complexity exceeds model capabilities
+
+**What to try:**
+• Ask more direct, specific questions
+• Use synthesis mode (no thinking) for faster results
+• Try: "What does this code do?" instead of "Explain how this works"
+• Larger models (qwen3:1.7b+) handle thinking better"""
+
+    def _explain_rambling(self) -> str:
+        return """💭 The AI started rambling instead of giving focused answers.
+
+**Why this happens:**
+• Small models sometimes lose focus on complex topics
+• Query might be too broad or vague  
+• Model trying to cover too much at once
+
+**What to try:**
+• Ask more specific questions
+• Break broad questions into focused parts
+• Example: "How is data validated?" instead of "Explain the whole system"
+• Exploration mode helps maintain focus across questions"""
+
+    def _explain_json_corruption(self) -> str:
+        return """🔧 The AI response format got corrupted.
+
+**Why this happens:**
+• Small models sometimes struggle with structured output
+• Context limits can cause format errors
+• Complex analysis might overwhelm formatting
+
+**What to try:**  
+• Try the question again (often resolves itself)
+• Use simpler questions for better formatting
+• Synthesis mode sometimes gives cleaner output
+• This is less common with larger models"""
+
+    def get_recovery_suggestions(self, issue_type: str, query: str) -> List[str]:
+        """Get specific recovery suggestions based on the issue."""
+        suggestions = []
+        
+        if issue_type in ['thinking_loop', 'excessive_thinking']:
+            suggestions.extend([
+                f"Try synthesis mode: `rag-mini search . \"{query}\" --synthesize`",
+                "Ask more direct questions without 'why' or 'how'",
+                "Break complex questions into smaller parts"
+            ])
+        
+        elif issue_type in ['word_repetition', 'phrase_repetition', 'high_repetition_ratio']:
+            suggestions.extend([
+                "Try rephrasing your question completely",
+                "Use more specific technical terms",  
+                f"Try exploration mode: `rag-mini explore .`"
+            ])
+        
+        elif issue_type == 'timeout':
+            suggestions.extend([
+                "Try a simpler version of your question",
+                "Use synthesis mode for faster responses",
+                "Check if Ollama is under heavy load"
+            ])
+        
+        # Universal suggestions
+        suggestions.extend([
+            "Consider using a larger model if available (qwen3:1.7b or qwen3:3b)",
+            "Check model status: `ollama list`"
+        ])
+        
+        return suggestions
+
+def get_optimal_ollama_parameters(model_name: str) -> Dict[str, any]:
+    """Get optimal parameters for different Ollama models."""
+    
+    base_params = {
+        "num_ctx": 32768,      # Good context window for most uses
+        "num_predict": 2000,   # Reasonable response length
+        "temperature": 0.3,    # Balanced creativity/consistency
+    }
+    
+    # Model-specific optimizations
+    if "qwen3:0.6b" in model_name.lower():
+        return {
+            **base_params,
+            "repeat_penalty": 1.15,      # Prevent repetition in small model
+            "presence_penalty": 1.5,     # Suppress repetitive outputs 
+            "top_p": 0.8,               # Focused sampling
+            "top_k": 20,                # Limit choices
+            "num_predict": 1500,        # Shorter responses for reliability
+        }
+    
+    elif "qwen3:1.7b" in model_name.lower():
+        return {
+            **base_params,
+            "repeat_penalty": 1.1,       # Less aggressive for larger model
+            "presence_penalty": 1.0,     # Balanced
+            "top_p": 0.9,               # More creative
+            "top_k": 40,                # More choices
+        }
+    
+    elif any(size in model_name.lower() for size in ["3b", "7b", "8b"]):
+        return {
+            **base_params,
+            "repeat_penalty": 1.05,      # Minimal for larger models
+            "presence_penalty": 0.5,     # Light touch
+            "top_p": 0.95,              # High creativity
+            "top_k": 50,                # Many choices
+            "num_predict": 3000,        # Longer responses OK
+        }
+    
+    return base_params
+
+# Quick test
+def test_safeguards():
+    """Test the safeguard system."""
+    detector = ModelRunawayDetector()
+    
+    # Test repetition detection
+    bad_response = "The user authentication system works by checking user credentials. The user authentication system works by checking user credentials. The user authentication system works by checking user credentials."
+    
+    is_valid, issue, explanation = detector.check_response_quality(bad_response, "auth", time.time())
+    
+    print(f"Repetition test: Valid={is_valid}, Issue={issue}")
+    if explanation:
+        print(explanation)
+
+if __name__ == "__main__":
+    test_safeguards()
--- a/claude_rag/llm_synthesizer.py
+++ b/claude_rag/llm_synthesizer.py
@ -8,11 +8,20 @@ Takes raw search results and generates coherent, contextual summaries.

 import json
 import logging
+import time
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass
 import requests
 from pathlib import Path

+try:
+    from .llm_safeguards import ModelRunawayDetector, SafeguardConfig, get_optimal_ollama_parameters
+except ImportError:
+    # Graceful fallback if safeguards not available
+    ModelRunawayDetector = None
+    SafeguardConfig = None
+    get_optimal_ollama_parameters = lambda x: {}
+
 logger = logging.getLogger(__name__)

@dataclass
@ -34,6 +43,12 @@ class LLMSynthesizer:
        self.enable_thinking = enable_thinking  # Default False for synthesis mode
        self._initialized = False
        
+        # Initialize safeguards
+        if ModelRunawayDetector:
+            self.safeguard_detector = ModelRunawayDetector(SafeguardConfig())
+        else:
+            self.safeguard_detector = None
+        
    def _get_available_models(self) -> List[str]:
        """Get list of available Ollama models."""
        try:
@ -129,7 +144,9 @@ class LLMSynthesizer:
        return len(self.available_models) > 0
    
    def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False) -> Optional[str]:
-        """Make a call to Ollama API."""
+        """Make a call to Ollama API with safeguards."""
+        start_time = time.time()
+        
        try:
            # Use the best available model
            model_to_use = self.model
@ -147,26 +164,46 @@ class LLMSynthesizer:
                if not final_prompt.endswith(" <no_think>"):
                    final_prompt += " <no_think>"
            
+            # Get optimal parameters for this model
+            optimal_params = get_optimal_ollama_parameters(model_to_use)
+            
            payload = {
                "model": model_to_use,
                "prompt": final_prompt,
                "stream": False,
                "options": {
                    "temperature": temperature,
-                    "top_p": 0.9,
-                    "top_k": 40
+                    "top_p": optimal_params.get("top_p", 0.9),
+                    "top_k": optimal_params.get("top_k", 40),
+                    "num_ctx": optimal_params.get("num_ctx", 32768),
+                    "num_predict": optimal_params.get("num_predict", 2000),
+                    "repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
+                    "presence_penalty": optimal_params.get("presence_penalty", 1.0)
                }
            }
            
            response = requests.post(
                f"{self.ollama_url}/api/generate",
                json=payload,
-                timeout=30
+                timeout=65  # Slightly longer than safeguard timeout
            )
            
            if response.status_code == 200:
                result = response.json()
-                return result.get('response', '').strip()
+                raw_response = result.get('response', '').strip()
+                
+                # Apply safeguards to check response quality
+                if self.safeguard_detector and raw_response:
+                    is_valid, issue_type, explanation = self.safeguard_detector.check_response_quality(
+                        raw_response, prompt[:100], start_time  # First 100 chars of prompt for context
+                    )
+                    
+                    if not is_valid:
+                        logger.warning(f"Safeguard triggered: {issue_type}")
+                        # Return a safe explanation instead of the problematic response
+                        return self._create_safeguard_response(issue_type, explanation, prompt)
+                
+                return raw_response
            else:
                logger.error(f"Ollama API error: {response.status_code}")
                return None
@ -175,6 +212,24 @@ class LLMSynthesizer:
            logger.error(f"Ollama call failed: {e}")
            return None
    
+    def _create_safeguard_response(self, issue_type: str, explanation: str, original_prompt: str) -> str:
+        """Create a helpful response when safeguards are triggered."""
+        return f"""⚠️ Model Response Issue Detected
+
+{explanation}
+
+**Original query context:** {original_prompt[:200]}{'...' if len(original_prompt) > 200 else ''}
+
+**What happened:** The AI model encountered a common issue with small language models and was prevented from giving a problematic response.
+
+**Your options:**
+1. **Try again**: Ask the same question (often resolves itself)
+2. **Rephrase**: Make your question more specific or break it into parts  
+3. **Use exploration mode**: `rag-mini explore` for complex questions
+4. **Different approach**: Try synthesis mode: `--synthesize` for simpler responses
+
+This is normal with smaller AI models and helps ensure you get quality responses."""
+    
    def synthesize_search_results(self, query: str, results: List[Any], project_path: Path) -> SynthesisResult:
        """Synthesize search results into a coherent summary."""
        
--- a/create_exploration_demo.py
+++ b/create_exploration_demo.py
@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+"""
+Create demo GIF for Exploration Mode - Deep Thinking & Interactive Learning
+Shows the conversational workflow for understanding and debugging codebases.
+"""
+
+import time
+import sys
+import os
+from pathlib import Path
+
+class ExplorationDemoSimulator:
+    def __init__(self):
+        self.width = 100
+        self.height = 35
+        
+    def clear_screen(self):
+        print("\033[H\033[2J", end="")
+        
+    def type_command(self, command: str, delay: float = 0.05):
+        """Simulate typing a command."""
+        print("$ ", end="", flush=True)
+        for char in command:
+            print(char, end="", flush=True)
+            time.sleep(delay)
+        print()
+        time.sleep(0.5)
+        
+    def type_question(self, question: str, delay: float = 0.04):
+        """Simulate typing a question in exploration mode."""
+        print("> ", end="", flush=True)
+        for char in question:
+            print(char, end="", flush=True)
+            time.sleep(delay)
+        print()
+        time.sleep(0.5)
+    
+    def show_thinking(self, duration: float = 2.0):
+        """Show thinking animation."""
+        print("🔍 Analyzing...", end="", flush=True)
+        for _ in range(3):
+            time.sleep(duration / 3)
+            print(".", end="", flush=True)
+        print()
+        time.sleep(0.5)
+    
+    def show_response(self, lines: list, delay: float = 0.4):
+        """Show AI response with realistic timing."""
+        for line in lines:
+            print(line)
+            time.sleep(delay)
+        time.sleep(1.5)
+    
+    def run_exploration_demo(self):
+        """Run the exploration mode demonstration."""
+        self.clear_screen()
+        
+        # Title
+        print("🧠 FSS-Mini-RAG: Exploration Mode Demo")
+        print("=" * 55)
+        print("Deep thinking & interactive learning for complex codebases")
+        print()
+        time.sleep(2)
+        
+        # Step 1: Start exploration
+        print("Step 1: Start interactive exploration session")
+        print("-" * 45)
+        self.type_command("rag-mini explore ./webapp-project")
+        
+        self.show_response([
+            "🧠 EXPLORATION MODE STARTED", 
+            "=" * 50,
+            "Project: webapp-project",
+            "Session: explore_1702845632",
+            "",
+            "🎯 This mode uses thinking and remembers context.",
+            "   Perfect for debugging, learning, and deep exploration.",
+            "",  
+            "💡 Tips:",
+            "   • Ask follow-up questions - I'll remember our conversation",
+            "   • Use 'why', 'how', 'explain' for detailed reasoning",
+            "   • Type 'quit' or 'exit' to end session",
+            "",
+            "=" * 50,
+            "",
+            "🤔 Ask your first question about the codebase:"
+        ])
+        
+        # Step 2: First question - How authentication works
+        print("Step 2: Understanding authentication system")
+        print("-" * 42)
+        self.type_question("How does authentication work in this codebase?")
+        
+        self.show_thinking()
+        
+        self.show_response([
+            "",
+            "🧠 EXPLORATION ANALYSIS (Question #1)",
+            "Session: 0.2m | Results: 8 | Time: 3.4s",
+            "=" * 60,
+            "",
+            "📝 Analysis:", 
+            "   This webapp implements a JWT-based authentication system with",
+            "   session management, role-based permissions, and OAuth2 integration.",
+            "",
+            "🔍 Key Insights:",
+            "   • JWT tokens generated in auth/jwt_handler.py with 24h expiry",
+            "   • User sessions stored in Redis for fast lookups and invalidation", 
+            "   • Role-based access control through middleware/permissions.py",
+            "   • OAuth2 integration supports Google, GitHub, and Microsoft providers",
+            "   • Password hashing uses bcrypt with 12 rounds for security",
+            "",
+            "💡 Code Examples:",
+            "   JWT generation: jwt.encode(payload, SECRET_KEY, algorithm='HS256')",
+            "   Session storage: redis.setex(f'session:{user_id}', 86400, token)",
+            "   Permission check: @require_permission('admin') decorator",
+            "",
+            "🎯 Next Steps:",
+            "   • Review JWT secret rotation policy", 
+            "   • Consider implementing refresh tokens",
+            "   • Audit OAuth2 scope permissions",
+            "",
+            "🟢 Confidence: 91%"
+        ])
+        
+        # Step 3: Follow-up question - Security concerns
+        print("Step 3: Follow-up question (context remembered)")
+        print("-" * 48)
+        self.type_question("What security concerns should I be aware of?")
+        
+        self.show_thinking(1.8)
+        
+        self.show_response([
+            "",
+            "🧠 EXPLORATION ANALYSIS (Question #2)",  
+            "Session: 1.4m | Results: 6 | Time: 2.1s",
+            "=" * 60,
+            "",
+            "📝 Analysis:",
+            "   Building on the JWT authentication system we just discussed,",
+            "   I've identified several security considerations that need attention.",
+            "",
+            "🔍 Key Insights:",
+            "   • JWT secret is hardcoded in config.py (CRITICAL RISK)",
+            "   • No rate limiting on login endpoints (brute force vulnerable)",
+            "   • Session invalidation only happens on logout, not on password change",
+            "   • OAuth2 state parameter validation could be strengthened",
+            "   • CORS settings allow all origins in development config",
+            "",
+            "💡 Code Examples:",
+            "   Vulnerable: SECRET_KEY = 'hardcoded-secret-123'",
+            "   Missing: @rate_limit('5 per minute') on /api/login",
+            "   Risk: CORS_ALLOW_ALL_ORIGINS = True",
+            "",
+            "🎯 Next Steps:",
+            "   • Move JWT secret to environment variables immediately",
+            "   • Implement rate limiting with Redis or in-memory store", 
+            "   • Add session invalidation on password/email changes",
+            "   • Review and restrict CORS origins for production",
+            "   • Add security headers middleware",
+            "",
+            "🔴 Confidence: 94% | Context: 1 previous questions"
+        ])
+        
+        # Step 4: Third question - Performance issues  
+        print("Step 4: Exploring performance (conversation continues)")
+        print("-" * 54)
+        self.type_question("Why might the login process be slow?")
+        
+        self.show_thinking(2.2)
+        
+        self.show_response([
+            "",
+            "🧠 EXPLORATION ANALYSIS (Question #3)",
+            "Session: 2.1m | Results: 7 | Time: 2.8s", 
+            "=" * 60,
+            "",
+            "📝 Analysis:",
+            "   Considering our previous discussion about JWT auth and security,",
+            "   the login slowness likely stems from several performance bottlenecks.",
+            "",
+            "🔍 Key Insights:",
+            "   • bcrypt rounds set to 15 (recommended: 10-12) causing CPU overhead",
+            "   • Database queries for user lookup aren't indexed on email field",
+            "   • Redis connection pool not configured, creating new connections",
+            "   • OAuth2 provider calls lack timeout settings (can hang)",
+            "   • Session cleanup runs synchronously during login flow",
+            "",
+            "💡 Code Examples:",
+            "   Slow: bcrypt.hashpw(password, bcrypt.gensalt(rounds=15))",
+            "   Missing: CREATE INDEX idx_users_email ON users(email)",  
+            "   Blocking: cleanup_expired_sessions() # Should be async",
+            "",
+            "🎯 Next Steps:",
+            "   • Reduce bcrypt rounds to 12 for better balance",
+            "   • Add database index on users.email column",
+            "   • Configure Redis connection pooling",
+            "   • Move session cleanup to background task",
+            "   • Add OAuth2 request timeouts (5-10 seconds)",
+            "",
+            "🟡 Confidence: 89% | Context: 2 previous questions"
+        ])
+        
+        # Step 5: Session summary
+        print("Step 5: Session summary and conclusion")
+        print("-" * 38)
+        self.type_question("summary")
+        
+        self.show_response([
+            "",
+            "🧠 EXPLORATION SESSION SUMMARY", 
+            "=" * 40,
+            "Project: webapp-project",
+            "Session ID: explore_1702845632", 
+            "Duration: 2.5 minutes",
+            "Questions explored: 3",
+            "",
+            "📋 Topics explored:",
+            "   1. How does authentication work in this codebase? (confidence: 91%)",
+            "   2. What security concerns should I be aware of? (confidence: 94%)",
+            "   3. Why might the login process be slow? (confidence: 89%)",
+            "",
+            "🎯 Key Discoveries:",
+            "   • JWT-based auth with session management",
+            "   • Critical security issues (hardcoded secrets, no rate limiting)",  
+            "   • Performance bottlenecks (bcrypt settings, missing indexes)",
+            "",
+            "💡 Action Items Generated:",
+            "   • Immediate: Fix hardcoded JWT secret",
+            "   • High Priority: Add rate limiting and database indexes", 
+            "   • Monitor: Review OAuth2 configurations"
+        ])
+        
+        # Step 6: Exit
+        self.type_question("quit")
+        
+        self.show_response([
+            "",
+            "✅ Exploration session ended.",
+            "",
+            "🎬 This was Exploration Mode - perfect for learning and debugging!"
+        ])
+        
+        # Final summary
+        print()
+        print("💡 Exploration Mode Benefits:")
+        print("   🧠 Thinking-enabled AI for detailed reasoning")
+        print("   💭 Conversation memory across questions")
+        print("   🔍 Perfect for debugging and understanding")
+        print("   📚 Educational - learn how code really works")
+        print("   🎯 Context-aware follow-up responses")
+        print()
+        time.sleep(3)
+
+def main():
+    """Run the exploration mode demo."""
+    demo = ExplorationDemoSimulator()
+    
+    print("Starting FSS-Mini-RAG Exploration Mode Demo...")
+    print("Record with: asciinema rec exploration_demo.cast")
+    print("Press Enter to start...")
+    input()
+    
+    demo.run_exploration_demo()
+    
+    print("\n🎯 To create GIF:")
+    print("agg exploration_demo.cast exploration_demo.gif")
+
+if __name__ == "__main__":
+    main()
--- a/create_synthesis_demo.py
+++ b/create_synthesis_demo.py
@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Create demo GIF for Synthesis Mode - Fast & Consistent RAG Search
+Shows the streamlined workflow for quick answers and code discovery.
+"""
+
+import time
+import sys
+import os
+from pathlib import Path
+
+class SynthesisDemoSimulator:
+    def __init__(self):
+        self.width = 100
+        self.height = 30
+        
+    def clear_screen(self):
+        print("\033[H\033[2J", end="")
+        
+    def type_command(self, command: str, delay: float = 0.05):
+        """Simulate typing a command."""
+        print("$ ", end="", flush=True)
+        for char in command:
+            print(char, end="", flush=True)
+            time.sleep(delay)
+        print()
+        time.sleep(0.5)
+        
+    def show_output(self, lines: list, delay: float = 0.3):
+        """Show command output with realistic timing."""
+        for line in lines:
+            print(line)
+            time.sleep(delay)
+        time.sleep(1.0)
+    
+    def run_synthesis_demo(self):
+        """Run the synthesis mode demonstration."""
+        self.clear_screen()
+        
+        # Title
+        print("🚀 FSS-Mini-RAG: Synthesis Mode Demo")
+        print("=" * 50)
+        print("Fast & consistent RAG search for quick answers")
+        print()
+        time.sleep(2)
+        
+        # Step 1: Index a project
+        print("Step 1: Index a sample project")
+        print("-" * 30)
+        self.type_command("rag-mini index ./sample-project")
+        
+        self.show_output([
+            "📁 Indexing project: sample-project",
+            "🔍 Found 12 files to process",  
+            "✂️  Creating semantic chunks...",
+            "🧠 Generating embeddings...",
+            "💾 Building vector index...",
+            "✅ Indexed 89 chunks from 12 files in 3.2s",
+            "",
+            "💡 Try: rag-mini search ./sample-project \"your search here\""
+        ])
+        
+        # Step 2: Quick search
+        print("Step 2: Quick semantic search")  
+        print("-" * 30)
+        self.type_command("rag-mini search ./sample-project \"user authentication\"")
+        
+        self.show_output([
+            "🔍 Searching \"user authentication\" in sample-project",
+            "✅ Found 5 results:",
+            "",
+            "1. auth/models.py",
+            "   Score: 0.923",
+            "   Lines: 45-62",  
+            "   Context: User class",
+            "   Content:",
+            "     class User:",
+            "         def authenticate(self, password):",
+            "             return bcrypt.checkpw(password, self.password_hash)",
+            "",
+            "2. auth/views.py",
+            "   Score: 0.887", 
+            "   Lines: 23-41",
+            "   Context: login_view function",
+            "   Content:",
+            "     def login_view(request):",
+            "         user = authenticate(username, password)",
+            "         if user:",
+            "             login(request, user)",
+            "",
+            "3. middleware/auth.py",
+            "   Score: 0.845",
+            "   Content: Authentication middleware checking..."
+        ])
+        
+        # Step 3: Search with AI synthesis
+        print("Step 3: Add AI synthesis for deeper understanding")
+        print("-" * 50)
+        self.type_command("rag-mini search ./sample-project \"error handling\" --synthesize")
+        
+        self.show_output([
+            "🔍 Searching \"error handling\" in sample-project", 
+            "🧠 Generating LLM synthesis...",
+            "✅ Found 4 results:",
+            "",
+            "1. utils/exceptions.py",
+            "   Score: 0.934",  
+            "   Content: Custom exception classes for API errors...",
+            "",
+            "2. api/handlers.py", 
+            "   Score: 0.889",
+            "   Content: Global exception handler with logging...",
+            "",
+            "🧠 LLM SYNTHESIS",
+            "=" * 50,
+            "",
+            "📝 Summary:",
+            "   This codebase implements a robust error handling system with",
+            "   custom exceptions, global handlers, and structured logging.",
+            "",
+            "🔍 Key Findings:",
+            "   • Custom exception hierarchy in utils/exceptions.py",
+            "   • Global error handler catches all API exceptions",  
+            "   • Logging integrated with error tracking service",
+            "",
+            "💡 Code Patterns:",
+            "   try/except blocks with specific exception types",
+            "   Centralized error response formatting",
+            "",
+            "🎯 Suggested Actions:",
+            "   • Review exception hierarchy for completeness",
+            "   • Consider adding error recovery mechanisms",
+            "",
+            "🟢 Confidence: 87%"
+        ])
+        
+        # Step 4: Show performance
+        print("Step 4: Performance characteristics") 
+        print("-" * 35)
+        print("⚡ Synthesis Mode Benefits:")
+        print("   • Lightning fast responses (no thinking overhead)")
+        print("   • Consistent, reliable results") 
+        print("   • Perfect for code discovery and quick answers")
+        print("   • Works great with ultra-efficient models (qwen3:0.6b)")
+        print()
+        time.sleep(3)
+        
+        # Step 5: When to use
+        print("💡 When to use Synthesis Mode:")
+        print("   ✅ Quick code lookups")
+        print("   ✅ Finding specific functions or classes") 
+        print("   ✅ Understanding code structure")
+        print("   ✅ Fast documentation searches")
+        print("   ✅ Batch processing multiple queries")
+        print()
+        
+        print("🧠 For deeper analysis, try: rag-mini explore ./project")
+        print()
+        time.sleep(3)
+        
+        print("🎬 Demo complete! This was Synthesis Mode - optimized for speed.")
+
+def main():
+    """Run the synthesis mode demo."""
+    demo = SynthesisDemoSimulator() 
+    
+    print("Starting FSS-Mini-RAG Synthesis Mode Demo...")
+    print("Record with: asciinema rec synthesis_demo.cast")
+    print("Press Enter to start...")
+    input()
+    
+    demo.run_synthesis_demo()
+    
+    print("\n🎯 To create GIF:")
+    print("agg synthesis_demo.cast synthesis_demo.gif")
+
+if __name__ == "__main__":
+    main()