🛡️ Add comprehensive LLM safeguards and dual-mode demo scripts
🛡️ SMART MODEL SAFEGUARDS: - Implement runaway prevention with pattern detection (repetition, thinking loops, rambling) - Add context length management with optimal parameters per model size - Quality validation prevents problematic responses before reaching users - Helpful explanations when issues occur with recovery suggestions - Model-specific parameter optimization (qwen3:0.6b vs 1.7b vs 3b+) - Timeout protection and graceful degradation ⚡ OPTIMAL PERFORMANCE SETTINGS: - Context window: 32k tokens for good balance - Repeat penalty: 1.15 for 0.6b, 1.1 for 1.7b, 1.05 for larger models - Presence penalty: 1.5 for quantized models to prevent repetition - Smart output limits: 1500 tokens for 0.6b, 2000+ for larger models - Top-p/top-k tuning based on research best practices 🎬 DUAL-MODE DEMO SCRIPTS: - create_synthesis_demo.py: Shows fast search with AI synthesis workflow - create_exploration_demo.py: Interactive thinking mode with conversation memory - Realistic typing simulation and response timing for quality GIFs - Clear demonstration of when to use each mode Perfect for creating compelling demo videos showing both RAG experiences!
This commit is contained in:
parent
3363171820
commit
5f42751e9a
320
claude_rag/llm_safeguards.py
Normal file
320
claude_rag/llm_safeguards.py
Normal file
@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
LLM Safeguards for Small Model Management
|
||||
|
||||
Provides runaway prevention, context management, and intelligent detection
|
||||
of problematic model behaviors to ensure reliable user experience.
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
from typing import Optional, Dict, List, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class SafeguardConfig:
|
||||
"""Configuration for LLM safeguards."""
|
||||
max_output_tokens: int = 2000 # Prevent excessive generation
|
||||
max_repetition_ratio: float = 0.3 # Max ratio of repeated content
|
||||
max_response_time: int = 60 # Max seconds for response
|
||||
min_useful_length: int = 20 # Minimum useful response length
|
||||
context_window: int = 32768 # Ollama context window
|
||||
enable_thinking_detection: bool = True # Detect thinking patterns
|
||||
|
||||
class ModelRunawayDetector:
|
||||
"""Detects and prevents model runaway behaviors."""
|
||||
|
||||
def __init__(self, config: SafeguardConfig = None):
|
||||
self.config = config or SafeguardConfig()
|
||||
self.response_patterns = self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self) -> Dict[str, re.Pattern]:
|
||||
"""Compile regex patterns for runaway detection."""
|
||||
return {
|
||||
# Excessive repetition patterns
|
||||
'word_repetition': re.compile(r'\b(\w+)\b(?:\s+\1\b){3,}', re.IGNORECASE),
|
||||
'phrase_repetition': re.compile(r'(.{10,50}?)\1{2,}', re.DOTALL),
|
||||
|
||||
# Thinking loop patterns (small models get stuck)
|
||||
'thinking_loop': re.compile(r'(let me think|i think|thinking|consider|actually|wait|hmm|well)\s*[.,:]*\s*\1', re.IGNORECASE),
|
||||
|
||||
# Rambling patterns
|
||||
'excessive_filler': re.compile(r'\b(um|uh|well|you know|like|basically|actually|so|then|and|but|however)\b(?:\s+[^.!?]*){5,}', re.IGNORECASE),
|
||||
|
||||
# JSON corruption patterns
|
||||
'broken_json': re.compile(r'\{[^}]*\{[^}]*\{'), # Nested broken JSON
|
||||
'json_repetition': re.compile(r'("[\w_]+"\s*:\s*"[^"]*",?\s*){4,}'), # Repeated JSON fields
|
||||
}
|
||||
|
||||
def check_response_quality(self, response: str, query: str, start_time: float) -> Tuple[bool, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Check response quality and detect runaway behaviors.
|
||||
|
||||
Returns:
|
||||
(is_valid, issue_type, user_explanation)
|
||||
"""
|
||||
if not response or len(response.strip()) < self.config.min_useful_length:
|
||||
return False, "too_short", self._explain_too_short()
|
||||
|
||||
# Check response time
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed > self.config.max_response_time:
|
||||
return False, "timeout", self._explain_timeout()
|
||||
|
||||
# Check for repetition issues
|
||||
repetition_issue = self._check_repetition(response)
|
||||
if repetition_issue:
|
||||
return False, repetition_issue, self._explain_repetition(repetition_issue)
|
||||
|
||||
# Check for thinking loops
|
||||
if self.config.enable_thinking_detection:
|
||||
thinking_issue = self._check_thinking_loops(response)
|
||||
if thinking_issue:
|
||||
return False, thinking_issue, self._explain_thinking_loop()
|
||||
|
||||
# Check for rambling
|
||||
rambling_issue = self._check_rambling(response)
|
||||
if rambling_issue:
|
||||
return False, rambling_issue, self._explain_rambling()
|
||||
|
||||
# Check JSON corruption (for structured responses)
|
||||
if '{' in response and '}' in response:
|
||||
json_issue = self._check_json_corruption(response)
|
||||
if json_issue:
|
||||
return False, json_issue, self._explain_json_corruption()
|
||||
|
||||
return True, None, None
|
||||
|
||||
def _check_repetition(self, response: str) -> Optional[str]:
|
||||
"""Check for excessive repetition."""
|
||||
# Word repetition
|
||||
if self.response_patterns['word_repetition'].search(response):
|
||||
return "word_repetition"
|
||||
|
||||
# Phrase repetition
|
||||
if self.response_patterns['phrase_repetition'].search(response):
|
||||
return "phrase_repetition"
|
||||
|
||||
# Calculate repetition ratio
|
||||
words = response.split()
|
||||
if len(words) > 10:
|
||||
unique_words = set(words)
|
||||
repetition_ratio = 1 - (len(unique_words) / len(words))
|
||||
if repetition_ratio > self.config.max_repetition_ratio:
|
||||
return "high_repetition_ratio"
|
||||
|
||||
return None
|
||||
|
||||
def _check_thinking_loops(self, response: str) -> Optional[str]:
|
||||
"""Check for thinking loops (common in small models)."""
|
||||
if self.response_patterns['thinking_loop'].search(response):
|
||||
return "thinking_loop"
|
||||
|
||||
# Check for excessive meta-commentary
|
||||
thinking_words = ['think', 'considering', 'actually', 'wait', 'hmm', 'let me']
|
||||
thinking_count = sum(response.lower().count(word) for word in thinking_words)
|
||||
|
||||
if thinking_count > 5 and len(response.split()) < 200:
|
||||
return "excessive_thinking"
|
||||
|
||||
return None
|
||||
|
||||
def _check_rambling(self, response: str) -> Optional[str]:
|
||||
"""Check for rambling or excessive filler."""
|
||||
if self.response_patterns['excessive_filler'].search(response):
|
||||
return "excessive_filler"
|
||||
|
||||
# Check for extremely long sentences (sign of rambling)
|
||||
sentences = re.split(r'[.!?]+', response)
|
||||
long_sentences = [s for s in sentences if len(s.split()) > 50]
|
||||
|
||||
if len(long_sentences) > 2:
|
||||
return "excessive_rambling"
|
||||
|
||||
return None
|
||||
|
||||
def _check_json_corruption(self, response: str) -> Optional[str]:
|
||||
"""Check for JSON corruption in structured responses."""
|
||||
if self.response_patterns['broken_json'].search(response):
|
||||
return "broken_json"
|
||||
|
||||
if self.response_patterns['json_repetition'].search(response):
|
||||
return "json_repetition"
|
||||
|
||||
return None
|
||||
|
||||
def _explain_too_short(self) -> str:
|
||||
return """🤔 The AI response was too short to be helpful.
|
||||
|
||||
**Why this happens:**
|
||||
• The model might be confused by the query
|
||||
• Context might be insufficient
|
||||
• Model might be overloaded
|
||||
|
||||
**What to try:**
|
||||
• Rephrase your question more specifically
|
||||
• Try a broader search term first
|
||||
• Use exploration mode for complex questions: `rag-mini explore`"""
|
||||
|
||||
def _explain_timeout(self) -> str:
|
||||
return """⏱️ The AI took too long to respond (over 60 seconds).
|
||||
|
||||
**Why this happens:**
|
||||
• Small models sometimes get "stuck" thinking
|
||||
• Complex queries can overwhelm smaller models
|
||||
• System might be under load
|
||||
|
||||
**What to try:**
|
||||
• Try a simpler, more direct question
|
||||
• Use synthesis mode for faster responses: `--synthesize`
|
||||
• Consider using a larger model if available"""
|
||||
|
||||
def _explain_repetition(self, issue_type: str) -> str:
|
||||
return f"""🔄 The AI got stuck in repetition loops ({issue_type}).
|
||||
|
||||
**Why this happens:**
|
||||
• Small models sometimes repeat when uncertain
|
||||
• Query might be too complex for the model size
|
||||
• Context window might be exceeded
|
||||
|
||||
**What to try:**
|
||||
• Try a more specific question
|
||||
• Break complex questions into smaller parts
|
||||
• Use exploration mode which handles context better: `rag-mini explore`
|
||||
• Consider: A larger model (qwen3:1.7b or qwen3:3b) would help"""
|
||||
|
||||
def _explain_thinking_loop(self) -> str:
|
||||
return """🧠 The AI got caught in a "thinking loop" - overthinking the response.
|
||||
|
||||
**Why this happens:**
|
||||
• Small models sometimes over-analyze simple questions
|
||||
• Thinking mode can cause loops in smaller models
|
||||
• Query complexity exceeds model capabilities
|
||||
|
||||
**What to try:**
|
||||
• Ask more direct, specific questions
|
||||
• Use synthesis mode (no thinking) for faster results
|
||||
• Try: "What does this code do?" instead of "Explain how this works"
|
||||
• Larger models (qwen3:1.7b+) handle thinking better"""
|
||||
|
||||
def _explain_rambling(self) -> str:
|
||||
return """💭 The AI started rambling instead of giving focused answers.
|
||||
|
||||
**Why this happens:**
|
||||
• Small models sometimes lose focus on complex topics
|
||||
• Query might be too broad or vague
|
||||
• Model trying to cover too much at once
|
||||
|
||||
**What to try:**
|
||||
• Ask more specific questions
|
||||
• Break broad questions into focused parts
|
||||
• Example: "How is data validated?" instead of "Explain the whole system"
|
||||
• Exploration mode helps maintain focus across questions"""
|
||||
|
||||
def _explain_json_corruption(self) -> str:
|
||||
return """🔧 The AI response format got corrupted.
|
||||
|
||||
**Why this happens:**
|
||||
• Small models sometimes struggle with structured output
|
||||
• Context limits can cause format errors
|
||||
• Complex analysis might overwhelm formatting
|
||||
|
||||
**What to try:**
|
||||
• Try the question again (often resolves itself)
|
||||
• Use simpler questions for better formatting
|
||||
• Synthesis mode sometimes gives cleaner output
|
||||
• This is less common with larger models"""
|
||||
|
||||
def get_recovery_suggestions(self, issue_type: str, query: str) -> List[str]:
|
||||
"""Get specific recovery suggestions based on the issue."""
|
||||
suggestions = []
|
||||
|
||||
if issue_type in ['thinking_loop', 'excessive_thinking']:
|
||||
suggestions.extend([
|
||||
f"Try synthesis mode: `rag-mini search . \"{query}\" --synthesize`",
|
||||
"Ask more direct questions without 'why' or 'how'",
|
||||
"Break complex questions into smaller parts"
|
||||
])
|
||||
|
||||
elif issue_type in ['word_repetition', 'phrase_repetition', 'high_repetition_ratio']:
|
||||
suggestions.extend([
|
||||
"Try rephrasing your question completely",
|
||||
"Use more specific technical terms",
|
||||
f"Try exploration mode: `rag-mini explore .`"
|
||||
])
|
||||
|
||||
elif issue_type == 'timeout':
|
||||
suggestions.extend([
|
||||
"Try a simpler version of your question",
|
||||
"Use synthesis mode for faster responses",
|
||||
"Check if Ollama is under heavy load"
|
||||
])
|
||||
|
||||
# Universal suggestions
|
||||
suggestions.extend([
|
||||
"Consider using a larger model if available (qwen3:1.7b or qwen3:3b)",
|
||||
"Check model status: `ollama list`"
|
||||
])
|
||||
|
||||
return suggestions
|
||||
|
||||
def get_optimal_ollama_parameters(model_name: str) -> Dict[str, any]:
|
||||
"""Get optimal parameters for different Ollama models."""
|
||||
|
||||
base_params = {
|
||||
"num_ctx": 32768, # Good context window for most uses
|
||||
"num_predict": 2000, # Reasonable response length
|
||||
"temperature": 0.3, # Balanced creativity/consistency
|
||||
}
|
||||
|
||||
# Model-specific optimizations
|
||||
if "qwen3:0.6b" in model_name.lower():
|
||||
return {
|
||||
**base_params,
|
||||
"repeat_penalty": 1.15, # Prevent repetition in small model
|
||||
"presence_penalty": 1.5, # Suppress repetitive outputs
|
||||
"top_p": 0.8, # Focused sampling
|
||||
"top_k": 20, # Limit choices
|
||||
"num_predict": 1500, # Shorter responses for reliability
|
||||
}
|
||||
|
||||
elif "qwen3:1.7b" in model_name.lower():
|
||||
return {
|
||||
**base_params,
|
||||
"repeat_penalty": 1.1, # Less aggressive for larger model
|
||||
"presence_penalty": 1.0, # Balanced
|
||||
"top_p": 0.9, # More creative
|
||||
"top_k": 40, # More choices
|
||||
}
|
||||
|
||||
elif any(size in model_name.lower() for size in ["3b", "7b", "8b"]):
|
||||
return {
|
||||
**base_params,
|
||||
"repeat_penalty": 1.05, # Minimal for larger models
|
||||
"presence_penalty": 0.5, # Light touch
|
||||
"top_p": 0.95, # High creativity
|
||||
"top_k": 50, # Many choices
|
||||
"num_predict": 3000, # Longer responses OK
|
||||
}
|
||||
|
||||
return base_params
|
||||
|
||||
# Quick test
|
||||
def test_safeguards():
|
||||
"""Test the safeguard system."""
|
||||
detector = ModelRunawayDetector()
|
||||
|
||||
# Test repetition detection
|
||||
bad_response = "The user authentication system works by checking user credentials. The user authentication system works by checking user credentials. The user authentication system works by checking user credentials."
|
||||
|
||||
is_valid, issue, explanation = detector.check_response_quality(bad_response, "auth", time.time())
|
||||
|
||||
print(f"Repetition test: Valid={is_valid}, Issue={issue}")
|
||||
if explanation:
|
||||
print(explanation)
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_safeguards()
|
||||
@ -8,11 +8,20 @@ Takes raw search results and generates coherent, contextual summaries.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from .llm_safeguards import ModelRunawayDetector, SafeguardConfig, get_optimal_ollama_parameters
|
||||
except ImportError:
|
||||
# Graceful fallback if safeguards not available
|
||||
ModelRunawayDetector = None
|
||||
SafeguardConfig = None
|
||||
get_optimal_ollama_parameters = lambda x: {}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
@ -34,6 +43,12 @@ class LLMSynthesizer:
|
||||
self.enable_thinking = enable_thinking # Default False for synthesis mode
|
||||
self._initialized = False
|
||||
|
||||
# Initialize safeguards
|
||||
if ModelRunawayDetector:
|
||||
self.safeguard_detector = ModelRunawayDetector(SafeguardConfig())
|
||||
else:
|
||||
self.safeguard_detector = None
|
||||
|
||||
def _get_available_models(self) -> List[str]:
|
||||
"""Get list of available Ollama models."""
|
||||
try:
|
||||
@ -129,7 +144,9 @@ class LLMSynthesizer:
|
||||
return len(self.available_models) > 0
|
||||
|
||||
def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False) -> Optional[str]:
|
||||
"""Make a call to Ollama API."""
|
||||
"""Make a call to Ollama API with safeguards."""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Use the best available model
|
||||
model_to_use = self.model
|
||||
@ -147,26 +164,46 @@ class LLMSynthesizer:
|
||||
if not final_prompt.endswith(" <no_think>"):
|
||||
final_prompt += " <no_think>"
|
||||
|
||||
# Get optimal parameters for this model
|
||||
optimal_params = get_optimal_ollama_parameters(model_to_use)
|
||||
|
||||
payload = {
|
||||
"model": model_to_use,
|
||||
"prompt": final_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"temperature": temperature,
|
||||
"top_p": 0.9,
|
||||
"top_k": 40
|
||||
"top_p": optimal_params.get("top_p", 0.9),
|
||||
"top_k": optimal_params.get("top_k", 40),
|
||||
"num_ctx": optimal_params.get("num_ctx", 32768),
|
||||
"num_predict": optimal_params.get("num_predict", 2000),
|
||||
"repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
|
||||
"presence_penalty": optimal_params.get("presence_penalty", 1.0)
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.ollama_url}/api/generate",
|
||||
json=payload,
|
||||
timeout=30
|
||||
timeout=65 # Slightly longer than safeguard timeout
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
return result.get('response', '').strip()
|
||||
raw_response = result.get('response', '').strip()
|
||||
|
||||
# Apply safeguards to check response quality
|
||||
if self.safeguard_detector and raw_response:
|
||||
is_valid, issue_type, explanation = self.safeguard_detector.check_response_quality(
|
||||
raw_response, prompt[:100], start_time # First 100 chars of prompt for context
|
||||
)
|
||||
|
||||
if not is_valid:
|
||||
logger.warning(f"Safeguard triggered: {issue_type}")
|
||||
# Return a safe explanation instead of the problematic response
|
||||
return self._create_safeguard_response(issue_type, explanation, prompt)
|
||||
|
||||
return raw_response
|
||||
else:
|
||||
logger.error(f"Ollama API error: {response.status_code}")
|
||||
return None
|
||||
@ -175,6 +212,24 @@ class LLMSynthesizer:
|
||||
logger.error(f"Ollama call failed: {e}")
|
||||
return None
|
||||
|
||||
def _create_safeguard_response(self, issue_type: str, explanation: str, original_prompt: str) -> str:
|
||||
"""Create a helpful response when safeguards are triggered."""
|
||||
return f"""⚠️ Model Response Issue Detected
|
||||
|
||||
{explanation}
|
||||
|
||||
**Original query context:** {original_prompt[:200]}{'...' if len(original_prompt) > 200 else ''}
|
||||
|
||||
**What happened:** The AI model encountered a common issue with small language models and was prevented from giving a problematic response.
|
||||
|
||||
**Your options:**
|
||||
1. **Try again**: Ask the same question (often resolves itself)
|
||||
2. **Rephrase**: Make your question more specific or break it into parts
|
||||
3. **Use exploration mode**: `rag-mini explore` for complex questions
|
||||
4. **Different approach**: Try synthesis mode: `--synthesize` for simpler responses
|
||||
|
||||
This is normal with smaller AI models and helps ensure you get quality responses."""
|
||||
|
||||
def synthesize_search_results(self, query: str, results: List[Any], project_path: Path) -> SynthesisResult:
|
||||
"""Synthesize search results into a coherent summary."""
|
||||
|
||||
|
||||
270
create_exploration_demo.py
Normal file
270
create_exploration_demo.py
Normal file
@ -0,0 +1,270 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create demo GIF for Exploration Mode - Deep Thinking & Interactive Learning
|
||||
Shows the conversational workflow for understanding and debugging codebases.
|
||||
"""
|
||||
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
class ExplorationDemoSimulator:
|
||||
def __init__(self):
|
||||
self.width = 100
|
||||
self.height = 35
|
||||
|
||||
def clear_screen(self):
|
||||
print("\033[H\033[2J", end="")
|
||||
|
||||
def type_command(self, command: str, delay: float = 0.05):
|
||||
"""Simulate typing a command."""
|
||||
print("$ ", end="", flush=True)
|
||||
for char in command:
|
||||
print(char, end="", flush=True)
|
||||
time.sleep(delay)
|
||||
print()
|
||||
time.sleep(0.5)
|
||||
|
||||
def type_question(self, question: str, delay: float = 0.04):
|
||||
"""Simulate typing a question in exploration mode."""
|
||||
print("> ", end="", flush=True)
|
||||
for char in question:
|
||||
print(char, end="", flush=True)
|
||||
time.sleep(delay)
|
||||
print()
|
||||
time.sleep(0.5)
|
||||
|
||||
def show_thinking(self, duration: float = 2.0):
|
||||
"""Show thinking animation."""
|
||||
print("🔍 Analyzing...", end="", flush=True)
|
||||
for _ in range(3):
|
||||
time.sleep(duration / 3)
|
||||
print(".", end="", flush=True)
|
||||
print()
|
||||
time.sleep(0.5)
|
||||
|
||||
def show_response(self, lines: list, delay: float = 0.4):
|
||||
"""Show AI response with realistic timing."""
|
||||
for line in lines:
|
||||
print(line)
|
||||
time.sleep(delay)
|
||||
time.sleep(1.5)
|
||||
|
||||
def run_exploration_demo(self):
|
||||
"""Run the exploration mode demonstration."""
|
||||
self.clear_screen()
|
||||
|
||||
# Title
|
||||
print("🧠 FSS-Mini-RAG: Exploration Mode Demo")
|
||||
print("=" * 55)
|
||||
print("Deep thinking & interactive learning for complex codebases")
|
||||
print()
|
||||
time.sleep(2)
|
||||
|
||||
# Step 1: Start exploration
|
||||
print("Step 1: Start interactive exploration session")
|
||||
print("-" * 45)
|
||||
self.type_command("rag-mini explore ./webapp-project")
|
||||
|
||||
self.show_response([
|
||||
"🧠 EXPLORATION MODE STARTED",
|
||||
"=" * 50,
|
||||
"Project: webapp-project",
|
||||
"Session: explore_1702845632",
|
||||
"",
|
||||
"🎯 This mode uses thinking and remembers context.",
|
||||
" Perfect for debugging, learning, and deep exploration.",
|
||||
"",
|
||||
"💡 Tips:",
|
||||
" • Ask follow-up questions - I'll remember our conversation",
|
||||
" • Use 'why', 'how', 'explain' for detailed reasoning",
|
||||
" • Type 'quit' or 'exit' to end session",
|
||||
"",
|
||||
"=" * 50,
|
||||
"",
|
||||
"🤔 Ask your first question about the codebase:"
|
||||
])
|
||||
|
||||
# Step 2: First question - How authentication works
|
||||
print("Step 2: Understanding authentication system")
|
||||
print("-" * 42)
|
||||
self.type_question("How does authentication work in this codebase?")
|
||||
|
||||
self.show_thinking()
|
||||
|
||||
self.show_response([
|
||||
"",
|
||||
"🧠 EXPLORATION ANALYSIS (Question #1)",
|
||||
"Session: 0.2m | Results: 8 | Time: 3.4s",
|
||||
"=" * 60,
|
||||
"",
|
||||
"📝 Analysis:",
|
||||
" This webapp implements a JWT-based authentication system with",
|
||||
" session management, role-based permissions, and OAuth2 integration.",
|
||||
"",
|
||||
"🔍 Key Insights:",
|
||||
" • JWT tokens generated in auth/jwt_handler.py with 24h expiry",
|
||||
" • User sessions stored in Redis for fast lookups and invalidation",
|
||||
" • Role-based access control through middleware/permissions.py",
|
||||
" • OAuth2 integration supports Google, GitHub, and Microsoft providers",
|
||||
" • Password hashing uses bcrypt with 12 rounds for security",
|
||||
"",
|
||||
"💡 Code Examples:",
|
||||
" JWT generation: jwt.encode(payload, SECRET_KEY, algorithm='HS256')",
|
||||
" Session storage: redis.setex(f'session:{user_id}', 86400, token)",
|
||||
" Permission check: @require_permission('admin') decorator",
|
||||
"",
|
||||
"🎯 Next Steps:",
|
||||
" • Review JWT secret rotation policy",
|
||||
" • Consider implementing refresh tokens",
|
||||
" • Audit OAuth2 scope permissions",
|
||||
"",
|
||||
"🟢 Confidence: 91%"
|
||||
])
|
||||
|
||||
# Step 3: Follow-up question - Security concerns
|
||||
print("Step 3: Follow-up question (context remembered)")
|
||||
print("-" * 48)
|
||||
self.type_question("What security concerns should I be aware of?")
|
||||
|
||||
self.show_thinking(1.8)
|
||||
|
||||
self.show_response([
|
||||
"",
|
||||
"🧠 EXPLORATION ANALYSIS (Question #2)",
|
||||
"Session: 1.4m | Results: 6 | Time: 2.1s",
|
||||
"=" * 60,
|
||||
"",
|
||||
"📝 Analysis:",
|
||||
" Building on the JWT authentication system we just discussed,",
|
||||
" I've identified several security considerations that need attention.",
|
||||
"",
|
||||
"🔍 Key Insights:",
|
||||
" • JWT secret is hardcoded in config.py (CRITICAL RISK)",
|
||||
" • No rate limiting on login endpoints (brute force vulnerable)",
|
||||
" • Session invalidation only happens on logout, not on password change",
|
||||
" • OAuth2 state parameter validation could be strengthened",
|
||||
" • CORS settings allow all origins in development config",
|
||||
"",
|
||||
"💡 Code Examples:",
|
||||
" Vulnerable: SECRET_KEY = 'hardcoded-secret-123'",
|
||||
" Missing: @rate_limit('5 per minute') on /api/login",
|
||||
" Risk: CORS_ALLOW_ALL_ORIGINS = True",
|
||||
"",
|
||||
"🎯 Next Steps:",
|
||||
" • Move JWT secret to environment variables immediately",
|
||||
" • Implement rate limiting with Redis or in-memory store",
|
||||
" • Add session invalidation on password/email changes",
|
||||
" • Review and restrict CORS origins for production",
|
||||
" • Add security headers middleware",
|
||||
"",
|
||||
"🔴 Confidence: 94% | Context: 1 previous questions"
|
||||
])
|
||||
|
||||
# Step 4: Third question - Performance issues
|
||||
print("Step 4: Exploring performance (conversation continues)")
|
||||
print("-" * 54)
|
||||
self.type_question("Why might the login process be slow?")
|
||||
|
||||
self.show_thinking(2.2)
|
||||
|
||||
self.show_response([
|
||||
"",
|
||||
"🧠 EXPLORATION ANALYSIS (Question #3)",
|
||||
"Session: 2.1m | Results: 7 | Time: 2.8s",
|
||||
"=" * 60,
|
||||
"",
|
||||
"📝 Analysis:",
|
||||
" Considering our previous discussion about JWT auth and security,",
|
||||
" the login slowness likely stems from several performance bottlenecks.",
|
||||
"",
|
||||
"🔍 Key Insights:",
|
||||
" • bcrypt rounds set to 15 (recommended: 10-12) causing CPU overhead",
|
||||
" • Database queries for user lookup aren't indexed on email field",
|
||||
" • Redis connection pool not configured, creating new connections",
|
||||
" • OAuth2 provider calls lack timeout settings (can hang)",
|
||||
" • Session cleanup runs synchronously during login flow",
|
||||
"",
|
||||
"💡 Code Examples:",
|
||||
" Slow: bcrypt.hashpw(password, bcrypt.gensalt(rounds=15))",
|
||||
" Missing: CREATE INDEX idx_users_email ON users(email)",
|
||||
" Blocking: cleanup_expired_sessions() # Should be async",
|
||||
"",
|
||||
"🎯 Next Steps:",
|
||||
" • Reduce bcrypt rounds to 12 for better balance",
|
||||
" • Add database index on users.email column",
|
||||
" • Configure Redis connection pooling",
|
||||
" • Move session cleanup to background task",
|
||||
" • Add OAuth2 request timeouts (5-10 seconds)",
|
||||
"",
|
||||
"🟡 Confidence: 89% | Context: 2 previous questions"
|
||||
])
|
||||
|
||||
# Step 5: Session summary
|
||||
print("Step 5: Session summary and conclusion")
|
||||
print("-" * 38)
|
||||
self.type_question("summary")
|
||||
|
||||
self.show_response([
|
||||
"",
|
||||
"🧠 EXPLORATION SESSION SUMMARY",
|
||||
"=" * 40,
|
||||
"Project: webapp-project",
|
||||
"Session ID: explore_1702845632",
|
||||
"Duration: 2.5 minutes",
|
||||
"Questions explored: 3",
|
||||
"",
|
||||
"📋 Topics explored:",
|
||||
" 1. How does authentication work in this codebase? (confidence: 91%)",
|
||||
" 2. What security concerns should I be aware of? (confidence: 94%)",
|
||||
" 3. Why might the login process be slow? (confidence: 89%)",
|
||||
"",
|
||||
"🎯 Key Discoveries:",
|
||||
" • JWT-based auth with session management",
|
||||
" • Critical security issues (hardcoded secrets, no rate limiting)",
|
||||
" • Performance bottlenecks (bcrypt settings, missing indexes)",
|
||||
"",
|
||||
"💡 Action Items Generated:",
|
||||
" • Immediate: Fix hardcoded JWT secret",
|
||||
" • High Priority: Add rate limiting and database indexes",
|
||||
" • Monitor: Review OAuth2 configurations"
|
||||
])
|
||||
|
||||
# Step 6: Exit
|
||||
self.type_question("quit")
|
||||
|
||||
self.show_response([
|
||||
"",
|
||||
"✅ Exploration session ended.",
|
||||
"",
|
||||
"🎬 This was Exploration Mode - perfect for learning and debugging!"
|
||||
])
|
||||
|
||||
# Final summary
|
||||
print()
|
||||
print("💡 Exploration Mode Benefits:")
|
||||
print(" 🧠 Thinking-enabled AI for detailed reasoning")
|
||||
print(" 💭 Conversation memory across questions")
|
||||
print(" 🔍 Perfect for debugging and understanding")
|
||||
print(" 📚 Educational - learn how code really works")
|
||||
print(" 🎯 Context-aware follow-up responses")
|
||||
print()
|
||||
time.sleep(3)
|
||||
|
||||
def main():
|
||||
"""Run the exploration mode demo."""
|
||||
demo = ExplorationDemoSimulator()
|
||||
|
||||
print("Starting FSS-Mini-RAG Exploration Mode Demo...")
|
||||
print("Record with: asciinema rec exploration_demo.cast")
|
||||
print("Press Enter to start...")
|
||||
input()
|
||||
|
||||
demo.run_exploration_demo()
|
||||
|
||||
print("\n🎯 To create GIF:")
|
||||
print("agg exploration_demo.cast exploration_demo.gif")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
178
create_synthesis_demo.py
Normal file
178
create_synthesis_demo.py
Normal file
@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create demo GIF for Synthesis Mode - Fast & Consistent RAG Search
|
||||
Shows the streamlined workflow for quick answers and code discovery.
|
||||
"""
|
||||
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
class SynthesisDemoSimulator:
|
||||
def __init__(self):
|
||||
self.width = 100
|
||||
self.height = 30
|
||||
|
||||
def clear_screen(self):
|
||||
print("\033[H\033[2J", end="")
|
||||
|
||||
def type_command(self, command: str, delay: float = 0.05):
|
||||
"""Simulate typing a command."""
|
||||
print("$ ", end="", flush=True)
|
||||
for char in command:
|
||||
print(char, end="", flush=True)
|
||||
time.sleep(delay)
|
||||
print()
|
||||
time.sleep(0.5)
|
||||
|
||||
def show_output(self, lines: list, delay: float = 0.3):
|
||||
"""Show command output with realistic timing."""
|
||||
for line in lines:
|
||||
print(line)
|
||||
time.sleep(delay)
|
||||
time.sleep(1.0)
|
||||
|
||||
def run_synthesis_demo(self):
|
||||
"""Run the synthesis mode demonstration."""
|
||||
self.clear_screen()
|
||||
|
||||
# Title
|
||||
print("🚀 FSS-Mini-RAG: Synthesis Mode Demo")
|
||||
print("=" * 50)
|
||||
print("Fast & consistent RAG search for quick answers")
|
||||
print()
|
||||
time.sleep(2)
|
||||
|
||||
# Step 1: Index a project
|
||||
print("Step 1: Index a sample project")
|
||||
print("-" * 30)
|
||||
self.type_command("rag-mini index ./sample-project")
|
||||
|
||||
self.show_output([
|
||||
"📁 Indexing project: sample-project",
|
||||
"🔍 Found 12 files to process",
|
||||
"✂️ Creating semantic chunks...",
|
||||
"🧠 Generating embeddings...",
|
||||
"💾 Building vector index...",
|
||||
"✅ Indexed 89 chunks from 12 files in 3.2s",
|
||||
"",
|
||||
"💡 Try: rag-mini search ./sample-project \"your search here\""
|
||||
])
|
||||
|
||||
# Step 2: Quick search
|
||||
print("Step 2: Quick semantic search")
|
||||
print("-" * 30)
|
||||
self.type_command("rag-mini search ./sample-project \"user authentication\"")
|
||||
|
||||
self.show_output([
|
||||
"🔍 Searching \"user authentication\" in sample-project",
|
||||
"✅ Found 5 results:",
|
||||
"",
|
||||
"1. auth/models.py",
|
||||
" Score: 0.923",
|
||||
" Lines: 45-62",
|
||||
" Context: User class",
|
||||
" Content:",
|
||||
" class User:",
|
||||
" def authenticate(self, password):",
|
||||
" return bcrypt.checkpw(password, self.password_hash)",
|
||||
"",
|
||||
"2. auth/views.py",
|
||||
" Score: 0.887",
|
||||
" Lines: 23-41",
|
||||
" Context: login_view function",
|
||||
" Content:",
|
||||
" def login_view(request):",
|
||||
" user = authenticate(username, password)",
|
||||
" if user:",
|
||||
" login(request, user)",
|
||||
"",
|
||||
"3. middleware/auth.py",
|
||||
" Score: 0.845",
|
||||
" Content: Authentication middleware checking..."
|
||||
])
|
||||
|
||||
# Step 3: Search with AI synthesis
|
||||
print("Step 3: Add AI synthesis for deeper understanding")
|
||||
print("-" * 50)
|
||||
self.type_command("rag-mini search ./sample-project \"error handling\" --synthesize")
|
||||
|
||||
self.show_output([
|
||||
"🔍 Searching \"error handling\" in sample-project",
|
||||
"🧠 Generating LLM synthesis...",
|
||||
"✅ Found 4 results:",
|
||||
"",
|
||||
"1. utils/exceptions.py",
|
||||
" Score: 0.934",
|
||||
" Content: Custom exception classes for API errors...",
|
||||
"",
|
||||
"2. api/handlers.py",
|
||||
" Score: 0.889",
|
||||
" Content: Global exception handler with logging...",
|
||||
"",
|
||||
"🧠 LLM SYNTHESIS",
|
||||
"=" * 50,
|
||||
"",
|
||||
"📝 Summary:",
|
||||
" This codebase implements a robust error handling system with",
|
||||
" custom exceptions, global handlers, and structured logging.",
|
||||
"",
|
||||
"🔍 Key Findings:",
|
||||
" • Custom exception hierarchy in utils/exceptions.py",
|
||||
" • Global error handler catches all API exceptions",
|
||||
" • Logging integrated with error tracking service",
|
||||
"",
|
||||
"💡 Code Patterns:",
|
||||
" try/except blocks with specific exception types",
|
||||
" Centralized error response formatting",
|
||||
"",
|
||||
"🎯 Suggested Actions:",
|
||||
" • Review exception hierarchy for completeness",
|
||||
" • Consider adding error recovery mechanisms",
|
||||
"",
|
||||
"🟢 Confidence: 87%"
|
||||
])
|
||||
|
||||
# Step 4: Show performance
|
||||
print("Step 4: Performance characteristics")
|
||||
print("-" * 35)
|
||||
print("⚡ Synthesis Mode Benefits:")
|
||||
print(" • Lightning fast responses (no thinking overhead)")
|
||||
print(" • Consistent, reliable results")
|
||||
print(" • Perfect for code discovery and quick answers")
|
||||
print(" • Works great with ultra-efficient models (qwen3:0.6b)")
|
||||
print()
|
||||
time.sleep(3)
|
||||
|
||||
# Step 5: When to use
|
||||
print("💡 When to use Synthesis Mode:")
|
||||
print(" ✅ Quick code lookups")
|
||||
print(" ✅ Finding specific functions or classes")
|
||||
print(" ✅ Understanding code structure")
|
||||
print(" ✅ Fast documentation searches")
|
||||
print(" ✅ Batch processing multiple queries")
|
||||
print()
|
||||
|
||||
print("🧠 For deeper analysis, try: rag-mini explore ./project")
|
||||
print()
|
||||
time.sleep(3)
|
||||
|
||||
print("🎬 Demo complete! This was Synthesis Mode - optimized for speed.")
|
||||
|
||||
def main():
|
||||
"""Run the synthesis mode demo."""
|
||||
demo = SynthesisDemoSimulator()
|
||||
|
||||
print("Starting FSS-Mini-RAG Synthesis Mode Demo...")
|
||||
print("Record with: asciinema rec synthesis_demo.cast")
|
||||
print("Press Enter to start...")
|
||||
input()
|
||||
|
||||
demo.run_synthesis_demo()
|
||||
|
||||
print("\n🎯 To create GIF:")
|
||||
print("agg synthesis_demo.cast synthesis_demo.gif")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user