diff --git a/PR_DRAFT.md b/PR_DRAFT.md new file mode 100644 index 0000000..3cf60ea --- /dev/null +++ b/PR_DRAFT.md @@ -0,0 +1,108 @@ +# Add Context Window Configuration for Optimal RAG Performance + +## Problem Statement + +Currently, FSS-Mini-RAG uses Ollama's default context window settings, which severely limits performance: + +- **Default 2048 tokens** is inadequate for RAG applications +- Users can't configure context window for their hardware/use case +- No guidance on optimal context sizes for different models +- Inconsistent context handling across the codebase +- New users don't understand context window importance + +## Impact on User Experience + +**With 2048 token context window:** +- Only 1-2 responses possible before context truncation +- Thinking tokens consume significant context space +- Poor performance with larger document chunks +- Frustrated users who don't understand why responses degrade + +**With proper context configuration:** +- 5-15+ responses in exploration mode +- Support for advanced use cases (15+ results, 4000+ character chunks) +- Better coding assistance and analysis +- Professional-grade RAG experience + +## Proposed Solution + +### 1. Enhanced Model Configuration Menu +Add context window selection alongside model selection with: +- **Development**: 8K tokens (fast, good for most cases) +- **Production**: 16K tokens (balanced performance) +- **Advanced**: 32K+ tokens (heavy development work) + +### 2. Educational Content +Help users understand: +- Why context window size matters for RAG +- Hardware implications of larger contexts +- Optimal settings for their use case +- Model-specific context capabilities + +### 3. Consistent Implementation +- Update all Ollama API calls to use consistent context settings +- Ensure configuration applies across synthesis, expansion, and exploration +- Validate context sizes against model capabilities +- Provide clear error messages for invalid configurations + +## Technical Implementation + +Based on research findings: + +### Model Context Capabilities +- **qwen3:0.6b/1.7b**: 32K token maximum +- **qwen3:4b**: 131K token maximum (YaRN extended) + +### Recommended Context Sizes +```yaml +# Conservative (fast, low memory) +num_ctx: 8192 # ~6MB memory, excellent for exploration + +# Balanced (recommended for most users) +num_ctx: 16384 # ~12MB memory, handles complex analysis + +# Advanced (heavy development work) +num_ctx: 32768 # ~24MB memory, supports large codebases +``` + +### Configuration Integration +- Add context window selection to TUI configuration menu +- Update config.yaml schema with context parameters +- Implement validation for model-specific limits +- Provide migration for existing configurations + +## Benefits + +1. **Improved User Experience** + - Longer conversation sessions + - Better analysis quality + - Clear performance expectations + +2. **Professional RAG Capability** + - Support for enterprise-scale projects + - Handles large codebases effectively + - Enables advanced use cases + +3. **Educational Value** + - Users learn about context windows + - Better understanding of RAG performance + - Informed decision making + +## Implementation Plan + +1. **Phase 1**: Research Ollama context handling (āœ… Complete) +2. **Phase 2**: Update configuration system +3. **Phase 3**: Enhance TUI with context selection +4. **Phase 4**: Update all API calls consistently +5. **Phase 5**: Add documentation and validation + +## Questions for Review + +1. Should we auto-detect optimal context based on available memory? +2. How should we handle model changes that affect context capabilities? +3. Should context be per-model or global configuration? +4. What validation should we provide for context/model combinations? + +--- + +**This PR will significantly improve FSS-Mini-RAG's performance and user experience by properly configuring one of the most critical parameters for RAG systems.** \ No newline at end of file diff --git a/mini_rag/config.py b/mini_rag/config.py index 1eab053..5f31228 100644 --- a/mini_rag/config.py +++ b/mini_rag/config.py @@ -81,6 +81,10 @@ class LLMConfig: enable_thinking: bool = True # Enable thinking mode for Qwen3 models cpu_optimized: bool = True # Prefer lightweight models + # Context window configuration (critical for RAG performance) + context_window: int = 16384 # Context window size in tokens (16K recommended) + auto_context: bool = True # Auto-adjust context based on model capabilities + # Model preference rankings (configurable) model_rankings: list = None # Will be set in __post_init__ @@ -255,6 +259,11 @@ class ConfigManager: f" max_expansion_terms: {config_dict['llm']['max_expansion_terms']} # Maximum terms to add to queries", f" enable_synthesis: {str(config_dict['llm']['enable_synthesis']).lower()} # Enable synthesis by default", f" synthesis_temperature: {config_dict['llm']['synthesis_temperature']} # LLM temperature for analysis", + "", + " # Context window configuration (critical for RAG performance)", + f" context_window: {config_dict['llm']['context_window']} # Context size in tokens (8K=fast, 16K=balanced, 32K=advanced)", + f" auto_context: {str(config_dict['llm']['auto_context']).lower()} # Auto-adjust context based on model capabilities", + "", " model_rankings: # Preferred model order (edit to change priority)", ]) diff --git a/mini_rag/explorer.py b/mini_rag/explorer.py index c95a7ec..b50d040 100644 --- a/mini_rag/explorer.py +++ b/mini_rag/explorer.py @@ -408,7 +408,7 @@ Guidelines: "temperature": temperature, "top_p": optimal_params.get("top_p", 0.9), "top_k": optimal_params.get("top_k", 40), - "num_ctx": optimal_params.get("num_ctx", 32768), + "num_ctx": self.synthesizer._get_optimal_context_size(model_to_use), "num_predict": optimal_params.get("num_predict", 2000), "repeat_penalty": optimal_params.get("repeat_penalty", 1.1), "presence_penalty": optimal_params.get("presence_penalty", 1.0) diff --git a/mini_rag/llm_synthesizer.py b/mini_rag/llm_synthesizer.py index b85056e..c4ed906 100644 --- a/mini_rag/llm_synthesizer.py +++ b/mini_rag/llm_synthesizer.py @@ -114,6 +114,51 @@ class LLMSynthesizer: self._initialized = True + def _get_optimal_context_size(self, model_name: str) -> int: + """Get optimal context size based on model capabilities and configuration.""" + # Get configured context window + if self.config and hasattr(self.config, 'llm'): + configured_context = self.config.llm.context_window + auto_context = getattr(self.config.llm, 'auto_context', True) + else: + configured_context = 16384 # Default to 16K + auto_context = True + + # Model-specific maximum context windows (based on research) + model_limits = { + # Qwen3 models with native context support + 'qwen3:0.6b': 32768, # 32K native + 'qwen3:1.7b': 32768, # 32K native + 'qwen3:4b': 131072, # 131K with YaRN extension + + # Qwen2.5 models + 'qwen2.5:1.5b': 32768, # 32K native + 'qwen2.5:3b': 32768, # 32K native + 'qwen2.5-coder:1.5b': 32768, # 32K native + + # Fallback for unknown models + 'default': 8192 + } + + # Find model limit (check for partial matches) + model_limit = model_limits.get('default', 8192) + for model_pattern, limit in model_limits.items(): + if model_pattern != 'default' and model_pattern.lower() in model_name.lower(): + model_limit = limit + break + + # If auto_context is enabled, respect model limits + if auto_context: + optimal_context = min(configured_context, model_limit) + else: + optimal_context = configured_context + + # Ensure minimum usable context for RAG + optimal_context = max(optimal_context, 4096) # Minimum 4K for basic RAG + + logger.debug(f"Context for {model_name}: {optimal_context} tokens (configured: {configured_context}, limit: {model_limit})") + return optimal_context + def is_available(self) -> bool: """Check if Ollama is available and has models.""" self._ensure_initialized() @@ -174,7 +219,7 @@ class LLMSynthesizer: "temperature": qwen3_temp, "top_p": qwen3_top_p, "top_k": qwen3_top_k, - "num_ctx": 32000, # Critical: Qwen3 context length (32K token limit) + "num_ctx": self._get_optimal_context_size(model_to_use), # Dynamic context based on model and config "num_predict": optimal_params.get("num_predict", 2000), "repeat_penalty": optimal_params.get("repeat_penalty", 1.1), "presence_penalty": qwen3_presence diff --git a/rag-tui.py b/rag-tui.py index 5895310..41fcdb3 100755 --- a/rag-tui.py +++ b/rag-tui.py @@ -1353,8 +1353,10 @@ Your suggested question (under 10 words):""" config_path = self.project_path / '.mini-rag' / 'config.yaml' print("šŸ“‹ Current Settings:") + print(f" šŸ¤– AI model: {config.llm.synthesis_model}") + print(f" 🧠 Context window: {config.llm.context_window} tokens") print(f" šŸ“ Chunk size: {config.chunking.max_size} characters") - print(f" 🧠 Chunking strategy: {config.chunking.strategy}") + print(f" šŸ”„ Chunking strategy: {config.chunking.strategy}") print(f" šŸ” Search results: {config.search.default_top_k} results") print(f" šŸ“Š Embedding method: {config.embedding.preferred_method}") print(f" šŸš€ Query expansion: {'enabled' if config.search.expand_queries else 'disabled'}") @@ -1362,12 +1364,14 @@ Your suggested question (under 10 words):""" print() print("šŸ› ļø Quick Configuration Options:") - print(" 1. Adjust chunk size (performance vs accuracy)") - print(" 2. Toggle query expansion (smarter searches)") - print(" 3. Configure search behavior") - print(" 4. View/edit full configuration file") - print(" 5. Reset to defaults") - print(" 6. Advanced settings") + print(" 1. Select AI model (Fast/Recommended/Quality)") + print(" 2. Configure context window (Development/Production/Advanced)") + print(" 3. Adjust chunk size (performance vs accuracy)") + print(" 4. Toggle query expansion (smarter searches)") + print(" 5. Configure search behavior") + print(" 6. View/edit full configuration file") + print(" 7. Reset to defaults") + print(" 8. Advanced settings") print() print(" V. View current config file") print(" B. Back to main menu") @@ -1386,16 +1390,20 @@ Your suggested question (under 10 words):""" elif choice == 'v': self._show_config_file(config_path) elif choice == '1': - self._configure_chunk_size(config_manager, config) + self._configure_llm_model(config_manager, config) elif choice == '2': - self._toggle_query_expansion(config_manager, config) + self._configure_context_window(config_manager, config) elif choice == '3': - self._configure_search_behavior(config_manager, config) + self._configure_chunk_size(config_manager, config) elif choice == '4': - self._edit_config_file(config_path) + self._toggle_query_expansion(config_manager, config) elif choice == '5': - self._reset_config(config_manager) + self._configure_search_behavior(config_manager, config) elif choice == '6': + self._edit_config_file(config_path) + elif choice == '7': + self._reset_config(config_manager) + elif choice == '8': self._advanced_settings(config_manager, config) else: print("Invalid option. Press Enter to continue...") @@ -1422,6 +1430,340 @@ Your suggested question (under 10 words):""" print("\n" + "=" * 50) input("Press Enter to continue...") + def _configure_llm_model(self, config_manager, config): + """Interactive LLM model selection with download capability.""" + self.clear_screen() + print("šŸ¤– AI Model Configuration") + print("=========================") + print() + + # Check if Ollama is available + import subprocess + import requests + + ollama_available = False + try: + subprocess.run(['ollama', '--version'], capture_output=True, check=True) + response = requests.get("http://localhost:11434/api/version", timeout=3) + ollama_available = response.status_code == 200 + except: + pass + + if not ollama_available: + print("āŒ Ollama not available") + print() + print("To use AI features, please:") + print(" 1. Install Ollama: https://ollama.com/download") + print(" 2. Start the service: ollama serve") + print(" 3. Return to this menu") + print() + input("Press Enter to continue...") + return + + # Get available models + try: + available_models = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True) + model_lines = available_models.stdout.strip().split('\n')[1:] # Skip header + installed_models = [line.split()[0] for line in model_lines if line.strip()] + except: + installed_models = [] + + print("🧠 Why Small Models Work Great for RAG") + print("=====================================") + print() + print("RAG systems like FSS-Mini-RAG don't need massive models because:") + print("• The relevant code/docs are provided as context") + print("• Models focus on analysis, not memorizing facts") + print("• Even 0.6B models give excellent results with good context") + print("• Smaller models = faster responses = better user experience") + print() + print("šŸ’” Advanced Use: For heavy development work with 15+ results") + print(" and 4000+ character chunks, even these models excel!") + print(" The 4B Qwen3 model will help you code remarkably well.") + print() + + # Model options + model_options = { + 'fast': { + 'model': 'qwen3:0.6b', + 'description': 'Ultra-fast responses (~500MB)', + 'details': 'Perfect for quick searches and exploration. Surprisingly capable!' + }, + 'recommended': { + 'model': 'qwen3:1.7b', + 'description': 'Best balance of speed and quality (~1.4GB)', + 'details': 'Ideal for most users. Great analysis with good speed.' + }, + 'quality': { + 'model': 'qwen3:4b', + 'description': 'Highest quality responses (~2.5GB)', + 'details': 'Excellent for coding assistance and detailed analysis.' + } + } + + print("šŸŽÆ Recommended Models:") + print() + for key, info in model_options.items(): + is_installed = any(info['model'] in model for model in installed_models) + status = "āœ… Installed" if is_installed else "šŸ“„ Available for download" + + print(f" {key.upper()}: {info['model']}") + print(f" {info['description']} - {status}") + print(f" {info['details']}") + print() + + current_model = config.llm.synthesis_model + print(f"Current model: {current_model}") + print() + + print("Options:") + print(" F. Select Fast model (qwen3:0.6b)") + print(" R. Select Recommended model (qwen3:1.7b)") + print(" Q. Select Quality model (qwen3:4b)") + print(" C. Keep current model") + print(" B. Back to configuration menu") + print() + + choice = input("Choose option: ").strip().lower() + + selected_model = None + if choice == 'f': + selected_model = model_options['fast']['model'] + elif choice == 'r': + selected_model = model_options['recommended']['model'] + elif choice == 'q': + selected_model = model_options['quality']['model'] + elif choice == 'c': + print("Keeping current model.") + input("Press Enter to continue...") + return + elif choice == 'b': + return + else: + print("Invalid option.") + input("Press Enter to continue...") + return + + # Check if model is installed + model_installed = any(selected_model in model for model in installed_models) + + if not model_installed: + print(f"\nšŸ“„ Model {selected_model} not installed.") + print("Would you like to download it now?") + print("This may take 2-5 minutes depending on your internet speed.") + print() + + download = input("Download now? [Y/n]: ").strip().lower() + if download != 'n' and download != 'no': + print(f"\nšŸ”„ Downloading {selected_model}...") + print("This may take a few minutes...") + + try: + result = subprocess.run(['ollama', 'pull', selected_model], + capture_output=True, text=True, check=True) + print(f"āœ… Successfully downloaded {selected_model}") + model_installed = True + except subprocess.CalledProcessError as e: + print(f"āŒ Download failed: {e}") + print("You can try downloading manually later with:") + print(f" ollama pull {selected_model}") + input("Press Enter to continue...") + return + else: + print("Model not downloaded. Configuration not changed.") + input("Press Enter to continue...") + return + + if model_installed: + # Update configuration + config.llm.synthesis_model = selected_model + config.llm.expansion_model = selected_model # Keep them in sync + + try: + config_manager.save_config(config) + print(f"\nāœ… Model updated to {selected_model}") + print("Configuration saved successfully!") + except Exception as e: + print(f"āŒ Error saving configuration: {e}") + + print() + input("Press Enter to continue...") + + def _configure_context_window(self, config_manager, config): + """Interactive context window configuration.""" + self.clear_screen() + print("🧠 Context Window Configuration") + print("===============================") + print() + + print("šŸ’” Why Context Window Size Matters for RAG") + print("==========================================") + print() + print("Context window determines how much text the AI can 'remember' during conversation:") + print() + print("āŒ Default 2048 tokens = Only 1-2 responses before forgetting") + print("āœ… Proper context = 5-15+ responses with maintained conversation") + print() + print("For RAG systems like FSS-Mini-RAG:") + print("• Larger context = better analysis of multiple code files") + print("• Thinking tokens consume ~200-500 tokens per response") + print("• Search results can be 1000-3000 tokens depending on chunk size") + print("• Conversation history builds up over time") + print() + print("šŸ’» Memory Usage Impact:") + print("• 8K context ā‰ˆ 6MB memory per conversation") + print("• 16K context ā‰ˆ 12MB memory per conversation") + print("• 32K context ā‰ˆ 24MB memory per conversation") + print() + + current_context = config.llm.context_window + current_model = config.llm.synthesis_model + + # Get model capabilities + model_limits = { + 'qwen3:0.6b': 32768, + 'qwen3:1.7b': 32768, + 'qwen3:4b': 131072, + 'qwen2.5:1.5b': 32768, + 'qwen2.5:3b': 32768, + 'default': 8192 + } + + model_limit = model_limits.get('default', 8192) + for model_pattern, limit in model_limits.items(): + if model_pattern != 'default' and model_pattern.lower() in current_model.lower(): + model_limit = limit + break + + print(f"Current model: {current_model}") + print(f"Model maximum: {model_limit:,} tokens") + print(f"Current setting: {current_context:,} tokens") + print() + + # Context options + context_options = { + 'development': { + 'size': 8192, + 'description': 'Fast and efficient for most development work', + 'details': 'Perfect for code exploration and basic analysis. Quick responses.', + 'memory': '~6MB' + }, + 'production': { + 'size': 16384, + 'description': 'Balanced performance for professional use', + 'details': 'Ideal for most users. Handles complex analysis well.', + 'memory': '~12MB' + }, + 'advanced': { + 'size': 32768, + 'description': 'Maximum performance for heavy development', + 'details': 'For large codebases, 15+ search results, complex analysis.', + 'memory': '~24MB' + } + } + + print("šŸŽÆ Recommended Context Sizes:") + print() + for key, info in context_options.items(): + # Check if this size is supported by current model + if info['size'] <= model_limit: + status = "āœ… Supported" + else: + status = f"āŒ Exceeds model limit ({model_limit:,})" + + print(f" {key.upper()}: {info['size']:,} tokens ({info['memory']})") + print(f" {info['description']} - {status}") + print(f" {info['details']}") + print() + + print("Options:") + print(" D. Development (8K tokens - fast)") + print(" P. Production (16K tokens - balanced)") + print(" A. Advanced (32K tokens - maximum)") + print(" C. Custom size (manual entry)") + print(" K. Keep current setting") + print(" B. Back to configuration menu") + print() + + choice = input("Choose option: ").strip().lower() + + new_context = None + if choice == 'd': + new_context = context_options['development']['size'] + elif choice == 'p': + new_context = context_options['production']['size'] + elif choice == 'a': + new_context = context_options['advanced']['size'] + elif choice == 'c': + print() + print("Enter custom context size in tokens:") + print(f" Minimum: 4096 (4K)") + print(f" Maximum for {current_model}: {model_limit:,}") + print() + try: + custom_size = int(input("Context size: ").strip()) + if custom_size < 4096: + print("āŒ Context too small. Minimum is 4096 tokens for RAG.") + input("Press Enter to continue...") + return + elif custom_size > model_limit: + print(f"āŒ Context too large. Maximum for {current_model} is {model_limit:,} tokens.") + input("Press Enter to continue...") + return + else: + new_context = custom_size + except ValueError: + print("āŒ Invalid number.") + input("Press Enter to continue...") + return + elif choice == 'k': + print("Keeping current context setting.") + input("Press Enter to continue...") + return + elif choice == 'b': + return + else: + print("Invalid option.") + input("Press Enter to continue...") + return + + if new_context: + # Validate against model capabilities + if new_context > model_limit: + print(f"āš ļø Warning: {new_context:,} tokens exceeds {current_model} limit of {model_limit:,}") + print("The system will automatically cap at the model limit.") + print() + + # Update configuration + config.llm.context_window = new_context + + try: + config_manager.save_config(config) + print(f"āœ… Context window updated to {new_context:,} tokens") + print() + + # Provide usage guidance + if new_context >= 32768: + print("šŸš€ Advanced context enabled!") + print("• Perfect for large codebases and complex analysis") + print("• Try cranking up search results to 15+ for deep exploration") + print("• Increase chunk size to 4000+ characters for comprehensive context") + elif new_context >= 16384: + print("āš–ļø Balanced context configured!") + print("• Great for professional development work") + print("• Supports extended conversations and analysis") + elif new_context >= 8192: + print("⚔ Development context set!") + print("• Fast responses with good conversation length") + print("• Perfect for code exploration and basic analysis") + + print("Configuration saved successfully!") + except Exception as e: + print(f"āŒ Error saving configuration: {e}") + + print() + input("Press Enter to continue...") + def _configure_chunk_size(self, config_manager, config): """Interactive chunk size configuration.""" self.clear_screen()