From a1f84e2bd5436ea203ba40e10ca3c7c74d7fc690 Mon Sep 17 00:00:00 2001 From: BobAi Date: Tue, 12 Aug 2025 20:01:16 +1000 Subject: [PATCH] Update model recommendations to Qwen3 4B and fix status command - Changed primary model recommendation from qwen3:1.7b to qwen3:4b - Added Q8 quantization info in technical docs for production users - Fixed method name error: get_embedding_info() -> get_status() - Updated all error messages and test files with new recommendations - Maintained beginner-friendly options (1.7b still very good, 0.6b surprisingly good) - Added explanation of why small models work well with RAG context - Comprehensive testing completed - system ready for clean release --- .mini-rag/config.yaml | 53 ++++++++++++++++++++++++++++++++ .mini-rag/last_search | 1 + docs/TECHNICAL_GUIDE.md | 32 +++++++++++++++++++ docs/TROUBLESHOOTING.md | 4 ++- mini_rag/llm_synthesizer.py | 5 ++- rag-mini.py | 9 ++++-- rag-tui.py | 2 +- tests/test_ollama_integration.py | 6 ++-- tests/troubleshoot.py | 2 +- 9 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 .mini-rag/config.yaml create mode 100644 .mini-rag/last_search diff --git a/.mini-rag/config.yaml b/.mini-rag/config.yaml new file mode 100644 index 0000000..86338ba --- /dev/null +++ b/.mini-rag/config.yaml @@ -0,0 +1,53 @@ +# FSS-Mini-RAG Configuration +# Edit this file to customize indexing and search behavior +# See docs/GETTING_STARTED.md for detailed explanations + +# Text chunking settings +chunking: + max_size: 2000 # Maximum characters per chunk + min_size: 150 # Minimum characters per chunk + strategy: semantic # 'semantic' (language-aware) or 'fixed' + +# Large file streaming settings +streaming: + enabled: true + threshold_bytes: 1048576 # Files larger than this use streaming (1MB) + +# File processing settings +files: + min_file_size: 50 # Skip files smaller than this + exclude_patterns: + - "node_modules/**" + - ".git/**" + - "__pycache__/**" + - "*.pyc" + - ".venv/**" + - "venv/**" + - "build/**" + - "dist/**" + include_patterns: + - "**/*" # Include all files by default + +# Embedding generation settings +embedding: + preferred_method: ollama # 'ollama', 'ml', 'hash', or 'auto' + ollama_model: nomic-embed-text + ollama_host: localhost:11434 + ml_model: sentence-transformers/all-MiniLM-L6-v2 + batch_size: 32 # Embeddings processed per batch + +# Search behavior settings +search: + default_limit: 10 # Default number of results + enable_bm25: true # Enable keyword matching boost + similarity_threshold: 0.1 # Minimum similarity score + expand_queries: false # Enable automatic query expansion + +# LLM synthesis and query expansion settings +llm: + ollama_host: localhost:11434 + synthesis_model: auto # 'auto', 'qwen3:1.7b', etc. + expansion_model: auto # Usually same as synthesis_model + max_expansion_terms: 8 # Maximum terms to add to queries + enable_synthesis: false # Enable synthesis by default + synthesis_temperature: 0.3 # LLM temperature for analysis \ No newline at end of file diff --git a/.mini-rag/last_search b/.mini-rag/last_search new file mode 100644 index 0000000..19fbe2c --- /dev/null +++ b/.mini-rag/last_search @@ -0,0 +1 @@ +chunking \ No newline at end of file diff --git a/docs/TECHNICAL_GUIDE.md b/docs/TECHNICAL_GUIDE.md index eb6b72a..a92410f 100644 --- a/docs/TECHNICAL_GUIDE.md +++ b/docs/TECHNICAL_GUIDE.md @@ -787,4 +787,36 @@ def repair_index(self, project_path: Path) -> bool: return False ``` +## LLM Model Selection & Performance + +### Model Recommendations by Use Case + +FSS-Mini-RAG works well with various LLM sizes because our rich context and guided prompts help small models perform excellently: + +**Recommended (Best Balance):** +- **qwen3:4b** - Excellent quality, good performance +- **qwen3:4b:q8_0** - High-precision quantized version for production + +**Still Excellent (Faster/CPU-friendly):** +- **qwen3:1.7b** - Very good results, faster responses +- **qwen3:0.6b** - Surprisingly good considering size (522MB) + +### Why Small Models Work Well Here + +Small models can produce excellent results in RAG systems because: + +1. **Rich Context**: Our chunking provides substantial context around each match +2. **Guided Prompts**: Well-structured prompts give models a clear "runway" to continue +3. **Specific Domain**: Code analysis is more predictable than general conversation + +Without good context, small models tend to get lost and produce erratic output. But with RAG's rich context and focused prompts, even the 0.6B model can provide meaningful analysis. + +### Quantization Benefits + +For production deployments, consider quantized models like `qwen3:4b:q8_0`: +- **Q8_0**: 8-bit quantization with minimal quality loss +- **Smaller memory footprint**: ~50% reduction vs full precision +- **Better CPU performance**: Faster inference on CPU-only systems +- **Production ready**: Maintains analysis quality while improving efficiency + This technical guide provides the deep implementation details that developers need to understand, modify, and extend the system, while keeping the main README focused on getting users started quickly. \ No newline at end of file diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index eade395..d3e5d7a 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -165,7 +165,9 @@ python3 -c "import mini_rag; print('✅ Installation successful')" 2. **Try different model:** ```bash - ollama pull qwen3:1.7b # Good balance of speed/quality + ollama pull qwen3:4b # Recommended: excellent quality + ollama pull qwen3:1.7b # Still very good, faster + ollama pull qwen3:0.6b # Surprisingly good for CPU-only ``` 3. **Use synthesis mode instead of exploration:** diff --git a/mini_rag/llm_synthesizer.py b/mini_rag/llm_synthesizer.py index 0bf8503..f0f1c39 100644 --- a/mini_rag/llm_synthesizer.py +++ b/mini_rag/llm_synthesizer.py @@ -68,11 +68,14 @@ class LLMSynthesizer: # Modern model preference ranking (CPU-friendly first) # Prioritize: Ultra-efficient > Standard efficient > Larger models model_rankings = [ + # Recommended model (excellent quality) + "qwen3:4b", + # Ultra-efficient models (perfect for CPU-only systems) "qwen3:0.6b", "qwen3:1.7b", "llama3.2:1b", # Standard efficient models - "qwen2.5:1.5b", "qwen3:3b", "qwen3:4b", + "qwen2.5:1.5b", "qwen3:3b", # Qwen2.5 models (excellent performance/size ratio) "qwen2.5-coder:1.5b", "qwen2.5:1.5b", "qwen2.5:3b", "qwen2.5-coder:3b", diff --git a/rag-mini.py b/rag-mini.py index e283973..3871cab 100644 --- a/rag-mini.py +++ b/rag-mini.py @@ -117,7 +117,12 @@ def search_project(project_path: Path, query: str, limit: int = 10, synthesize: for i, result in enumerate(results, 1): # Clean up file path display - rel_path = result.file_path.relative_to(project_path) if result.file_path.is_absolute() else result.file_path + file_path = Path(result.file_path) + try: + rel_path = file_path.relative_to(project_path) + except ValueError: + # If relative_to fails, just show the basename + rel_path = file_path.name print(f"{i}. {rel_path}") print(f" Score: {result.score:.3f}") @@ -236,7 +241,7 @@ def status_check(project_path: Path): print("🧠 Embedding System:") try: embedder = OllamaEmbedder() - emb_info = embedder.get_embedding_info() + emb_info = embedder.get_status() method = emb_info.get('method', 'unknown') if method == 'ollama': diff --git a/rag-tui.py b/rag-tui.py index 47a785f..07cb230 100755 --- a/rag-tui.py +++ b/rag-tui.py @@ -514,7 +514,7 @@ class SimpleTUI: from mini_rag.ollama_embeddings import OllamaEmbedder embedder = OllamaEmbedder() - info = embedder.get_embedding_info() + info = embedder.get_status() print("🧠 Embedding System:") method = info.get('method', 'unknown') diff --git a/tests/test_ollama_integration.py b/tests/test_ollama_integration.py index 9e7c530..4466d3a 100755 --- a/tests/test_ollama_integration.py +++ b/tests/test_ollama_integration.py @@ -68,7 +68,7 @@ class TestOllamaIntegration(unittest.TestCase): if len(models) > 5: print(f" ... and {len(models)-5} more") else: - print(" ⚠️ No models found. Install with: ollama pull qwen3:1.7b") + print(" ⚠️ No models found. Install with: ollama pull qwen3:4b") self.assertTrue(True) else: @@ -146,7 +146,7 @@ class TestOllamaIntegration(unittest.TestCase): if not synthesizer.is_available(): self.fail( "❌ No LLM models available.\n" - " 💡 Install a model like: ollama pull qwen3:1.7b" + " 💡 Install a model like: ollama pull qwen3:4b" ) print(f" ✅ Found {len(synthesizer.available_models)} LLM models") @@ -426,7 +426,7 @@ def run_troubleshooting(): print("💡 Common Solutions:") print(" • Install Ollama: https://ollama.ai/download") print(" • Start server: ollama serve") - print(" • Install models: ollama pull qwen3:1.7b") + print(" • Install models: ollama pull qwen3:4b") print(" • Install embedding model: ollama pull nomic-embed-text") print() print("📚 For more help, see docs/QUERY_EXPANSION.md") diff --git a/tests/troubleshoot.py b/tests/troubleshoot.py index 69ab26b..3e6255e 100755 --- a/tests/troubleshoot.py +++ b/tests/troubleshoot.py @@ -50,7 +50,7 @@ def main(): print(" • Check docs/QUERY_EXPANSION.md for setup help") print(" • Ensure Ollama is installed: https://ollama.ai/download") print(" • Start Ollama server: ollama serve") - print(" • Install models: ollama pull qwen3:1.7b") + print(" • Install models: ollama pull qwen3:4b") def run_test(test_file): """Run a specific test file."""