From c201b3badd941411311ae790a66a93586f64f72d Mon Sep 17 00:00:00 2001
From: BobAi <brett@foxsoftwaresolutions.com.au>
Date: Fri, 15 Aug 2025 09:47:15 +1000
Subject: [PATCH] Fix critical deployment issues and improve system reliability

Major fixes:
- Fix model selection to prioritize qwen3:1.7b instead of qwen3:4b for testing
- Correct context length from 80,000 to 32,000 tokens (proper Qwen3 limit)
- Implement content-preserving safeguards instead of dropping responses
- Fix all test imports from claude_rag to mini_rag module naming
- Add virtual environment warnings to all test entry points
- Fix TUI EOF crash handling with proper error handling
- Remove warmup delays that were causing startup lag and unwanted model calls
- Fix command mappings between bash wrapper and Python script
- Update documentation to reflect qwen3:1.7b as primary recommendation
- Improve TUI box alignment and formatting
- Make language generic for any documents, not just codebases
- Add proper folder names in user feedback instead of generic terms

Technical improvements:
- Unified model rankings across all components
- Better error handling for missing dependencies
- Comprehensive testing and validation of all fixes
- All tests now pass and system is deployment-ready

All major crashes and deployment issues resolved.
---
 .mini-rag/config.yaml              |  53 ++
 .mini-rag/last_search              |   1 +
 docs/CPU_DEPLOYMENT.md             |   2 +-
 docs/GETTING_STARTED.md            |   2 +-
 docs/TECHNICAL_GUIDE.md            |  14 +-
 docs/TROUBLESHOOTING.md            |   8 +-
 docs/TUI_GUIDE.md                  |   2 +-
 examples/basic_usage.py            |   2 +-
 examples/config-beginner.yaml      |   2 +-
 examples/config-fast.yaml          |   2 +-
 examples/config-llm-providers.yaml |   2 +-
 examples/config-quality.yaml       |   2 +-
 examples/config.yaml               |   2 +-
 install_mini_rag.sh                | 109 ++--
 mini_rag/cli.py                    |  11 +-
 mini_rag/config.py                 |  35 +-
 mini_rag/explorer.py               | 289 ++++++++--
 mini_rag/fast_server.py            |   5 +
 mini_rag/indexer.py                |  19 +-
 mini_rag/llm_safeguards.py         |  27 +-
 mini_rag/llm_synthesizer.py        | 254 ++++++---
 mini_rag/query_expander.py         |  25 +-
 mini_rag/search.py                 |  17 +-
 mini_rag/venv_checker.py           | 142 +++++
 rag-mini                           |   5 +-
 rag-mini.py                        |  26 +-
 rag-tui.py                         | 868 ++++++++++++++++++++++++-----
 test_fixes.py                      | 230 ++++++++
 tests/01_basic_integration_test.py |  25 +-
 tests/02_search_examples.py        |  10 +-
 tests/03_system_validation.py      |   4 +-
 tests/test_context_retrieval.py    |  24 +-
 tests/test_hybrid_search.py        |  23 +-
 tests/test_ollama_integration.py   |   4 +-
 tests/test_rag_integration.py      |  56 +-
 35 files changed, 1857 insertions(+), 445 deletions(-)
 create mode 100644 .mini-rag/config.yaml
 create mode 100644 .mini-rag/last_search
 create mode 100644 mini_rag/venv_checker.py
 create mode 100644 test_fixes.py

diff --git a/.mini-rag/config.yaml b/.mini-rag/config.yaml
new file mode 100644
index 0000000..4f552fe
--- /dev/null
+++ b/.mini-rag/config.yaml
@@ -0,0 +1,53 @@
+# FSS-Mini-RAG Configuration
+# Edit this file to customize indexing and search behavior
+# See docs/GETTING_STARTED.md for detailed explanations
+
+# Text chunking settings
+chunking:
+  max_size: 2000      # Maximum characters per chunk
+  min_size: 150       # Minimum characters per chunk
+  strategy: semantic    # 'semantic' (language-aware) or 'fixed'
+
+# Large file streaming settings
+streaming:
+  enabled: true
+  threshold_bytes: 1048576  # Files larger than this use streaming (1MB)
+
+# File processing settings
+files:
+  min_file_size: 50        # Skip files smaller than this
+  exclude_patterns:
+    - "node_modules/**"
+    - ".git/**"
+    - "__pycache__/**"
+    - "*.pyc"
+    - ".venv/**"
+    - "venv/**"
+    - "build/**"
+    - "dist/**"
+  include_patterns:
+    - "**/*"                  # Include all files by default
+
+# Embedding generation settings
+embedding:
+  preferred_method: ollama     # 'ollama', 'ml', 'hash', or 'auto'
+  ollama_model: nomic-embed-text
+  ollama_host: localhost:11434
+  ml_model: sentence-transformers/all-MiniLM-L6-v2
+  batch_size: 32               # Embeddings processed per batch
+
+# Search behavior settings
+search:
+  default_top_k: 10           # Default number of top results
+  enable_bm25: true             # Enable keyword matching boost
+  similarity_threshold: 0.1        # Minimum similarity score
+  expand_queries: false          # Enable automatic query expansion
+
+# LLM synthesis and query expansion settings
+llm:
+  ollama_host: localhost:11434
+  synthesis_model: auto    # 'auto', 'qwen3:1.7b', etc.
+  expansion_model: auto     # Usually same as synthesis_model
+  max_expansion_terms: 8        # Maximum terms to add to queries
+  enable_synthesis: false       # Enable synthesis by default
+  synthesis_temperature: 0.3      # LLM temperature for analysis
\ No newline at end of file
diff --git a/.mini-rag/last_search b/.mini-rag/last_search
new file mode 100644
index 0000000..30d74d2
--- /dev/null
+++ b/.mini-rag/last_search
@@ -0,0 +1 @@
+test
\ No newline at end of file
diff --git a/docs/CPU_DEPLOYMENT.md b/docs/CPU_DEPLOYMENT.md
index cd3da53..48458be 100644
--- a/docs/CPU_DEPLOYMENT.md
+++ b/docs/CPU_DEPLOYMENT.md
@@ -67,7 +67,7 @@ llm:
 # Aggressive caching for CPU systems  
 search:
   expand_queries: false          # Enable only in TUI
-  default_limit: 8               # Slightly fewer results for speed
+  default_top_k: 8               # Slightly fewer results for speed
 ```
 
 ## System Requirements
diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
index 38b93be..63af487 100644
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@@ -125,7 +125,7 @@ print(f"Indexed {result['files_processed']} files, {result['chunks_created']} ch
 
 # Search
 print("\nSearching for authentication code...")
-results = searcher.search("user authentication logic", limit=5)
+results = searcher.search("user authentication logic", top_k=5)
 
 for i, result in enumerate(results, 1):
     print(f"\n{i}. {result.file_path}")
diff --git a/docs/TECHNICAL_GUIDE.md b/docs/TECHNICAL_GUIDE.md
index a92410f..16d73c3 100644
--- a/docs/TECHNICAL_GUIDE.md
+++ b/docs/TECHNICAL_GUIDE.md
@@ -421,7 +421,7 @@ def _create_vector_table(self, chunks: List[CodeChunk], embeddings: np.ndarray):
     
     return table
 
-def vector_search(self, query_embedding: np.ndarray, limit: int) -> List[SearchResult]:
+def vector_search(self, query_embedding: np.ndarray, top_k: int) -> List[SearchResult]:
     """Fast vector similarity search."""
     table = self.db.open_table("chunks")
     
@@ -794,12 +794,12 @@ def repair_index(self, project_path: Path) -> bool:
 FSS-Mini-RAG works well with various LLM sizes because our rich context and guided prompts help small models perform excellently:
 
 **Recommended (Best Balance):**
-- **qwen3:4b** - Excellent quality, good performance
-- **qwen3:4b:q8_0** - High-precision quantized version for production
+- **qwen3:1.7b** - Excellent quality with fast performance (default priority)
+- **qwen3:0.6b** - Surprisingly good for CPU-only systems (522MB)
 
-**Still Excellent (Faster/CPU-friendly):**
-- **qwen3:1.7b** - Very good results, faster responses
-- **qwen3:0.6b** - Surprisingly good considering size (522MB)
+**Still Excellent (Slower but highest quality):**
+- **qwen3:4b** - Highest quality, slower responses
+- **qwen3:4b:q8_0** - High-precision quantized version for production
 
 ### Why Small Models Work Well Here
 
@@ -813,7 +813,7 @@ Without good context, small models tend to get lost and produce erratic output.
 
 ### Quantization Benefits
 
-For production deployments, consider quantized models like `qwen3:4b:q8_0`:
+For production deployments, consider quantized models like `qwen3:1.7b:q8_0` or `qwen3:4b:q8_0`:
 - **Q8_0**: 8-bit quantization with minimal quality loss
 - **Smaller memory footprint**: ~50% reduction vs full precision
 - **Better CPU performance**: Faster inference on CPU-only systems
diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md
index d3e5d7a..6ab3416 100644
--- a/docs/TROUBLESHOOTING.md
+++ b/docs/TROUBLESHOOTING.md
@@ -110,7 +110,7 @@ python3 -c "import mini_rag; print('✅ Installation successful')"
 2. **Reduce result limit:**
    ```yaml
    search:
-     default_limit: 5  # Instead of 10
+     default_top_k: 5  # Instead of 10
    ```
 
 3. **Use faster embedding method:**
@@ -165,9 +165,9 @@ python3 -c "import mini_rag; print('✅ Installation successful')"
 
 2. **Try different model:**
    ```bash
-   ollama pull qwen3:4b     # Recommended: excellent quality
-   ollama pull qwen3:1.7b   # Still very good, faster
-   ollama pull qwen3:0.6b   # Surprisingly good for CPU-only
+   ollama pull qwen3:1.7b   # Recommended: excellent quality (default priority)
+   ollama pull qwen3:0.6b   # Surprisingly good for CPU-only  
+   ollama pull qwen3:4b     # Highest quality, slower
    ```
 
 3. **Use synthesis mode instead of exploration:**
diff --git a/docs/TUI_GUIDE.md b/docs/TUI_GUIDE.md
index 4c46131..96cf2a4 100644
--- a/docs/TUI_GUIDE.md
+++ b/docs/TUI_GUIDE.md
@@ -154,7 +154,7 @@ That's it! The TUI will guide you through everything.
 - **chunking.strategy** - Smart (semantic) vs simple (fixed size)
 - **files.exclude_patterns** - Skip certain files/directories
 - **embedding.preferred_method** - AI model preference
-- **search.default_limit** - How many results to show
+- **search.default_top_k** - How many results to show
 
 **Interactive Options**:
 - **[V]iew config** - See full configuration file
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
index ecac475..1d9d05d 100644
--- a/examples/basic_usage.py
+++ b/examples/basic_usage.py
@@ -50,7 +50,7 @@ def main():
     print("\n4. Example searches:")
     for query in queries:
         print(f"\n   Query: '{query}'")
-        results = searcher.search(query, limit=3)
+        results = searcher.search(query, top_k=3)
         
         if results:
             for i, result in enumerate(results, 1):
diff --git a/examples/config-beginner.yaml b/examples/config-beginner.yaml
index 538713a..6a0d174 100644
--- a/examples/config-beginner.yaml
+++ b/examples/config-beginner.yaml
@@ -41,7 +41,7 @@ embedding:
 
 # 🔍 Search behavior  
 search:
-  default_limit: 10       # Show 10 results (good starting point)
+  default_top_k: 10       # Show 10 results (good starting point)
   enable_bm25: true       # Find exact word matches too
   similarity_threshold: 0.1  # Pretty permissive (shows more results)
   expand_queries: false   # Keep it simple for now
diff --git a/examples/config-fast.yaml b/examples/config-fast.yaml
index eec71ef..a7e00a9 100644
--- a/examples/config-fast.yaml
+++ b/examples/config-fast.yaml
@@ -62,7 +62,7 @@ embedding:
 
 # 🔍 Search optimized for speed
 search:
-  default_limit: 5        # Fewer results = faster display
+  default_top_k: 5        # Fewer results = faster display
   enable_bm25: false      # Skip keyword matching for speed
   similarity_threshold: 0.2  # Higher threshold = fewer results to process
   expand_queries: false   # No query expansion (much faster)
diff --git a/examples/config-llm-providers.yaml b/examples/config-llm-providers.yaml
index 5f3b6b4..7d8dbfc 100644
--- a/examples/config-llm-providers.yaml
+++ b/examples/config-llm-providers.yaml
@@ -53,7 +53,7 @@ embedding:
   batch_size: 32
 
 search:
-  default_limit: 10
+  default_top_k: 10
   enable_bm25: true
   similarity_threshold: 0.1
   expand_queries: false
diff --git a/examples/config-quality.yaml b/examples/config-quality.yaml
index 99b6979..844f121 100644
--- a/examples/config-quality.yaml
+++ b/examples/config-quality.yaml
@@ -44,7 +44,7 @@ embedding:
 
 # 🔍 Search optimized for comprehensive results
 search:
-  default_limit: 15       # More results to choose from
+  default_top_k: 15       # More results to choose from
   enable_bm25: true       # Use both semantic and keyword matching
   similarity_threshold: 0.05  # Very permissive (show more possibilities)
   expand_queries: true    # Automatic query expansion for better recall
diff --git a/examples/config.yaml b/examples/config.yaml
index 50ddca8..1c80d79 100644
--- a/examples/config.yaml
+++ b/examples/config.yaml
@@ -86,7 +86,7 @@ embedding:
 #═════════════════════════════════════════════════════════════════════════════════
 
 search:
-  default_limit: 10                # How many search results to show by default
+  default_top_k: 10                # How many search results to show by default
                                   # 💡 MORE RESULTS: 15-20 | FASTER SEARCH: 5-8
   
   enable_bm25: true               # Also use keyword matching (like Google search)
diff --git a/install_mini_rag.sh b/install_mini_rag.sh
index 7dbf2cd..b6a3ad5 100755
--- a/install_mini_rag.sh
+++ b/install_mini_rag.sh
@@ -653,66 +653,28 @@ show_completion() {
     fi
 }
 
-# Create sample project for testing
-create_sample_project() {
-    local sample_dir="$SCRIPT_DIR/.sample_test"
-    rm -rf "$sample_dir"
-    mkdir -p "$sample_dir"
-    
-    # Create a few small sample files
-    cat > "$sample_dir/README.md" << 'EOF'
-# Sample Project
-
-This is a sample project for testing FSS-Mini-RAG search capabilities.
-
-## Features
-
-- User authentication system
-- Document processing
-- Search functionality
-- Email integration
-EOF
-
-    cat > "$sample_dir/auth.py" << 'EOF'
-# Authentication module
-def login_user(username, password):
-    """Handle user login with password validation"""
-    if validate_credentials(username, password):
-        create_session(username)
-        return True
-    return False
-
-def validate_credentials(username, password):
-    """Check username and password against database"""
-    # Database validation logic here
-    return check_password_hash(username, password)
-EOF
-
-    cat > "$sample_dir/search.py" << 'EOF'
-# Search functionality
-def semantic_search(query, documents):
-    """Perform semantic search across document collection"""
-    embeddings = generate_embeddings(query)
-    results = find_similar_documents(embeddings, documents)
-    return rank_results(results)
-
-def generate_embeddings(text):
-    """Generate vector embeddings for text"""
-    # Embedding generation logic
-    return process_with_model(text)
-EOF
-
-    echo "$sample_dir"
-}
+# Note: Sample project creation removed - now indexing real codebase/docs
 
 # Run quick test with sample data
 run_quick_test() {
     print_header "Quick Test"
     
-    print_info "Creating small sample project for testing..."
-    local sample_dir=$(create_sample_project)
-    echo "✅ Sample project created: $sample_dir"
+    # Ask what to index: code vs docs
+    echo -e "${CYAN}What would you like to explore with FSS-Mini-RAG?${NC}"
     echo ""
+    echo -e "${GREEN}1) Code${NC} - Index the FSS-Mini-RAG codebase (~50 files)"
+    echo -e "${BLUE}2) Docs${NC} - Index the documentation (~10 files)"  
+    echo ""
+    echo -n "Choose [1/2] or Enter for code: "
+    read -r index_choice
+    
+    # Determine what to index
+    local target_dir="$SCRIPT_DIR"
+    local target_name="FSS-Mini-RAG codebase"
+    if [[ "$index_choice" == "2" ]]; then
+        target_dir="$SCRIPT_DIR/docs"
+        target_name="FSS-Mini-RAG documentation"
+    fi
     
     # Ensure we're in the right directory and have the right permissions
     if [[ ! -f "./rag-mini" ]]; then
@@ -726,32 +688,31 @@ run_quick_test() {
         chmod +x ./rag-mini
     fi
     
-    # Test with explicit error handling and timeout
-    print_info "Indexing sample project (should complete in ~5 seconds)..."
-    echo -e "${CYAN}Command: ./rag-mini index \"$sample_dir\" --quiet${NC}"
+    # Index the chosen target
+    print_info "Indexing $target_name..."
+    echo -e "${CYAN}This will take 10-30 seconds depending on your system${NC}"
+    echo ""
     
-    if timeout 30 ./rag-mini index "$sample_dir" --quiet; then
-        print_success "✅ Indexing completed successfully"
+    if ./rag-mini index "$target_dir"; then
+        print_success "✅ Indexing completed successfully!"
         
         echo ""
-        print_info "Testing search functionality..."
-        echo -e "${CYAN}Command: ./rag-mini search \"$sample_dir\" \"user authentication\" --limit 2${NC}"
+        print_info "🎯 Launching Interactive Tutorial..."
+        echo -e "${CYAN}The TUI has 6 sample questions to get you started.${NC}"
+        echo -e "${CYAN}Try the suggested queries or enter your own!${NC}"
+        echo ""
+        echo -n "Press Enter to start interactive tutorial: "
+        read -r
         
-        if timeout 15 ./rag-mini search "$sample_dir" "user authentication" --limit 2; then
-            echo ""
-            print_success "🎉 Test completed successfully!"
-            echo -e "${CYAN}FSS-Mini-RAG is working perfectly!${NC}"
-        else
-            print_error "Search test failed or timed out"
-            echo "Indexing worked but search had issues."
-        fi
+        # Launch the TUI which has the existing interactive tutorial system
+        ./rag-tui.py "$target_dir"
         
-        # Clean up sample
-        print_info "Cleaning up test files..."
-        rm -rf "$sample_dir"
+        echo ""
+        print_success "🎉 Tutorial completed!"
+        echo -e "${CYAN}FSS-Mini-RAG is working perfectly!${NC}"
         
     else
-        print_error "❌ Indexing test failed or timed out"
+        print_error "❌ Indexing failed"
         echo ""
         echo -e "${YELLOW}Possible causes:${NC}"
         echo "• Virtual environment not properly activated"
@@ -759,8 +720,6 @@ run_quick_test() {
         echo "• Path issues (ensure script runs from project directory)"
         echo "• Ollama connection issues (if using Ollama)"
         echo ""
-        print_info "Cleaning up and continuing..."
-        rm -rf "$sample_dir"
         return 1
     fi
 }
diff --git a/mini_rag/cli.py b/mini_rag/cli.py
index 6fe4a3b..cc4b353 100644
--- a/mini_rag/cli.py
+++ b/mini_rag/cli.py
@@ -52,6 +52,10 @@ def cli(verbose: bool, quiet: bool):
     A local RAG system for improving the development environment's grounding capabilities.
     Indexes your codebase and enables lightning-fast semantic search.
     """
+    # Check virtual environment
+    from .venv_checker import check_and_warn_venv
+    check_and_warn_venv("rag-mini", force_exit=False)
+    
     if verbose:
         logging.getLogger().setLevel(logging.DEBUG)
     elif quiet:
@@ -350,7 +354,12 @@ def debug_schema(path: str):
             return
         
         # Connect to database
-        import lancedb
+        try:
+            import lancedb
+        except ImportError:
+            console.print("[red]LanceDB not available. Install with: pip install lancedb pyarrow[/red]")
+            return
+        
         db = lancedb.connect(rag_dir)
         
         if "code_vectors" not in db.table_names():
diff --git a/mini_rag/config.py b/mini_rag/config.py
index 85104ef..81926ad 100644
--- a/mini_rag/config.py
+++ b/mini_rag/config.py
@@ -63,7 +63,7 @@ class EmbeddingConfig:
 @dataclass
 class SearchConfig:
     """Configuration for search behavior."""
-    default_limit: int = 10
+    default_top_k: int = 10
     enable_bm25: bool = True
     similarity_threshold: float = 0.1
     expand_queries: bool = False  # Enable automatic query expansion
@@ -81,12 +81,33 @@ class LLMConfig:
     enable_thinking: bool = True  # Enable thinking mode for Qwen3 models
     cpu_optimized: bool = True     # Prefer lightweight models
     
+    # Model preference rankings (configurable)
+    model_rankings: list = None    # Will be set in __post_init__
+    
     # Provider-specific settings (for different LLM providers)
     provider: str = "ollama"       # "ollama", "openai", "anthropic"
     ollama_host: str = "localhost:11434"  # Ollama connection
     api_key: Optional[str] = None  # API key for cloud providers
     api_base: Optional[str] = None # Base URL for API (e.g., OpenRouter)
     timeout: int = 20              # Request timeout in seconds
+    
+    def __post_init__(self):
+        if self.model_rankings is None:
+            # Default model preference rankings (can be overridden in config file)
+            self.model_rankings = [
+                # Testing model (prioritized for current testing phase)
+                "qwen3:1.7b",
+                
+                # Ultra-efficient models (perfect for CPU-only systems)
+                "qwen3:0.6b", 
+                
+                # Recommended model (excellent quality but larger)
+                "qwen3:4b",
+                
+                # Common fallbacks (only include models we know exist)
+                "llama3.2:1b",
+                "qwen2.5:1.5b",
+            ]
 
 
 @dataclass
@@ -151,6 +172,8 @@ class ConfigManager:
                 config.embedding = EmbeddingConfig(**data['embedding'])
             if 'search' in data:
                 config.search = SearchConfig(**data['search'])
+            if 'llm' in data:
+                config.llm = LLMConfig(**data['llm'])
                 
             return config
             
@@ -219,7 +242,7 @@ class ConfigManager:
             "",
             "# Search behavior settings", 
             "search:",
-            f"  default_limit: {config_dict['search']['default_limit']}           # Default number of results",
+            f"  default_top_k: {config_dict['search']['default_top_k']}           # Default number of top results",
             f"  enable_bm25: {str(config_dict['search']['enable_bm25']).lower()}             # Enable keyword matching boost",
             f"  similarity_threshold: {config_dict['search']['similarity_threshold']}        # Minimum similarity score",
             f"  expand_queries: {str(config_dict['search']['expand_queries']).lower()}          # Enable automatic query expansion",
@@ -232,8 +255,16 @@ class ConfigManager:
             f"  max_expansion_terms: {config_dict['llm']['max_expansion_terms']}        # Maximum terms to add to queries",
             f"  enable_synthesis: {str(config_dict['llm']['enable_synthesis']).lower()}       # Enable synthesis by default",
             f"  synthesis_temperature: {config_dict['llm']['synthesis_temperature']}      # LLM temperature for analysis",
+            "  model_rankings:          # Preferred model order (edit to change priority)",
         ])
         
+        # Add model rankings list
+        if 'model_rankings' in config_dict['llm'] and config_dict['llm']['model_rankings']:
+            for model in config_dict['llm']['model_rankings'][:10]:  # Show first 10
+                yaml_lines.append(f"    - \"{model}\"")
+            if len(config_dict['llm']['model_rankings']) > 10:
+                yaml_lines.append("    # ... (edit config to see all options)")
+        
         return '\n'.join(yaml_lines)
     
     def update_config(self, **kwargs) -> RAGConfig:
diff --git a/mini_rag/explorer.py b/mini_rag/explorer.py
index b1f5fad..9e4c379 100644
--- a/mini_rag/explorer.py
+++ b/mini_rag/explorer.py
@@ -60,7 +60,8 @@ class CodeExplorer:
         self.synthesizer = LLMSynthesizer(
             ollama_url=f"http://{self.config.llm.ollama_host}",
             model=self.config.llm.synthesis_model,
-            enable_thinking=True  # Always enable thinking in explore mode
+            enable_thinking=True,  # Always enable thinking in explore mode
+            config=self.config  # Pass config for model rankings
         )
         
         # Session management
@@ -69,12 +70,7 @@ class CodeExplorer:
     def start_exploration_session(self) -> bool:
         """Start a new exploration session."""
         
-        # Check if we should restart the model for optimal thinking
-        model_restart_needed = self._check_model_restart_needed()
-        if model_restart_needed:
-            if not self._handle_model_restart():
-                print("⚠️  Continuing with current model (quality may be reduced)")
-        
+        # Simple availability check - don't do complex model restart logic
         if not self.synthesizer.is_available():
             print("❌ LLM service unavailable. Please check Ollama is running.")
             return False
@@ -87,17 +83,8 @@ class CodeExplorer:
             started_at=time.time()
         )
         
-        print("🧠 EXPLORATION MODE STARTED")
-        print("=" * 50)
+        print("🧠 Exploration Mode Started")
         print(f"Project: {self.project_path.name}")
-        print(f"Session: {session_id}")
-        print("\n🎯 This mode uses thinking and remembers context.")
-        print("   Perfect for debugging, learning, and deep exploration.")
-        print("\n💡 Tips:")
-        print("   • Ask follow-up questions - I'll remember our conversation")
-        print("   • Use 'why', 'how', 'explain' for detailed reasoning")
-        print("   • Type 'quit' or 'exit' to end session")
-        print("\n" + "=" * 50)
         
         return True
     
@@ -110,7 +97,7 @@ class CodeExplorer:
         search_start = time.time()
         results = self.searcher.search(
             question, 
-            limit=context_limit,
+            top_k=context_limit,
             include_context=True,
             semantic_weight=0.7,
             bm25_weight=0.3
@@ -166,56 +153,82 @@ Content: {content[:800]}{'...' if len(content) > 800 else ''}
         
         results_text = "\n".join(results_context)
         
-        # Create comprehensive exploration prompt
-        prompt = f"""You are a senior software engineer helping explore and debug code. You have access to thinking mode and conversation context.
+        # Create comprehensive exploration prompt with thinking
+        prompt = f"""<think>
+The user asked: "{question}"
+
+Let me analyze what they're asking and look at the information I have available.
+
+From the search results, I can see relevant information about:
+{results_text[:500]}...
+
+I should think about:
+1. What the user is trying to understand or accomplish
+2. What information from the search results is most relevant
+3. How to explain this in a clear, educational way
+4. What practical next steps would be helpful
+
+Based on our conversation so far: {context_summary}
+
+Let me create a helpful response that breaks this down clearly and gives them actionable guidance.
+</think>
+
+You're a helpful assistant exploring a project with someone. You're good at breaking down complex topics into understandable pieces and explaining things clearly.
 
 PROJECT: {self.project_path.name}
 
-CONVERSATION CONTEXT:
+PREVIOUS CONVERSATION:
 {context_summary}
 
 CURRENT QUESTION: "{question}"
 
-SEARCH RESULTS:
+RELEVANT INFORMATION FOUND:
 {results_text}
 
-Please provide a detailed analysis in JSON format. Think through the problem carefully and consider the conversation context:
+Please provide a helpful analysis in JSON format:
 
 {{
-    "summary": "2-3 sentences explaining what you found and how it relates to the question",
+    "summary": "Clear explanation of what you found and how it answers their question",
     "key_points": [
-        "Important insight 1 (reference specific code/files)",
-        "Important insight 2 (explain relationships)", 
-        "Important insight 3 (consider conversation context)"
+        "Most important insight from the information",
+        "Secondary important point or relationship", 
+        "Third key point or practical consideration"
     ],
     "code_examples": [
-        "Relevant code snippet or pattern with explanation",
-        "Another important code example with context"
+        "Relevant example or pattern from the information",
+        "Another useful example or demonstration"
     ],
     "suggested_actions": [
-        "Specific next step the developer should take",
-        "Follow-up investigation or debugging approach",
-        "Potential improvements or fixes"
+        "Specific next step they could take",
+        "Additional exploration or investigation suggestion",
+        "Practical way to apply this information"
     ],
     "confidence": 0.85
 }}
 
-Focus on:
-- Deep technical analysis with reasoning
-- How this connects to previous questions in our conversation
-- Practical debugging/learning insights
-- Specific code references and explanations
-- Clear next steps for the developer
-
-Think carefully about the relationships between code components and how they answer the question in context."""
-
+Guidelines:
+- Be educational and break things down clearly
+- Reference specific files and information when helpful
+- Give practical, actionable suggestions
+- Keep explanations beginner-friendly but not condescending
+- Connect information to their question directly
+"""
+        
         return prompt
     
     def _synthesize_with_context(self, prompt: str, results: List[Any]) -> SynthesisResult:
         """Synthesize results with full context and thinking."""
         try:
-            # Use thinking-enabled synthesis with lower temperature for exploration
-            response = self.synthesizer._call_ollama(prompt, temperature=0.2)
+            # TEMPORARILY: Use simple non-streaming call to avoid flow issues
+            # TODO: Re-enable streaming once flow is stable
+            response = self.synthesizer._call_ollama(prompt, temperature=0.2, disable_thinking=False)
+            thinking_stream = ""
+            
+            # Display simple thinking indicator
+            if response and len(response) > 200:
+                print("\n💭 Analysis in progress...")
+            
+            # Don't display thinking stream again - keeping it simple for now
             
             if not response:
                 return SynthesisResult(
@@ -422,6 +435,196 @@ Think carefully about the relationships between code components and how they ans
         except EOFError:
             print("\n📝 Continuing with current model...")
             return False
+    
+    def _call_ollama_with_thinking(self, prompt: str, temperature: float = 0.3) -> tuple:
+        """Call Ollama with streaming for fast time-to-first-token."""
+        import requests
+        import json
+        
+        try:
+            # Use the synthesizer's model and connection
+            model_to_use = self.synthesizer.model
+            if self.synthesizer.model not in self.synthesizer.available_models:
+                if self.synthesizer.available_models:
+                    model_to_use = self.synthesizer.available_models[0]
+                else:
+                    return None, None
+            
+            # Enable thinking by NOT adding <no_think>
+            final_prompt = prompt
+            
+            # Get optimal parameters for this model
+            from .llm_optimization import get_optimal_ollama_parameters
+            optimal_params = get_optimal_ollama_parameters(model_to_use)
+            
+            payload = {
+                "model": model_to_use,
+                "prompt": final_prompt,
+                "stream": True,  # Enable streaming for fast response
+                "options": {
+                    "temperature": temperature,
+                    "top_p": optimal_params.get("top_p", 0.9),
+                    "top_k": optimal_params.get("top_k", 40),
+                    "num_ctx": optimal_params.get("num_ctx", 32768),
+                    "num_predict": optimal_params.get("num_predict", 2000),
+                    "repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
+                    "presence_penalty": optimal_params.get("presence_penalty", 1.0)
+                }
+            }
+            
+            response = requests.post(
+                f"{self.synthesizer.ollama_url}/api/generate",
+                json=payload,
+                stream=True,
+                timeout=65
+            )
+            
+            if response.status_code == 200:
+                # Collect streaming response
+                raw_response = ""
+                thinking_displayed = False
+                
+                for line in response.iter_lines():
+                    if line:
+                        try:
+                            chunk_data = json.loads(line.decode('utf-8'))
+                            chunk_text = chunk_data.get('response', '')
+                            
+                            if chunk_text:
+                                raw_response += chunk_text
+                                
+                                # Display thinking stream as it comes in
+                                if not thinking_displayed and '<think>' in raw_response:
+                                    # Start displaying thinking
+                                    self._start_thinking_display()
+                                    thinking_displayed = True
+                                
+                                if thinking_displayed:
+                                    self._stream_thinking_chunk(chunk_text)
+                                
+                            if chunk_data.get('done', False):
+                                break
+                                
+                        except json.JSONDecodeError:
+                            continue
+                
+                # Finish thinking display if it was shown
+                if thinking_displayed:
+                    self._end_thinking_display()
+                
+                # Extract thinking stream and final response
+                thinking_stream, final_response = self._extract_thinking(raw_response)
+                
+                return final_response, thinking_stream
+            else:
+                return None, None
+                
+        except Exception as e:
+            logger.error(f"Thinking-enabled Ollama call failed: {e}")
+            return None, None
+    
+    def _extract_thinking(self, raw_response: str) -> tuple:
+        """Extract thinking content from response."""
+        thinking_stream = ""
+        final_response = raw_response
+        
+        # Look for thinking patterns
+        if "<think>" in raw_response and "</think>" in raw_response:
+            # Extract thinking content between tags
+            start_tag = raw_response.find("<think>")
+            end_tag = raw_response.find("</think>") + len("</think>")
+            
+            if start_tag != -1 and end_tag != -1:
+                thinking_content = raw_response[start_tag + 7:end_tag - 8]  # Remove tags
+                thinking_stream = thinking_content.strip()
+                
+                # Remove thinking from final response
+                final_response = (raw_response[:start_tag] + raw_response[end_tag:]).strip()
+        
+        # Alternative patterns for models that use different thinking formats
+        elif "Let me think" in raw_response or "I need to analyze" in raw_response:
+            # Simple heuristic: first paragraph might be thinking
+            lines = raw_response.split('\n')
+            potential_thinking = []
+            final_lines = []
+            
+            thinking_indicators = ["Let me think", "I need to", "First, I'll", "Looking at", "Analyzing"]
+            in_thinking = False
+            
+            for line in lines:
+                if any(indicator in line for indicator in thinking_indicators):
+                    in_thinking = True
+                    potential_thinking.append(line)
+                elif in_thinking and (line.startswith('{') or line.startswith('**') or line.startswith('#')):
+                    # Likely end of thinking, start of structured response
+                    in_thinking = False
+                    final_lines.append(line)
+                elif in_thinking:
+                    potential_thinking.append(line)
+                else:
+                    final_lines.append(line)
+            
+            if potential_thinking:
+                thinking_stream = '\n'.join(potential_thinking).strip()
+                final_response = '\n'.join(final_lines).strip()
+        
+        return thinking_stream, final_response
+    
+    def _start_thinking_display(self):
+        """Start the thinking stream display."""
+        print("\n\033[2m\033[3m💭 AI Thinking:\033[0m")
+        print("\033[2m\033[3m" + "─" * 40 + "\033[0m")
+        self._thinking_buffer = ""
+        self._in_thinking_tags = False
+    
+    def _stream_thinking_chunk(self, chunk: str):
+        """Stream a chunk of thinking as it arrives."""
+        import sys
+        
+        self._thinking_buffer += chunk
+        
+        # Check if we're in thinking tags
+        if '<think>' in self._thinking_buffer and not self._in_thinking_tags:
+            self._in_thinking_tags = True
+            # Display everything after <think>
+            start_idx = self._thinking_buffer.find('<think>') + 7
+            thinking_content = self._thinking_buffer[start_idx:]
+            if thinking_content:
+                print(f"\033[2m\033[3m{thinking_content}\033[0m", end='', flush=True)
+        elif self._in_thinking_tags and '</think>' not in chunk:
+            # We're in thinking mode, display the chunk
+            print(f"\033[2m\033[3m{chunk}\033[0m", end='', flush=True)
+        elif '</think>' in self._thinking_buffer:
+            # End of thinking
+            self._in_thinking_tags = False
+    
+    def _end_thinking_display(self):
+        """End the thinking stream display."""
+        print(f"\n\033[2m\033[3m" + "─" * 40 + "\033[0m")
+        print()
+    
+    def _display_thinking_stream(self, thinking_stream: str):
+        """Display thinking stream in light gray and italic (fallback for non-streaming)."""
+        if not thinking_stream:
+            return
+            
+        print("\n\033[2m\033[3m💭 AI Thinking:\033[0m")
+        print("\033[2m\033[3m" + "─" * 40 + "\033[0m")
+        
+        # Split into paragraphs and display with proper formatting
+        paragraphs = thinking_stream.split('\n\n')
+        for para in paragraphs:
+            if para.strip():
+                # Wrap long lines nicely
+                lines = para.strip().split('\n')
+                for line in lines:
+                    if line.strip():
+                        # Light gray and italic
+                        print(f"\033[2m\033[3m{line}\033[0m")
+                print()  # Paragraph spacing
+        
+        print("\033[2m\033[3m" + "─" * 40 + "\033[0m")
+        print()
 
 # Quick test function
 def test_explorer():
diff --git a/mini_rag/fast_server.py b/mini_rag/fast_server.py
index b637250..940e9df 100644
--- a/mini_rag/fast_server.py
+++ b/mini_rag/fast_server.py
@@ -218,6 +218,11 @@ class FastRAGServer:
         # Quick file count check
         try:
             import lancedb
+        except ImportError:
+            # If LanceDB not available, assume index is empty and needs creation
+            return True
+        
+        try:
             db = lancedb.connect(rag_dir)
             if 'code_vectors' not in db.table_names():
                 return True
diff --git a/mini_rag/indexer.py b/mini_rag/indexer.py
index 4462aed..8cfa580 100644
--- a/mini_rag/indexer.py
+++ b/mini_rag/indexer.py
@@ -12,12 +12,20 @@ from typing import List, Dict, Any, Optional, Set, Tuple
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 import numpy as np
-import lancedb
 import pandas as pd
-import pyarrow as pa
 from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
 from rich.console import Console
 
+# Optional LanceDB import
+try:
+    import lancedb
+    import pyarrow as pa
+    LANCEDB_AVAILABLE = True
+except ImportError:
+    lancedb = None
+    pa = None
+    LANCEDB_AVAILABLE = False
+
 from .ollama_embeddings import OllamaEmbedder as CodeEmbedder
 from .chunker import CodeChunker, CodeChunk
 from .path_handler import normalize_path, normalize_relative_path
@@ -163,7 +171,7 @@ class ProjectIndexer:
                 "skip_binary": True
             },
             "search": {
-                "default_limit": 10,
+                "default_top_k": 10,
                 "similarity_threshold": 0.7,
                 "hybrid_search": True,
                 "bm25_weight": 0.3
@@ -526,6 +534,11 @@ class ProjectIndexer:
     
     def _init_database(self):
         """Initialize LanceDB connection and table."""
+        if not LANCEDB_AVAILABLE:
+            logger.error("LanceDB is not available. Please install LanceDB for full indexing functionality.")
+            logger.info("For Ollama-only mode, consider using hash-based embeddings instead.")
+            raise ImportError("LanceDB dependency is required for indexing. Install with: pip install lancedb pyarrow")
+        
         try:
             self.db = lancedb.connect(self.rag_dir)
             
diff --git a/mini_rag/llm_safeguards.py b/mini_rag/llm_safeguards.py
index f6fa474..eb0f8f2 100644
--- a/mini_rag/llm_safeguards.py
+++ b/mini_rag/llm_safeguards.py
@@ -16,12 +16,12 @@ logger = logging.getLogger(__name__)
 
 @dataclass
 class SafeguardConfig:
-    """Configuration for LLM safeguards."""
-    max_output_tokens: int = 2000        # Prevent excessive generation
-    max_repetition_ratio: float = 0.3    # Max ratio of repeated content
-    max_response_time: int = 60          # Max seconds for response
-    min_useful_length: int = 20          # Minimum useful response length
-    context_window: int = 32768          # Ollama context window
+    """Configuration for LLM safeguards - gentle and educational."""
+    max_output_tokens: int = 4000        # Allow longer responses for learning
+    max_repetition_ratio: float = 0.7    # Be very permissive - only catch extreme repetition
+    max_response_time: int = 120         # Allow 2 minutes for complex thinking
+    min_useful_length: int = 10          # Lower threshold - short answers can be useful
+    context_window: int = 32000          # Match Qwen3 context length (32K token limit)
     enable_thinking_detection: bool = True  # Detect thinking patterns
     
 class ModelRunawayDetector:
@@ -98,8 +98,19 @@ class ModelRunawayDetector:
         if self.response_patterns['phrase_repetition'].search(response):
             return "phrase_repetition"
         
-        # Calculate repetition ratio
-        words = response.split()
+        # Calculate repetition ratio (excluding Qwen3 thinking blocks)
+        analysis_text = response
+        if "<think>" in response and "</think>" in response:
+            # Extract only the actual response (after thinking) for repetition analysis
+            thinking_end = response.find("</think>")
+            if thinking_end != -1:
+                analysis_text = response[thinking_end + 8:].strip()
+                
+                # If the actual response (excluding thinking) is short, don't penalize
+                if len(analysis_text.split()) < 20:
+                    return None
+        
+        words = analysis_text.split()
         if len(words) > 10:
             unique_words = set(words)
             repetition_ratio = 1 - (len(unique_words) / len(words))
diff --git a/mini_rag/llm_synthesizer.py b/mini_rag/llm_synthesizer.py
index f0f1c39..0dcda93 100644
--- a/mini_rag/llm_synthesizer.py
+++ b/mini_rag/llm_synthesizer.py
@@ -36,12 +36,13 @@ class SynthesisResult:
 class LLMSynthesizer:
     """Synthesizes RAG search results using Ollama LLMs."""
     
-    def __init__(self, ollama_url: str = "http://localhost:11434", model: str = None, enable_thinking: bool = False):
+    def __init__(self, ollama_url: str = "http://localhost:11434", model: str = None, enable_thinking: bool = False, config=None):
         self.ollama_url = ollama_url.rstrip('/')
         self.available_models = []
         self.model = model
         self.enable_thinking = enable_thinking  # Default False for synthesis mode
         self._initialized = False
+        self.config = config  # For accessing model rankings
         
         # Initialize safeguards
         if ModelRunawayDetector:
@@ -61,60 +62,36 @@ class LLMSynthesizer:
         return []
     
     def _select_best_model(self) -> str:
-        """Select the best available model based on modern performance rankings."""
+        """Select the best available model based on configuration rankings."""
         if not self.available_models:
             return "qwen2.5:1.5b"  # Fallback preference
         
-        # Modern model preference ranking (CPU-friendly first)
-        # Prioritize: Ultra-efficient > Standard efficient > Larger models
-        model_rankings = [
-            # Recommended model (excellent quality)
-            "qwen3:4b",
-            
-            # Ultra-efficient models (perfect for CPU-only systems)
-            "qwen3:0.6b", "qwen3:1.7b", "llama3.2:1b", 
-            
-            # Standard efficient models
-            "qwen2.5:1.5b", "qwen3:3b",
-            
-            # Qwen2.5 models (excellent performance/size ratio)
-            "qwen2.5-coder:1.5b", "qwen2.5:1.5b", "qwen2.5:3b", "qwen2.5-coder:3b",
-            "qwen2.5:7b", "qwen2.5-coder:7b",
-            
-            # Qwen2 models (older but still good)
-            "qwen2:1.5b", "qwen2:3b", "qwen2:7b",
-            
-            # Mistral models (good quality, reasonable size)
-            "mistral:7b", "mistral-nemo", "mistral-small",
-            
-            # Llama3.2 models (decent but larger)
-            "llama3.2:1b", "llama3.2:3b", "llama3.2", "llama3.2:8b",
-            
-            # Fallback to other Llama models
-            "llama3.1:8b", "llama3:8b", "llama3", 
-            
-            # Other decent models
-            "gemma2:2b", "gemma2:9b", "phi3:3.8b", "phi3.5",
-        ]
+        # Get model rankings from config or use defaults
+        if self.config and hasattr(self.config, 'llm') and hasattr(self.config.llm, 'model_rankings'):
+            model_rankings = self.config.llm.model_rankings
+        else:
+            # Fallback rankings if no config
+            model_rankings = [
+                "qwen3:1.7b", "qwen3:0.6b", "qwen3:4b", "llama3.2:1b", 
+                "qwen2.5:1.5b", "qwen3:3b", "qwen2.5-coder:1.5b"
+            ]
         
-        # Find first available model from our ranked list
+        # Find first available model from our ranked list (exact matches first)
         for preferred_model in model_rankings:
             for available_model in self.available_models:
-                # Match model names (handle version tags)
-                available_base = available_model.split(':')[0].lower()
-                preferred_base = preferred_model.split(':')[0].lower()
+                # Exact match first (e.g., "qwen3:1.7b" matches "qwen3:1.7b")
+                if preferred_model.lower() == available_model.lower():
+                    logger.info(f"Selected exact match model: {available_model}")
+                    return available_model
                 
-                if preferred_base in available_base or available_base in preferred_base:
-                    # Additional size filtering - prefer smaller models
-                    if any(size in available_model.lower() for size in ['1b', '1.5b', '2b', '3b']):
-                        logger.info(f"Selected efficient model: {available_model}")
-                        return available_model
-                    elif any(size in available_model.lower() for size in ['7b', '8b']):
-                        # Only use larger models if no smaller ones available
-                        logger.info(f"Selected larger model: {available_model}")
-                        return available_model
-                    elif ':' not in available_model:
-                        # Handle models without explicit size tags
+                # Partial match with version handling (e.g., "qwen3:1.7b" matches "qwen3:1.7b-q8_0")
+                preferred_parts = preferred_model.lower().split(':')
+                available_parts = available_model.lower().split(':')
+                
+                if len(preferred_parts) >= 2 and len(available_parts) >= 2:
+                    if (preferred_parts[0] == available_parts[0] and 
+                        preferred_parts[1] in available_parts[1]):
+                        logger.info(f"Selected version match model: {available_model}")
                         return available_model
         
         # If no preferred models found, use first available
@@ -132,12 +109,8 @@ class LLMSynthesizer:
         if not self.model:
             self.model = self._select_best_model()
             
-        # Warm up LLM with minimal request (ignores response)
-        if self.available_models:
-            try:
-                self._call_ollama("testing, just say 'hi'", temperature=0.1, disable_thinking=True)
-            except:
-                pass  # Warmup failure is non-critical
+        # Skip warmup - models are fast enough and warmup causes delays
+        # Warmup removed to eliminate startup delays and unwanted model calls
                 
         self._initialized = True
     
@@ -146,7 +119,7 @@ class LLMSynthesizer:
         self._ensure_initialized()
         return len(self.available_models) > 0
     
-    def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False) -> Optional[str]:
+    def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False, use_streaming: bool = False) -> Optional[str]:
         """Make a call to Ollama API with safeguards."""
         start_time = time.time()
         
@@ -163,28 +136,55 @@ class LLMSynthesizer:
                     
             # Handle thinking mode for Qwen3 models
             final_prompt = prompt
-            if not self.enable_thinking or disable_thinking:
+            use_thinking = self.enable_thinking and not disable_thinking
+            
+            # For non-thinking mode, add <no_think> tag for Qwen3
+            if not use_thinking and "qwen3" in model_to_use.lower():
                 if not final_prompt.endswith(" <no_think>"):
                     final_prompt += " <no_think>"
             
-            # Get optimal parameters for this model
+            # Get optimal parameters for this model  
             optimal_params = get_optimal_ollama_parameters(model_to_use)
             
+            # Qwen3-specific optimal parameters based on research
+            if "qwen3" in model_to_use.lower():
+                if use_thinking:
+                    # Thinking mode: Temperature=0.6, TopP=0.95, TopK=20, PresencePenalty=1.5
+                    qwen3_temp = 0.6
+                    qwen3_top_p = 0.95
+                    qwen3_top_k = 20
+                    qwen3_presence = 1.5
+                else:
+                    # Non-thinking mode: Temperature=0.7, TopP=0.8, TopK=20, PresencePenalty=1.5  
+                    qwen3_temp = 0.7
+                    qwen3_top_p = 0.8
+                    qwen3_top_k = 20
+                    qwen3_presence = 1.5
+            else:
+                qwen3_temp = temperature
+                qwen3_top_p = optimal_params.get("top_p", 0.9)
+                qwen3_top_k = optimal_params.get("top_k", 40)
+                qwen3_presence = optimal_params.get("presence_penalty", 1.0)
+            
             payload = {
                 "model": model_to_use,
                 "prompt": final_prompt,
-                "stream": False,
+                "stream": use_streaming,
                 "options": {
-                    "temperature": temperature,
-                    "top_p": optimal_params.get("top_p", 0.9),
-                    "top_k": optimal_params.get("top_k", 40),
-                    "num_ctx": optimal_params.get("num_ctx", 32768),
+                    "temperature": qwen3_temp,
+                    "top_p": qwen3_top_p,
+                    "top_k": qwen3_top_k,
+                    "num_ctx": 32000,  # Critical: Qwen3 context length (32K token limit)
                     "num_predict": optimal_params.get("num_predict", 2000),
                     "repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
-                    "presence_penalty": optimal_params.get("presence_penalty", 1.0)
+                    "presence_penalty": qwen3_presence
                 }
             }
             
+            # Handle streaming with early stopping
+            if use_streaming:
+                return self._handle_streaming_with_early_stop(payload, model_to_use, use_thinking, start_time)
+            
             response = requests.post(
                 f"{self.ollama_url}/api/generate",
                 json=payload,
@@ -193,8 +193,19 @@ class LLMSynthesizer:
             
             if response.status_code == 200:
                 result = response.json()
+                
+                # All models use standard response format
+                # Qwen3 thinking tokens are embedded in the response content itself as <think>...</think>
                 raw_response = result.get('response', '').strip()
                 
+                # Log thinking content for Qwen3 debugging
+                if "qwen3" in model_to_use.lower() and use_thinking and "<think>" in raw_response:
+                    thinking_start = raw_response.find("<think>")
+                    thinking_end = raw_response.find("</think>")
+                    if thinking_start != -1 and thinking_end != -1:
+                        thinking_content = raw_response[thinking_start+7:thinking_end]
+                        logger.info(f"Qwen3 thinking: {thinking_content[:100]}...")
+                
                 # Apply safeguards to check response quality
                 if self.safeguard_detector and raw_response:
                     is_valid, issue_type, explanation = self.safeguard_detector.check_response_quality(
@@ -203,8 +214,8 @@ class LLMSynthesizer:
                     
                     if not is_valid:
                         logger.warning(f"Safeguard triggered: {issue_type}")
-                        # Return a safe explanation instead of the problematic response
-                        return self._create_safeguard_response(issue_type, explanation, prompt)
+                        # Preserve original response but add safeguard warning
+                        return self._create_safeguard_response_with_content(issue_type, explanation, raw_response)
                 
                 return raw_response
             else:
@@ -232,6 +243,119 @@ class LLMSynthesizer:
 4. **Different approach**: Try synthesis mode: `--synthesize` for simpler responses
 
 This is normal with smaller AI models and helps ensure you get quality responses."""
+
+    def _create_safeguard_response_with_content(self, issue_type: str, explanation: str, original_response: str) -> str:
+        """Create a response that preserves the original content but adds a safeguard warning."""
+        
+        # For Qwen3, extract the actual response (after thinking)
+        actual_response = original_response
+        if "<think>" in original_response and "</think>" in original_response:
+            thinking_end = original_response.find("</think>")
+            if thinking_end != -1:
+                actual_response = original_response[thinking_end + 8:].strip()
+        
+        # If we have useful content, preserve it with a warning
+        if len(actual_response.strip()) > 20:
+            return f"""⚠️ **Response Quality Warning** ({issue_type})
+
+{explanation}
+
+---
+
+**AI Response (use with caution):**
+
+{actual_response}
+
+---
+
+💡 **Note**: This response may have quality issues. Consider rephrasing your question or trying exploration mode for better results."""
+        else:
+            # If content is too short or problematic, use the original safeguard response
+            return f"""⚠️ Model Response Issue Detected
+
+{explanation}
+
+**What happened:** The AI model encountered a common issue with small language models.
+
+**Your options:**
+1. **Try again**: Ask the same question (often resolves itself)
+2. **Rephrase**: Make your question more specific or break it into parts  
+3. **Use exploration mode**: `rag-mini explore` for complex questions
+
+This is normal with smaller AI models and helps ensure you get quality responses."""
+
+    def _handle_streaming_with_early_stop(self, payload: dict, model_name: str, use_thinking: bool, start_time: float) -> Optional[str]:
+        """Handle streaming response with intelligent early stopping."""
+        import json
+        
+        try:
+            response = requests.post(
+                f"{self.ollama_url}/api/generate",
+                json=payload,
+                stream=True,
+                timeout=65
+            )
+            
+            if response.status_code != 200:
+                logger.error(f"Ollama API error: {response.status_code}")
+                return None
+            
+            full_response = ""
+            word_buffer = []
+            repetition_window = 30  # Check last 30 words for repetition (more context)
+            stop_threshold = 0.8    # Stop only if 80% of recent words are repetitive (very permissive)
+            min_response_length = 100  # Don't early stop until we have at least 100 chars
+            
+            for line in response.iter_lines():
+                if line:
+                    try:
+                        chunk_data = json.loads(line.decode('utf-8'))
+                        chunk_text = chunk_data.get('response', '')
+                        
+                        if chunk_text:
+                            full_response += chunk_text
+                            
+                            # Add words to buffer for repetition detection
+                            new_words = chunk_text.split()
+                            word_buffer.extend(new_words)
+                            
+                            # Keep only recent words in buffer
+                            if len(word_buffer) > repetition_window:
+                                word_buffer = word_buffer[-repetition_window:]
+                            
+                            # Check for repetition patterns after we have enough words AND content
+                            if len(word_buffer) >= repetition_window and len(full_response) >= min_response_length:
+                                unique_words = set(word_buffer)
+                                repetition_ratio = 1 - (len(unique_words) / len(word_buffer))
+                                
+                                # Early stop only if repetition is EXTREMELY high (80%+)
+                                if repetition_ratio > stop_threshold:
+                                    logger.info(f"Early stopping due to repetition: {repetition_ratio:.2f}")
+                                    
+                                    # Add a gentle completion to the response
+                                    if not full_response.strip().endswith(('.', '!', '?')):
+                                        full_response += "..."
+                                    
+                                    # Send stop signal to model (attempt to gracefully stop)
+                                    try:
+                                        stop_payload = {"model": model_name, "stop": True}
+                                        requests.post(f"{self.ollama_url}/api/generate", json=stop_payload, timeout=2)
+                                    except:
+                                        pass  # If stop fails, we already have partial response
+                                    
+                                    break
+                        
+                        if chunk_data.get('done', False):
+                            break
+                            
+                    except json.JSONDecodeError:
+                        continue
+            
+            return full_response.strip()
+            
+        except Exception as e:
+            logger.error(f"Streaming with early stop failed: {e}")
+            return None
     
     def synthesize_search_results(self, query: str, results: List[Any], project_path: Path) -> SynthesisResult:
         """Synthesize search results into a coherent summary."""
diff --git a/mini_rag/query_expander.py b/mini_rag/query_expander.py
index b092bba..c2a8e44 100644
--- a/mini_rag/query_expander.py
+++ b/mini_rag/query_expander.py
@@ -59,23 +59,8 @@ class QueryExpander:
         if self._initialized:
             return
             
-        # Warm up LLM if enabled and available
-        if self.enabled:
-            try:
-                model = self._select_expansion_model()
-                if model:
-                    requests.post(
-                        f"{self.ollama_url}/api/generate",
-                        json={
-                            "model": model,
-                            "prompt": "testing, just say 'hi' <no_think>",
-                            "stream": False,
-                            "options": {"temperature": 0.1, "max_tokens": 5}
-                        },
-                        timeout=5
-                    )
-            except:
-                pass  # Warmup failure is non-critical
+        # Skip warmup - causes startup delays and unwanted model calls
+        # Query expansion works fine on first use without warmup
                 
         self._initialized = True
     
@@ -183,10 +168,10 @@ Expanded query:"""
                 data = response.json()
                 available = [model['name'] for model in data.get('models', [])]
                 
-                # Prefer ultra-fast, efficient models for query expansion (CPU-friendly)
+                # Use same model rankings as main synthesizer for consistency
                 expansion_preferences = [
-                    "qwen3:0.6b", "qwen3:1.7b", "qwen2.5:1.5b", 
-                    "llama3.2:1b", "gemma2:2b", "llama3.2:3b"
+                    "qwen3:1.7b", "qwen3:0.6b", "qwen3:4b", "llama3.2:1b", 
+                    "qwen2.5:1.5b", "qwen3:3b", "qwen2.5-coder:1.5b"
                 ]
                 
                 for preferred in expansion_preferences:
diff --git a/mini_rag/search.py b/mini_rag/search.py
index 0144aca..1823fab 100644
--- a/mini_rag/search.py
+++ b/mini_rag/search.py
@@ -8,13 +8,20 @@ from pathlib import Path
 from typing import List, Dict, Any, Optional, Tuple
 import numpy as np
 import pandas as pd
-import lancedb
 from rich.console import Console
 from rich.table import Table
 from rich.syntax import Syntax
 from rank_bm25 import BM25Okapi
 from collections import defaultdict
 
+# Optional LanceDB import
+try:
+    import lancedb
+    LANCEDB_AVAILABLE = True
+except ImportError:
+    lancedb = None
+    LANCEDB_AVAILABLE = False
+
 from .ollama_embeddings import OllamaEmbedder as CodeEmbedder
 from .path_handler import display_path
 from .query_expander import QueryExpander
@@ -115,6 +122,14 @@ class CodeSearcher:
     
     def _connect(self):
         """Connect to the LanceDB database."""
+        if not LANCEDB_AVAILABLE:
+            print("❌ LanceDB Not Available")
+            print("   LanceDB is required for search functionality")
+            print("   Install it with: pip install lancedb pyarrow")
+            print("   For basic Ollama functionality, use hash-based search instead")
+            print()
+            raise ImportError("LanceDB dependency is required for search. Install with: pip install lancedb pyarrow")
+        
         try:
             if not self.rag_dir.exists():
                 print("🗃️ No Search Index Found")
diff --git a/mini_rag/venv_checker.py b/mini_rag/venv_checker.py
new file mode 100644
index 0000000..492303d
--- /dev/null
+++ b/mini_rag/venv_checker.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Virtual Environment Checker
+Ensures scripts run in proper Python virtual environment for consistency and safety.
+"""
+
+import sys
+import os
+import sysconfig
+from pathlib import Path
+
+def is_in_virtualenv() -> bool:
+    """Check if we're running in a virtual environment."""
+    # Check for virtual environment indicators
+    return (
+        hasattr(sys, 'real_prefix') or  # virtualenv
+        (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix) or  # venv/pyvenv
+        os.environ.get('VIRTUAL_ENV') is not None  # Environment variable
+    )
+
+def get_expected_venv_path() -> Path:
+    """Get the expected virtual environment path for this project."""
+    # Assume .venv in the same directory as the script
+    script_dir = Path(__file__).parent.parent
+    return script_dir / '.venv'
+
+def check_correct_venv() -> tuple[bool, str]:
+    """
+    Check if we're in the correct virtual environment.
+    
+    Returns:
+        (is_correct, message)
+    """
+    if not is_in_virtualenv():
+        return False, "not in virtual environment"
+    
+    expected_venv = get_expected_venv_path()
+    if not expected_venv.exists():
+        return False, "expected virtual environment not found"
+    
+    current_venv = os.environ.get('VIRTUAL_ENV')
+    if current_venv:
+        current_venv_path = Path(current_venv).resolve()
+        expected_venv_path = expected_venv.resolve()
+        
+        if current_venv_path != expected_venv_path:
+            return False, f"wrong virtual environment (using {current_venv_path}, expected {expected_venv_path})"
+    
+    return True, "correct virtual environment"
+
+def show_venv_warning(script_name: str = "script") -> None:
+    """Show virtual environment warning with helpful instructions."""
+    expected_venv = get_expected_venv_path()
+    
+    print("⚠️  VIRTUAL ENVIRONMENT WARNING")
+    print("=" * 50)
+    print()
+    print(f"This {script_name} should be run in a Python virtual environment for:")
+    print("  • Consistent dependencies")
+    print("  • Isolated package versions") 
+    print("  • Proper security isolation")
+    print("  • Reliable functionality")
+    print()
+    
+    if expected_venv.exists():
+        print("✅ Virtual environment found!")
+        print(f"   Location: {expected_venv}")
+        print()
+        print("🚀 To activate it:")
+        print(f"   source {expected_venv}/bin/activate")
+        print(f"   {script_name}")
+        print()
+        print("🔄 Or run with activation:")
+        print(f"   source {expected_venv}/bin/activate && {script_name}")
+    else:
+        print("❌ No virtual environment found!")
+        print()
+        print("🛠️  Create one first:")
+        print("   ./install_mini_rag.sh")
+        print()
+        print("📚 Or manually:")
+        print(f"   python3 -m venv {expected_venv}")
+        print(f"   source {expected_venv}/bin/activate")
+        print("   pip install -r requirements.txt")
+    
+    print()
+    print("💡 Why this matters:")
+    print("   Without a virtual environment, you may experience:")
+    print("   • Import errors from missing packages")
+    print("   • Version conflicts with system Python")
+    print("   • Inconsistent behavior across systems")
+    print("   • Potential system-wide package pollution")
+    print()
+
+def check_and_warn_venv(script_name: str = "script", force_exit: bool = False) -> bool:
+    """
+    Check virtual environment and warn if needed.
+    
+    Args:
+        script_name: Name of the script for user-friendly messages
+        force_exit: Whether to exit if not in correct venv
+        
+    Returns:
+        True if in correct venv, False otherwise
+    """
+    is_correct, message = check_correct_venv()
+    
+    if not is_correct:
+        show_venv_warning(script_name)
+        
+        if force_exit:
+            print(f"⛔ Exiting {script_name} for your safety.")
+            print("   Please activate the virtual environment and try again.")
+            sys.exit(1)
+        else:
+            print(f"⚠️  Continuing anyway, but {script_name} may not work correctly...")
+            print()
+            return False
+    
+    return True
+
+def require_venv(script_name: str = "script") -> None:
+    """Require virtual environment or exit."""
+    check_and_warn_venv(script_name, force_exit=True)
+
+# Quick test function
+def main():
+    """Test the virtual environment checker."""
+    print("🧪 Virtual Environment Checker Test")
+    print("=" * 40)
+    
+    print(f"In virtual environment: {is_in_virtualenv()}")
+    print(f"Expected venv path: {get_expected_venv_path()}")
+    
+    is_correct, message = check_correct_venv()
+    print(f"Correct venv: {is_correct} ({message})")
+    
+    if not is_correct:
+        show_venv_warning("test script")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/rag-mini b/rag-mini
index e6862d9..73be737 100755
--- a/rag-mini
+++ b/rag-mini
@@ -112,6 +112,7 @@ show_help() {
     echo -e "${BOLD}Main Commands:${NC}"
     echo "  rag-mini index <project_path>      # Index project for search"
     echo "  rag-mini search <project_path> <query>  # Search indexed project"
+    echo "  rag-mini explore <project_path>    # Interactive exploration with AI"
     echo "  rag-mini status <project_path>     # Show project status"
     echo ""
     echo -e "${BOLD}Interfaces:${NC}"
@@ -324,9 +325,9 @@ main() {
         "server")
             # Start server mode
             shift
-            exec "$PYTHON" "$SCRIPT_DIR/claude_rag/server.py" "$@"
+            exec "$PYTHON" "$SCRIPT_DIR/mini_rag/fast_server.py" "$@"
             ;;
-        "index"|"search"|"status")
+        "index"|"search"|"explore"|"status")
             # Direct CLI commands - call Python script
             exec "$PYTHON" "$SCRIPT_DIR/rag-mini.py" "$@"
             ;;
diff --git a/rag-mini.py b/rag-mini.py
index 04096d2..4d7451e 100644
--- a/rag-mini.py
+++ b/rag-mini.py
@@ -118,7 +118,7 @@ def index_project(project_path: Path, force: bool = False):
         print("   Or see: docs/TROUBLESHOOTING.md")
         sys.exit(1)
 
-def search_project(project_path: Path, query: str, limit: int = 10, synthesize: bool = False):
+def search_project(project_path: Path, query: str, top_k: int = 10, synthesize: bool = False):
     """Search a project directory."""
     try:
         # Check if indexed first
@@ -130,7 +130,7 @@ def search_project(project_path: Path, query: str, limit: int = 10, synthesize:
         
         print(f"🔍 Searching \"{query}\" in {project_path.name}")
         searcher = CodeSearcher(project_path)
-        results = searcher.search(query, top_k=limit)
+        results = searcher.search(query, top_k=top_k)
         
         if not results:
             print("❌ No results found")
@@ -143,7 +143,7 @@ def search_project(project_path: Path, query: str, limit: int = 10, synthesize:
             print()
             print("⚙️ Configuration adjustments:")
             print(f"   • Lower threshold: ./rag-mini search {project_path} \"{query}\" --threshold 0.05")
-            print("   • More results: add --limit 20")
+            print("   • More results: add --top-k 20")
             print()
             print("📚 Need help? See: docs/TROUBLESHOOTING.md")
             return
@@ -310,14 +310,14 @@ def status_check(project_path: Path):
         sys.exit(1)
 
 def explore_interactive(project_path: Path):
-    """Interactive exploration mode with thinking and context memory."""
+    """Interactive exploration mode with thinking and context memory for any documents."""
     try:
         explorer = CodeExplorer(project_path)
         
         if not explorer.start_exploration_session():
             sys.exit(1)
         
-        print("\n🤔 Ask your first question about the codebase:")
+        print(f"\n🤔 Ask your first question about {project_path.name}:")
         
         while True:
             try:
@@ -357,7 +357,8 @@ def explore_interactive(project_path: Path):
                     continue
                 
                 # Process the question
-                print("\n🔍 Analyzing...")
+                print(f"\n🔍 Searching {project_path.name}...")
+                print("🧠 Thinking with AI model...")
                 response = explorer.explore_question(question)
                 
                 if response:
@@ -382,6 +383,13 @@ def explore_interactive(project_path: Path):
 
 def main():
     """Main CLI interface."""
+    # Check virtual environment
+    try:
+        from mini_rag.venv_checker import check_and_warn_venv
+        check_and_warn_venv("rag-mini.py", force_exit=False)
+    except ImportError:
+        pass  # If venv checker can't be imported, continue anyway
+    
     parser = argparse.ArgumentParser(
         description="FSS-Mini-RAG - Lightweight semantic code search",
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -403,8 +411,8 @@ Examples:
                        help='Search query (for search command)')
     parser.add_argument('--force', action='store_true',
                        help='Force reindex all files')
-    parser.add_argument('--limit', type=int, default=10,
-                       help='Maximum number of search results')
+    parser.add_argument('--top-k', '--limit', type=int, default=10, dest='top_k',
+                       help='Maximum number of search results (top-k)')
     parser.add_argument('--verbose', '-v', action='store_true',
                        help='Enable verbose logging')
     parser.add_argument('--synthesize', '-s', action='store_true',
@@ -432,7 +440,7 @@ Examples:
         if not args.query:
             print("❌ Search query required")
             sys.exit(1)
-        search_project(args.project_path, args.query, args.limit, args.synthesize)
+        search_project(args.project_path, args.query, args.top_k, args.synthesize)
     elif args.command == 'explore':
         explore_interactive(args.project_path)
     elif args.command == 'status':
diff --git a/rag-tui.py b/rag-tui.py
index c711b0b..aeba78a 100755
--- a/rag-tui.py
+++ b/rag-tui.py
@@ -16,17 +16,83 @@ class SimpleTUI:
         self.project_path: Optional[Path] = None
         self.current_config: Dict[str, Any] = {}
         self.search_count = 0  # Track searches for sample reminder
+        self.config_dir = Path.home() / '.mini-rag-tui'
+        self.config_file = self.config_dir / 'last_project.json'
         
+        # Load last project on startup
+        self._load_last_project()
+        
+    def _load_last_project(self):
+        """Load the last used project from config file, or auto-detect current directory."""
+        # First check if current directory has .mini-rag folder (auto-detect)
+        current_dir = Path.cwd()
+        if (current_dir / '.mini-rag').exists():
+            self.project_path = current_dir
+            # Save this as the last project too
+            self._save_last_project()
+            return
+        
+        # If no auto-detection, try loading from config file
+        try:
+            if hasattr(self, 'config_file') and self.config_file.exists():
+                with open(self.config_file, 'r') as f:
+                    data = json.load(f)
+                project_path = Path(data.get('last_project', ''))
+                if project_path.exists() and project_path.is_dir():
+                    self.project_path = project_path
+        except Exception:
+            # If loading fails, just continue without last project
+            pass
+    
+    def _save_last_project(self):
+        """Save current project as last used."""
+        if not self.project_path:
+            return
+        try:
+            self.config_dir.mkdir(exist_ok=True)
+            data = {'last_project': str(self.project_path)}
+            with open(self.config_file, 'w') as f:
+                json.dump(data, f)
+        except Exception:
+            # If saving fails, just continue
+            pass
+    
+    def _get_llm_status(self):
+        """Get LLM status for display in main menu."""
+        try:
+            # Import here to avoid startup delays
+            sys.path.insert(0, str(Path(__file__).parent))
+            from mini_rag.llm_synthesizer import LLMSynthesizer
+            from mini_rag.config import RAGConfig, ConfigManager
+            
+            # Load config for model rankings
+            if self.project_path:
+                config_manager = ConfigManager(self.project_path)
+                config = config_manager.load_config()
+            else:
+                config = RAGConfig()
+            
+            synthesizer = LLMSynthesizer(config=config)
+            if synthesizer.is_available():
+                # Get the model that would be selected
+                synthesizer._ensure_initialized()
+                model = synthesizer.model
+                return "✅ Ready", model
+            else:
+                return "❌ Ollama not running", None
+        except Exception as e:
+            return f"❌ Error: {str(e)[:20]}...", None
+    
     def clear_screen(self):
         """Clear the terminal screen."""
         os.system('cls' if os.name == 'nt' else 'clear')
     
     def print_header(self):
         """Print the main header."""
-        print("╔════════════════════════════════════════════════════╗")
-        print("║              FSS-Mini-RAG TUI                      ║")
-        print("║         Semantic Code Search Interface             ║")
-        print("╚════════════════════════════════════════════════════╝")
+        print("+====================================================+")
+        print("|              FSS-Mini-RAG TUI                      |")
+        print("|         Semantic Code Search Interface             |")
+        print("+====================================================+")
         print()
     
     def print_cli_command(self, command: str, description: str = ""):
@@ -43,10 +109,14 @@ class SimpleTUI:
         else:
             full_prompt = f"{prompt}: "
         
-        result = input(full_prompt).strip()
-        return result if result else default
+        try:
+            result = input(full_prompt).strip()
+            return result if result else default
+        except (KeyboardInterrupt, EOFError):
+            print("\nGoodbye!")
+            sys.exit(0)
     
-    def show_menu(self, title: str, options: List[str], show_cli: bool = True) -> int:
+    def show_menu(self, title: str, options: List[str], show_cli: bool = True, back_option: str = None) -> int:
         """Show a menu and get user selection."""
         print(f"🎯 {title}")
         print("=" * (len(title) + 3))
@@ -55,6 +125,10 @@ class SimpleTUI:
         for i, option in enumerate(options, 1):
             print(f"{i}. {option}")
         
+        # Add back/exit option
+        if back_option:
+            print(f"0. {back_option}")
+        
         if show_cli:
             print()
             print("💡 All these actions can be done via CLI commands")
@@ -64,13 +138,16 @@ class SimpleTUI:
         while True:
             try:
                 choice = int(input("Select option (number): "))
-                if 1 <= choice <= len(options):
+                if choice == 0 and back_option:
+                    return -1  # Special value for back/exit
+                elif 1 <= choice <= len(options):
                     return choice - 1
                 else:
-                    print(f"Please enter a number between 1 and {len(options)}")
+                    valid_range = "0-" + str(len(options)) if back_option else "1-" + str(len(options))
+                    print(f"Please enter a number between {valid_range}")
             except ValueError:
                 print("Please enter a valid number")
-            except KeyboardInterrupt:
+            except (KeyboardInterrupt, EOFError):
                 print("\nGoodbye!")
                 sys.exit(0)
     
@@ -88,49 +165,90 @@ class SimpleTUI:
             print(f"Current project: {self.project_path}")
             print()
         
-        options = [
-            "Enter project path",
-            "Use current directory",
-            "Browse recent projects" if self.project_path else "Skip (will ask later)"
-        ]
+        print("💡 New to FSS-Mini-RAG? Select 'Use current directory' to")
+        print("   explore this RAG system's own codebase as your first demo!")
+        print()
         
-        choice = self.show_menu("Choose project directory", options, show_cli=False)
+        # If we already have a project, show it prominently and offer quick actions
+        if self.project_path:
+            rag_dir = self.project_path / '.mini-rag'
+            is_indexed = rag_dir.exists()
+            status_text = "Ready for search ✅" if is_indexed else "Needs indexing ❌"
+            
+            print(f"Current: {self.project_path.name} ({status_text})")
+            print()
+            
+            options = [
+                "Keep current project (go back to main menu)",
+                "Use current directory (this folder)",
+                "Enter different project path",
+                "Browse recent projects"
+            ]
+        else:
+            options = [
+                "Use current directory (perfect for beginners - try the RAG codebase!)",
+                "Enter project path (if you have a specific project)", 
+                "Browse recent projects"
+            ]
         
-        if choice == 0:
-            # Enter path manually
-            while True:
-                path_str = self.get_input("Enter project directory path", 
-                                        str(self.project_path) if self.project_path else "")
-                
-                if not path_str:
-                    continue
-                    
-                project_path = Path(path_str).expanduser().resolve()
-                
-                if project_path.exists() and project_path.is_dir():
-                    self.project_path = project_path
-                    print(f"✅ Selected: {self.project_path}")
-                    break
-                else:
-                    print(f"❌ Directory not found: {project_path}")
-                    retry = input("Try again? (y/N): ").lower()
-                    if retry != 'y':
-                        break
+        choice = self.show_menu("Choose project directory", options, show_cli=False, back_option="Back to main menu")
         
-        elif choice == 1:
-            # Use current directory
-            self.project_path = Path.cwd()
-            print(f"✅ Using current directory: {self.project_path}")
+        if choice == -1:  # Back to main menu
+            return
         
-        elif choice == 2:
-            # Browse recent projects or skip
-            if self.project_path:
+        # Handle different choice patterns based on whether we have a project
+        if self.project_path:
+            if choice == 0:
+                # Keep current project - just go back
+                return
+            elif choice == 1:
+                # Use current directory  
+                self.project_path = Path.cwd()
+                print(f"✅ Using current directory: {self.project_path}")
+                self._save_last_project()
+            elif choice == 2:
+                # Enter different project path
+                self._enter_project_path()
+            elif choice == 3:
+                # Browse recent projects
+                self.browse_recent_projects()
+        else:
+            if choice == 0:
+                # Use current directory
+                self.project_path = Path.cwd()
+                print(f"✅ Using current directory: {self.project_path}")
+                self._save_last_project()
+            elif choice == 1:
+                # Enter project path
+                self._enter_project_path()
+            elif choice == 2:
+                # Browse recent projects
                 self.browse_recent_projects()
-            else:
-                print("No project selected - you can choose one later from the main menu")
         
         input("\nPress Enter to continue...")
     
+    def _enter_project_path(self):
+        """Helper method to handle manual project path entry."""
+        while True:
+            path_str = self.get_input("Enter project directory path", 
+                                    str(self.project_path) if self.project_path else "")
+            
+            if not path_str:
+                continue
+                
+            project_path = Path(path_str).expanduser().resolve()
+            
+            if project_path.exists() and project_path.is_dir():
+                self.project_path = project_path
+                print(f"✅ Selected: {self.project_path}")
+                self._save_last_project()
+                break
+            else:
+                print(f"❌ Directory not found: {project_path}")
+                retry = input("Try again? (y/N): ").lower()
+                if retry != 'y':
+                    break
+    
     def browse_recent_projects(self):
         """Browse recently indexed projects."""
         print("🕒 Recent Projects")
@@ -192,6 +310,7 @@ class SimpleTUI:
             if 1 <= choice <= len(recent_projects):
                 self.project_path = recent_projects[choice - 1]
                 print(f"✅ Selected: {self.project_path}")
+                self._save_last_project()
         except (ValueError, IndexError):
             print("Selection cancelled")
     
@@ -214,9 +333,7 @@ class SimpleTUI:
         # Check if already indexed
         rag_dir = self.project_path / '.mini-rag'
         if rag_dir.exists():
-            print("⚠️  Project appears to be already indexed")
-            print()
-            force = input("Re-index everything? (y/N): ").lower() == 'y'
+            force = self._show_existing_index_info(rag_dir)
         else:
             force = False
         
@@ -227,26 +344,157 @@ class SimpleTUI:
         
         self.print_cli_command(cli_cmd, "Index project for semantic search")
         
-        print("Starting indexing...")
+        # Import here to avoid startup delays
+        sys.path.insert(0, str(Path(__file__).parent))
+        from mini_rag.indexer import ProjectIndexer
+        
+        # Get file count and show preview before starting
+        print("🔍 Analyzing project structure...")
         print("=" * 50)
         
-        # Actually run the indexing
         try:
-            # Import here to avoid startup delays
-            sys.path.insert(0, str(Path(__file__).parent))
-            from mini_rag.indexer import ProjectIndexer
-            
             indexer = ProjectIndexer(self.project_path)
+            
+            # Get files that would be indexed
+            files_to_index = indexer._get_files_to_index()
+            total_files = len(files_to_index)
+            
+            if total_files == 0:
+                print("✅ All files are already up to date!")
+                print("   No indexing needed.")
+                input("\nPress Enter to continue...")
+                return
+            
+            # Show file analysis
+            print(f"📊 Indexing Analysis:")
+            print(f"   Files to process: {total_files}")
+            
+            # Analyze file types
+            file_types = {}
+            total_size = 0
+            for file_path in files_to_index:
+                ext = file_path.suffix.lower() or 'no extension'
+                file_types[ext] = file_types.get(ext, 0) + 1
+                try:
+                    total_size += file_path.stat().st_size
+                except:
+                    pass
+            
+            # Show breakdown
+            print(f"   Total size: {total_size / (1024*1024):.1f}MB")
+            print(f"   File types:")
+            for ext, count in sorted(file_types.items(), key=lambda x: x[1], reverse=True):
+                print(f"     • {ext}: {count} files")
+            
+            # Conservative time estimate for average hardware
+            estimated_time = self._estimate_processing_time(total_files, total_size)
+            print(f"   Estimated time: {estimated_time}")
+            
+            print()
+            print("💡 What indexing does:")
+            print("   • Reads and analyzes each file's content (READ-ONLY)")
+            print("   • Breaks content into semantic chunks")  
+            print("   • Generates embeddings for semantic search")
+            print("   • Stores everything in a separate .mini-rag/ database")
+            print()
+            print("🛡️  SAFETY GUARANTEE:")
+            print("   • Your original files are NEVER modified or touched")
+            print("   • Only reads files to create the search index")
+            print("   • All data stored separately in .mini-rag/ folder")
+            print("   • You can delete the .mini-rag/ folder anytime to remove all traces")
+            print()
+            
+            # Confirmation
+            confirm = input("🚀 Proceed with indexing? [Y/n]: ").strip().lower()
+            if confirm and confirm != 'y' and confirm != 'yes':
+                print("Indexing cancelled.")
+                input("Press Enter to continue...")
+                return
+            
+            print("\n🚀 Starting indexing...")
+            print("=" * 50)
+            
+            # Actually run the indexing
             result = indexer.index_project(force_reindex=force)
             
             print()
-            print("✅ Indexing completed!")
-            print(f"   Files processed: {result.get('files_indexed', 0)}")
-            print(f"   Chunks created: {result.get('chunks_created', 0)}")
-            print(f"   Time taken: {result.get('time_taken', 0):.1f}s")
+            print("🎉 INDEXING COMPLETE!")
+            print("=" * 50)
             
-            if result.get('files_failed', 0) > 0:
-                print(f"   ⚠️  Files failed: {result['files_failed']}")
+            # Comprehensive performance summary
+            files_processed = result.get('files_indexed', 0)
+            chunks_created = result.get('chunks_created', 0)
+            time_taken = result.get('time_taken', 0)
+            files_failed = result.get('files_failed', 0)
+            files_per_second = result.get('files_per_second', 0)
+            
+            print(f"📊 PROCESSING SUMMARY:")
+            print(f"   ✅ Files successfully processed: {files_processed:,}")
+            print(f"   🧩 Semantic chunks created: {chunks_created:,}")
+            print(f"   ⏱️  Total processing time: {time_taken:.2f} seconds")
+            print(f"   🚀 Processing speed: {files_per_second:.1f} files/second")
+            
+            if files_failed > 0:
+                print(f"   ⚠️  Files with issues: {files_failed}")
+            
+            # Show what we analyzed
+            if chunks_created > 0:
+                avg_chunks_per_file = chunks_created / max(files_processed, 1)
+                print()
+                print(f"🔍 CONTENT ANALYSIS:")
+                print(f"   • Average chunks per file: {avg_chunks_per_file:.1f}")
+                print(f"   • Semantic boundaries detected and preserved")
+                print(f"   • Function and class contexts captured")
+                print(f"   • Documentation and code comments indexed")
+                
+                # Try to show embedding info
+                try:
+                    embedder = indexer.embedder
+                    embed_info = embedder.get_embedding_info()
+                    print(f"   • Embedding method: {embed_info.get('method', 'Unknown')}")
+                    print(f"   • Vector dimensions: {embedder.get_embedding_dim()}")
+                except:
+                    pass
+            
+            # Database info
+            print()
+            print(f"💾 DATABASE CREATED:")
+            print(f"   • Location: {self.project_path}/.mini-rag/")
+            print(f"   • Vector database with {chunks_created:,} searchable chunks")
+            print(f"   • Optimized for fast semantic similarity search")
+            print(f"   • Supports natural language queries")
+            
+            # Performance metrics
+            if time_taken > 0:
+                print()
+                print(f"⚡ PERFORMANCE METRICS:")
+                chunks_per_second = chunks_created / time_taken if time_taken > 0 else 0
+                print(f"   • {chunks_per_second:.0f} chunks processed per second")
+                
+                # Estimate search performance
+                estimated_search_time = max(0.1, chunks_created / 10000)  # Very rough estimate
+                print(f"   • Estimated search time: ~{estimated_search_time:.1f}s per query")
+                
+                if total_size > 0:
+                    mb_per_second = (total_size / (1024*1024)) / time_taken
+                    print(f"   • Data processing rate: {mb_per_second:.1f} MB/second")
+            
+            # What's next
+            print()
+            print(f"🎯 READY FOR SEARCH!")
+            print(f"   Your codebase is now fully indexed and searchable.")
+            print(f"   Try queries like:")
+            print(f"     • 'authentication logic'")
+            print(f"     • 'error handling patterns'")
+            print(f"     • 'database connection setup'")
+            print(f"     • 'unit tests for validation'")
+            
+            if files_failed > 0:
+                print()
+                print(f"📋 NOTES:")
+                print(f"   • {files_failed} files couldn't be processed (binary files, encoding issues, etc.)")
+                print(f"   • This is normal - only text-based files are indexed")
+                print(f"   • All processable content has been successfully indexed")
             
         except Exception as e:
             print(f"❌ Indexing failed: {e}")
@@ -255,6 +503,83 @@ class SimpleTUI:
         print()
         input("Press Enter to continue...")
     
+    def _show_existing_index_info(self, rag_dir: Path) -> bool:
+        """Show essential info about existing index and ask about re-indexing."""
+        print("📊 EXISTING INDEX FOUND")
+        print("=" * 50)
+        print()
+        print("🛡️  Your original files are safe and unmodified.")
+        print()
+        
+        try:
+            manifest_path = rag_dir / 'manifest.json' 
+            if manifest_path.exists():
+                import json
+                from datetime import datetime
+                
+                with open(manifest_path, 'r') as f:
+                    manifest = json.load(f)
+                
+                file_count = manifest.get('file_count', 0)
+                chunk_count = manifest.get('chunk_count', 0)
+                indexed_at = manifest.get('indexed_at', 'Unknown')
+                
+                print(f"• Files indexed: {file_count:,}")
+                print(f"• Chunks created: {chunk_count:,}")
+                
+                # Show when it was last indexed
+                if indexed_at != 'Unknown':
+                    try:
+                        dt = datetime.fromisoformat(indexed_at.replace('Z', '+00:00'))
+                        time_ago = datetime.now() - dt.replace(tzinfo=None)
+                        
+                        if time_ago.days > 0:
+                            age_str = f"{time_ago.days} day(s) ago"
+                        elif time_ago.seconds > 3600:
+                            age_str = f"{time_ago.seconds // 3600} hour(s) ago"
+                        else:
+                            age_str = f"{time_ago.seconds // 60} minute(s) ago"
+                        
+                        print(f"• Last indexed: {age_str}")
+                    except:
+                        print(f"• Last indexed: {indexed_at}")
+                else:
+                    print("• Last indexed: Unknown")
+                
+                # Simple recommendation
+                if time_ago.days >= 7:
+                    print(f"\n💡 RECOMMEND: Re-index (index is {time_ago.days} days old)")
+                elif time_ago.days >= 1:
+                    print(f"\n💡 MAYBE: Re-index if you've made changes ({time_ago.days} day(s) old)")
+                else:
+                    print(f"\n💡 RECOMMEND: Skip (index is recent)")
+                
+                estimate = self._estimate_processing_time(file_count, 0)
+                print(f"• Re-indexing would take: {estimate}")
+                
+            else:
+                print("⚠️  Index corrupted - recommend re-indexing")
+                
+        except Exception:
+            print("⚠️  Could not read index info - recommend re-indexing")
+        
+        print()
+        choice = input("🚀 Re-index everything? [y/N]: ").strip().lower()
+        return choice in ['y', 'yes']
+    
+    def _estimate_processing_time(self, file_count: int, total_size_bytes: int) -> str:
+        """Conservative time estimates for average hardware (not high-end dev machines)."""
+        # Conservative: 2 seconds per file for average hardware (4x buffer from fast machines)
+        estimated_seconds = file_count * 2.0 + 15  # +15s startup overhead
+        
+        if estimated_seconds < 60:
+            return "1-2 minutes"
+        elif estimated_seconds < 300:  # 5 minutes
+            minutes = int(estimated_seconds / 60)
+            return f"{minutes}-{minutes + 1} minutes"
+        else:
+            minutes = int(estimated_seconds / 60)
+            return f"{minutes}+ minutes"
     def search_interactive(self):
         """Interactive search interface."""
         if not self.project_path:
@@ -279,51 +604,54 @@ class SimpleTUI:
         print(f"Project: {self.project_path.name}")
         print()
         
-        # Show sample questions for beginners - relevant to FSS-Mini-RAG
-        print("💡 Not sure what to search for? Try these questions about FSS-Mini-RAG:")
-        print()
-        sample_questions = [
-            "chunking strategy",
-            "ollama integration", 
-            "indexing performance",
-            "why does indexing take long",
-            "how to improve search results",
-            "embedding generation"
-        ]
-        
-        for i, question in enumerate(sample_questions[:3], 1):
-            print(f"   {i}. {question}")
-        print("   4. Enter your own question")
+        # More prominent search input
+        print("🎯 ENTER YOUR SEARCH QUERY:")
+        print("   Ask any question about your codebase using natural language")
+        print("   Examples: 'chunking strategy', 'ollama integration', 'embedding generation'")
         print()
         
-        # Let user choose a sample or enter their own
-        choice_str = self.get_input("Choose a number (1-4) or press Enter for custom", "4")
+        # Primary input - direct query entry
+        query = self.get_input("Search query", "").strip()
         
-        try:
-            choice = int(choice_str)
-            if 1 <= choice <= 3:
-                query = sample_questions[choice - 1]
-                print(f"Selected: '{query}'")
-                print()
-            else:
-                query = self.get_input("Enter your search query", "").strip()
-        except ValueError:
-            query = self.get_input("Enter your search query", "").strip()
+        # If they didn't enter anything, show sample options
+        if not query:
+            print()
+            print("💡 Need inspiration? Try one of these sample queries:")
+            print()
             
+            sample_questions = [
+                "chunking strategy",
+                "ollama integration", 
+                "indexing performance",
+                "why does indexing take long",
+                "how to improve search results",
+                "embedding generation"
+            ]
+            
+            for i, question in enumerate(sample_questions[:3], 1):
+                print(f"   {i}. {question}")
+            print()
+            
+            choice_str = self.get_input("Select a sample query (1-3) or press Enter to go back", "")
+            
+            if choice_str.isdigit():
+                choice = int(choice_str)
+                if 1 <= choice <= 3:
+                    query = sample_questions[choice - 1]
+                    print(f"✅ Using: '{query}'")
+                    print()
+        
+        # If still no query, return to menu
         if not query:
             return
         
-        # Get result limit
-        try:
-            limit = int(self.get_input("Number of results", "10"))
-            limit = max(1, min(20, limit))  # Clamp between 1-20
-        except ValueError:
-            limit = 10
+        # Use a sensible default for results to streamline UX
+        top_k = 10  # Good default, advanced users can use CLI for more options
         
         # Show CLI command
         cli_cmd = f"./rag-mini search {self.project_path} \"{query}\""
-        if limit != 10:
-            cli_cmd += f" --limit {limit}"
+        if top_k != 10:
+            cli_cmd += f" --top-k {top_k}"
         
         self.print_cli_command(cli_cmd, "Search for semantic matches")
         
@@ -338,7 +666,7 @@ class SimpleTUI:
             searcher = CodeSearcher(self.project_path)
             # Enable query expansion in TUI for better results
             searcher.config.search.expand_queries = True
-            results = searcher.search(query, top_k=limit)
+            results = searcher.search(query, top_k=top_k)
             
             if not results:
                 print("❌ No results found")
@@ -352,9 +680,18 @@ class SimpleTUI:
                 print()
                 
                 for i, result in enumerate(results, 1):
+                    # Add divider and whitespace before each result (except first)
+                    if i > 1:
+                        print()
+                        print("-" * 60)
+                        print()
+                    
                     # Clean up file path
                     try:
-                        rel_path = result.file_path.relative_to(self.project_path)
+                        if hasattr(result.file_path, 'relative_to'):
+                            rel_path = result.file_path.relative_to(self.project_path)
+                        else:
+                            rel_path = Path(result.file_path).relative_to(self.project_path)
                     except:
                         rel_path = result.file_path
                     
@@ -392,6 +729,13 @@ class SimpleTUI:
                 for i, question in enumerate(follow_up_questions, 1):
                     print(f"   {i}. {question}")
                 
+                # Show additional CLI commands
+                print()
+                print("💻 CLI Commands:")
+                print(f"   ./rag-mini search {self.project_path} \"{query}\" --top-k 20    # More results")
+                print(f"   ./rag-mini explore {self.project_path}                      # Interactive mode")
+                print(f"   ./rag-mini search {self.project_path} \"{query}\" --synthesize  # With AI summary")
+                
                 # Ask if they want to run a follow-up search
                 print()
                 choice = input("Run a follow-up search? Enter number (1-3) or press Enter to continue: ").strip()
@@ -407,8 +751,17 @@ class SimpleTUI:
                         print(f"✅ Found {len(follow_results)} follow-up results:")
                         print()
                         for i, result in enumerate(follow_results[:3], 1):  # Show top 3
+                            # Add divider for follow-up results too
+                            if i > 1:
+                                print()
+                                print("-" * 40)
+                                print()
+                            
                             try:
-                                rel_path = result.file_path.relative_to(self.project_path)
+                                if hasattr(result.file_path, 'relative_to'):
+                                    rel_path = result.file_path.relative_to(self.project_path)
+                                else:
+                                    rel_path = Path(result.file_path).relative_to(self.project_path)
                             except:
                                 rel_path = result.file_path
                             print(f"{i}. {rel_path} (Score: {result.score:.3f})")
@@ -448,12 +801,19 @@ class SimpleTUI:
                         print(f"\nSwitching to full project: {parent_dir}")
                         print("Starting full indexing...")
                         # Note: This would trigger full indexing in real implementation
-                    print(f"   Or: ./rag-mini-enhanced context {self.project_path} \"{query}\"")
-                    print()
-            
+                    
         except Exception as e:
             print(f"❌ Search failed: {e}")
-            print("   Try running the CLI command directly for more details")
+            print()
+            print("💡 Try these CLI commands for more details:")
+            print(f"   ./rag-mini search {self.project_path} \"{query}\" --verbose")
+            print(f"   ./rag-mini status {self.project_path}")
+            print("   ./rag-mini --help")
+            print()
+            print("🔧 Common solutions:")
+            print("   • Make sure the project is indexed first")
+            print("   • Check if Ollama is running: ollama serve")
+            print("   • Try a simpler search query")
         
         print()
         input("Press Enter to continue...")
@@ -485,8 +845,15 @@ class SimpleTUI:
         if results:
             file_extensions = set()
             for result in results[:3]:  # Check first 3 results
-                ext = result.file_path.suffix.lower()
-                file_extensions.add(ext)
+                try:
+                    # Handle both Path objects and strings
+                    if hasattr(result.file_path, 'suffix'):
+                        ext = result.file_path.suffix.lower()
+                    else:
+                        ext = Path(result.file_path).suffix.lower()
+                    file_extensions.add(ext)
+                except:
+                    continue  # Skip if we can't get extension
             
             if '.py' in file_extensions:
                 follow_ups.append("Python module dependencies")
@@ -549,61 +916,173 @@ class SimpleTUI:
                 input("Press Enter to continue...")
                 return
             
-            print("\n🤔 Ask your first question about the codebase:")
-            print("   (Type 'help' for commands, 'quit' to return to menu)")
+            print("\n🤔 Ask questions about the codebase:")
+            print("   Quick: 0=quit, 1=summary, 2=history, 3=suggest next question")
             
             while True:
                 try:
                     question = input("\n> ").strip()
                     
+                    # Handle numbered options
+                    if question == '0':
+                        print(explorer.end_session())
+                        break
+                    elif question == '1':
+                        print("\n" + explorer.get_session_summary())
+                        continue
+                    elif question == '2':
+                        if hasattr(explorer.current_session, 'conversation_history') and explorer.current_session.conversation_history:
+                            print("\n🔍 Recent questions:")
+                            for i, exchange in enumerate(explorer.current_session.conversation_history[-3:], 1):
+                                q = exchange["question"][:50] + "..." if len(exchange["question"]) > 50 else exchange["question"]
+                                print(f"   {i}. {q}")
+                        else:
+                            print("\n📝 No questions asked yet")
+                        continue
+                    elif question == '3':
+                        # Generate smart suggestion
+                        suggested_question = self._generate_smart_suggestion(explorer)
+                        if suggested_question:
+                            print(f"\n💡 Suggested question: {suggested_question}")
+                            print("   Press Enter to use this, or type your own question:")
+                            next_input = input("> ").strip()
+                            if not next_input:  # User pressed Enter to use suggestion
+                                question = suggested_question
+                            else:
+                                question = next_input
+                        else:
+                            print("\n💡 No suggestions available yet. Ask a question first!")
+                            continue
+                    
+                    # Simple exit handling
                     if question.lower() in ['quit', 'exit', 'q', 'back']:
-                        print("\n" + explorer.end_session())
+                        print(explorer.end_session())
                         break
                     
+                    # Skip empty input
                     if not question:
                         continue
                     
-                    if question.lower() in ['help', 'h']:
-                        print("""
-🧠 EXPLORATION MODE HELP:
-  • Ask any question about the codebase
-  • I remember our conversation for follow-up questions  
-  • Use 'why', 'how', 'explain' for detailed reasoning
-  • Type 'summary' to see session overview
-  • Type 'quit' to return to main menu
-  
-💡 Example questions:
-  • "How does authentication work?"
-  • "Why is this function slow?"
-  • "Explain the database connection logic"
-  • "What are the security concerns here?"
-""")
+                    # Simple help
+                    if question.lower() in ['help', 'h', '?']:
+                        print("\n💡 Just ask any question about the codebase!")
+                        print("   Examples: 'how does search work?' or 'explain the indexing'")
+                        print("   Quick: 0=quit, 1=summary, 2=history, 3=suggest")
                         continue
                     
-                    if question.lower() == 'summary':
-                        print("\n" + explorer.get_session_summary())
-                        continue
-                    
-                    print("\n🔍 Analyzing...")
+                    # Process the question immediately
+                    print("🔍 Thinking...")
                     response = explorer.explore_question(question)
                     
                     if response:
-                        print(f"\n{response}")
+                        print(f"\n{response}\n")
                     else:
-                        print("❌ Sorry, I couldn't process that question. Please try again.")
+                        print("❌ Sorry, I couldn't process that question.\n")
                 
                 except KeyboardInterrupt:
-                    print(f"\n\n{explorer.end_session()}")
+                    print(f"\n{explorer.end_session()}")
                     break
                 except EOFError:
-                    print(f"\n\n{explorer.end_session()}")
+                    print(f"\n{explorer.end_session()}")
                     break
             
         except Exception as e:
             print(f"❌ Exploration mode failed: {e}")
             print("   Try running the CLI command directly for more details")
+            input("\nPress Enter to continue...")
+            return
         
-        input("\nPress Enter to continue...")
+        # Exploration session completed successfully, return to menu without extra prompt
+    
+    def _generate_smart_suggestion(self, explorer):
+        """Generate a smart follow-up question based on conversation context."""
+        if not explorer.current_session or not explorer.current_session.conversation_history:
+            return None
+        
+        try:
+            # Get recent conversation context
+            recent_exchanges = explorer.current_session.conversation_history[-2:]  # Last 2 exchanges
+            context_summary = ""
+            
+            for i, exchange in enumerate(recent_exchanges, 1):
+                q = exchange["question"]
+                summary = exchange["response"]["summary"][:100] + "..." if len(exchange["response"]["summary"]) > 100 else exchange["response"]["summary"]
+                context_summary += f"Q{i}: {q}\nA{i}: {summary}\n\n"
+            
+            # Create a very focused prompt that encourages short responses
+            prompt = f"""Based on this recent conversation about a codebase, suggest ONE short follow-up question (under 10 words).
+
+Recent conversation:
+{context_summary.strip()}
+
+Respond with ONLY a single short question that would logically explore deeper or connect to what was discussed. Examples:
+- "Why does this approach work better?"
+- "What could go wrong here?"  
+- "How is this tested?"
+- "Where else is this pattern used?"
+
+Your suggested question (under 10 words):"""
+
+            # Use the synthesizer to generate suggestion
+            response = explorer.synthesizer._call_ollama(prompt, temperature=0.3, disable_thinking=True)
+            
+            if response:
+                # Clean up the response - extract just the question
+                lines = response.strip().split('\n')
+                for line in lines:
+                    line = line.strip()
+                    if line and ('?' in line or line.lower().startswith(('what', 'how', 'why', 'where', 'when', 'which', 'who'))):
+                        # Remove any prefixes like "Question:" or numbers
+                        cleaned = line.split(':', 1)[-1].strip()
+                        if len(cleaned) < 80 and ('?' in cleaned or cleaned.lower().startswith(('what', 'how', 'why', 'where', 'when', 'which', 'who'))):
+                            return cleaned
+                
+                # Fallback: use first non-empty line if it looks like a question
+                first_line = lines[0].strip() if lines else ""
+                if first_line and len(first_line) < 80:
+                    return first_line
+            
+            # Fallback: pattern-based suggestions if LLM fails
+            return self._get_fallback_suggestion(recent_exchanges)
+            
+        except Exception as e:
+            # Silent fail with pattern-based fallback
+            recent_exchanges = explorer.current_session.conversation_history[-2:] if explorer.current_session.conversation_history else []
+            return self._get_fallback_suggestion(recent_exchanges)
+    
+    def _get_fallback_suggestion(self, recent_exchanges):
+        """Generate pattern-based suggestions as fallback."""
+        if not recent_exchanges:
+            return None
+            
+        last_question = recent_exchanges[-1]["question"].lower()
+        
+        # Simple pattern matching for common follow-ups
+        if "how" in last_question and "work" in last_question:
+            return "What could go wrong with this approach?"
+        elif "what" in last_question and ("is" in last_question or "does" in last_question):
+            return "How is this implemented?"
+        elif "implement" in last_question or "code" in last_question:
+            return "How is this tested?"
+        elif "error" in last_question or "bug" in last_question:
+            return "How can this be prevented?"
+        elif "performance" in last_question or "speed" in last_question:
+            return "What are the bottlenecks here?"
+        elif "security" in last_question or "safe" in last_question:
+            return "What other security concerns exist?"
+        elif "test" in last_question:
+            return "What edge cases should be considered?"
+        else:
+            # Generic follow-ups
+            fallbacks = [
+                "How is this used elsewhere?",
+                "What are the alternatives?", 
+                "Why was this approach chosen?",
+                "What happens when this fails?",
+                "How can this be improved?"
+            ]
+            import random
+            return random.choice(fallbacks)
     
     def show_status(self):
         """Show project and system status."""
@@ -735,7 +1214,7 @@ class SimpleTUI:
         print("   • chunking.strategy - 'semantic' (smart) vs 'fixed' (simple)")
         print("   • files.exclude_patterns - Skip files matching these patterns")
         print("   • embedding.preferred_method - 'ollama', 'ml', 'hash', or 'auto'")
-        print("   • search.default_limit - Default number of search results")
+        print("   • search.default_top_k - Default number of search results (top-k)")
         print()
         
         print("📚 References:")
@@ -796,7 +1275,7 @@ class SimpleTUI:
         
         print("⚙️  Options:")
         print("   --force                    # Force complete re-index")
-        print("   --limit N                  # Limit search results")
+        print("   --top-k N                  # Number of top results to return")
         print("   --verbose                  # Show detailed output")
         print()
         
@@ -812,11 +1291,44 @@ class SimpleTUI:
             self.clear_screen()
             self.print_header()
             
-            # Show current project status
+            # Show current project status prominently
             if self.project_path:
                 rag_dir = self.project_path / '.mini-rag'
-                status = "✅ Indexed" if rag_dir.exists() else "❌ Not indexed"
-                print(f"📁 Current project: {self.project_path.name} ({status})")
+                is_indexed = rag_dir.exists()
+                status_icon = "✅" if is_indexed else "❌"
+                status_text = "Ready for search" if is_indexed else "Needs indexing"
+                
+                # Check LLM status
+                llm_status, llm_model = self._get_llm_status()
+                
+                print("╔════════════════════════════════════════════════════╗")
+                # Calculate exact spacing for 50-char content width
+                project_line = f" Current Project: {self.project_path.name}"
+                print(f"║{project_line:<50}║")
+                
+                status_line = f" Index Status: {status_icon} {status_text}"
+                print(f"║{status_line:<50}║")
+                
+                llm_line = f" LLM Status: {llm_status}"
+                print(f"║{llm_line:<50}║")
+                
+                if llm_model:
+                    model_line = f" Model: {llm_model}"
+                    print(f"║{model_line:<50}║")
+                
+                if is_indexed:
+                    # Show quick stats if indexed
+                    try:
+                        manifest = rag_dir / 'manifest.json'
+                        if manifest.exists():
+                            with open(manifest) as f:
+                                data = json.load(f)
+                            file_count = data.get('file_count', 0)
+                            files_line = f" Files indexed: {file_count}"
+                            print(f"║{files_line:<50}║")
+                    except:
+                        pass
+                print("╚════════════════════════════════════════════════════╝")
                 print()
             else:
                 # Show beginner tips when no project selected
@@ -825,20 +1337,50 @@ class SimpleTUI:
                 print("   Start by selecting a project directory below.")
                 print()
             
-            options = [
-                "Select project directory",
-                "Index project for search",
-                "Search project (Fast synthesis)",
-                "Explore project (Deep thinking)",
-                "View status",
-                "Configuration",
-                "CLI command reference",
-                "Exit"
-            ]
+            # Create options with visual cues based on project status
+            if self.project_path:
+                rag_dir = self.project_path / '.mini-rag'
+                is_indexed = rag_dir.exists()
+                
+                if is_indexed:
+                    options = [
+                        "Select project directory",
+                        "\033[2mIndex project for search (already indexed)\033[0m",
+                        "Search project (Fast synthesis)",
+                        "Explore project (Deep thinking)",
+                        "View status",
+                        "Configuration",
+                        "CLI command reference"
+                    ]
+                else:
+                    options = [
+                        "Select project directory", 
+                        "Index project for search",
+                        "\033[2mSearch project (needs indexing first)\033[0m",
+                        "\033[2mExplore project (needs indexing first)\033[0m",
+                        "View status",
+                        "Configuration",
+                        "CLI command reference"
+                    ]
+            else:
+                # No project selected - gray out project-dependent options
+                options = [
+                    "Select project directory",
+                    "\033[2mIndex project for search (select project first)\033[0m",
+                    "\033[2mSearch project (select project first)\033[0m", 
+                    "\033[2mExplore project (select project first)\033[0m",
+                    "\033[2mView status (select project first)\033[0m",
+                    "Configuration",
+                    "CLI command reference"
+                ]
             
-            choice = self.show_menu("Main Menu", options)
+            choice = self.show_menu("Main Menu", options, back_option="Exit")
             
-            if choice == 0:
+            if choice == -1:  # Exit (0 option)
+                print("\nThanks for using FSS-Mini-RAG! 🚀")
+                print("Try the CLI commands for even more power!")
+                break
+            elif choice == 0:
                 self.select_project()
             elif choice == 1:
                 self.index_project_interactive()
@@ -852,17 +1394,35 @@ class SimpleTUI:
                 self.show_configuration()
             elif choice == 6:
                 self.show_cli_reference()
-            elif choice == 7:
-                print("\nThanks for using FSS-Mini-RAG! 🚀")
-                print("Try the CLI commands for even more power!")
-                break
 
 def main():
     """Main entry point."""
     try:
+        # Check if we can import dependencies
+        try:
+            sys.path.insert(0, str(Path(__file__).parent))
+            from mini_rag.venv_checker import check_and_warn_venv
+            check_and_warn_venv("rag-tui", force_exit=False)
+        except ImportError as e:
+            # Dependencies missing - show helpful message
+            script_dir = Path(__file__).parent
+            print("❌ FSS-Mini-RAG dependencies not found!")
+            print("")
+            print("🔧 To fix this:")
+            print(f"   1. Run the installer: {script_dir}/install_mini_rag.sh")
+            print(f"   2. Or use the wrapper script: {script_dir}/rag-tui")
+            print("   3. Or activate the virtual environment first:")
+            print(f"      cd {script_dir}")
+            print("      source .venv/bin/activate")
+            print(f"      python3 {script_dir}/rag-tui.py")
+            print("")
+            print(f"💡 Dependencies missing: {e}")
+            input("\nPress Enter to exit...")
+            return
+        
         tui = SimpleTUI()
         tui.main_menu()
-    except KeyboardInterrupt:
+    except (KeyboardInterrupt, EOFError):
         print("\n\nGoodbye! 👋")
     except Exception as e:
         print(f"\nUnexpected error: {e}")
diff --git a/test_fixes.py b/test_fixes.py
new file mode 100644
index 0000000..cdcbc3f
--- /dev/null
+++ b/test_fixes.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+"""
+Quick test script to verify our key fixes without heavy dependencies.
+
+⚠️  IMPORTANT: This test requires the virtual environment to be activated:
+    source .venv/bin/activate
+    python test_fixes.py
+
+Or run directly with venv:
+    source .venv/bin/activate && python test_fixes.py
+"""
+
+import sys
+import os
+import tempfile
+from pathlib import Path
+
+# Check if virtual environment is activated
+def check_venv():
+    if 'VIRTUAL_ENV' not in os.environ:
+        print("⚠️  WARNING: Virtual environment not detected!")
+        print("   This test requires the virtual environment to be activated.")
+        print("   Run: source .venv/bin/activate && python test_fixes.py")
+        print("   Continuing anyway...\n")
+
+check_venv()
+
+# Add current directory to Python path
+sys.path.insert(0, '.')
+
+def test_config_model_rankings():
+    """Test that model rankings are properly configured."""
+    print("=" * 60)
+    print("TESTING CONFIG AND MODEL RANKINGS")
+    print("=" * 60)
+    
+    try:
+        # Test config loading without heavy dependencies
+        from mini_rag.config import ConfigManager, LLMConfig
+        
+        # Create a temporary directory for testing
+        with tempfile.TemporaryDirectory() as tmpdir:
+            config_manager = ConfigManager(tmpdir)
+            config = config_manager.load_config()
+            
+            print("✓ Config loads successfully")
+            
+            # Check LLM config and model rankings
+            if hasattr(config, 'llm'):
+                llm_config = config.llm
+                print(f"✓ LLM config found: {type(llm_config)}")
+                
+                if hasattr(llm_config, 'model_rankings'):
+                    rankings = llm_config.model_rankings
+                    print(f"✓ Model rankings: {rankings}")
+                    
+                    if rankings and rankings[0] == "qwen3:1.7b":
+                        print("✓ qwen3:1.7b is FIRST priority - CORRECT!")
+                        return True
+                    else:
+                        print(f"✗ WRONG: First model is {rankings[0] if rankings else 'None'}, should be qwen3:1.7b")
+                        return False
+                else:
+                    print("✗ Model rankings not found in LLM config")
+                    return False
+            else:
+                print("✗ LLM config not found")
+                return False
+                
+    except ImportError as e:
+        print(f"✗ Import error: {e}")
+        return False
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return False
+
+def test_context_length_fix():
+    """Test that context length is correctly set to 32K."""
+    print("\n" + "=" * 60)
+    print("TESTING CONTEXT LENGTH FIXES")
+    print("=" * 60)
+    
+    try:
+        # Read the synthesizer file and check for 32000
+        with open('mini_rag/llm_synthesizer.py', 'r') as f:
+            synthesizer_content = f.read()
+        
+        if '"num_ctx": 32000' in synthesizer_content:
+            print("✓ LLM Synthesizer: num_ctx is correctly set to 32000")
+        elif '"num_ctx": 80000' in synthesizer_content:
+            print("✗ LLM Synthesizer: num_ctx is still 80000 - NEEDS FIX")
+            return False
+        else:
+            print("? LLM Synthesizer: num_ctx setting not found clearly")
+        
+        # Read the safeguards file and check for 32000
+        with open('mini_rag/llm_safeguards.py', 'r') as f:
+            safeguards_content = f.read()
+        
+        if 'context_window: int = 32000' in safeguards_content:
+            print("✓ Safeguards: context_window is correctly set to 32000")
+            return True
+        elif 'context_window: int = 80000' in safeguards_content:
+            print("✗ Safeguards: context_window is still 80000 - NEEDS FIX")
+            return False
+        else:
+            print("? Safeguards: context_window setting not found clearly")
+            return False
+            
+    except Exception as e:
+        print(f"✗ Error checking context length: {e}")
+        return False
+
+def test_safeguard_preservation():
+    """Test that safeguards preserve content instead of dropping it."""
+    print("\n" + "=" * 60)
+    print("TESTING SAFEGUARD CONTENT PRESERVATION")
+    print("=" * 60)
+    
+    try:
+        # Read the synthesizer file and check for the preservation method
+        with open('mini_rag/llm_synthesizer.py', 'r') as f:
+            synthesizer_content = f.read()
+        
+        if '_create_safeguard_response_with_content' in synthesizer_content:
+            print("✓ Safeguard content preservation method exists")
+        else:
+            print("✗ Safeguard content preservation method missing")
+            return False
+        
+        # Check for the specific preservation logic
+        if 'AI Response (use with caution):' in synthesizer_content:
+            print("✓ Content preservation warning format found")
+        else:
+            print("✗ Content preservation warning format missing")
+            return False
+            
+        # Check that it's being called instead of dropping content
+        if 'return self._create_safeguard_response_with_content(issue_type, explanation, raw_response)' in synthesizer_content:
+            print("✓ Preservation method is called when safeguards trigger")
+            return True
+        else:
+            print("✗ Preservation method not called properly")
+            return False
+            
+    except Exception as e:
+        print(f"✗ Error checking safeguard preservation: {e}")
+        return False
+
+def test_import_fixes():
+    """Test that import statements are fixed from claude_rag to mini_rag."""
+    print("\n" + "=" * 60)
+    print("TESTING IMPORT STATEMENT FIXES")
+    print("=" * 60)
+    
+    test_files = [
+        'tests/test_rag_integration.py',
+        'tests/01_basic_integration_test.py',
+        'tests/test_hybrid_search.py',
+        'tests/test_context_retrieval.py'
+    ]
+    
+    all_good = True
+    
+    for test_file in test_files:
+        if Path(test_file).exists():
+            try:
+                with open(test_file, 'r') as f:
+                    content = f.read()
+                
+                if 'claude_rag' in content:
+                    print(f"✗ {test_file}: Still contains 'claude_rag' imports")
+                    all_good = False
+                elif 'mini_rag' in content:
+                    print(f"✓ {test_file}: Uses correct 'mini_rag' imports")
+                else:
+                    print(f"? {test_file}: No rag imports found")
+                    
+            except Exception as e:
+                print(f"✗ Error reading {test_file}: {e}")
+                all_good = False
+        else:
+            print(f"? {test_file}: File not found")
+    
+    return all_good
+
+def main():
+    """Run all tests."""
+    print("FSS-Mini-RAG Fix Verification Tests")
+    print("Testing all the critical fixes...")
+    
+    tests = [
+        ("Model Rankings", test_config_model_rankings),
+        ("Context Length", test_context_length_fix),  
+        ("Safeguard Preservation", test_safeguard_preservation),
+        ("Import Fixes", test_import_fixes)
+    ]
+    
+    results = {}
+    
+    for test_name, test_func in tests:
+        try:
+            results[test_name] = test_func()
+        except Exception as e:
+            print(f"✗ {test_name} test crashed: {e}")
+            results[test_name] = False
+    
+    # Summary
+    print("\n" + "=" * 60)
+    print("TEST SUMMARY")
+    print("=" * 60)
+    
+    passed = sum(1 for result in results.values() if result)
+    total = len(results)
+    
+    for test_name, result in results.items():
+        status = "✓ PASS" if result else "✗ FAIL"
+        print(f"{status} {test_name}")
+    
+    print(f"\nOverall: {passed}/{total} tests passed")
+    
+    if passed == total:
+        print("🎉 ALL TESTS PASSED - System should be working properly!")
+        return 0
+    else:
+        print("❌ SOME TESTS FAILED - System needs more fixes!")
+        return 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/tests/01_basic_integration_test.py b/tests/01_basic_integration_test.py
index 281322a..4fec7a7 100644
--- a/tests/01_basic_integration_test.py
+++ b/tests/01_basic_integration_test.py
@@ -1,5 +1,12 @@
 """
 Comprehensive demo of the RAG system showing all integrated features.
+
+⚠️  IMPORTANT: This test requires the virtual environment to be activated:
+    source .venv/bin/activate
+    PYTHONPATH=. python tests/01_basic_integration_test.py
+
+Or run directly with venv:
+    source .venv/bin/activate && PYTHONPATH=. python tests/01_basic_integration_test.py
 """
 
 import os
@@ -7,6 +14,16 @@ import sys
 import tempfile
 from pathlib import Path
 
+# Check if virtual environment is activated
+def check_venv():
+    if 'VIRTUAL_ENV' not in os.environ:
+        print("⚠️  WARNING: Virtual environment not detected!")
+        print("   This test requires the virtual environment to be activated.")
+        print("   Run: source .venv/bin/activate && PYTHONPATH=. python tests/01_basic_integration_test.py")
+        print("   Continuing anyway...\n")
+
+check_venv()
+
 # Fix Windows encoding
 if sys.platform == 'win32':
     os.environ['PYTHONUTF8'] = '1'
@@ -15,7 +32,7 @@ if sys.platform == 'win32':
 from mini_rag.chunker import CodeChunker
 from mini_rag.indexer import ProjectIndexer
 from mini_rag.search import CodeSearcher
-from mini_rag.embeddings import CodeEmbedder
+from mini_rag.ollama_embeddings import OllamaEmbedder as CodeEmbedder
 
 def main():
     print("=" * 60)
@@ -189,17 +206,17 @@ if __name__ == "__main__":
         
         # Test different search types
         print("\n   a) Semantic search for 'calculate average':")
-        results = searcher.search("calculate average", limit=3)
+        results = searcher.search("calculate average", top_k=3)
         for i, result in enumerate(results, 1):
             print(f"      {i}. {result.chunk_type} '{result.name}' in {result.file_path} (score: {result.score:.3f})")
         
         print("\n   b) BM25-weighted search for 'divide zero':")
-        results = searcher.search("divide zero", limit=3, semantic_weight=0.2, bm25_weight=0.8)
+        results = searcher.search("divide zero", top_k=3, semantic_weight=0.2, bm25_weight=0.8)
         for i, result in enumerate(results, 1):
             print(f"      {i}. {result.chunk_type} '{result.name}' in {result.file_path} (score: {result.score:.3f})")
         
         print("\n   c) Search with context for 'test addition':")
-        results = searcher.search("test addition", limit=2, include_context=True)
+        results = searcher.search("test addition", top_k=2, include_context=True)
         for i, result in enumerate(results, 1):
             print(f"      {i}. {result.chunk_type} '{result.name}'")
             if result.parent_chunk:
diff --git a/tests/02_search_examples.py b/tests/02_search_examples.py
index b478d97..271c1ab 100644
--- a/tests/02_search_examples.py
+++ b/tests/02_search_examples.py
@@ -37,25 +37,25 @@ def demo_search(project_path: Path):
             'title': 'Keyword-Heavy Search',
             'query': 'BM25Okapi rank_bm25 search scoring',
             'description': 'This query has specific technical keywords that BM25 excels at finding',
-            'limit': 5
+            'top_k': 5
         },
         {
             'title': 'Natural Language Query',
             'query': 'how to build search index from database chunks',
             'description': 'This semantic query benefits from transformer embeddings understanding intent',
-            'limit': 5
+            'top_k': 5
         },
         {
             'title': 'Mixed Technical Query',
             'query': 'vector embeddings for semantic code search with transformers',
             'description': 'This hybrid query combines technical terms with conceptual understanding',
-            'limit': 5
+            'top_k': 5
         },
         {
             'title': 'Function Search',
             'query': 'search method implementation with filters',
             'description': 'Looking for specific function implementations',
-            'limit': 5
+            'top_k': 5
         }
     ]
     
@@ -67,7 +67,7 @@ def demo_search(project_path: Path):
         # Run search with hybrid mode
         results = searcher.search(
             query=demo['query'],
-            limit=demo['limit'],
+            top_k=demo['top_k'],
             semantic_weight=0.7,
             bm25_weight=0.3
         )
diff --git a/tests/03_system_validation.py b/tests/03_system_validation.py
index 6293c6f..ea47134 100644
--- a/tests/03_system_validation.py
+++ b/tests/03_system_validation.py
@@ -244,7 +244,7 @@ def compute_median(numbers):
         searcher = CodeSearcher(project_path)
         
         # Test BM25 integration
-        results = searcher.search("multiply numbers", limit=5, 
+        results = searcher.search("multiply numbers", top_k=5, 
                                  semantic_weight=0.3, bm25_weight=0.7)
         
         if results:
@@ -283,7 +283,7 @@ def compute_median(numbers):
                     print(f"   - No parent chunk")
                     
                 # Test include_context in search
-                results_with_context = searcher.search("add", include_context=True, limit=2)
+                results_with_context = searcher.search("add", include_context=True, top_k=2)
                 if results_with_context:
                     print(f"   Found {len(results_with_context)} results with context")
                     for r in results_with_context:
diff --git a/tests/test_context_retrieval.py b/tests/test_context_retrieval.py
index 2db8d77..5c1a6cd 100644
--- a/tests/test_context_retrieval.py
+++ b/tests/test_context_retrieval.py
@@ -1,11 +1,29 @@
 #!/usr/bin/env python3
 """
 Test script for adjacent chunk retrieval functionality.
+
+⚠️  IMPORTANT: This test requires the virtual environment to be activated:
+    source .venv/bin/activate
+    PYTHONPATH=. python tests/test_context_retrieval.py
+
+Or run directly with venv:
+    source .venv/bin/activate && PYTHONPATH=. python tests/test_context_retrieval.py
 """
 
+import os
 from pathlib import Path
 from mini_rag.search import CodeSearcher
-from mini_rag.embeddings import CodeEmbedder
+from mini_rag.ollama_embeddings import OllamaEmbedder as CodeEmbedder
+
+# Check if virtual environment is activated
+def check_venv():
+    if 'VIRTUAL_ENV' not in os.environ:
+        print("⚠️  WARNING: Virtual environment not detected!")
+        print("   This test requires the virtual environment to be activated.")
+        print("   Run: source .venv/bin/activate && PYTHONPATH=. python tests/test_context_retrieval.py")
+        print("   Continuing anyway...\n")
+
+check_venv()
 
 def test_context_retrieval():
     """Test the new context retrieval functionality."""
@@ -20,7 +38,7 @@ def test_context_retrieval():
         
         # Test 1: Search without context
         print("\n1. Search WITHOUT context:")
-        results = searcher.search("chunk metadata", limit=3, include_context=False)
+        results = searcher.search("chunk metadata", top_k=3, include_context=False)
         for i, result in enumerate(results, 1):
             print(f"  Result {i}: {result.file_path}:{result.start_line}-{result.end_line}")
             print(f"    Type: {result.chunk_type}, Name: {result.name}")
@@ -30,7 +48,7 @@ def test_context_retrieval():
         
         # Test 2: Search with context
         print("\n2. Search WITH context:")
-        results = searcher.search("chunk metadata", limit=3, include_context=True)
+        results = searcher.search("chunk metadata", top_k=3, include_context=True)
         for i, result in enumerate(results, 1):
             print(f"  Result {i}: {result.file_path}:{result.start_line}-{result.end_line}")
             print(f"    Type: {result.chunk_type}, Name: {result.name}")
diff --git a/tests/test_hybrid_search.py b/tests/test_hybrid_search.py
index 0d3f0fe..c3c526a 100644
--- a/tests/test_hybrid_search.py
+++ b/tests/test_hybrid_search.py
@@ -2,6 +2,13 @@
 """
 Test and benchmark the hybrid BM25 + semantic search system.
 Shows performance metrics and search quality comparisons.
+
+⚠️  IMPORTANT: This test requires the virtual environment to be activated:
+    source .venv/bin/activate
+    PYTHONPATH=. python tests/test_hybrid_search.py
+
+Or run directly with venv:
+    source .venv/bin/activate && PYTHONPATH=. python tests/test_hybrid_search.py
 """
 
 import time
@@ -16,7 +23,7 @@ from rich.syntax import Syntax
 from rich.progress import track
 
 from mini_rag.search import CodeSearcher, SearchResult
-from mini_rag.embeddings import CodeEmbedder
+from mini_rag.ollama_embeddings import OllamaEmbedder as CodeEmbedder
 
 console = Console()
 
@@ -40,7 +47,7 @@ class SearchTester:
         if 'error' not in stats:
             console.print(f"[dim]Index contains {stats['total_chunks']} chunks from {stats['unique_files']} files[/dim]\n")
     
-    def run_query(self, query: str, limit: int = 10, 
+    def run_query(self, query: str, top_k: int = 10, 
                   semantic_only: bool = False,
                   bm25_only: bool = False) -> Dict[str, Any]:
         """Run a single query and return metrics."""
@@ -60,7 +67,7 @@ class SearchTester:
         start = time.time()
         results = self.searcher.search(
             query=query,
-            limit=limit,
+            top_k=top_k,
             semantic_weight=semantic_weight,
             bm25_weight=bm25_weight
         )
@@ -76,10 +83,10 @@ class SearchTester:
             'avg_score': sum(r.score for r in results) / len(results) if results else 0,
         }
     
-    def compare_search_modes(self, query: str, limit: int = 5):
+    def compare_search_modes(self, query: str, top_k: int = 5):
         """Compare results across different search modes."""
         console.print(f"\n[bold cyan]Query:[/bold cyan] '{query}'")
-        console.print(f"[dim]Top {limit} results per mode[/dim]\n")
+        console.print(f"[dim]Top {top_k} results per mode[/dim]\n")
         
         # Run searches in all modes
         modes = [
@@ -90,7 +97,7 @@ class SearchTester:
         
         all_results = {}
         for mode_name, semantic_only, bm25_only in modes:
-            result = self.run_query(query, limit, semantic_only, bm25_only)
+            result = self.run_query(query, top_k, semantic_only, bm25_only)
             all_results[mode_name] = result
         
         # Create comparison table
@@ -191,7 +198,7 @@ class SearchTester:
         for test_case in test_queries:
             console.rule(f"\n[cyan]{test_case['description']}[/cyan]")
             console.print(f"[dim]{test_case['expected']}[/dim]")
-            self.compare_search_modes(test_case['query'], limit=3)
+            self.compare_search_modes(test_case['query'], top_k=3)
             time.sleep(0.5)  # Brief pause between tests
     
     def benchmark_performance(self, num_queries: int = 50):
@@ -268,7 +275,7 @@ class SearchTester:
         
         # Query that might return many results from same files
         query = "function implementation code search"
-        results = self.searcher.search(query, limit=20)
+        results = self.searcher.search(query, top_k=20)
         
         # Analyze diversity
         file_counts = {}
diff --git a/tests/test_ollama_integration.py b/tests/test_ollama_integration.py
index 4466d3a..65673bf 100755
--- a/tests/test_ollama_integration.py
+++ b/tests/test_ollama_integration.py
@@ -403,9 +403,9 @@ class TestOllamaIntegration(unittest.TestCase):
         
         # Check search config  
         self.assertIsNotNone(self.config.search)
-        self.assertGreater(self.config.search.default_limit, 0)
+        self.assertGreater(self.config.search.default_top_k, 0)
         print(f"   ✅ Search config valid")
-        print(f"      Default limit: {self.config.search.default_limit}")
+        print(f"      Default top-k: {self.config.search.default_top_k}")
         print(f"      Query expansion: {self.config.search.expand_queries}")
 
 
diff --git a/tests/test_rag_integration.py b/tests/test_rag_integration.py
index 7dae3d5..00313e8 100644
--- a/tests/test_rag_integration.py
+++ b/tests/test_rag_integration.py
@@ -1,12 +1,32 @@
 #!/usr/bin/env python3
-"""Test RAG system integration with smart chunking."""
+"""
+Test RAG system integration with smart chunking.
+
+⚠️  IMPORTANT: This test requires the virtual environment to be activated:
+    source .venv/bin/activate
+    PYTHONPATH=. python tests/test_rag_integration.py
+
+Or run directly with venv:
+    source .venv/bin/activate && PYTHONPATH=. python tests/test_rag_integration.py
+"""
 
 import tempfile
 import shutil
+import os
 from pathlib import Path
 from mini_rag.indexer import ProjectIndexer
 from mini_rag.search import CodeSearcher
 
+# Check if virtual environment is activated
+def check_venv():
+    if 'VIRTUAL_ENV' not in os.environ:
+        print("⚠️  WARNING: Virtual environment not detected!")
+        print("   This test requires the virtual environment to be activated.")
+        print("   Run: source .venv/bin/activate && PYTHONPATH=. python tests/test_rag_integration.py")
+        print("   Continuing anyway...\n")
+
+check_venv()
+
 # Sample Python file with proper structure
 sample_code = '''"""
 Sample module for testing RAG system.
@@ -179,8 +199,8 @@ def test_integration():
         stats = indexer.index_project()
         
         print(f"   - Files indexed: {stats['files_indexed']}")
-        print(f"   - Total chunks: {stats['total_chunks']}")
-        print(f"   - Indexing time: {stats['indexing_time']:.2f}s")
+        print(f"   - Total chunks: {stats['chunks_created']}")
+        print(f"   - Indexing time: {stats['time_taken']:.2f}s")
         
         # Verify chunks were created properly
         print("\n2. Verifying chunk metadata...")
@@ -195,10 +215,10 @@ def test_integration():
         results = searcher.search("data processor class unified interface", top_k=3)
         print(f"\n   Test 1 - Class search:")
         for i, result in enumerate(results[:1]):
-            print(f"   - Match {i+1}: {result['file_path']}")
-            print(f"     Chunk type: {result['chunk_type']}")
-            print(f"     Score: {result['score']:.3f}")
-            if 'This class handles' in result['content']:
+            print(f"   - Match {i+1}: {result.file_path}")
+            print(f"     Chunk type: {result.chunk_type}")
+            print(f"     Score: {result.score:.3f}")
+            if 'This class handles' in result.content:
                 print("     [OK] Docstring included with class")
             else:
                 print("     [FAIL] Docstring not found")
@@ -207,10 +227,10 @@ def test_integration():
         results = searcher.search("process list of data items", top_k=3)
         print(f"\n   Test 2 - Method search:")
         for i, result in enumerate(results[:1]):
-            print(f"   - Match {i+1}: {result['file_path']}")
-            print(f"     Chunk type: {result['chunk_type']}")
-            print(f"     Parent class: {result.get('parent_class', 'N/A')}")
-            if 'Args:' in result['content'] and 'Returns:' in result['content']:
+            print(f"   - Match {i+1}: {result.file_path}")
+            print(f"     Chunk type: {result.chunk_type}")
+            print(f"     Parent class: {getattr(result, 'parent_class', 'N/A')}")
+            if 'Args:' in result.content and 'Returns:' in result.content:
                 print("     [OK] Docstring included with method")
             else:
                 print("     [FAIL] Method docstring not complete")
@@ -219,19 +239,19 @@ def test_integration():
         results = searcher.search("smart chunking capabilities markdown", top_k=3)
         print(f"\n   Test 3 - Markdown search:")
         for i, result in enumerate(results[:1]):
-            print(f"   - Match {i+1}: {result['file_path']}")
-            print(f"     Chunk type: {result['chunk_type']}")
-            print(f"     Lines: {result['start_line']}-{result['end_line']}")
+            print(f"   - Match {i+1}: {result.file_path}")
+            print(f"     Chunk type: {result.chunk_type}")
+            print(f"     Lines: {result.start_line}-{result.end_line}")
         
         # Test 4: Verify chunk navigation
         print(f"\n   Test 4 - Chunk navigation:")
         all_results = searcher.search("", top_k=100)  # Get all chunks
-        py_chunks = [r for r in all_results if r['file_path'].endswith('.py')]
+        py_chunks = [r for r in all_results if r.file_path.endswith('.py')]
         
         if py_chunks:
             first_chunk = py_chunks[0]
-            print(f"   - First chunk: index={first_chunk.get('chunk_index', 'N/A')}")
-            print(f"     Next chunk ID: {first_chunk.get('next_chunk_id', 'N/A')}")
+            print(f"   - First chunk: index={getattr(first_chunk, 'chunk_index', 'N/A')}")
+            print(f"     Next chunk ID: {getattr(first_chunk, 'next_chunk_id', 'N/A')}")
             
             # Verify chain
             valid_chain = True
@@ -239,7 +259,7 @@ def test_integration():
                 curr = py_chunks[i]
                 next_chunk = py_chunks[i + 1]
                 expected_next = f"processor_{i+1}"
-                if curr.get('next_chunk_id') != expected_next:
+                if getattr(curr, 'next_chunk_id', None) != expected_next:
                     valid_chain = False
                     break