Compare commits

..

2 Commits

Author SHA1 Message Date
c201b3badd Fix critical deployment issues and improve system reliability
Major fixes:
- Fix model selection to prioritize qwen3:1.7b instead of qwen3:4b for testing
- Correct context length from 80,000 to 32,000 tokens (proper Qwen3 limit)
- Implement content-preserving safeguards instead of dropping responses
- Fix all test imports from claude_rag to mini_rag module naming
- Add virtual environment warnings to all test entry points
- Fix TUI EOF crash handling with proper error handling
- Remove warmup delays that were causing startup lag and unwanted model calls
- Fix command mappings between bash wrapper and Python script
- Update documentation to reflect qwen3:1.7b as primary recommendation
- Improve TUI box alignment and formatting
- Make language generic for any documents, not just codebases
- Add proper folder names in user feedback instead of generic terms

Technical improvements:
- Unified model rankings across all components
- Better error handling for missing dependencies
- Comprehensive testing and validation of all fixes
- All tests now pass and system is deployment-ready

All major crashes and deployment issues resolved.
2025-08-15 09:47:15 +10:00
597c810034 Fix installer indexing hang and improve user experience
🔧 Script Handling Improvements:
- Fix infinite recursion in bash wrapper for index/search commands
- Improve embedding system diagnostics with intelligent detection
- Add timeout protection and progress indicators to installer test
- Enhance interactive input handling with graceful fallbacks

🎯 User Experience Enhancements:
- Replace confusing error messages with educational diagnostics
- Add RAG performance tips about model sizing (4B optimal, 8B+ overkill)
- Correct model recommendations (qwen3:4b not qwen3:3b)
- Smart Ollama model detection shows available models
- Clear guidance for next steps after installation

🛠 Technical Fixes:
- Add get_embedding_info() method to CodeEmbedder class
- Robust test prompt handling with /dev/tty input
- Path validation and permission fixing in test scripts
- Comprehensive error diagnostics with actionable solutions

Installation now completes reliably with clear feedback and guidance.
2025-08-14 20:23:57 +10:00
36 changed files with 1964 additions and 457 deletions

53
.mini-rag/config.yaml Normal file
View File

@ -0,0 +1,53 @@
# FSS-Mini-RAG Configuration
# Edit this file to customize indexing and search behavior
# See docs/GETTING_STARTED.md for detailed explanations
# Text chunking settings
chunking:
max_size: 2000 # Maximum characters per chunk
min_size: 150 # Minimum characters per chunk
strategy: semantic # 'semantic' (language-aware) or 'fixed'
# Large file streaming settings
streaming:
enabled: true
threshold_bytes: 1048576 # Files larger than this use streaming (1MB)
# File processing settings
files:
min_file_size: 50 # Skip files smaller than this
exclude_patterns:
- "node_modules/**"
- ".git/**"
- "__pycache__/**"
- "*.pyc"
- ".venv/**"
- "venv/**"
- "build/**"
- "dist/**"
include_patterns:
- "**/*" # Include all files by default
# Embedding generation settings
embedding:
preferred_method: ollama # 'ollama', 'ml', 'hash', or 'auto'
ollama_model: nomic-embed-text
ollama_host: localhost:11434
ml_model: sentence-transformers/all-MiniLM-L6-v2
batch_size: 32 # Embeddings processed per batch
# Search behavior settings
search:
default_top_k: 10 # Default number of top results
enable_bm25: true # Enable keyword matching boost
similarity_threshold: 0.1 # Minimum similarity score
expand_queries: false # Enable automatic query expansion
# LLM synthesis and query expansion settings
llm:
ollama_host: localhost:11434
synthesis_model: auto # 'auto', 'qwen3:1.7b', etc.
expansion_model: auto # Usually same as synthesis_model
max_expansion_terms: 8 # Maximum terms to add to queries
enable_synthesis: false # Enable synthesis by default
synthesis_temperature: 0.3 # LLM temperature for analysis

1
.mini-rag/last_search Normal file
View File

@ -0,0 +1 @@
test

View File

@ -67,7 +67,7 @@ llm:
# Aggressive caching for CPU systems # Aggressive caching for CPU systems
search: search:
expand_queries: false # Enable only in TUI expand_queries: false # Enable only in TUI
default_limit: 8 # Slightly fewer results for speed default_top_k: 8 # Slightly fewer results for speed
``` ```
## System Requirements ## System Requirements

View File

@ -125,7 +125,7 @@ print(f"Indexed {result['files_processed']} files, {result['chunks_created']} ch
# Search # Search
print("\nSearching for authentication code...") print("\nSearching for authentication code...")
results = searcher.search("user authentication logic", limit=5) results = searcher.search("user authentication logic", top_k=5)
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f"\n{i}. {result.file_path}") print(f"\n{i}. {result.file_path}")

View File

@ -421,7 +421,7 @@ def _create_vector_table(self, chunks: List[CodeChunk], embeddings: np.ndarray):
return table return table
def vector_search(self, query_embedding: np.ndarray, limit: int) -> List[SearchResult]: def vector_search(self, query_embedding: np.ndarray, top_k: int) -> List[SearchResult]:
"""Fast vector similarity search.""" """Fast vector similarity search."""
table = self.db.open_table("chunks") table = self.db.open_table("chunks")
@ -794,12 +794,12 @@ def repair_index(self, project_path: Path) -> bool:
FSS-Mini-RAG works well with various LLM sizes because our rich context and guided prompts help small models perform excellently: FSS-Mini-RAG works well with various LLM sizes because our rich context and guided prompts help small models perform excellently:
**Recommended (Best Balance):** **Recommended (Best Balance):**
- **qwen3:4b** - Excellent quality, good performance - **qwen3:1.7b** - Excellent quality with fast performance (default priority)
- **qwen3:4b:q8_0** - High-precision quantized version for production - **qwen3:0.6b** - Surprisingly good for CPU-only systems (522MB)
**Still Excellent (Faster/CPU-friendly):** **Still Excellent (Slower but highest quality):**
- **qwen3:1.7b** - Very good results, faster responses - **qwen3:4b** - Highest quality, slower responses
- **qwen3:0.6b** - Surprisingly good considering size (522MB) - **qwen3:4b:q8_0** - High-precision quantized version for production
### Why Small Models Work Well Here ### Why Small Models Work Well Here
@ -813,7 +813,7 @@ Without good context, small models tend to get lost and produce erratic output.
### Quantization Benefits ### Quantization Benefits
For production deployments, consider quantized models like `qwen3:4b:q8_0`: For production deployments, consider quantized models like `qwen3:1.7b:q8_0` or `qwen3:4b:q8_0`:
- **Q8_0**: 8-bit quantization with minimal quality loss - **Q8_0**: 8-bit quantization with minimal quality loss
- **Smaller memory footprint**: ~50% reduction vs full precision - **Smaller memory footprint**: ~50% reduction vs full precision
- **Better CPU performance**: Faster inference on CPU-only systems - **Better CPU performance**: Faster inference on CPU-only systems

View File

@ -110,7 +110,7 @@ python3 -c "import mini_rag; print('✅ Installation successful')"
2. **Reduce result limit:** 2. **Reduce result limit:**
```yaml ```yaml
search: search:
default_limit: 5 # Instead of 10 default_top_k: 5 # Instead of 10
``` ```
3. **Use faster embedding method:** 3. **Use faster embedding method:**
@ -165,9 +165,9 @@ python3 -c "import mini_rag; print('✅ Installation successful')"
2. **Try different model:** 2. **Try different model:**
```bash ```bash
ollama pull qwen3:4b # Recommended: excellent quality ollama pull qwen3:1.7b # Recommended: excellent quality (default priority)
ollama pull qwen3:1.7b # Still very good, faster
ollama pull qwen3:0.6b # Surprisingly good for CPU-only ollama pull qwen3:0.6b # Surprisingly good for CPU-only
ollama pull qwen3:4b # Highest quality, slower
``` ```
3. **Use synthesis mode instead of exploration:** 3. **Use synthesis mode instead of exploration:**

View File

@ -154,7 +154,7 @@ That's it! The TUI will guide you through everything.
- **chunking.strategy** - Smart (semantic) vs simple (fixed size) - **chunking.strategy** - Smart (semantic) vs simple (fixed size)
- **files.exclude_patterns** - Skip certain files/directories - **files.exclude_patterns** - Skip certain files/directories
- **embedding.preferred_method** - AI model preference - **embedding.preferred_method** - AI model preference
- **search.default_limit** - How many results to show - **search.default_top_k** - How many results to show
**Interactive Options**: **Interactive Options**:
- **[V]iew config** - See full configuration file - **[V]iew config** - See full configuration file

View File

@ -50,7 +50,7 @@ def main():
print("\n4. Example searches:") print("\n4. Example searches:")
for query in queries: for query in queries:
print(f"\n Query: '{query}'") print(f"\n Query: '{query}'")
results = searcher.search(query, limit=3) results = searcher.search(query, top_k=3)
if results: if results:
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):

View File

@ -41,7 +41,7 @@ embedding:
# 🔍 Search behavior # 🔍 Search behavior
search: search:
default_limit: 10 # Show 10 results (good starting point) default_top_k: 10 # Show 10 results (good starting point)
enable_bm25: true # Find exact word matches too enable_bm25: true # Find exact word matches too
similarity_threshold: 0.1 # Pretty permissive (shows more results) similarity_threshold: 0.1 # Pretty permissive (shows more results)
expand_queries: false # Keep it simple for now expand_queries: false # Keep it simple for now

View File

@ -62,7 +62,7 @@ embedding:
# 🔍 Search optimized for speed # 🔍 Search optimized for speed
search: search:
default_limit: 5 # Fewer results = faster display default_top_k: 5 # Fewer results = faster display
enable_bm25: false # Skip keyword matching for speed enable_bm25: false # Skip keyword matching for speed
similarity_threshold: 0.2 # Higher threshold = fewer results to process similarity_threshold: 0.2 # Higher threshold = fewer results to process
expand_queries: false # No query expansion (much faster) expand_queries: false # No query expansion (much faster)

View File

@ -53,7 +53,7 @@ embedding:
batch_size: 32 batch_size: 32
search: search:
default_limit: 10 default_top_k: 10
enable_bm25: true enable_bm25: true
similarity_threshold: 0.1 similarity_threshold: 0.1
expand_queries: false expand_queries: false

View File

@ -44,7 +44,7 @@ embedding:
# 🔍 Search optimized for comprehensive results # 🔍 Search optimized for comprehensive results
search: search:
default_limit: 15 # More results to choose from default_top_k: 15 # More results to choose from
enable_bm25: true # Use both semantic and keyword matching enable_bm25: true # Use both semantic and keyword matching
similarity_threshold: 0.05 # Very permissive (show more possibilities) similarity_threshold: 0.05 # Very permissive (show more possibilities)
expand_queries: true # Automatic query expansion for better recall expand_queries: true # Automatic query expansion for better recall

View File

@ -86,7 +86,7 @@ embedding:
#═════════════════════════════════════════════════════════════════════════════════ #═════════════════════════════════════════════════════════════════════════════════
search: search:
default_limit: 10 # How many search results to show by default default_top_k: 10 # How many search results to show by default
# 💡 MORE RESULTS: 15-20 | FASTER SEARCH: 5-8 # 💡 MORE RESULTS: 15-20 | FASTER SEARCH: 5-8
enable_bm25: true # Also use keyword matching (like Google search) enable_bm25: true # Also use keyword matching (like Google search)

View File

@ -188,12 +188,13 @@ check_ollama() {
echo "" echo ""
echo -e "${CYAN}💡 Pro tip: Download an LLM for AI-powered search synthesis!${NC}" echo -e "${CYAN}💡 Pro tip: Download an LLM for AI-powered search synthesis!${NC}"
echo -e " Lightweight: ${GREEN}ollama pull qwen3:0.6b${NC} (~400MB, very fast)" echo -e " Lightweight: ${GREEN}ollama pull qwen3:0.6b${NC} (~500MB, very fast)"
echo -e " Balanced: ${GREEN}ollama pull qwen3:1.7b${NC} (~1GB, good quality)" echo -e " Balanced: ${GREEN}ollama pull qwen3:1.7b${NC} (~1.4GB, good quality)"
echo -e " Excellent: ${GREEN}ollama pull qwen3:3b${NC} (~2GB, great for this project)" echo -e " Excellent: ${GREEN}ollama pull qwen3:4b${NC} (~2.5GB, sweet spot for most users)"
echo -e " Premium: ${GREEN}ollama pull qwen3:8b${NC} (~5GB, amazing results)" echo -e " Maximum: ${GREEN}ollama pull qwen3:8b${NC} (~5GB, slower but top quality)"
echo "" echo ""
echo -e "${BLUE}Creative possibilities: Try mistral for storytelling, or qwen3-coder for development!${NC}" echo -e "${BLUE}🧠 RAG works great with smaller models! 4B is usually perfect.${NC}"
echo -e "${BLUE}Creative possibilities: Try mistral for storytelling, qwen2.5-coder for development!${NC}"
echo "" echo ""
return 0 return 0
@ -558,7 +559,36 @@ print(f'✅ Embedding system: {info[\"method\"]}')
" 2>/dev/null; then " 2>/dev/null; then
print_success "Embedding system working" print_success "Embedding system working"
else else
print_warning "Embedding test failed, but system should still work" echo ""
echo -e "${YELLOW}⚠️ System Check${NC}"
# Smart diagnosis - check what's actually available
if command_exists ollama && curl -s http://localhost:11434/api/version >/dev/null 2>&1; then
# Ollama is running, check for models
local available_models=$(ollama list 2>/dev/null | grep -E "(qwen3|llama|mistral|gemma)" | head -5)
local embedding_models=$(ollama list 2>/dev/null | grep -E "(embed|bge)" | head -2)
if [[ -n "$available_models" ]]; then
echo -e "${GREEN}✅ Ollama is running with available models${NC}"
echo -e "${CYAN}Your setup will work great! The system will auto-select the best models.${NC}"
echo ""
echo -e "${BLUE}💡 RAG Performance Tip:${NC} Smaller models often work better with RAG!"
echo -e " With context provided, even 0.6B models give good results"
echo -e " 4B models = excellent, 8B+ = overkill (slower responses)"
else
echo -e "${BLUE}Ollama is running but no chat models found.${NC}"
echo -e "Download a lightweight model: ${GREEN}ollama pull qwen3:0.6b${NC} (fast)"
echo -e "Or balanced option: ${GREEN}ollama pull qwen3:4b${NC} (excellent quality)"
fi
else
echo -e "${BLUE}Ollama not running or not installed.${NC}"
echo -e "Start Ollama: ${GREEN}ollama serve${NC}"
echo -e "Or install from: https://ollama.com/download"
fi
echo ""
echo -e "${CYAN}✅ FSS-Mini-RAG will auto-detect and use the best available method.${NC}"
echo ""
fi fi
return 0 return 0
@ -595,103 +625,102 @@ show_completion() {
fi fi
# Ask if they want to run a test # Ask if they want to run a test
echo -n "Would you like to run a quick test now? (Y/n): " echo ""
read -r run_test echo -e "${BOLD}🧪 Quick Test Available${NC}"
echo -e "${CYAN}Test FSS-Mini-RAG with a small sample project (takes ~10 seconds)${NC}"
echo ""
# Ensure output is flushed and we're ready for input
printf "Run quick test now? [Y/n]: "
# More robust input handling
if read -r run_test < /dev/tty 2>/dev/null; then
echo "User chose: '$run_test'" # Debug output
if [[ ! $run_test =~ ^[Nn]$ ]]; then if [[ ! $run_test =~ ^[Nn]$ ]]; then
run_quick_test run_quick_test
echo "" echo ""
show_beginner_guidance show_beginner_guidance
else else
echo -e "${BLUE}Skipping test - you can run it later with: ./rag-tui${NC}"
show_beginner_guidance
fi
else
# Fallback if interactive input fails
echo ""
echo -e "${YELLOW}⚠️ Interactive input not available - skipping test prompt${NC}"
echo -e "${BLUE}You can test FSS-Mini-RAG anytime with: ./rag-tui${NC}"
show_beginner_guidance show_beginner_guidance
fi fi
} }
# Create sample project for testing # Note: Sample project creation removed - now indexing real codebase/docs
create_sample_project() {
local sample_dir="$SCRIPT_DIR/.sample_test"
rm -rf "$sample_dir"
mkdir -p "$sample_dir"
# Create a few small sample files
cat > "$sample_dir/README.md" << 'EOF'
# Sample Project
This is a sample project for testing FSS-Mini-RAG search capabilities.
## Features
- User authentication system
- Document processing
- Search functionality
- Email integration
EOF
cat > "$sample_dir/auth.py" << 'EOF'
# Authentication module
def login_user(username, password):
"""Handle user login with password validation"""
if validate_credentials(username, password):
create_session(username)
return True
return False
def validate_credentials(username, password):
"""Check username and password against database"""
# Database validation logic here
return check_password_hash(username, password)
EOF
cat > "$sample_dir/search.py" << 'EOF'
# Search functionality
def semantic_search(query, documents):
"""Perform semantic search across document collection"""
embeddings = generate_embeddings(query)
results = find_similar_documents(embeddings, documents)
return rank_results(results)
def generate_embeddings(text):
"""Generate vector embeddings for text"""
# Embedding generation logic
return process_with_model(text)
EOF
echo "$sample_dir"
}
# Run quick test with sample data # Run quick test with sample data
run_quick_test() { run_quick_test() {
print_header "Quick Test" print_header "Quick Test"
print_info "Creating small sample project for testing..." # Ask what to index: code vs docs
local sample_dir=$(create_sample_project) echo -e "${CYAN}What would you like to explore with FSS-Mini-RAG?${NC}"
echo "Sample project created with 3 files for fast testing." echo ""
echo -e "${GREEN}1) Code${NC} - Index the FSS-Mini-RAG codebase (~50 files)"
echo -e "${BLUE}2) Docs${NC} - Index the documentation (~10 files)"
echo ""
echo -n "Choose [1/2] or Enter for code: "
read -r index_choice
# Determine what to index
local target_dir="$SCRIPT_DIR"
local target_name="FSS-Mini-RAG codebase"
if [[ "$index_choice" == "2" ]]; then
target_dir="$SCRIPT_DIR/docs"
target_name="FSS-Mini-RAG documentation"
fi
# Ensure we're in the right directory and have the right permissions
if [[ ! -f "./rag-mini" ]]; then
print_error "rag-mini script not found in current directory: $(pwd)"
print_info "This might be a path issue. The installer should run from the project directory."
return 1
fi
if [[ ! -x "./rag-mini" ]]; then
print_info "Making rag-mini executable..."
chmod +x ./rag-mini
fi
# Index the chosen target
print_info "Indexing $target_name..."
echo -e "${CYAN}This will take 10-30 seconds depending on your system${NC}"
echo "" echo ""
# Index the sample project (much faster) if ./rag-mini index "$target_dir"; then
print_info "Indexing sample project (this should be fast)..." print_success "✅ Indexing completed successfully!"
if ./rag-mini index "$sample_dir" --quiet; then
print_success "Sample project indexed successfully"
echo "" echo ""
print_info "Testing search with sample queries..." print_info "🎯 Launching Interactive Tutorial..."
echo -e "${BLUE}Running search: 'user authentication'${NC}" echo -e "${CYAN}The TUI has 6 sample questions to get you started.${NC}"
./rag-mini search "$sample_dir" "user authentication" --limit 2 echo -e "${CYAN}Try the suggested queries or enter your own!${NC}"
echo ""
echo -n "Press Enter to start interactive tutorial: "
read -r
# Launch the TUI which has the existing interactive tutorial system
./rag-tui.py "$target_dir"
echo "" echo ""
print_success "Test completed successfully!" print_success "🎉 Tutorial completed!"
echo -e "${CYAN}Ready to use FSS-Mini-RAG on your own projects!${NC}" echo -e "${CYAN}FSS-Mini-RAG is working perfectly!${NC}"
# Offer beginner guidance
echo ""
echo -e "${YELLOW}💡 Beginner Tip:${NC} Try the interactive mode with pre-made questions"
echo " Run: ./rag-tui for guided experience"
# Clean up sample
rm -rf "$sample_dir"
else else
print_error "Sample test failed" print_error "❌ Indexing failed"
echo "This might indicate an issue with the installation." echo ""
rm -rf "$sample_dir" echo -e "${YELLOW}Possible causes:${NC}"
echo "• Virtual environment not properly activated"
echo "• Missing dependencies (try: pip install -r requirements.txt)"
echo "• Path issues (ensure script runs from project directory)"
echo "• Ollama connection issues (if using Ollama)"
echo ""
return 1
fi fi
} }

View File

@ -52,6 +52,10 @@ def cli(verbose: bool, quiet: bool):
A local RAG system for improving the development environment's grounding capabilities. A local RAG system for improving the development environment's grounding capabilities.
Indexes your codebase and enables lightning-fast semantic search. Indexes your codebase and enables lightning-fast semantic search.
""" """
# Check virtual environment
from .venv_checker import check_and_warn_venv
check_and_warn_venv("rag-mini", force_exit=False)
if verbose: if verbose:
logging.getLogger().setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG)
elif quiet: elif quiet:
@ -350,7 +354,12 @@ def debug_schema(path: str):
return return
# Connect to database # Connect to database
try:
import lancedb import lancedb
except ImportError:
console.print("[red]LanceDB not available. Install with: pip install lancedb pyarrow[/red]")
return
db = lancedb.connect(rag_dir) db = lancedb.connect(rag_dir)
if "code_vectors" not in db.table_names(): if "code_vectors" not in db.table_names():

View File

@ -63,7 +63,7 @@ class EmbeddingConfig:
@dataclass @dataclass
class SearchConfig: class SearchConfig:
"""Configuration for search behavior.""" """Configuration for search behavior."""
default_limit: int = 10 default_top_k: int = 10
enable_bm25: bool = True enable_bm25: bool = True
similarity_threshold: float = 0.1 similarity_threshold: float = 0.1
expand_queries: bool = False # Enable automatic query expansion expand_queries: bool = False # Enable automatic query expansion
@ -81,6 +81,9 @@ class LLMConfig:
enable_thinking: bool = True # Enable thinking mode for Qwen3 models enable_thinking: bool = True # Enable thinking mode for Qwen3 models
cpu_optimized: bool = True # Prefer lightweight models cpu_optimized: bool = True # Prefer lightweight models
# Model preference rankings (configurable)
model_rankings: list = None # Will be set in __post_init__
# Provider-specific settings (for different LLM providers) # Provider-specific settings (for different LLM providers)
provider: str = "ollama" # "ollama", "openai", "anthropic" provider: str = "ollama" # "ollama", "openai", "anthropic"
ollama_host: str = "localhost:11434" # Ollama connection ollama_host: str = "localhost:11434" # Ollama connection
@ -88,6 +91,24 @@ class LLMConfig:
api_base: Optional[str] = None # Base URL for API (e.g., OpenRouter) api_base: Optional[str] = None # Base URL for API (e.g., OpenRouter)
timeout: int = 20 # Request timeout in seconds timeout: int = 20 # Request timeout in seconds
def __post_init__(self):
if self.model_rankings is None:
# Default model preference rankings (can be overridden in config file)
self.model_rankings = [
# Testing model (prioritized for current testing phase)
"qwen3:1.7b",
# Ultra-efficient models (perfect for CPU-only systems)
"qwen3:0.6b",
# Recommended model (excellent quality but larger)
"qwen3:4b",
# Common fallbacks (only include models we know exist)
"llama3.2:1b",
"qwen2.5:1.5b",
]
@dataclass @dataclass
class RAGConfig: class RAGConfig:
@ -151,6 +172,8 @@ class ConfigManager:
config.embedding = EmbeddingConfig(**data['embedding']) config.embedding = EmbeddingConfig(**data['embedding'])
if 'search' in data: if 'search' in data:
config.search = SearchConfig(**data['search']) config.search = SearchConfig(**data['search'])
if 'llm' in data:
config.llm = LLMConfig(**data['llm'])
return config return config
@ -219,7 +242,7 @@ class ConfigManager:
"", "",
"# Search behavior settings", "# Search behavior settings",
"search:", "search:",
f" default_limit: {config_dict['search']['default_limit']} # Default number of results", f" default_top_k: {config_dict['search']['default_top_k']} # Default number of top results",
f" enable_bm25: {str(config_dict['search']['enable_bm25']).lower()} # Enable keyword matching boost", f" enable_bm25: {str(config_dict['search']['enable_bm25']).lower()} # Enable keyword matching boost",
f" similarity_threshold: {config_dict['search']['similarity_threshold']} # Minimum similarity score", f" similarity_threshold: {config_dict['search']['similarity_threshold']} # Minimum similarity score",
f" expand_queries: {str(config_dict['search']['expand_queries']).lower()} # Enable automatic query expansion", f" expand_queries: {str(config_dict['search']['expand_queries']).lower()} # Enable automatic query expansion",
@ -232,8 +255,16 @@ class ConfigManager:
f" max_expansion_terms: {config_dict['llm']['max_expansion_terms']} # Maximum terms to add to queries", f" max_expansion_terms: {config_dict['llm']['max_expansion_terms']} # Maximum terms to add to queries",
f" enable_synthesis: {str(config_dict['llm']['enable_synthesis']).lower()} # Enable synthesis by default", f" enable_synthesis: {str(config_dict['llm']['enable_synthesis']).lower()} # Enable synthesis by default",
f" synthesis_temperature: {config_dict['llm']['synthesis_temperature']} # LLM temperature for analysis", f" synthesis_temperature: {config_dict['llm']['synthesis_temperature']} # LLM temperature for analysis",
" model_rankings: # Preferred model order (edit to change priority)",
]) ])
# Add model rankings list
if 'model_rankings' in config_dict['llm'] and config_dict['llm']['model_rankings']:
for model in config_dict['llm']['model_rankings'][:10]: # Show first 10
yaml_lines.append(f" - \"{model}\"")
if len(config_dict['llm']['model_rankings']) > 10:
yaml_lines.append(" # ... (edit config to see all options)")
return '\n'.join(yaml_lines) return '\n'.join(yaml_lines)
def update_config(self, **kwargs) -> RAGConfig: def update_config(self, **kwargs) -> RAGConfig:

View File

@ -60,7 +60,8 @@ class CodeExplorer:
self.synthesizer = LLMSynthesizer( self.synthesizer = LLMSynthesizer(
ollama_url=f"http://{self.config.llm.ollama_host}", ollama_url=f"http://{self.config.llm.ollama_host}",
model=self.config.llm.synthesis_model, model=self.config.llm.synthesis_model,
enable_thinking=True # Always enable thinking in explore mode enable_thinking=True, # Always enable thinking in explore mode
config=self.config # Pass config for model rankings
) )
# Session management # Session management
@ -69,12 +70,7 @@ class CodeExplorer:
def start_exploration_session(self) -> bool: def start_exploration_session(self) -> bool:
"""Start a new exploration session.""" """Start a new exploration session."""
# Check if we should restart the model for optimal thinking # Simple availability check - don't do complex model restart logic
model_restart_needed = self._check_model_restart_needed()
if model_restart_needed:
if not self._handle_model_restart():
print("⚠️ Continuing with current model (quality may be reduced)")
if not self.synthesizer.is_available(): if not self.synthesizer.is_available():
print("❌ LLM service unavailable. Please check Ollama is running.") print("❌ LLM service unavailable. Please check Ollama is running.")
return False return False
@ -87,17 +83,8 @@ class CodeExplorer:
started_at=time.time() started_at=time.time()
) )
print("🧠 EXPLORATION MODE STARTED") print("🧠 Exploration Mode Started")
print("=" * 50)
print(f"Project: {self.project_path.name}") print(f"Project: {self.project_path.name}")
print(f"Session: {session_id}")
print("\n🎯 This mode uses thinking and remembers context.")
print(" Perfect for debugging, learning, and deep exploration.")
print("\n💡 Tips:")
print(" • Ask follow-up questions - I'll remember our conversation")
print(" • Use 'why', 'how', 'explain' for detailed reasoning")
print(" • Type 'quit' or 'exit' to end session")
print("\n" + "=" * 50)
return True return True
@ -110,7 +97,7 @@ class CodeExplorer:
search_start = time.time() search_start = time.time()
results = self.searcher.search( results = self.searcher.search(
question, question,
limit=context_limit, top_k=context_limit,
include_context=True, include_context=True,
semantic_weight=0.7, semantic_weight=0.7,
bm25_weight=0.3 bm25_weight=0.3
@ -166,56 +153,82 @@ Content: {content[:800]}{'...' if len(content) > 800 else ''}
results_text = "\n".join(results_context) results_text = "\n".join(results_context)
# Create comprehensive exploration prompt # Create comprehensive exploration prompt with thinking
prompt = f"""You are a senior software engineer helping explore and debug code. You have access to thinking mode and conversation context. prompt = f"""<think>
The user asked: "{question}"
Let me analyze what they're asking and look at the information I have available.
From the search results, I can see relevant information about:
{results_text[:500]}...
I should think about:
1. What the user is trying to understand or accomplish
2. What information from the search results is most relevant
3. How to explain this in a clear, educational way
4. What practical next steps would be helpful
Based on our conversation so far: {context_summary}
Let me create a helpful response that breaks this down clearly and gives them actionable guidance.
</think>
You're a helpful assistant exploring a project with someone. You're good at breaking down complex topics into understandable pieces and explaining things clearly.
PROJECT: {self.project_path.name} PROJECT: {self.project_path.name}
CONVERSATION CONTEXT: PREVIOUS CONVERSATION:
{context_summary} {context_summary}
CURRENT QUESTION: "{question}" CURRENT QUESTION: "{question}"
SEARCH RESULTS: RELEVANT INFORMATION FOUND:
{results_text} {results_text}
Please provide a detailed analysis in JSON format. Think through the problem carefully and consider the conversation context: Please provide a helpful analysis in JSON format:
{{ {{
"summary": "2-3 sentences explaining what you found and how it relates to the question", "summary": "Clear explanation of what you found and how it answers their question",
"key_points": [ "key_points": [
"Important insight 1 (reference specific code/files)", "Most important insight from the information",
"Important insight 2 (explain relationships)", "Secondary important point or relationship",
"Important insight 3 (consider conversation context)" "Third key point or practical consideration"
], ],
"code_examples": [ "code_examples": [
"Relevant code snippet or pattern with explanation", "Relevant example or pattern from the information",
"Another important code example with context" "Another useful example or demonstration"
], ],
"suggested_actions": [ "suggested_actions": [
"Specific next step the developer should take", "Specific next step they could take",
"Follow-up investigation or debugging approach", "Additional exploration or investigation suggestion",
"Potential improvements or fixes" "Practical way to apply this information"
], ],
"confidence": 0.85 "confidence": 0.85
}} }}
Focus on: Guidelines:
- Deep technical analysis with reasoning - Be educational and break things down clearly
- How this connects to previous questions in our conversation - Reference specific files and information when helpful
- Practical debugging/learning insights - Give practical, actionable suggestions
- Specific code references and explanations - Keep explanations beginner-friendly but not condescending
- Clear next steps for the developer - Connect information to their question directly
"""
Think carefully about the relationships between code components and how they answer the question in context."""
return prompt return prompt
def _synthesize_with_context(self, prompt: str, results: List[Any]) -> SynthesisResult: def _synthesize_with_context(self, prompt: str, results: List[Any]) -> SynthesisResult:
"""Synthesize results with full context and thinking.""" """Synthesize results with full context and thinking."""
try: try:
# Use thinking-enabled synthesis with lower temperature for exploration # TEMPORARILY: Use simple non-streaming call to avoid flow issues
response = self.synthesizer._call_ollama(prompt, temperature=0.2) # TODO: Re-enable streaming once flow is stable
response = self.synthesizer._call_ollama(prompt, temperature=0.2, disable_thinking=False)
thinking_stream = ""
# Display simple thinking indicator
if response and len(response) > 200:
print("\n💭 Analysis in progress...")
# Don't display thinking stream again - keeping it simple for now
if not response: if not response:
return SynthesisResult( return SynthesisResult(
@ -423,6 +436,196 @@ Think carefully about the relationships between code components and how they ans
print("\n📝 Continuing with current model...") print("\n📝 Continuing with current model...")
return False return False
def _call_ollama_with_thinking(self, prompt: str, temperature: float = 0.3) -> tuple:
"""Call Ollama with streaming for fast time-to-first-token."""
import requests
import json
try:
# Use the synthesizer's model and connection
model_to_use = self.synthesizer.model
if self.synthesizer.model not in self.synthesizer.available_models:
if self.synthesizer.available_models:
model_to_use = self.synthesizer.available_models[0]
else:
return None, None
# Enable thinking by NOT adding <no_think>
final_prompt = prompt
# Get optimal parameters for this model
from .llm_optimization import get_optimal_ollama_parameters
optimal_params = get_optimal_ollama_parameters(model_to_use)
payload = {
"model": model_to_use,
"prompt": final_prompt,
"stream": True, # Enable streaming for fast response
"options": {
"temperature": temperature,
"top_p": optimal_params.get("top_p", 0.9),
"top_k": optimal_params.get("top_k", 40),
"num_ctx": optimal_params.get("num_ctx", 32768),
"num_predict": optimal_params.get("num_predict", 2000),
"repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
"presence_penalty": optimal_params.get("presence_penalty", 1.0)
}
}
response = requests.post(
f"{self.synthesizer.ollama_url}/api/generate",
json=payload,
stream=True,
timeout=65
)
if response.status_code == 200:
# Collect streaming response
raw_response = ""
thinking_displayed = False
for line in response.iter_lines():
if line:
try:
chunk_data = json.loads(line.decode('utf-8'))
chunk_text = chunk_data.get('response', '')
if chunk_text:
raw_response += chunk_text
# Display thinking stream as it comes in
if not thinking_displayed and '<think>' in raw_response:
# Start displaying thinking
self._start_thinking_display()
thinking_displayed = True
if thinking_displayed:
self._stream_thinking_chunk(chunk_text)
if chunk_data.get('done', False):
break
except json.JSONDecodeError:
continue
# Finish thinking display if it was shown
if thinking_displayed:
self._end_thinking_display()
# Extract thinking stream and final response
thinking_stream, final_response = self._extract_thinking(raw_response)
return final_response, thinking_stream
else:
return None, None
except Exception as e:
logger.error(f"Thinking-enabled Ollama call failed: {e}")
return None, None
def _extract_thinking(self, raw_response: str) -> tuple:
"""Extract thinking content from response."""
thinking_stream = ""
final_response = raw_response
# Look for thinking patterns
if "<think>" in raw_response and "</think>" in raw_response:
# Extract thinking content between tags
start_tag = raw_response.find("<think>")
end_tag = raw_response.find("</think>") + len("</think>")
if start_tag != -1 and end_tag != -1:
thinking_content = raw_response[start_tag + 7:end_tag - 8] # Remove tags
thinking_stream = thinking_content.strip()
# Remove thinking from final response
final_response = (raw_response[:start_tag] + raw_response[end_tag:]).strip()
# Alternative patterns for models that use different thinking formats
elif "Let me think" in raw_response or "I need to analyze" in raw_response:
# Simple heuristic: first paragraph might be thinking
lines = raw_response.split('\n')
potential_thinking = []
final_lines = []
thinking_indicators = ["Let me think", "I need to", "First, I'll", "Looking at", "Analyzing"]
in_thinking = False
for line in lines:
if any(indicator in line for indicator in thinking_indicators):
in_thinking = True
potential_thinking.append(line)
elif in_thinking and (line.startswith('{') or line.startswith('**') or line.startswith('#')):
# Likely end of thinking, start of structured response
in_thinking = False
final_lines.append(line)
elif in_thinking:
potential_thinking.append(line)
else:
final_lines.append(line)
if potential_thinking:
thinking_stream = '\n'.join(potential_thinking).strip()
final_response = '\n'.join(final_lines).strip()
return thinking_stream, final_response
def _start_thinking_display(self):
"""Start the thinking stream display."""
print("\n\033[2m\033[3m💭 AI Thinking:\033[0m")
print("\033[2m\033[3m" + "" * 40 + "\033[0m")
self._thinking_buffer = ""
self._in_thinking_tags = False
def _stream_thinking_chunk(self, chunk: str):
"""Stream a chunk of thinking as it arrives."""
import sys
self._thinking_buffer += chunk
# Check if we're in thinking tags
if '<think>' in self._thinking_buffer and not self._in_thinking_tags:
self._in_thinking_tags = True
# Display everything after <think>
start_idx = self._thinking_buffer.find('<think>') + 7
thinking_content = self._thinking_buffer[start_idx:]
if thinking_content:
print(f"\033[2m\033[3m{thinking_content}\033[0m", end='', flush=True)
elif self._in_thinking_tags and '</think>' not in chunk:
# We're in thinking mode, display the chunk
print(f"\033[2m\033[3m{chunk}\033[0m", end='', flush=True)
elif '</think>' in self._thinking_buffer:
# End of thinking
self._in_thinking_tags = False
def _end_thinking_display(self):
"""End the thinking stream display."""
print(f"\n\033[2m\033[3m" + "" * 40 + "\033[0m")
print()
def _display_thinking_stream(self, thinking_stream: str):
"""Display thinking stream in light gray and italic (fallback for non-streaming)."""
if not thinking_stream:
return
print("\n\033[2m\033[3m💭 AI Thinking:\033[0m")
print("\033[2m\033[3m" + "" * 40 + "\033[0m")
# Split into paragraphs and display with proper formatting
paragraphs = thinking_stream.split('\n\n')
for para in paragraphs:
if para.strip():
# Wrap long lines nicely
lines = para.strip().split('\n')
for line in lines:
if line.strip():
# Light gray and italic
print(f"\033[2m\033[3m{line}\033[0m")
print() # Paragraph spacing
print("\033[2m\033[3m" + "" * 40 + "\033[0m")
print()
# Quick test function # Quick test function
def test_explorer(): def test_explorer():
"""Test the code explorer.""" """Test the code explorer."""

View File

@ -218,6 +218,11 @@ class FastRAGServer:
# Quick file count check # Quick file count check
try: try:
import lancedb import lancedb
except ImportError:
# If LanceDB not available, assume index is empty and needs creation
return True
try:
db = lancedb.connect(rag_dir) db = lancedb.connect(rag_dir)
if 'code_vectors' not in db.table_names(): if 'code_vectors' not in db.table_names():
return True return True

View File

@ -12,12 +12,20 @@ from typing import List, Dict, Any, Optional, Set, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
import numpy as np import numpy as np
import lancedb
import pandas as pd import pandas as pd
import pyarrow as pa
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
from rich.console import Console from rich.console import Console
# Optional LanceDB import
try:
import lancedb
import pyarrow as pa
LANCEDB_AVAILABLE = True
except ImportError:
lancedb = None
pa = None
LANCEDB_AVAILABLE = False
from .ollama_embeddings import OllamaEmbedder as CodeEmbedder from .ollama_embeddings import OllamaEmbedder as CodeEmbedder
from .chunker import CodeChunker, CodeChunk from .chunker import CodeChunker, CodeChunk
from .path_handler import normalize_path, normalize_relative_path from .path_handler import normalize_path, normalize_relative_path
@ -163,7 +171,7 @@ class ProjectIndexer:
"skip_binary": True "skip_binary": True
}, },
"search": { "search": {
"default_limit": 10, "default_top_k": 10,
"similarity_threshold": 0.7, "similarity_threshold": 0.7,
"hybrid_search": True, "hybrid_search": True,
"bm25_weight": 0.3 "bm25_weight": 0.3
@ -526,6 +534,11 @@ class ProjectIndexer:
def _init_database(self): def _init_database(self):
"""Initialize LanceDB connection and table.""" """Initialize LanceDB connection and table."""
if not LANCEDB_AVAILABLE:
logger.error("LanceDB is not available. Please install LanceDB for full indexing functionality.")
logger.info("For Ollama-only mode, consider using hash-based embeddings instead.")
raise ImportError("LanceDB dependency is required for indexing. Install with: pip install lancedb pyarrow")
try: try:
self.db = lancedb.connect(self.rag_dir) self.db = lancedb.connect(self.rag_dir)

View File

@ -16,12 +16,12 @@ logger = logging.getLogger(__name__)
@dataclass @dataclass
class SafeguardConfig: class SafeguardConfig:
"""Configuration for LLM safeguards.""" """Configuration for LLM safeguards - gentle and educational."""
max_output_tokens: int = 2000 # Prevent excessive generation max_output_tokens: int = 4000 # Allow longer responses for learning
max_repetition_ratio: float = 0.3 # Max ratio of repeated content max_repetition_ratio: float = 0.7 # Be very permissive - only catch extreme repetition
max_response_time: int = 60 # Max seconds for response max_response_time: int = 120 # Allow 2 minutes for complex thinking
min_useful_length: int = 20 # Minimum useful response length min_useful_length: int = 10 # Lower threshold - short answers can be useful
context_window: int = 32768 # Ollama context window context_window: int = 32000 # Match Qwen3 context length (32K token limit)
enable_thinking_detection: bool = True # Detect thinking patterns enable_thinking_detection: bool = True # Detect thinking patterns
class ModelRunawayDetector: class ModelRunawayDetector:
@ -98,8 +98,19 @@ class ModelRunawayDetector:
if self.response_patterns['phrase_repetition'].search(response): if self.response_patterns['phrase_repetition'].search(response):
return "phrase_repetition" return "phrase_repetition"
# Calculate repetition ratio # Calculate repetition ratio (excluding Qwen3 thinking blocks)
words = response.split() analysis_text = response
if "<think>" in response and "</think>" in response:
# Extract only the actual response (after thinking) for repetition analysis
thinking_end = response.find("</think>")
if thinking_end != -1:
analysis_text = response[thinking_end + 8:].strip()
# If the actual response (excluding thinking) is short, don't penalize
if len(analysis_text.split()) < 20:
return None
words = analysis_text.split()
if len(words) > 10: if len(words) > 10:
unique_words = set(words) unique_words = set(words)
repetition_ratio = 1 - (len(unique_words) / len(words)) repetition_ratio = 1 - (len(unique_words) / len(words))

View File

@ -36,12 +36,13 @@ class SynthesisResult:
class LLMSynthesizer: class LLMSynthesizer:
"""Synthesizes RAG search results using Ollama LLMs.""" """Synthesizes RAG search results using Ollama LLMs."""
def __init__(self, ollama_url: str = "http://localhost:11434", model: str = None, enable_thinking: bool = False): def __init__(self, ollama_url: str = "http://localhost:11434", model: str = None, enable_thinking: bool = False, config=None):
self.ollama_url = ollama_url.rstrip('/') self.ollama_url = ollama_url.rstrip('/')
self.available_models = [] self.available_models = []
self.model = model self.model = model
self.enable_thinking = enable_thinking # Default False for synthesis mode self.enable_thinking = enable_thinking # Default False for synthesis mode
self._initialized = False self._initialized = False
self.config = config # For accessing model rankings
# Initialize safeguards # Initialize safeguards
if ModelRunawayDetector: if ModelRunawayDetector:
@ -61,60 +62,36 @@ class LLMSynthesizer:
return [] return []
def _select_best_model(self) -> str: def _select_best_model(self) -> str:
"""Select the best available model based on modern performance rankings.""" """Select the best available model based on configuration rankings."""
if not self.available_models: if not self.available_models:
return "qwen2.5:1.5b" # Fallback preference return "qwen2.5:1.5b" # Fallback preference
# Modern model preference ranking (CPU-friendly first) # Get model rankings from config or use defaults
# Prioritize: Ultra-efficient > Standard efficient > Larger models if self.config and hasattr(self.config, 'llm') and hasattr(self.config.llm, 'model_rankings'):
model_rankings = self.config.llm.model_rankings
else:
# Fallback rankings if no config
model_rankings = [ model_rankings = [
# Recommended model (excellent quality) "qwen3:1.7b", "qwen3:0.6b", "qwen3:4b", "llama3.2:1b",
"qwen3:4b", "qwen2.5:1.5b", "qwen3:3b", "qwen2.5-coder:1.5b"
# Ultra-efficient models (perfect for CPU-only systems)
"qwen3:0.6b", "qwen3:1.7b", "llama3.2:1b",
# Standard efficient models
"qwen2.5:1.5b", "qwen3:3b",
# Qwen2.5 models (excellent performance/size ratio)
"qwen2.5-coder:1.5b", "qwen2.5:1.5b", "qwen2.5:3b", "qwen2.5-coder:3b",
"qwen2.5:7b", "qwen2.5-coder:7b",
# Qwen2 models (older but still good)
"qwen2:1.5b", "qwen2:3b", "qwen2:7b",
# Mistral models (good quality, reasonable size)
"mistral:7b", "mistral-nemo", "mistral-small",
# Llama3.2 models (decent but larger)
"llama3.2:1b", "llama3.2:3b", "llama3.2", "llama3.2:8b",
# Fallback to other Llama models
"llama3.1:8b", "llama3:8b", "llama3",
# Other decent models
"gemma2:2b", "gemma2:9b", "phi3:3.8b", "phi3.5",
] ]
# Find first available model from our ranked list # Find first available model from our ranked list (exact matches first)
for preferred_model in model_rankings: for preferred_model in model_rankings:
for available_model in self.available_models: for available_model in self.available_models:
# Match model names (handle version tags) # Exact match first (e.g., "qwen3:1.7b" matches "qwen3:1.7b")
available_base = available_model.split(':')[0].lower() if preferred_model.lower() == available_model.lower():
preferred_base = preferred_model.split(':')[0].lower() logger.info(f"Selected exact match model: {available_model}")
return available_model
if preferred_base in available_base or available_base in preferred_base: # Partial match with version handling (e.g., "qwen3:1.7b" matches "qwen3:1.7b-q8_0")
# Additional size filtering - prefer smaller models preferred_parts = preferred_model.lower().split(':')
if any(size in available_model.lower() for size in ['1b', '1.5b', '2b', '3b']): available_parts = available_model.lower().split(':')
logger.info(f"Selected efficient model: {available_model}")
return available_model if len(preferred_parts) >= 2 and len(available_parts) >= 2:
elif any(size in available_model.lower() for size in ['7b', '8b']): if (preferred_parts[0] == available_parts[0] and
# Only use larger models if no smaller ones available preferred_parts[1] in available_parts[1]):
logger.info(f"Selected larger model: {available_model}") logger.info(f"Selected version match model: {available_model}")
return available_model
elif ':' not in available_model:
# Handle models without explicit size tags
return available_model return available_model
# If no preferred models found, use first available # If no preferred models found, use first available
@ -132,12 +109,8 @@ class LLMSynthesizer:
if not self.model: if not self.model:
self.model = self._select_best_model() self.model = self._select_best_model()
# Warm up LLM with minimal request (ignores response) # Skip warmup - models are fast enough and warmup causes delays
if self.available_models: # Warmup removed to eliminate startup delays and unwanted model calls
try:
self._call_ollama("testing, just say 'hi'", temperature=0.1, disable_thinking=True)
except:
pass # Warmup failure is non-critical
self._initialized = True self._initialized = True
@ -146,7 +119,7 @@ class LLMSynthesizer:
self._ensure_initialized() self._ensure_initialized()
return len(self.available_models) > 0 return len(self.available_models) > 0
def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False) -> Optional[str]: def _call_ollama(self, prompt: str, temperature: float = 0.3, disable_thinking: bool = False, use_streaming: bool = False) -> Optional[str]:
"""Make a call to Ollama API with safeguards.""" """Make a call to Ollama API with safeguards."""
start_time = time.time() start_time = time.time()
@ -163,28 +136,55 @@ class LLMSynthesizer:
# Handle thinking mode for Qwen3 models # Handle thinking mode for Qwen3 models
final_prompt = prompt final_prompt = prompt
if not self.enable_thinking or disable_thinking: use_thinking = self.enable_thinking and not disable_thinking
# For non-thinking mode, add <no_think> tag for Qwen3
if not use_thinking and "qwen3" in model_to_use.lower():
if not final_prompt.endswith(" <no_think>"): if not final_prompt.endswith(" <no_think>"):
final_prompt += " <no_think>" final_prompt += " <no_think>"
# Get optimal parameters for this model # Get optimal parameters for this model
optimal_params = get_optimal_ollama_parameters(model_to_use) optimal_params = get_optimal_ollama_parameters(model_to_use)
# Qwen3-specific optimal parameters based on research
if "qwen3" in model_to_use.lower():
if use_thinking:
# Thinking mode: Temperature=0.6, TopP=0.95, TopK=20, PresencePenalty=1.5
qwen3_temp = 0.6
qwen3_top_p = 0.95
qwen3_top_k = 20
qwen3_presence = 1.5
else:
# Non-thinking mode: Temperature=0.7, TopP=0.8, TopK=20, PresencePenalty=1.5
qwen3_temp = 0.7
qwen3_top_p = 0.8
qwen3_top_k = 20
qwen3_presence = 1.5
else:
qwen3_temp = temperature
qwen3_top_p = optimal_params.get("top_p", 0.9)
qwen3_top_k = optimal_params.get("top_k", 40)
qwen3_presence = optimal_params.get("presence_penalty", 1.0)
payload = { payload = {
"model": model_to_use, "model": model_to_use,
"prompt": final_prompt, "prompt": final_prompt,
"stream": False, "stream": use_streaming,
"options": { "options": {
"temperature": temperature, "temperature": qwen3_temp,
"top_p": optimal_params.get("top_p", 0.9), "top_p": qwen3_top_p,
"top_k": optimal_params.get("top_k", 40), "top_k": qwen3_top_k,
"num_ctx": optimal_params.get("num_ctx", 32768), "num_ctx": 32000, # Critical: Qwen3 context length (32K token limit)
"num_predict": optimal_params.get("num_predict", 2000), "num_predict": optimal_params.get("num_predict", 2000),
"repeat_penalty": optimal_params.get("repeat_penalty", 1.1), "repeat_penalty": optimal_params.get("repeat_penalty", 1.1),
"presence_penalty": optimal_params.get("presence_penalty", 1.0) "presence_penalty": qwen3_presence
} }
} }
# Handle streaming with early stopping
if use_streaming:
return self._handle_streaming_with_early_stop(payload, model_to_use, use_thinking, start_time)
response = requests.post( response = requests.post(
f"{self.ollama_url}/api/generate", f"{self.ollama_url}/api/generate",
json=payload, json=payload,
@ -193,8 +193,19 @@ class LLMSynthesizer:
if response.status_code == 200: if response.status_code == 200:
result = response.json() result = response.json()
# All models use standard response format
# Qwen3 thinking tokens are embedded in the response content itself as <think>...</think>
raw_response = result.get('response', '').strip() raw_response = result.get('response', '').strip()
# Log thinking content for Qwen3 debugging
if "qwen3" in model_to_use.lower() and use_thinking and "<think>" in raw_response:
thinking_start = raw_response.find("<think>")
thinking_end = raw_response.find("</think>")
if thinking_start != -1 and thinking_end != -1:
thinking_content = raw_response[thinking_start+7:thinking_end]
logger.info(f"Qwen3 thinking: {thinking_content[:100]}...")
# Apply safeguards to check response quality # Apply safeguards to check response quality
if self.safeguard_detector and raw_response: if self.safeguard_detector and raw_response:
is_valid, issue_type, explanation = self.safeguard_detector.check_response_quality( is_valid, issue_type, explanation = self.safeguard_detector.check_response_quality(
@ -203,8 +214,8 @@ class LLMSynthesizer:
if not is_valid: if not is_valid:
logger.warning(f"Safeguard triggered: {issue_type}") logger.warning(f"Safeguard triggered: {issue_type}")
# Return a safe explanation instead of the problematic response # Preserve original response but add safeguard warning
return self._create_safeguard_response(issue_type, explanation, prompt) return self._create_safeguard_response_with_content(issue_type, explanation, raw_response)
return raw_response return raw_response
else: else:
@ -233,6 +244,119 @@ class LLMSynthesizer:
This is normal with smaller AI models and helps ensure you get quality responses.""" This is normal with smaller AI models and helps ensure you get quality responses."""
def _create_safeguard_response_with_content(self, issue_type: str, explanation: str, original_response: str) -> str:
"""Create a response that preserves the original content but adds a safeguard warning."""
# For Qwen3, extract the actual response (after thinking)
actual_response = original_response
if "<think>" in original_response and "</think>" in original_response:
thinking_end = original_response.find("</think>")
if thinking_end != -1:
actual_response = original_response[thinking_end + 8:].strip()
# If we have useful content, preserve it with a warning
if len(actual_response.strip()) > 20:
return f"""⚠️ **Response Quality Warning** ({issue_type})
{explanation}
---
**AI Response (use with caution):**
{actual_response}
---
💡 **Note**: This response may have quality issues. Consider rephrasing your question or trying exploration mode for better results."""
else:
# If content is too short or problematic, use the original safeguard response
return f"""⚠️ Model Response Issue Detected
{explanation}
**What happened:** The AI model encountered a common issue with small language models.
**Your options:**
1. **Try again**: Ask the same question (often resolves itself)
2. **Rephrase**: Make your question more specific or break it into parts
3. **Use exploration mode**: `rag-mini explore` for complex questions
This is normal with smaller AI models and helps ensure you get quality responses."""
def _handle_streaming_with_early_stop(self, payload: dict, model_name: str, use_thinking: bool, start_time: float) -> Optional[str]:
"""Handle streaming response with intelligent early stopping."""
import json
try:
response = requests.post(
f"{self.ollama_url}/api/generate",
json=payload,
stream=True,
timeout=65
)
if response.status_code != 200:
logger.error(f"Ollama API error: {response.status_code}")
return None
full_response = ""
word_buffer = []
repetition_window = 30 # Check last 30 words for repetition (more context)
stop_threshold = 0.8 # Stop only if 80% of recent words are repetitive (very permissive)
min_response_length = 100 # Don't early stop until we have at least 100 chars
for line in response.iter_lines():
if line:
try:
chunk_data = json.loads(line.decode('utf-8'))
chunk_text = chunk_data.get('response', '')
if chunk_text:
full_response += chunk_text
# Add words to buffer for repetition detection
new_words = chunk_text.split()
word_buffer.extend(new_words)
# Keep only recent words in buffer
if len(word_buffer) > repetition_window:
word_buffer = word_buffer[-repetition_window:]
# Check for repetition patterns after we have enough words AND content
if len(word_buffer) >= repetition_window and len(full_response) >= min_response_length:
unique_words = set(word_buffer)
repetition_ratio = 1 - (len(unique_words) / len(word_buffer))
# Early stop only if repetition is EXTREMELY high (80%+)
if repetition_ratio > stop_threshold:
logger.info(f"Early stopping due to repetition: {repetition_ratio:.2f}")
# Add a gentle completion to the response
if not full_response.strip().endswith(('.', '!', '?')):
full_response += "..."
# Send stop signal to model (attempt to gracefully stop)
try:
stop_payload = {"model": model_name, "stop": True}
requests.post(f"{self.ollama_url}/api/generate", json=stop_payload, timeout=2)
except:
pass # If stop fails, we already have partial response
break
if chunk_data.get('done', False):
break
except json.JSONDecodeError:
continue
return full_response.strip()
except Exception as e:
logger.error(f"Streaming with early stop failed: {e}")
return None
def synthesize_search_results(self, query: str, results: List[Any], project_path: Path) -> SynthesisResult: def synthesize_search_results(self, query: str, results: List[Any], project_path: Path) -> SynthesisResult:
"""Synthesize search results into a coherent summary.""" """Synthesize search results into a coherent summary."""

View File

@ -469,6 +469,31 @@ class OllamaEmbedder:
"ollama_url": self.base_url if self.mode == "ollama" else None "ollama_url": self.base_url if self.mode == "ollama" else None
} }
def get_embedding_info(self) -> Dict[str, str]:
"""Get human-readable embedding system information for installer."""
status = self.get_status()
if status["mode"] == "ollama":
return {
"method": f"Ollama ({status['ollama_model']})",
"status": "working"
}
elif status["mode"] == "ml":
return {
"method": f"ML Fallback ({status['fallback_model']})",
"status": "working"
}
elif status["mode"] == "hash":
return {
"method": "Hash-based (basic similarity)",
"status": "working"
}
else:
return {
"method": "Unknown",
"status": "error"
}
def warmup(self): def warmup(self):
"""Warm up the embedding system with a dummy request.""" """Warm up the embedding system with a dummy request."""
dummy_code = "def hello(): pass" dummy_code = "def hello(): pass"

View File

@ -59,23 +59,8 @@ class QueryExpander:
if self._initialized: if self._initialized:
return return
# Warm up LLM if enabled and available # Skip warmup - causes startup delays and unwanted model calls
if self.enabled: # Query expansion works fine on first use without warmup
try:
model = self._select_expansion_model()
if model:
requests.post(
f"{self.ollama_url}/api/generate",
json={
"model": model,
"prompt": "testing, just say 'hi' <no_think>",
"stream": False,
"options": {"temperature": 0.1, "max_tokens": 5}
},
timeout=5
)
except:
pass # Warmup failure is non-critical
self._initialized = True self._initialized = True
@ -183,10 +168,10 @@ Expanded query:"""
data = response.json() data = response.json()
available = [model['name'] for model in data.get('models', [])] available = [model['name'] for model in data.get('models', [])]
# Prefer ultra-fast, efficient models for query expansion (CPU-friendly) # Use same model rankings as main synthesizer for consistency
expansion_preferences = [ expansion_preferences = [
"qwen3:0.6b", "qwen3:1.7b", "qwen2.5:1.5b", "qwen3:1.7b", "qwen3:0.6b", "qwen3:4b", "llama3.2:1b",
"llama3.2:1b", "gemma2:2b", "llama3.2:3b" "qwen2.5:1.5b", "qwen3:3b", "qwen2.5-coder:1.5b"
] ]
for preferred in expansion_preferences: for preferred in expansion_preferences:

View File

@ -8,13 +8,20 @@ from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple from typing import List, Dict, Any, Optional, Tuple
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import lancedb
from rich.console import Console from rich.console import Console
from rich.table import Table from rich.table import Table
from rich.syntax import Syntax from rich.syntax import Syntax
from rank_bm25 import BM25Okapi from rank_bm25 import BM25Okapi
from collections import defaultdict from collections import defaultdict
# Optional LanceDB import
try:
import lancedb
LANCEDB_AVAILABLE = True
except ImportError:
lancedb = None
LANCEDB_AVAILABLE = False
from .ollama_embeddings import OllamaEmbedder as CodeEmbedder from .ollama_embeddings import OllamaEmbedder as CodeEmbedder
from .path_handler import display_path from .path_handler import display_path
from .query_expander import QueryExpander from .query_expander import QueryExpander
@ -115,6 +122,14 @@ class CodeSearcher:
def _connect(self): def _connect(self):
"""Connect to the LanceDB database.""" """Connect to the LanceDB database."""
if not LANCEDB_AVAILABLE:
print("❌ LanceDB Not Available")
print(" LanceDB is required for search functionality")
print(" Install it with: pip install lancedb pyarrow")
print(" For basic Ollama functionality, use hash-based search instead")
print()
raise ImportError("LanceDB dependency is required for search. Install with: pip install lancedb pyarrow")
try: try:
if not self.rag_dir.exists(): if not self.rag_dir.exists():
print("🗃️ No Search Index Found") print("🗃️ No Search Index Found")

142
mini_rag/venv_checker.py Normal file
View File

@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
Virtual Environment Checker
Ensures scripts run in proper Python virtual environment for consistency and safety.
"""
import sys
import os
import sysconfig
from pathlib import Path
def is_in_virtualenv() -> bool:
"""Check if we're running in a virtual environment."""
# Check for virtual environment indicators
return (
hasattr(sys, 'real_prefix') or # virtualenv
(hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix) or # venv/pyvenv
os.environ.get('VIRTUAL_ENV') is not None # Environment variable
)
def get_expected_venv_path() -> Path:
"""Get the expected virtual environment path for this project."""
# Assume .venv in the same directory as the script
script_dir = Path(__file__).parent.parent
return script_dir / '.venv'
def check_correct_venv() -> tuple[bool, str]:
"""
Check if we're in the correct virtual environment.
Returns:
(is_correct, message)
"""
if not is_in_virtualenv():
return False, "not in virtual environment"
expected_venv = get_expected_venv_path()
if not expected_venv.exists():
return False, "expected virtual environment not found"
current_venv = os.environ.get('VIRTUAL_ENV')
if current_venv:
current_venv_path = Path(current_venv).resolve()
expected_venv_path = expected_venv.resolve()
if current_venv_path != expected_venv_path:
return False, f"wrong virtual environment (using {current_venv_path}, expected {expected_venv_path})"
return True, "correct virtual environment"
def show_venv_warning(script_name: str = "script") -> None:
"""Show virtual environment warning with helpful instructions."""
expected_venv = get_expected_venv_path()
print("⚠️ VIRTUAL ENVIRONMENT WARNING")
print("=" * 50)
print()
print(f"This {script_name} should be run in a Python virtual environment for:")
print(" • Consistent dependencies")
print(" • Isolated package versions")
print(" • Proper security isolation")
print(" • Reliable functionality")
print()
if expected_venv.exists():
print("✅ Virtual environment found!")
print(f" Location: {expected_venv}")
print()
print("🚀 To activate it:")
print(f" source {expected_venv}/bin/activate")
print(f" {script_name}")
print()
print("🔄 Or run with activation:")
print(f" source {expected_venv}/bin/activate && {script_name}")
else:
print("❌ No virtual environment found!")
print()
print("🛠️ Create one first:")
print(" ./install_mini_rag.sh")
print()
print("📚 Or manually:")
print(f" python3 -m venv {expected_venv}")
print(f" source {expected_venv}/bin/activate")
print(" pip install -r requirements.txt")
print()
print("💡 Why this matters:")
print(" Without a virtual environment, you may experience:")
print(" • Import errors from missing packages")
print(" • Version conflicts with system Python")
print(" • Inconsistent behavior across systems")
print(" • Potential system-wide package pollution")
print()
def check_and_warn_venv(script_name: str = "script", force_exit: bool = False) -> bool:
"""
Check virtual environment and warn if needed.
Args:
script_name: Name of the script for user-friendly messages
force_exit: Whether to exit if not in correct venv
Returns:
True if in correct venv, False otherwise
"""
is_correct, message = check_correct_venv()
if not is_correct:
show_venv_warning(script_name)
if force_exit:
print(f"⛔ Exiting {script_name} for your safety.")
print(" Please activate the virtual environment and try again.")
sys.exit(1)
else:
print(f"⚠️ Continuing anyway, but {script_name} may not work correctly...")
print()
return False
return True
def require_venv(script_name: str = "script") -> None:
"""Require virtual environment or exit."""
check_and_warn_venv(script_name, force_exit=True)
# Quick test function
def main():
"""Test the virtual environment checker."""
print("🧪 Virtual Environment Checker Test")
print("=" * 40)
print(f"In virtual environment: {is_in_virtualenv()}")
print(f"Expected venv path: {get_expected_venv_path()}")
is_correct, message = check_correct_venv()
print(f"Correct venv: {is_correct} ({message})")
if not is_correct:
show_venv_warning("test script")
if __name__ == "__main__":
main()

View File

@ -112,6 +112,7 @@ show_help() {
echo -e "${BOLD}Main Commands:${NC}" echo -e "${BOLD}Main Commands:${NC}"
echo " rag-mini index <project_path> # Index project for search" echo " rag-mini index <project_path> # Index project for search"
echo " rag-mini search <project_path> <query> # Search indexed project" echo " rag-mini search <project_path> <query> # Search indexed project"
echo " rag-mini explore <project_path> # Interactive exploration with AI"
echo " rag-mini status <project_path> # Show project status" echo " rag-mini status <project_path> # Show project status"
echo "" echo ""
echo -e "${BOLD}Interfaces:${NC}" echo -e "${BOLD}Interfaces:${NC}"
@ -324,11 +325,11 @@ main() {
"server") "server")
# Start server mode # Start server mode
shift shift
exec "$PYTHON" "$SCRIPT_DIR/claude_rag/server.py" "$@" exec "$PYTHON" "$SCRIPT_DIR/mini_rag/fast_server.py" "$@"
;; ;;
"index"|"search"|"status") "index"|"search"|"explore"|"status")
# Direct CLI commands # Direct CLI commands - call Python script
exec "$SCRIPT_DIR/rag-mini" "$@" exec "$PYTHON" "$SCRIPT_DIR/rag-mini.py" "$@"
;; ;;
*) *)
# Unknown command - show help # Unknown command - show help

View File

@ -118,7 +118,7 @@ def index_project(project_path: Path, force: bool = False):
print(" Or see: docs/TROUBLESHOOTING.md") print(" Or see: docs/TROUBLESHOOTING.md")
sys.exit(1) sys.exit(1)
def search_project(project_path: Path, query: str, limit: int = 10, synthesize: bool = False): def search_project(project_path: Path, query: str, top_k: int = 10, synthesize: bool = False):
"""Search a project directory.""" """Search a project directory."""
try: try:
# Check if indexed first # Check if indexed first
@ -130,7 +130,7 @@ def search_project(project_path: Path, query: str, limit: int = 10, synthesize:
print(f"🔍 Searching \"{query}\" in {project_path.name}") print(f"🔍 Searching \"{query}\" in {project_path.name}")
searcher = CodeSearcher(project_path) searcher = CodeSearcher(project_path)
results = searcher.search(query, top_k=limit) results = searcher.search(query, top_k=top_k)
if not results: if not results:
print("❌ No results found") print("❌ No results found")
@ -143,7 +143,7 @@ def search_project(project_path: Path, query: str, limit: int = 10, synthesize:
print() print()
print("⚙️ Configuration adjustments:") print("⚙️ Configuration adjustments:")
print(f" • Lower threshold: ./rag-mini search {project_path} \"{query}\" --threshold 0.05") print(f" • Lower threshold: ./rag-mini search {project_path} \"{query}\" --threshold 0.05")
print(" • More results: add --limit 20") print(" • More results: add --top-k 20")
print() print()
print("📚 Need help? See: docs/TROUBLESHOOTING.md") print("📚 Need help? See: docs/TROUBLESHOOTING.md")
return return
@ -310,14 +310,14 @@ def status_check(project_path: Path):
sys.exit(1) sys.exit(1)
def explore_interactive(project_path: Path): def explore_interactive(project_path: Path):
"""Interactive exploration mode with thinking and context memory.""" """Interactive exploration mode with thinking and context memory for any documents."""
try: try:
explorer = CodeExplorer(project_path) explorer = CodeExplorer(project_path)
if not explorer.start_exploration_session(): if not explorer.start_exploration_session():
sys.exit(1) sys.exit(1)
print("\n🤔 Ask your first question about the codebase:") print(f"\n🤔 Ask your first question about {project_path.name}:")
while True: while True:
try: try:
@ -357,7 +357,8 @@ def explore_interactive(project_path: Path):
continue continue
# Process the question # Process the question
print("\n🔍 Analyzing...") print(f"\n🔍 Searching {project_path.name}...")
print("🧠 Thinking with AI model...")
response = explorer.explore_question(question) response = explorer.explore_question(question)
if response: if response:
@ -382,6 +383,13 @@ def explore_interactive(project_path: Path):
def main(): def main():
"""Main CLI interface.""" """Main CLI interface."""
# Check virtual environment
try:
from mini_rag.venv_checker import check_and_warn_venv
check_and_warn_venv("rag-mini.py", force_exit=False)
except ImportError:
pass # If venv checker can't be imported, continue anyway
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="FSS-Mini-RAG - Lightweight semantic code search", description="FSS-Mini-RAG - Lightweight semantic code search",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
@ -403,8 +411,8 @@ Examples:
help='Search query (for search command)') help='Search query (for search command)')
parser.add_argument('--force', action='store_true', parser.add_argument('--force', action='store_true',
help='Force reindex all files') help='Force reindex all files')
parser.add_argument('--limit', type=int, default=10, parser.add_argument('--top-k', '--limit', type=int, default=10, dest='top_k',
help='Maximum number of search results') help='Maximum number of search results (top-k)')
parser.add_argument('--verbose', '-v', action='store_true', parser.add_argument('--verbose', '-v', action='store_true',
help='Enable verbose logging') help='Enable verbose logging')
parser.add_argument('--synthesize', '-s', action='store_true', parser.add_argument('--synthesize', '-s', action='store_true',
@ -432,7 +440,7 @@ Examples:
if not args.query: if not args.query:
print("❌ Search query required") print("❌ Search query required")
sys.exit(1) sys.exit(1)
search_project(args.project_path, args.query, args.limit, args.synthesize) search_project(args.project_path, args.query, args.top_k, args.synthesize)
elif args.command == 'explore': elif args.command == 'explore':
explore_interactive(args.project_path) explore_interactive(args.project_path)
elif args.command == 'status': elif args.command == 'status':

File diff suppressed because it is too large Load Diff

230
test_fixes.py Normal file
View File

@ -0,0 +1,230 @@
#!/usr/bin/env python3
"""
Quick test script to verify our key fixes without heavy dependencies.
IMPORTANT: This test requires the virtual environment to be activated:
source .venv/bin/activate
python test_fixes.py
Or run directly with venv:
source .venv/bin/activate && python test_fixes.py
"""
import sys
import os
import tempfile
from pathlib import Path
# Check if virtual environment is activated
def check_venv():
if 'VIRTUAL_ENV' not in os.environ:
print("⚠️ WARNING: Virtual environment not detected!")
print(" This test requires the virtual environment to be activated.")
print(" Run: source .venv/bin/activate && python test_fixes.py")
print(" Continuing anyway...\n")
check_venv()
# Add current directory to Python path
sys.path.insert(0, '.')
def test_config_model_rankings():
"""Test that model rankings are properly configured."""
print("=" * 60)
print("TESTING CONFIG AND MODEL RANKINGS")
print("=" * 60)
try:
# Test config loading without heavy dependencies
from mini_rag.config import ConfigManager, LLMConfig
# Create a temporary directory for testing
with tempfile.TemporaryDirectory() as tmpdir:
config_manager = ConfigManager(tmpdir)
config = config_manager.load_config()
print("✓ Config loads successfully")
# Check LLM config and model rankings
if hasattr(config, 'llm'):
llm_config = config.llm
print(f"✓ LLM config found: {type(llm_config)}")
if hasattr(llm_config, 'model_rankings'):
rankings = llm_config.model_rankings
print(f"✓ Model rankings: {rankings}")
if rankings and rankings[0] == "qwen3:1.7b":
print("✓ qwen3:1.7b is FIRST priority - CORRECT!")
return True
else:
print(f"✗ WRONG: First model is {rankings[0] if rankings else 'None'}, should be qwen3:1.7b")
return False
else:
print("✗ Model rankings not found in LLM config")
return False
else:
print("✗ LLM config not found")
return False
except ImportError as e:
print(f"✗ Import error: {e}")
return False
except Exception as e:
print(f"✗ Error: {e}")
return False
def test_context_length_fix():
"""Test that context length is correctly set to 32K."""
print("\n" + "=" * 60)
print("TESTING CONTEXT LENGTH FIXES")
print("=" * 60)
try:
# Read the synthesizer file and check for 32000
with open('mini_rag/llm_synthesizer.py', 'r') as f:
synthesizer_content = f.read()
if '"num_ctx": 32000' in synthesizer_content:
print("✓ LLM Synthesizer: num_ctx is correctly set to 32000")
elif '"num_ctx": 80000' in synthesizer_content:
print("✗ LLM Synthesizer: num_ctx is still 80000 - NEEDS FIX")
return False
else:
print("? LLM Synthesizer: num_ctx setting not found clearly")
# Read the safeguards file and check for 32000
with open('mini_rag/llm_safeguards.py', 'r') as f:
safeguards_content = f.read()
if 'context_window: int = 32000' in safeguards_content:
print("✓ Safeguards: context_window is correctly set to 32000")
return True
elif 'context_window: int = 80000' in safeguards_content:
print("✗ Safeguards: context_window is still 80000 - NEEDS FIX")
return False
else:
print("? Safeguards: context_window setting not found clearly")
return False
except Exception as e:
print(f"✗ Error checking context length: {e}")
return False
def test_safeguard_preservation():
"""Test that safeguards preserve content instead of dropping it."""
print("\n" + "=" * 60)
print("TESTING SAFEGUARD CONTENT PRESERVATION")
print("=" * 60)
try:
# Read the synthesizer file and check for the preservation method
with open('mini_rag/llm_synthesizer.py', 'r') as f:
synthesizer_content = f.read()
if '_create_safeguard_response_with_content' in synthesizer_content:
print("✓ Safeguard content preservation method exists")
else:
print("✗ Safeguard content preservation method missing")
return False
# Check for the specific preservation logic
if 'AI Response (use with caution):' in synthesizer_content:
print("✓ Content preservation warning format found")
else:
print("✗ Content preservation warning format missing")
return False
# Check that it's being called instead of dropping content
if 'return self._create_safeguard_response_with_content(issue_type, explanation, raw_response)' in synthesizer_content:
print("✓ Preservation method is called when safeguards trigger")
return True
else:
print("✗ Preservation method not called properly")
return False
except Exception as e:
print(f"✗ Error checking safeguard preservation: {e}")
return False
def test_import_fixes():
"""Test that import statements are fixed from claude_rag to mini_rag."""
print("\n" + "=" * 60)
print("TESTING IMPORT STATEMENT FIXES")
print("=" * 60)
test_files = [
'tests/test_rag_integration.py',
'tests/01_basic_integration_test.py',
'tests/test_hybrid_search.py',
'tests/test_context_retrieval.py'
]
all_good = True
for test_file in test_files:
if Path(test_file).exists():
try:
with open(test_file, 'r') as f:
content = f.read()
if 'claude_rag' in content:
print(f"{test_file}: Still contains 'claude_rag' imports")
all_good = False
elif 'mini_rag' in content:
print(f"{test_file}: Uses correct 'mini_rag' imports")
else:
print(f"? {test_file}: No rag imports found")
except Exception as e:
print(f"✗ Error reading {test_file}: {e}")
all_good = False
else:
print(f"? {test_file}: File not found")
return all_good
def main():
"""Run all tests."""
print("FSS-Mini-RAG Fix Verification Tests")
print("Testing all the critical fixes...")
tests = [
("Model Rankings", test_config_model_rankings),
("Context Length", test_context_length_fix),
("Safeguard Preservation", test_safeguard_preservation),
("Import Fixes", test_import_fixes)
]
results = {}
for test_name, test_func in tests:
try:
results[test_name] = test_func()
except Exception as e:
print(f"{test_name} test crashed: {e}")
results[test_name] = False
# Summary
print("\n" + "=" * 60)
print("TEST SUMMARY")
print("=" * 60)
passed = sum(1 for result in results.values() if result)
total = len(results)
for test_name, result in results.items():
status = "✓ PASS" if result else "✗ FAIL"
print(f"{status} {test_name}")
print(f"\nOverall: {passed}/{total} tests passed")
if passed == total:
print("🎉 ALL TESTS PASSED - System should be working properly!")
return 0
else:
print("❌ SOME TESTS FAILED - System needs more fixes!")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,5 +1,12 @@
""" """
Comprehensive demo of the RAG system showing all integrated features. Comprehensive demo of the RAG system showing all integrated features.
IMPORTANT: This test requires the virtual environment to be activated:
source .venv/bin/activate
PYTHONPATH=. python tests/01_basic_integration_test.py
Or run directly with venv:
source .venv/bin/activate && PYTHONPATH=. python tests/01_basic_integration_test.py
""" """
import os import os
@ -7,6 +14,16 @@ import sys
import tempfile import tempfile
from pathlib import Path from pathlib import Path
# Check if virtual environment is activated
def check_venv():
if 'VIRTUAL_ENV' not in os.environ:
print("⚠️ WARNING: Virtual environment not detected!")
print(" This test requires the virtual environment to be activated.")
print(" Run: source .venv/bin/activate && PYTHONPATH=. python tests/01_basic_integration_test.py")
print(" Continuing anyway...\n")
check_venv()
# Fix Windows encoding # Fix Windows encoding
if sys.platform == 'win32': if sys.platform == 'win32':
os.environ['PYTHONUTF8'] = '1' os.environ['PYTHONUTF8'] = '1'
@ -15,7 +32,7 @@ if sys.platform == 'win32':
from mini_rag.chunker import CodeChunker from mini_rag.chunker import CodeChunker
from mini_rag.indexer import ProjectIndexer from mini_rag.indexer import ProjectIndexer
from mini_rag.search import CodeSearcher from mini_rag.search import CodeSearcher
from mini_rag.embeddings import CodeEmbedder from mini_rag.ollama_embeddings import OllamaEmbedder as CodeEmbedder
def main(): def main():
print("=" * 60) print("=" * 60)
@ -189,17 +206,17 @@ if __name__ == "__main__":
# Test different search types # Test different search types
print("\n a) Semantic search for 'calculate average':") print("\n a) Semantic search for 'calculate average':")
results = searcher.search("calculate average", limit=3) results = searcher.search("calculate average", top_k=3)
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f" {i}. {result.chunk_type} '{result.name}' in {result.file_path} (score: {result.score:.3f})") print(f" {i}. {result.chunk_type} '{result.name}' in {result.file_path} (score: {result.score:.3f})")
print("\n b) BM25-weighted search for 'divide zero':") print("\n b) BM25-weighted search for 'divide zero':")
results = searcher.search("divide zero", limit=3, semantic_weight=0.2, bm25_weight=0.8) results = searcher.search("divide zero", top_k=3, semantic_weight=0.2, bm25_weight=0.8)
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f" {i}. {result.chunk_type} '{result.name}' in {result.file_path} (score: {result.score:.3f})") print(f" {i}. {result.chunk_type} '{result.name}' in {result.file_path} (score: {result.score:.3f})")
print("\n c) Search with context for 'test addition':") print("\n c) Search with context for 'test addition':")
results = searcher.search("test addition", limit=2, include_context=True) results = searcher.search("test addition", top_k=2, include_context=True)
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f" {i}. {result.chunk_type} '{result.name}'") print(f" {i}. {result.chunk_type} '{result.name}'")
if result.parent_chunk: if result.parent_chunk:

View File

@ -37,25 +37,25 @@ def demo_search(project_path: Path):
'title': 'Keyword-Heavy Search', 'title': 'Keyword-Heavy Search',
'query': 'BM25Okapi rank_bm25 search scoring', 'query': 'BM25Okapi rank_bm25 search scoring',
'description': 'This query has specific technical keywords that BM25 excels at finding', 'description': 'This query has specific technical keywords that BM25 excels at finding',
'limit': 5 'top_k': 5
}, },
{ {
'title': 'Natural Language Query', 'title': 'Natural Language Query',
'query': 'how to build search index from database chunks', 'query': 'how to build search index from database chunks',
'description': 'This semantic query benefits from transformer embeddings understanding intent', 'description': 'This semantic query benefits from transformer embeddings understanding intent',
'limit': 5 'top_k': 5
}, },
{ {
'title': 'Mixed Technical Query', 'title': 'Mixed Technical Query',
'query': 'vector embeddings for semantic code search with transformers', 'query': 'vector embeddings for semantic code search with transformers',
'description': 'This hybrid query combines technical terms with conceptual understanding', 'description': 'This hybrid query combines technical terms with conceptual understanding',
'limit': 5 'top_k': 5
}, },
{ {
'title': 'Function Search', 'title': 'Function Search',
'query': 'search method implementation with filters', 'query': 'search method implementation with filters',
'description': 'Looking for specific function implementations', 'description': 'Looking for specific function implementations',
'limit': 5 'top_k': 5
} }
] ]
@ -67,7 +67,7 @@ def demo_search(project_path: Path):
# Run search with hybrid mode # Run search with hybrid mode
results = searcher.search( results = searcher.search(
query=demo['query'], query=demo['query'],
limit=demo['limit'], top_k=demo['top_k'],
semantic_weight=0.7, semantic_weight=0.7,
bm25_weight=0.3 bm25_weight=0.3
) )

View File

@ -244,7 +244,7 @@ def compute_median(numbers):
searcher = CodeSearcher(project_path) searcher = CodeSearcher(project_path)
# Test BM25 integration # Test BM25 integration
results = searcher.search("multiply numbers", limit=5, results = searcher.search("multiply numbers", top_k=5,
semantic_weight=0.3, bm25_weight=0.7) semantic_weight=0.3, bm25_weight=0.7)
if results: if results:
@ -283,7 +283,7 @@ def compute_median(numbers):
print(f" - No parent chunk") print(f" - No parent chunk")
# Test include_context in search # Test include_context in search
results_with_context = searcher.search("add", include_context=True, limit=2) results_with_context = searcher.search("add", include_context=True, top_k=2)
if results_with_context: if results_with_context:
print(f" Found {len(results_with_context)} results with context") print(f" Found {len(results_with_context)} results with context")
for r in results_with_context: for r in results_with_context:

View File

@ -1,11 +1,29 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Test script for adjacent chunk retrieval functionality. Test script for adjacent chunk retrieval functionality.
IMPORTANT: This test requires the virtual environment to be activated:
source .venv/bin/activate
PYTHONPATH=. python tests/test_context_retrieval.py
Or run directly with venv:
source .venv/bin/activate && PYTHONPATH=. python tests/test_context_retrieval.py
""" """
import os
from pathlib import Path from pathlib import Path
from mini_rag.search import CodeSearcher from mini_rag.search import CodeSearcher
from mini_rag.embeddings import CodeEmbedder from mini_rag.ollama_embeddings import OllamaEmbedder as CodeEmbedder
# Check if virtual environment is activated
def check_venv():
if 'VIRTUAL_ENV' not in os.environ:
print("⚠️ WARNING: Virtual environment not detected!")
print(" This test requires the virtual environment to be activated.")
print(" Run: source .venv/bin/activate && PYTHONPATH=. python tests/test_context_retrieval.py")
print(" Continuing anyway...\n")
check_venv()
def test_context_retrieval(): def test_context_retrieval():
"""Test the new context retrieval functionality.""" """Test the new context retrieval functionality."""
@ -20,7 +38,7 @@ def test_context_retrieval():
# Test 1: Search without context # Test 1: Search without context
print("\n1. Search WITHOUT context:") print("\n1. Search WITHOUT context:")
results = searcher.search("chunk metadata", limit=3, include_context=False) results = searcher.search("chunk metadata", top_k=3, include_context=False)
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f" Result {i}: {result.file_path}:{result.start_line}-{result.end_line}") print(f" Result {i}: {result.file_path}:{result.start_line}-{result.end_line}")
print(f" Type: {result.chunk_type}, Name: {result.name}") print(f" Type: {result.chunk_type}, Name: {result.name}")
@ -30,7 +48,7 @@ def test_context_retrieval():
# Test 2: Search with context # Test 2: Search with context
print("\n2. Search WITH context:") print("\n2. Search WITH context:")
results = searcher.search("chunk metadata", limit=3, include_context=True) results = searcher.search("chunk metadata", top_k=3, include_context=True)
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f" Result {i}: {result.file_path}:{result.start_line}-{result.end_line}") print(f" Result {i}: {result.file_path}:{result.start_line}-{result.end_line}")
print(f" Type: {result.chunk_type}, Name: {result.name}") print(f" Type: {result.chunk_type}, Name: {result.name}")

View File

@ -2,6 +2,13 @@
""" """
Test and benchmark the hybrid BM25 + semantic search system. Test and benchmark the hybrid BM25 + semantic search system.
Shows performance metrics and search quality comparisons. Shows performance metrics and search quality comparisons.
IMPORTANT: This test requires the virtual environment to be activated:
source .venv/bin/activate
PYTHONPATH=. python tests/test_hybrid_search.py
Or run directly with venv:
source .venv/bin/activate && PYTHONPATH=. python tests/test_hybrid_search.py
""" """
import time import time
@ -16,7 +23,7 @@ from rich.syntax import Syntax
from rich.progress import track from rich.progress import track
from mini_rag.search import CodeSearcher, SearchResult from mini_rag.search import CodeSearcher, SearchResult
from mini_rag.embeddings import CodeEmbedder from mini_rag.ollama_embeddings import OllamaEmbedder as CodeEmbedder
console = Console() console = Console()
@ -40,7 +47,7 @@ class SearchTester:
if 'error' not in stats: if 'error' not in stats:
console.print(f"[dim]Index contains {stats['total_chunks']} chunks from {stats['unique_files']} files[/dim]\n") console.print(f"[dim]Index contains {stats['total_chunks']} chunks from {stats['unique_files']} files[/dim]\n")
def run_query(self, query: str, limit: int = 10, def run_query(self, query: str, top_k: int = 10,
semantic_only: bool = False, semantic_only: bool = False,
bm25_only: bool = False) -> Dict[str, Any]: bm25_only: bool = False) -> Dict[str, Any]:
"""Run a single query and return metrics.""" """Run a single query and return metrics."""
@ -60,7 +67,7 @@ class SearchTester:
start = time.time() start = time.time()
results = self.searcher.search( results = self.searcher.search(
query=query, query=query,
limit=limit, top_k=top_k,
semantic_weight=semantic_weight, semantic_weight=semantic_weight,
bm25_weight=bm25_weight bm25_weight=bm25_weight
) )
@ -76,10 +83,10 @@ class SearchTester:
'avg_score': sum(r.score for r in results) / len(results) if results else 0, 'avg_score': sum(r.score for r in results) / len(results) if results else 0,
} }
def compare_search_modes(self, query: str, limit: int = 5): def compare_search_modes(self, query: str, top_k: int = 5):
"""Compare results across different search modes.""" """Compare results across different search modes."""
console.print(f"\n[bold cyan]Query:[/bold cyan] '{query}'") console.print(f"\n[bold cyan]Query:[/bold cyan] '{query}'")
console.print(f"[dim]Top {limit} results per mode[/dim]\n") console.print(f"[dim]Top {top_k} results per mode[/dim]\n")
# Run searches in all modes # Run searches in all modes
modes = [ modes = [
@ -90,7 +97,7 @@ class SearchTester:
all_results = {} all_results = {}
for mode_name, semantic_only, bm25_only in modes: for mode_name, semantic_only, bm25_only in modes:
result = self.run_query(query, limit, semantic_only, bm25_only) result = self.run_query(query, top_k, semantic_only, bm25_only)
all_results[mode_name] = result all_results[mode_name] = result
# Create comparison table # Create comparison table
@ -191,7 +198,7 @@ class SearchTester:
for test_case in test_queries: for test_case in test_queries:
console.rule(f"\n[cyan]{test_case['description']}[/cyan]") console.rule(f"\n[cyan]{test_case['description']}[/cyan]")
console.print(f"[dim]{test_case['expected']}[/dim]") console.print(f"[dim]{test_case['expected']}[/dim]")
self.compare_search_modes(test_case['query'], limit=3) self.compare_search_modes(test_case['query'], top_k=3)
time.sleep(0.5) # Brief pause between tests time.sleep(0.5) # Brief pause between tests
def benchmark_performance(self, num_queries: int = 50): def benchmark_performance(self, num_queries: int = 50):
@ -268,7 +275,7 @@ class SearchTester:
# Query that might return many results from same files # Query that might return many results from same files
query = "function implementation code search" query = "function implementation code search"
results = self.searcher.search(query, limit=20) results = self.searcher.search(query, top_k=20)
# Analyze diversity # Analyze diversity
file_counts = {} file_counts = {}

View File

@ -403,9 +403,9 @@ class TestOllamaIntegration(unittest.TestCase):
# Check search config # Check search config
self.assertIsNotNone(self.config.search) self.assertIsNotNone(self.config.search)
self.assertGreater(self.config.search.default_limit, 0) self.assertGreater(self.config.search.default_top_k, 0)
print(f" ✅ Search config valid") print(f" ✅ Search config valid")
print(f" Default limit: {self.config.search.default_limit}") print(f" Default top-k: {self.config.search.default_top_k}")
print(f" Query expansion: {self.config.search.expand_queries}") print(f" Query expansion: {self.config.search.expand_queries}")

View File

@ -1,12 +1,32 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Test RAG system integration with smart chunking.""" """
Test RAG system integration with smart chunking.
IMPORTANT: This test requires the virtual environment to be activated:
source .venv/bin/activate
PYTHONPATH=. python tests/test_rag_integration.py
Or run directly with venv:
source .venv/bin/activate && PYTHONPATH=. python tests/test_rag_integration.py
"""
import tempfile import tempfile
import shutil import shutil
import os
from pathlib import Path from pathlib import Path
from mini_rag.indexer import ProjectIndexer from mini_rag.indexer import ProjectIndexer
from mini_rag.search import CodeSearcher from mini_rag.search import CodeSearcher
# Check if virtual environment is activated
def check_venv():
if 'VIRTUAL_ENV' not in os.environ:
print("⚠️ WARNING: Virtual environment not detected!")
print(" This test requires the virtual environment to be activated.")
print(" Run: source .venv/bin/activate && PYTHONPATH=. python tests/test_rag_integration.py")
print(" Continuing anyway...\n")
check_venv()
# Sample Python file with proper structure # Sample Python file with proper structure
sample_code = '''""" sample_code = '''"""
Sample module for testing RAG system. Sample module for testing RAG system.
@ -179,8 +199,8 @@ def test_integration():
stats = indexer.index_project() stats = indexer.index_project()
print(f" - Files indexed: {stats['files_indexed']}") print(f" - Files indexed: {stats['files_indexed']}")
print(f" - Total chunks: {stats['total_chunks']}") print(f" - Total chunks: {stats['chunks_created']}")
print(f" - Indexing time: {stats['indexing_time']:.2f}s") print(f" - Indexing time: {stats['time_taken']:.2f}s")
# Verify chunks were created properly # Verify chunks were created properly
print("\n2. Verifying chunk metadata...") print("\n2. Verifying chunk metadata...")
@ -195,10 +215,10 @@ def test_integration():
results = searcher.search("data processor class unified interface", top_k=3) results = searcher.search("data processor class unified interface", top_k=3)
print(f"\n Test 1 - Class search:") print(f"\n Test 1 - Class search:")
for i, result in enumerate(results[:1]): for i, result in enumerate(results[:1]):
print(f" - Match {i+1}: {result['file_path']}") print(f" - Match {i+1}: {result.file_path}")
print(f" Chunk type: {result['chunk_type']}") print(f" Chunk type: {result.chunk_type}")
print(f" Score: {result['score']:.3f}") print(f" Score: {result.score:.3f}")
if 'This class handles' in result['content']: if 'This class handles' in result.content:
print(" [OK] Docstring included with class") print(" [OK] Docstring included with class")
else: else:
print(" [FAIL] Docstring not found") print(" [FAIL] Docstring not found")
@ -207,10 +227,10 @@ def test_integration():
results = searcher.search("process list of data items", top_k=3) results = searcher.search("process list of data items", top_k=3)
print(f"\n Test 2 - Method search:") print(f"\n Test 2 - Method search:")
for i, result in enumerate(results[:1]): for i, result in enumerate(results[:1]):
print(f" - Match {i+1}: {result['file_path']}") print(f" - Match {i+1}: {result.file_path}")
print(f" Chunk type: {result['chunk_type']}") print(f" Chunk type: {result.chunk_type}")
print(f" Parent class: {result.get('parent_class', 'N/A')}") print(f" Parent class: {getattr(result, 'parent_class', 'N/A')}")
if 'Args:' in result['content'] and 'Returns:' in result['content']: if 'Args:' in result.content and 'Returns:' in result.content:
print(" [OK] Docstring included with method") print(" [OK] Docstring included with method")
else: else:
print(" [FAIL] Method docstring not complete") print(" [FAIL] Method docstring not complete")
@ -219,19 +239,19 @@ def test_integration():
results = searcher.search("smart chunking capabilities markdown", top_k=3) results = searcher.search("smart chunking capabilities markdown", top_k=3)
print(f"\n Test 3 - Markdown search:") print(f"\n Test 3 - Markdown search:")
for i, result in enumerate(results[:1]): for i, result in enumerate(results[:1]):
print(f" - Match {i+1}: {result['file_path']}") print(f" - Match {i+1}: {result.file_path}")
print(f" Chunk type: {result['chunk_type']}") print(f" Chunk type: {result.chunk_type}")
print(f" Lines: {result['start_line']}-{result['end_line']}") print(f" Lines: {result.start_line}-{result.end_line}")
# Test 4: Verify chunk navigation # Test 4: Verify chunk navigation
print(f"\n Test 4 - Chunk navigation:") print(f"\n Test 4 - Chunk navigation:")
all_results = searcher.search("", top_k=100) # Get all chunks all_results = searcher.search("", top_k=100) # Get all chunks
py_chunks = [r for r in all_results if r['file_path'].endswith('.py')] py_chunks = [r for r in all_results if r.file_path.endswith('.py')]
if py_chunks: if py_chunks:
first_chunk = py_chunks[0] first_chunk = py_chunks[0]
print(f" - First chunk: index={first_chunk.get('chunk_index', 'N/A')}") print(f" - First chunk: index={getattr(first_chunk, 'chunk_index', 'N/A')}")
print(f" Next chunk ID: {first_chunk.get('next_chunk_id', 'N/A')}") print(f" Next chunk ID: {getattr(first_chunk, 'next_chunk_id', 'N/A')}")
# Verify chain # Verify chain
valid_chain = True valid_chain = True
@ -239,7 +259,7 @@ def test_integration():
curr = py_chunks[i] curr = py_chunks[i]
next_chunk = py_chunks[i + 1] next_chunk = py_chunks[i + 1]
expected_next = f"processor_{i+1}" expected_next = f"processor_{i+1}"
if curr.get('next_chunk_id') != expected_next: if getattr(curr, 'next_chunk_id', None) != expected_next:
valid_chain = False valid_chain = False
break break