Fss-Rag-Mini/tests/test_hybrid_search.py
BobAi a96ddba3c9 MAJOR: Remove all Claude references and rename to Mini-RAG
Complete rebrand to eliminate any Claude/Anthropic references:

Directory Changes:
- claude_rag/ → mini_rag/ (preserving git history)

Content Changes:
- Replaced 930+ Claude references across 40+ files
- Updated all imports: from claude_rag → from mini_rag
- Updated all file paths: .claude-rag → .mini-rag
- Updated documentation and comments
- Updated configuration files and examples

Testing Changes:
- All tests updated to use mini_rag imports
- Integration tests verify new module structure

This ensures complete independence from Claude/Anthropic
branding while maintaining all functionality and git history.
2025-08-12 19:21:30 +10:00

358 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Test and benchmark the hybrid BM25 + semantic search system.
Shows performance metrics and search quality comparisons.
"""
import time
import json
from pathlib import Path
from typing import List, Dict, Any
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.columns import Columns
from rich.syntax import Syntax
from rich.progress import track
from mini_rag.search import CodeSearcher, SearchResult
from mini_rag.embeddings import CodeEmbedder
console = Console()
class SearchTester:
"""Test harness for hybrid search evaluation."""
def __init__(self, project_path: Path):
self.project_path = project_path
console.print(f"\n[cyan]Initializing search system for: {project_path}[/cyan]")
# Initialize searcher
start = time.time()
self.searcher = CodeSearcher(project_path)
init_time = time.time() - start
console.print(f"[green] Initialized in {init_time:.2f}s[/green]")
# Get statistics
stats = self.searcher.get_statistics()
if 'error' not in stats:
console.print(f"[dim]Index contains {stats['total_chunks']} chunks from {stats['unique_files']} files[/dim]\n")
def run_query(self, query: str, limit: int = 10,
semantic_only: bool = False,
bm25_only: bool = False) -> Dict[str, Any]:
"""Run a single query and return metrics."""
# Set weights based on mode
if semantic_only:
semantic_weight, bm25_weight = 1.0, 0.0
mode = "Semantic Only"
elif bm25_only:
semantic_weight, bm25_weight = 0.0, 1.0
mode = "BM25 Only"
else:
semantic_weight, bm25_weight = 0.7, 0.3
mode = "Hybrid (70/30)"
# Run search
start = time.time()
results = self.searcher.search(
query=query,
limit=limit,
semantic_weight=semantic_weight,
bm25_weight=bm25_weight
)
search_time = time.time() - start
return {
'query': query,
'mode': mode,
'results': results,
'search_time_ms': search_time * 1000,
'num_results': len(results),
'top_score': results[0].score if results else 0,
'avg_score': sum(r.score for r in results) / len(results) if results else 0,
}
def compare_search_modes(self, query: str, limit: int = 5):
"""Compare results across different search modes."""
console.print(f"\n[bold cyan]Query:[/bold cyan] '{query}'")
console.print(f"[dim]Top {limit} results per mode[/dim]\n")
# Run searches in all modes
modes = [
('hybrid', False, False),
('semantic', True, False),
('bm25', False, True)
]
all_results = {}
for mode_name, semantic_only, bm25_only in modes:
result = self.run_query(query, limit, semantic_only, bm25_only)
all_results[mode_name] = result
# Create comparison table
table = Table(title="Search Mode Comparison")
table.add_column("Metric", style="cyan", width=20)
table.add_column("Hybrid (70/30)", style="green")
table.add_column("Semantic Only", style="blue")
table.add_column("BM25 Only", style="magenta")
# Add metrics
table.add_row(
"Search Time (ms)",
f"{all_results['hybrid']['search_time_ms']:.1f}",
f"{all_results['semantic']['search_time_ms']:.1f}",
f"{all_results['bm25']['search_time_ms']:.1f}"
)
table.add_row(
"Results Found",
str(all_results['hybrid']['num_results']),
str(all_results['semantic']['num_results']),
str(all_results['bm25']['num_results'])
)
table.add_row(
"Top Score",
f"{all_results['hybrid']['top_score']:.3f}",
f"{all_results['semantic']['top_score']:.3f}",
f"{all_results['bm25']['top_score']:.3f}"
)
table.add_row(
"Avg Score",
f"{all_results['hybrid']['avg_score']:.3f}",
f"{all_results['semantic']['avg_score']:.3f}",
f"{all_results['bm25']['avg_score']:.3f}"
)
console.print(table)
# Show top results from each mode
console.print("\n[bold]Top Results by Mode:[/bold]")
for mode_name, result_data in all_results.items():
console.print(f"\n[bold cyan]{result_data['mode']}:[/bold cyan]")
for i, result in enumerate(result_data['results'][:3], 1):
console.print(f"\n{i}. [green]{result.file_path}[/green]:{result.start_line}-{result.end_line}")
console.print(f" [dim]Type: {result.chunk_type} | Name: {result.name} | Score: {result.score:.3f}[/dim]")
# Show snippet
lines = result.content.splitlines()[:5]
for line in lines:
console.print(f" [dim]{line[:80]}{'...' if len(line) > 80 else ''}[/dim]")
def test_query_types(self):
"""Test different types of queries to show system capabilities."""
test_queries = [
# Keyword-heavy queries (should benefit from BM25)
{
'query': 'class CodeSearcher search method',
'description': 'Specific class and method names',
'expected': 'Should find exact matches with BM25 boost'
},
{
'query': 'import pandas numpy torch',
'description': 'Multiple import keywords',
'expected': 'BM25 should excel at finding import statements'
},
# Semantic queries (should benefit from embeddings)
{
'query': 'find similar code chunks using vector similarity',
'description': 'Natural language description',
'expected': 'Semantic search should understand intent'
},
{
'query': 'how to initialize database connection',
'description': 'How-to question',
'expected': 'Semantic search should find relevant implementations'
},
# Mixed queries (benefit from hybrid)
{
'query': 'BM25 scoring implementation for search ranking',
'description': 'Technical terms + intent',
'expected': 'Hybrid should balance keyword and semantic matching'
},
{
'query': 'embedding vectors for code search with transformers',
'description': 'Domain-specific terminology',
'expected': 'Hybrid should leverage both approaches'
}
]
console.print("\n[bold yellow]Query Type Analysis[/bold yellow]")
console.print("[dim]Testing different query patterns to demonstrate hybrid search benefits[/dim]\n")
for test_case in test_queries:
console.rule(f"\n[cyan]{test_case['description']}[/cyan]")
console.print(f"[dim]{test_case['expected']}[/dim]")
self.compare_search_modes(test_case['query'], limit=3)
time.sleep(0.5) # Brief pause between tests
def benchmark_performance(self, num_queries: int = 50):
"""Run performance benchmarks."""
console.print("\n[bold yellow]Performance Benchmark[/bold yellow]")
console.print(f"[dim]Running {num_queries} queries to measure performance[/dim]\n")
# Sample queries for benchmarking
benchmark_queries = [
"search function implementation",
"class definition with methods",
"import statements and dependencies",
"error handling try except",
"database connection setup",
"api endpoint handler",
"test cases unit testing",
"configuration settings",
"logging and debugging",
"performance optimization"
] * (num_queries // 10 + 1)
benchmark_queries = benchmark_queries[:num_queries]
# Benchmark each mode
modes = [
('Hybrid (70/30)', 0.7, 0.3),
('Semantic Only', 1.0, 0.0),
('BM25 Only', 0.0, 1.0)
]
results_table = Table(title="Performance Benchmark Results")
results_table.add_column("Mode", style="cyan")
results_table.add_column("Avg Time (ms)", style="green")
results_table.add_column("Min Time (ms)", style="blue")
results_table.add_column("Max Time (ms)", style="red")
results_table.add_column("Total Time (s)", style="magenta")
for mode_name, sem_weight, bm25_weight in modes:
times = []
console.print(f"[cyan]Testing {mode_name}...[/cyan]")
for query in track(benchmark_queries, description=f"Running {mode_name}"):
start = time.time()
self.searcher.search(
query=query,
limit=10,
semantic_weight=sem_weight,
bm25_weight=bm25_weight
)
elapsed = (time.time() - start) * 1000
times.append(elapsed)
# Calculate statistics
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
total_time = sum(times) / 1000
results_table.add_row(
mode_name,
f"{avg_time:.2f}",
f"{min_time:.2f}",
f"{max_time:.2f}",
f"{total_time:.2f}"
)
console.print("\n")
console.print(results_table)
def test_diversity_constraints(self):
"""Test the diversity constraints in search results."""
console.print("\n[bold yellow]Diversity Constraints Test[/bold yellow]")
console.print("[dim]Verifying max 2 chunks per file and chunk type diversity[/dim]\n")
# Query that might return many results from same files
query = "function implementation code search"
results = self.searcher.search(query, limit=20)
# Analyze diversity
file_counts = {}
chunk_types = {}
for result in results:
file_counts[result.file_path] = file_counts.get(result.file_path, 0) + 1
chunk_types[result.chunk_type] = chunk_types.get(result.chunk_type, 0) + 1
# Create diversity report
table = Table(title="Result Diversity Analysis")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Total Results", str(len(results)))
table.add_row("Unique Files", str(len(file_counts)))
table.add_row("Max Chunks per File", str(max(file_counts.values()) if file_counts else 0))
table.add_row("Unique Chunk Types", str(len(chunk_types)))
console.print(table)
# Show file distribution
if len(file_counts) > 0:
console.print("\n[bold]File Distribution:[/bold]")
for file_path, count in sorted(file_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
console.print(f" {count}x {file_path}")
# Show chunk type distribution
if len(chunk_types) > 0:
console.print("\n[bold]Chunk Type Distribution:[/bold]")
for chunk_type, count in sorted(chunk_types.items(), key=lambda x: x[1], reverse=True):
console.print(f" {chunk_type}: {count} chunks")
# Verify constraints
console.print("\n[bold]Constraint Verification:[/bold]")
max_per_file = max(file_counts.values()) if file_counts else 0
if max_per_file <= 2:
console.print(" [green] Max 2 chunks per file constraint satisfied[/green]")
else:
console.print(f" [red] Max chunks per file exceeded: {max_per_file}[/red]")
def main():
"""Run comprehensive hybrid search tests."""
import sys
if len(sys.argv) > 1:
project_path = Path(sys.argv[1])
else:
project_path = Path.cwd()
if not (project_path / '.mini-rag').exists():
console.print("[red]Error: No RAG index found. Run 'mini-rag index' first.[/red]")
return
# Create tester
tester = SearchTester(project_path)
# Run all tests
console.print("\n" + "="*80)
console.print("[bold green]Mini RAG Hybrid Search Test Suite[/bold green]")
console.print("="*80)
# Test 1: Query type analysis
tester.test_query_types()
# Test 2: Performance benchmark
console.print("\n" + "-"*80)
tester.benchmark_performance(num_queries=30)
# Test 3: Diversity constraints
console.print("\n" + "-"*80)
tester.test_diversity_constraints()
# Summary
console.print("\n" + "="*80)
console.print("[bold green]Test Suite Complete![/bold green]")
console.print("\n[dim]The hybrid search combines:")
console.print(" • Semantic understanding from transformer embeddings")
console.print(" • Keyword relevance from BM25 scoring")
console.print(" • Result diversity through intelligent filtering")
console.print(" • Performance optimization through concurrent processing[/dim]")
console.print("="*80 + "\n")
if __name__ == "__main__":
main()