🎯 Complete transformation from 5.9GB bloated system to 70MB optimized solution ✨ Key Features: - Hybrid embedding system (Ollama + ML fallback + hash backup) - Intelligent chunking with language-aware parsing - Semantic + BM25 hybrid search with rich context - Zero-config portable design with graceful degradation - Beautiful TUI for beginners + powerful CLI for experts - Comprehensive documentation with 8+ Mermaid diagrams - Professional animated demo (183KB optimized GIF) 🏗️ Architecture Highlights: - LanceDB vector storage with streaming indexing - Smart file tracking (size/mtime) to avoid expensive rehashing - Progressive chunking: Markdown headers → Python functions → fixed-size - Quality filtering: 200+ chars, 20+ words, 30% alphanumeric content - Concurrent batch processing with error recovery 📦 Package Contents: - Core engine: claude_rag/ (11 modules, 2,847 lines) - Entry points: rag-mini (unified), rag-tui (beginner interface) - Documentation: README + 6 guides with visual diagrams - Assets: 3D icon, optimized demo GIF, recording tools - Tests: 8 comprehensive integration and validation tests - Examples: Usage patterns, config templates, dependency analysis 🎥 Demo System: - Scripted demonstration showing 12 files → 58 chunks indexing - Semantic search with multi-line result previews - Complete workflow from TUI startup to CLI mastery - Professional recording pipeline with asciinema + GIF conversion 🛡️ Security & Quality: - Complete .gitignore with personal data protection - Dependency optimization (removed python-dotenv) - Code quality validation and educational test suite - Agent-reviewed architecture and documentation Ready for production use - copy folder, run ./rag-mini, start searching\!
355 lines
12 KiB
Python
355 lines
12 KiB
Python
"""
|
||
Integration test to verify all three agents' work integrates properly.
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import tempfile
|
||
from pathlib import Path
|
||
|
||
# Fix Windows encoding
|
||
if sys.platform == 'win32':
|
||
os.environ['PYTHONUTF8'] = '1'
|
||
sys.stdout.reconfigure(encoding='utf-8')
|
||
|
||
from claude_rag.chunker import CodeChunker
|
||
from claude_rag.indexer import ProjectIndexer
|
||
from claude_rag.search import CodeSearcher
|
||
from claude_rag.embeddings import CodeEmbedder
|
||
|
||
def test_chunker():
|
||
"""Test that chunker creates chunks with all required metadata."""
|
||
print("1. Testing Chunker...")
|
||
|
||
# Create test Python file with more substantial content
|
||
test_code = '''"""Test module for integration testing the chunker."""
|
||
|
||
import os
|
||
import sys
|
||
|
||
class TestClass:
|
||
"""A test class with multiple methods."""
|
||
|
||
def __init__(self):
|
||
"""Initialize the test class."""
|
||
self.value = 42
|
||
self.name = "test"
|
||
|
||
def method_one(self):
|
||
"""First method with some logic."""
|
||
result = self.value * 2
|
||
return result
|
||
|
||
def method_two(self, x):
|
||
"""Second method that takes a parameter."""
|
||
if x > 0:
|
||
return self.value + x
|
||
else:
|
||
return self.value - x
|
||
|
||
def method_three(self):
|
||
"""Third method for testing."""
|
||
data = []
|
||
for i in range(10):
|
||
data.append(i * self.value)
|
||
return data
|
||
|
||
class AnotherClass:
|
||
"""Another test class."""
|
||
|
||
def __init__(self, name):
|
||
"""Initialize with name."""
|
||
self.name = name
|
||
|
||
def process(self):
|
||
"""Process something."""
|
||
return f"Processing {self.name}"
|
||
|
||
def standalone_function(arg1, arg2):
|
||
"""A standalone function that does something."""
|
||
result = arg1 + arg2
|
||
return result * 2
|
||
|
||
def another_function():
|
||
"""Another standalone function."""
|
||
data = {"key": "value", "number": 123}
|
||
return data
|
||
'''
|
||
|
||
chunker = CodeChunker(min_chunk_size=1) # Use small chunk size for testing
|
||
chunks = chunker.chunk_file(Path("test.py"), test_code)
|
||
|
||
print(f" Created {len(chunks)} chunks")
|
||
|
||
# Debug: Show what chunks were created
|
||
print(" Chunks created:")
|
||
for chunk in chunks:
|
||
print(f" - Type: {chunk.chunk_type}, Name: {chunk.name}, Lines: {chunk.start_line}-{chunk.end_line}")
|
||
|
||
# Check metadata
|
||
issues = []
|
||
for i, chunk in enumerate(chunks):
|
||
if chunk.chunk_index is None:
|
||
issues.append(f"Chunk {i} missing chunk_index")
|
||
if chunk.total_chunks is None:
|
||
issues.append(f"Chunk {i} missing total_chunks")
|
||
if chunk.file_lines is None:
|
||
issues.append(f"Chunk {i} missing file_lines")
|
||
|
||
# Check links (except first/last)
|
||
if i > 0 and chunk.prev_chunk_id is None:
|
||
issues.append(f"Chunk {i} missing prev_chunk_id")
|
||
if i < len(chunks) - 1 and chunk.next_chunk_id is None:
|
||
issues.append(f"Chunk {i} missing next_chunk_id")
|
||
|
||
# Check parent_class for methods
|
||
if chunk.chunk_type == 'method' and chunk.parent_class is None:
|
||
issues.append(f"Method chunk {chunk.name} missing parent_class")
|
||
|
||
print(f" - Chunk {i}: {chunk.chunk_type} '{chunk.name}' "
|
||
f"[{chunk.chunk_index}/{chunk.total_chunks}] "
|
||
f"prev={chunk.prev_chunk_id} next={chunk.next_chunk_id}")
|
||
|
||
if issues:
|
||
print(" Issues found:")
|
||
for issue in issues:
|
||
print(f" - {issue}")
|
||
else:
|
||
print(" All metadata present")
|
||
|
||
return len(issues) == 0
|
||
|
||
def test_indexer_storage():
|
||
"""Test that indexer stores the new metadata."""
|
||
print("\n2. Testing Indexer Storage...")
|
||
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
project_path = Path(tmpdir)
|
||
|
||
# Create test file
|
||
test_file = project_path / "test.py"
|
||
test_file.write_text('''
|
||
class MyClass:
|
||
def my_method(self):
|
||
return 42
|
||
''')
|
||
|
||
# Index the project with small chunk size for testing
|
||
from claude_rag.chunker import CodeChunker
|
||
chunker = CodeChunker(min_chunk_size=1)
|
||
indexer = ProjectIndexer(project_path, chunker=chunker)
|
||
stats = indexer.index_project()
|
||
|
||
print(f" Indexed {stats['chunks_created']} chunks")
|
||
|
||
# Check what was stored
|
||
if indexer.table:
|
||
df = indexer.table.to_pandas()
|
||
columns = df.columns.tolist()
|
||
|
||
required_fields = ['chunk_id', 'prev_chunk_id', 'next_chunk_id', 'parent_class']
|
||
missing_fields = [f for f in required_fields if f not in columns]
|
||
|
||
if missing_fields:
|
||
print(f" Missing fields in database: {missing_fields}")
|
||
print(f" Current fields: {columns}")
|
||
return False
|
||
else:
|
||
print(" All required fields in database schema")
|
||
|
||
# Check if data is actually stored
|
||
sample = df.iloc[0] if len(df) > 0 else None
|
||
if sample is not None:
|
||
print(f" Sample chunk_id: {sample.get('chunk_id', 'MISSING')}")
|
||
print(f" Sample prev_chunk_id: {sample.get('prev_chunk_id', 'MISSING')}")
|
||
print(f" Sample next_chunk_id: {sample.get('next_chunk_id', 'MISSING')}")
|
||
print(f" Sample parent_class: {sample.get('parent_class', 'MISSING')}")
|
||
|
||
return len(missing_fields) == 0
|
||
|
||
def test_search_integration():
|
||
"""Test that search uses the new metadata."""
|
||
print("\n3. Testing Search Integration...")
|
||
|
||
with tempfile.TemporaryDirectory() as tmpdir:
|
||
project_path = Path(tmpdir)
|
||
|
||
# Create test files with proper content that will create multiple chunks
|
||
(project_path / "math_utils.py").write_text('''"""Math utilities module."""
|
||
|
||
import math
|
||
|
||
class Calculator:
|
||
"""A simple calculator class."""
|
||
|
||
def __init__(self):
|
||
"""Initialize calculator."""
|
||
self.result = 0
|
||
|
||
def add(self, a, b):
|
||
"""Add two numbers."""
|
||
self.result = a + b
|
||
return self.result
|
||
|
||
def multiply(self, a, b):
|
||
"""Multiply two numbers."""
|
||
self.result = a * b
|
||
return self.result
|
||
|
||
def divide(self, a, b):
|
||
"""Divide two numbers."""
|
||
if b == 0:
|
||
raise ValueError("Cannot divide by zero")
|
||
self.result = a / b
|
||
return self.result
|
||
|
||
class AdvancedCalculator(Calculator):
|
||
"""Advanced calculator with more operations."""
|
||
|
||
def power(self, a, b):
|
||
"""Raise a to power b."""
|
||
self.result = a ** b
|
||
return self.result
|
||
|
||
def sqrt(self, a):
|
||
"""Calculate square root."""
|
||
self.result = math.sqrt(a)
|
||
return self.result
|
||
|
||
def compute_average(numbers):
|
||
"""Compute average of a list."""
|
||
if not numbers:
|
||
return 0
|
||
return sum(numbers) / len(numbers)
|
||
|
||
def compute_median(numbers):
|
||
"""Compute median of a list."""
|
||
if not numbers:
|
||
return 0
|
||
sorted_nums = sorted(numbers)
|
||
n = len(sorted_nums)
|
||
if n % 2 == 0:
|
||
return (sorted_nums[n//2-1] + sorted_nums[n//2]) / 2
|
||
return sorted_nums[n//2]
|
||
''')
|
||
|
||
# Index with small chunk size for testing
|
||
chunker = CodeChunker(min_chunk_size=1)
|
||
indexer = ProjectIndexer(project_path, chunker=chunker)
|
||
indexer.index_project()
|
||
|
||
# Search
|
||
searcher = CodeSearcher(project_path)
|
||
|
||
# Test BM25 integration
|
||
results = searcher.search("multiply numbers", limit=5,
|
||
semantic_weight=0.3, bm25_weight=0.7)
|
||
|
||
if results:
|
||
print(f" BM25 + semantic search returned {len(results)} results")
|
||
for r in results[:2]:
|
||
print(f" - {r.chunk_type} '{r.name}' score={r.score:.3f}")
|
||
else:
|
||
print(" No search results returned")
|
||
return False
|
||
|
||
# Test context retrieval
|
||
print("\n Testing context retrieval...")
|
||
if searcher.table:
|
||
df = searcher.table.to_pandas()
|
||
print(f" Total chunks in DB: {len(df)}")
|
||
|
||
# Find a method chunk to test parent context
|
||
method_chunks = df[df['chunk_type'] == 'method']
|
||
if len(method_chunks) > 0:
|
||
method_chunk_id = method_chunks.iloc[0]['chunk_id']
|
||
context = searcher.get_chunk_context(method_chunk_id)
|
||
|
||
if context['chunk']:
|
||
print(f" Got main chunk: {context['chunk'].name}")
|
||
if context['prev']:
|
||
print(f" Got previous chunk: {context['prev'].name}")
|
||
else:
|
||
print(f" - No previous chunk (might be first)")
|
||
if context['next']:
|
||
print(f" Got next chunk: {context['next'].name}")
|
||
else:
|
||
print(f" - No next chunk (might be last)")
|
||
if context['parent']:
|
||
print(f" Got parent chunk: {context['parent'].name}")
|
||
else:
|
||
print(f" - No parent chunk")
|
||
|
||
# Test include_context in search
|
||
results_with_context = searcher.search("add", include_context=True, limit=2)
|
||
if results_with_context:
|
||
print(f" Found {len(results_with_context)} results with context")
|
||
for r in results_with_context:
|
||
has_context = bool(r.context_before or r.context_after or r.parent_chunk)
|
||
print(f" - {r.name}: context_before={bool(r.context_before)}, "
|
||
f"context_after={bool(r.context_after)}, parent={bool(r.parent_chunk)}")
|
||
|
||
# Check if at least one result has some context
|
||
if any(r.context_before or r.context_after or r.parent_chunk for r in results_with_context):
|
||
print(" Search with context working")
|
||
return True
|
||
else:
|
||
print(" Search returned results but no context attached")
|
||
return False
|
||
else:
|
||
print(" No search results returned")
|
||
return False
|
||
else:
|
||
print(" No method chunks found in database")
|
||
return False
|
||
|
||
return True
|
||
|
||
def test_server():
|
||
"""Test that server still works."""
|
||
print("\n4. Testing Server...")
|
||
|
||
# Just check if we can import and create server instance
|
||
try:
|
||
from claude_rag.server import RAGServer
|
||
server = RAGServer(Path("."), port=7778)
|
||
print(" Server can be instantiated")
|
||
return True
|
||
except Exception as e:
|
||
print(f" Server error: {e}")
|
||
return False
|
||
|
||
def main():
|
||
"""Run all integration tests."""
|
||
print("=" * 50)
|
||
print("RAG System Integration Check")
|
||
print("=" * 50)
|
||
|
||
results = {
|
||
"Chunker": test_chunker(),
|
||
"Indexer": test_indexer_storage(),
|
||
"Search": test_search_integration(),
|
||
"Server": test_server()
|
||
}
|
||
|
||
print("\n" + "=" * 50)
|
||
print("SUMMARY:")
|
||
print("=" * 50)
|
||
|
||
all_passed = True
|
||
for component, passed in results.items():
|
||
status = " PASS" if passed else " FAIL"
|
||
print(f"{component}: {status}")
|
||
if not passed:
|
||
all_passed = False
|
||
|
||
if all_passed:
|
||
print("\n All integration tests passed!")
|
||
else:
|
||
print("\n️ Some tests failed - fixes needed!")
|
||
|
||
return all_passed
|
||
|
||
if __name__ == "__main__":
|
||
success = main()
|
||
sys.exit(0 if success else 1) |