Fss-Rag-Mini/tests/test_rag_integration.py
BobAi 4166d0a362 Initial release: FSS-Mini-RAG - Lightweight semantic code search system
🎯 Complete transformation from 5.9GB bloated system to 70MB optimized solution

 Key Features:
- Hybrid embedding system (Ollama + ML fallback + hash backup)
- Intelligent chunking with language-aware parsing
- Semantic + BM25 hybrid search with rich context
- Zero-config portable design with graceful degradation
- Beautiful TUI for beginners + powerful CLI for experts
- Comprehensive documentation with 8+ Mermaid diagrams
- Professional animated demo (183KB optimized GIF)

🏗️ Architecture Highlights:
- LanceDB vector storage with streaming indexing
- Smart file tracking (size/mtime) to avoid expensive rehashing
- Progressive chunking: Markdown headers → Python functions → fixed-size
- Quality filtering: 200+ chars, 20+ words, 30% alphanumeric content
- Concurrent batch processing with error recovery

📦 Package Contents:
- Core engine: claude_rag/ (11 modules, 2,847 lines)
- Entry points: rag-mini (unified), rag-tui (beginner interface)
- Documentation: README + 6 guides with visual diagrams
- Assets: 3D icon, optimized demo GIF, recording tools
- Tests: 8 comprehensive integration and validation tests
- Examples: Usage patterns, config templates, dependency analysis

🎥 Demo System:
- Scripted demonstration showing 12 files → 58 chunks indexing
- Semantic search with multi-line result previews
- Complete workflow from TUI startup to CLI mastery
- Professional recording pipeline with asciinema + GIF conversion

🛡️ Security & Quality:
- Complete .gitignore with personal data protection
- Dependency optimization (removed python-dotenv)
- Code quality validation and educational test suite
- Agent-reviewed architecture and documentation

Ready for production use - copy folder, run ./rag-mini, start searching\!
2025-08-12 16:38:28 +10:00

257 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""Test RAG system integration with smart chunking."""
import tempfile
import shutil
from pathlib import Path
from claude_rag.indexer import ProjectIndexer
from claude_rag.search import CodeSearcher
# Sample Python file with proper structure
sample_code = '''"""
Sample module for testing RAG system.
This module demonstrates various Python constructs.
"""
import os
import sys
from typing import List, Dict, Optional
from dataclasses import dataclass
# Module-level constants
DEFAULT_TIMEOUT = 30
MAX_RETRIES = 3
@dataclass
class Config:
"""Configuration dataclass."""
timeout: int = DEFAULT_TIMEOUT
retries: int = MAX_RETRIES
class DataProcessor:
"""
Main data processor class.
This class handles the processing of various data types
and provides a unified interface for data operations.
"""
def __init__(self, config: Config):
"""
Initialize the processor with configuration.
Args:
config: Configuration object
"""
self.config = config
self._cache = {}
self._initialized = False
def process(self, data: List[Dict]) -> List[Dict]:
"""
Process a list of data items.
Args:
data: List of dictionaries to process
Returns:
Processed data list
"""
if not self._initialized:
self._initialize()
results = []
for item in data:
processed = self._process_item(item)
results.append(processed)
return results
def _initialize(self):
"""Initialize internal state."""
self._cache.clear()
self._initialized = True
def _process_item(self, item: Dict) -> Dict:
"""Process a single item."""
# Implementation details
return {**item, 'processed': True}
def main():
"""Main entry point."""
config = Config()
processor = DataProcessor(config)
test_data = [
{'id': 1, 'value': 'test1'},
{'id': 2, 'value': 'test2'},
]
results = processor.process(test_data)
print(f"Processed {len(results)} items")
if __name__ == "__main__":
main()
'''
# Sample markdown file
sample_markdown = '''# RAG System Documentation
## Overview
This is the documentation for the RAG system that demonstrates
smart chunking capabilities.
## Features
### Smart Code Chunking
The system intelligently chunks code files by:
- Keeping docstrings with their functions/classes
- Creating logical boundaries at function and class definitions
- Preserving context through parent-child relationships
### Markdown Support
Markdown files are chunked by sections with:
- Header-based splitting
- Context overlap between chunks
- Preservation of document structure
## Usage
### Basic Example
```python
from claude_rag import ProjectIndexer
indexer = ProjectIndexer("/path/to/project")
indexer.index_project()
```
### Advanced Configuration
You can customize the chunking behavior:
```python
from claude_rag import CodeChunker
chunker = CodeChunker(
max_chunk_size=1000,
min_chunk_size=50
)
```
## API Reference
### ProjectIndexer
Main class for indexing projects.
### CodeSearcher
Provides semantic search capabilities.
'''
def test_integration():
"""Test the complete RAG system with smart chunking."""
# Create temporary project directory
with tempfile.TemporaryDirectory() as tmpdir:
project_path = Path(tmpdir)
# Create test files
(project_path / "processor.py").write_text(sample_code)
(project_path / "README.md").write_text(sample_markdown)
print("=" * 60)
print("TESTING RAG SYSTEM INTEGRATION")
print("=" * 60)
# Index the project
print("\n1. Indexing project...")
indexer = ProjectIndexer(project_path)
stats = indexer.index_project()
print(f" - Files indexed: {stats['files_indexed']}")
print(f" - Total chunks: {stats['total_chunks']}")
print(f" - Indexing time: {stats['indexing_time']:.2f}s")
# Verify chunks were created properly
print("\n2. Verifying chunk metadata...")
# Initialize searcher
searcher = CodeSearcher(project_path)
# Search for specific content
print("\n3. Testing search functionality...")
# Test 1: Search for class with docstring
results = searcher.search("data processor class unified interface", top_k=3)
print(f"\n Test 1 - Class search:")
for i, result in enumerate(results[:1]):
print(f" - Match {i+1}: {result['file_path']}")
print(f" Chunk type: {result['chunk_type']}")
print(f" Score: {result['score']:.3f}")
if 'This class handles' in result['content']:
print(" [OK] Docstring included with class")
else:
print(" [FAIL] Docstring not found")
# Test 2: Search for method with docstring
results = searcher.search("process list of data items", top_k=3)
print(f"\n Test 2 - Method search:")
for i, result in enumerate(results[:1]):
print(f" - Match {i+1}: {result['file_path']}")
print(f" Chunk type: {result['chunk_type']}")
print(f" Parent class: {result.get('parent_class', 'N/A')}")
if 'Args:' in result['content'] and 'Returns:' in result['content']:
print(" [OK] Docstring included with method")
else:
print(" [FAIL] Method docstring not complete")
# Test 3: Search markdown content
results = searcher.search("smart chunking capabilities markdown", top_k=3)
print(f"\n Test 3 - Markdown search:")
for i, result in enumerate(results[:1]):
print(f" - Match {i+1}: {result['file_path']}")
print(f" Chunk type: {result['chunk_type']}")
print(f" Lines: {result['start_line']}-{result['end_line']}")
# Test 4: Verify chunk navigation
print(f"\n Test 4 - Chunk navigation:")
all_results = searcher.search("", top_k=100) # Get all chunks
py_chunks = [r for r in all_results if r['file_path'].endswith('.py')]
if py_chunks:
first_chunk = py_chunks[0]
print(f" - First chunk: index={first_chunk.get('chunk_index', 'N/A')}")
print(f" Next chunk ID: {first_chunk.get('next_chunk_id', 'N/A')}")
# Verify chain
valid_chain = True
for i in range(len(py_chunks) - 1):
curr = py_chunks[i]
next_chunk = py_chunks[i + 1]
expected_next = f"processor_{i+1}"
if curr.get('next_chunk_id') != expected_next:
valid_chain = False
break
if valid_chain:
print(" [OK] Chunk navigation chain is valid")
else:
print(" [FAIL] Chunk navigation chain broken")
print("\n" + "=" * 60)
print("INTEGRATION TEST COMPLETED")
print("=" * 60)
if __name__ == "__main__":
test_integration()