🎯 Complete transformation from 5.9GB bloated system to 70MB optimized solution ✨ Key Features: - Hybrid embedding system (Ollama + ML fallback + hash backup) - Intelligent chunking with language-aware parsing - Semantic + BM25 hybrid search with rich context - Zero-config portable design with graceful degradation - Beautiful TUI for beginners + powerful CLI for experts - Comprehensive documentation with 8+ Mermaid diagrams - Professional animated demo (183KB optimized GIF) 🏗️ Architecture Highlights: - LanceDB vector storage with streaming indexing - Smart file tracking (size/mtime) to avoid expensive rehashing - Progressive chunking: Markdown headers → Python functions → fixed-size - Quality filtering: 200+ chars, 20+ words, 30% alphanumeric content - Concurrent batch processing with error recovery 📦 Package Contents: - Core engine: claude_rag/ (11 modules, 2,847 lines) - Entry points: rag-mini (unified), rag-tui (beginner interface) - Documentation: README + 6 guides with visual diagrams - Assets: 3D icon, optimized demo GIF, recording tools - Tests: 8 comprehensive integration and validation tests - Examples: Usage patterns, config templates, dependency analysis 🎥 Demo System: - Scripted demonstration showing 12 files → 58 chunks indexing - Semantic search with multi-line result previews - Complete workflow from TUI startup to CLI mastery - Professional recording pipeline with asciinema + GIF conversion 🛡️ Security & Quality: - Complete .gitignore with personal data protection - Dependency optimization (removed python-dotenv) - Code quality validation and educational test suite - Agent-reviewed architecture and documentation Ready for production use - copy folder, run ./rag-mini, start searching\!
150 lines
5.2 KiB
Python
150 lines
5.2 KiB
Python
"""
|
|
Smart language-aware chunking strategies for FSS-Mini-RAG.
|
|
Automatically adapts chunking based on file type and content patterns.
|
|
"""
|
|
|
|
from typing import Dict, Any, List
|
|
from pathlib import Path
|
|
import json
|
|
|
|
class SmartChunkingStrategy:
|
|
"""Intelligent chunking that adapts to file types and content."""
|
|
|
|
def __init__(self):
|
|
self.language_configs = {
|
|
'python': {
|
|
'max_size': 3000, # Larger for better function context
|
|
'min_size': 200,
|
|
'strategy': 'function',
|
|
'prefer_semantic': True
|
|
},
|
|
'javascript': {
|
|
'max_size': 2500,
|
|
'min_size': 150,
|
|
'strategy': 'function',
|
|
'prefer_semantic': True
|
|
},
|
|
'markdown': {
|
|
'max_size': 2500,
|
|
'min_size': 300, # Larger minimum for complete thoughts
|
|
'strategy': 'header',
|
|
'preserve_structure': True
|
|
},
|
|
'json': {
|
|
'max_size': 1000, # Smaller for config files
|
|
'min_size': 50,
|
|
'skip_if_large': True, # Skip huge config JSONs
|
|
'max_file_size': 50000 # 50KB limit
|
|
},
|
|
'yaml': {
|
|
'max_size': 1500,
|
|
'min_size': 100,
|
|
'strategy': 'key_block'
|
|
},
|
|
'text': {
|
|
'max_size': 2000,
|
|
'min_size': 200,
|
|
'strategy': 'paragraph'
|
|
},
|
|
'bash': {
|
|
'max_size': 1500,
|
|
'min_size': 100,
|
|
'strategy': 'function'
|
|
}
|
|
}
|
|
|
|
# Smart defaults for unknown languages
|
|
self.default_config = {
|
|
'max_size': 2000,
|
|
'min_size': 150,
|
|
'strategy': 'semantic'
|
|
}
|
|
|
|
def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]:
|
|
"""Get optimal chunking config for a specific language."""
|
|
config = self.language_configs.get(language, self.default_config).copy()
|
|
|
|
# Smart adjustments based on file size
|
|
if file_size > 0:
|
|
if file_size < 500: # Very small files
|
|
config['max_size'] = max(config['max_size'] // 2, 200)
|
|
config['min_size'] = 50
|
|
elif file_size > 20000: # Large files
|
|
config['max_size'] = min(config['max_size'] + 1000, 4000)
|
|
|
|
return config
|
|
|
|
def should_skip_file(self, language: str, file_size: int) -> bool:
|
|
"""Determine if a file should be skipped entirely."""
|
|
lang_config = self.language_configs.get(language, {})
|
|
|
|
# Skip huge JSON config files
|
|
if language == 'json' and lang_config.get('skip_if_large'):
|
|
max_size = lang_config.get('max_file_size', 50000)
|
|
if file_size > max_size:
|
|
return True
|
|
|
|
# Skip tiny files that won't provide good context
|
|
if file_size < 30:
|
|
return True
|
|
|
|
return False
|
|
|
|
def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate smart defaults based on project language distribution."""
|
|
languages = project_stats.get('languages', {})
|
|
total_files = sum(languages.values())
|
|
|
|
# Determine primary language
|
|
primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else 'python'
|
|
primary_config = self.language_configs.get(primary_lang, self.default_config)
|
|
|
|
# Smart streaming threshold based on large files
|
|
large_files = project_stats.get('large_files', 0)
|
|
streaming_threshold = 5120 if large_files > 5 else 1048576 # 5KB vs 1MB
|
|
|
|
return {
|
|
"chunking": {
|
|
"max_size": primary_config['max_size'],
|
|
"min_size": primary_config['min_size'],
|
|
"strategy": primary_config.get('strategy', 'semantic'),
|
|
"language_specific": {
|
|
lang: config for lang, config in self.language_configs.items()
|
|
if languages.get(lang, 0) > 0
|
|
}
|
|
},
|
|
"streaming": {
|
|
"enabled": True,
|
|
"threshold_bytes": streaming_threshold,
|
|
"chunk_size_kb": 64
|
|
},
|
|
"files": {
|
|
"skip_tiny_files": True,
|
|
"tiny_threshold": 30,
|
|
"smart_json_filtering": True
|
|
}
|
|
}
|
|
|
|
# Example usage
|
|
def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze project and suggest optimal configuration."""
|
|
from collections import Counter
|
|
|
|
files = manifest_data.get('files', {})
|
|
languages = Counter()
|
|
large_files = 0
|
|
|
|
for info in files.values():
|
|
lang = info.get('language', 'unknown')
|
|
languages[lang] += 1
|
|
if info.get('size', 0) > 10000:
|
|
large_files += 1
|
|
|
|
stats = {
|
|
'languages': dict(languages),
|
|
'large_files': large_files,
|
|
'total_files': len(files)
|
|
}
|
|
|
|
strategy = SmartChunkingStrategy()
|
|
return strategy.get_smart_defaults(stats) |