fss-mini-rag-github/claude_rag/smart_chunking.py
BobAi 4166d0a362 Initial release: FSS-Mini-RAG - Lightweight semantic code search system
🎯 Complete transformation from 5.9GB bloated system to 70MB optimized solution

 Key Features:
- Hybrid embedding system (Ollama + ML fallback + hash backup)
- Intelligent chunking with language-aware parsing
- Semantic + BM25 hybrid search with rich context
- Zero-config portable design with graceful degradation
- Beautiful TUI for beginners + powerful CLI for experts
- Comprehensive documentation with 8+ Mermaid diagrams
- Professional animated demo (183KB optimized GIF)

🏗️ Architecture Highlights:
- LanceDB vector storage with streaming indexing
- Smart file tracking (size/mtime) to avoid expensive rehashing
- Progressive chunking: Markdown headers → Python functions → fixed-size
- Quality filtering: 200+ chars, 20+ words, 30% alphanumeric content
- Concurrent batch processing with error recovery

📦 Package Contents:
- Core engine: claude_rag/ (11 modules, 2,847 lines)
- Entry points: rag-mini (unified), rag-tui (beginner interface)
- Documentation: README + 6 guides with visual diagrams
- Assets: 3D icon, optimized demo GIF, recording tools
- Tests: 8 comprehensive integration and validation tests
- Examples: Usage patterns, config templates, dependency analysis

🎥 Demo System:
- Scripted demonstration showing 12 files → 58 chunks indexing
- Semantic search with multi-line result previews
- Complete workflow from TUI startup to CLI mastery
- Professional recording pipeline with asciinema + GIF conversion

🛡️ Security & Quality:
- Complete .gitignore with personal data protection
- Dependency optimization (removed python-dotenv)
- Code quality validation and educational test suite
- Agent-reviewed architecture and documentation

Ready for production use - copy folder, run ./rag-mini, start searching\!
2025-08-12 16:38:28 +10:00

150 lines
5.2 KiB
Python

"""
Smart language-aware chunking strategies for FSS-Mini-RAG.
Automatically adapts chunking based on file type and content patterns.
"""
from typing import Dict, Any, List
from pathlib import Path
import json
class SmartChunkingStrategy:
"""Intelligent chunking that adapts to file types and content."""
def __init__(self):
self.language_configs = {
'python': {
'max_size': 3000, # Larger for better function context
'min_size': 200,
'strategy': 'function',
'prefer_semantic': True
},
'javascript': {
'max_size': 2500,
'min_size': 150,
'strategy': 'function',
'prefer_semantic': True
},
'markdown': {
'max_size': 2500,
'min_size': 300, # Larger minimum for complete thoughts
'strategy': 'header',
'preserve_structure': True
},
'json': {
'max_size': 1000, # Smaller for config files
'min_size': 50,
'skip_if_large': True, # Skip huge config JSONs
'max_file_size': 50000 # 50KB limit
},
'yaml': {
'max_size': 1500,
'min_size': 100,
'strategy': 'key_block'
},
'text': {
'max_size': 2000,
'min_size': 200,
'strategy': 'paragraph'
},
'bash': {
'max_size': 1500,
'min_size': 100,
'strategy': 'function'
}
}
# Smart defaults for unknown languages
self.default_config = {
'max_size': 2000,
'min_size': 150,
'strategy': 'semantic'
}
def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]:
"""Get optimal chunking config for a specific language."""
config = self.language_configs.get(language, self.default_config).copy()
# Smart adjustments based on file size
if file_size > 0:
if file_size < 500: # Very small files
config['max_size'] = max(config['max_size'] // 2, 200)
config['min_size'] = 50
elif file_size > 20000: # Large files
config['max_size'] = min(config['max_size'] + 1000, 4000)
return config
def should_skip_file(self, language: str, file_size: int) -> bool:
"""Determine if a file should be skipped entirely."""
lang_config = self.language_configs.get(language, {})
# Skip huge JSON config files
if language == 'json' and lang_config.get('skip_if_large'):
max_size = lang_config.get('max_file_size', 50000)
if file_size > max_size:
return True
# Skip tiny files that won't provide good context
if file_size < 30:
return True
return False
def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]:
"""Generate smart defaults based on project language distribution."""
languages = project_stats.get('languages', {})
total_files = sum(languages.values())
# Determine primary language
primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else 'python'
primary_config = self.language_configs.get(primary_lang, self.default_config)
# Smart streaming threshold based on large files
large_files = project_stats.get('large_files', 0)
streaming_threshold = 5120 if large_files > 5 else 1048576 # 5KB vs 1MB
return {
"chunking": {
"max_size": primary_config['max_size'],
"min_size": primary_config['min_size'],
"strategy": primary_config.get('strategy', 'semantic'),
"language_specific": {
lang: config for lang, config in self.language_configs.items()
if languages.get(lang, 0) > 0
}
},
"streaming": {
"enabled": True,
"threshold_bytes": streaming_threshold,
"chunk_size_kb": 64
},
"files": {
"skip_tiny_files": True,
"tiny_threshold": 30,
"smart_json_filtering": True
}
}
# Example usage
def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze project and suggest optimal configuration."""
from collections import Counter
files = manifest_data.get('files', {})
languages = Counter()
large_files = 0
for info in files.values():
lang = info.get('language', 'unknown')
languages[lang] += 1
if info.get('size', 0) > 10000:
large_files += 1
stats = {
'languages': dict(languages),
'large_files': large_files,
'total_files': len(files)
}
strategy = SmartChunkingStrategy()
return strategy.get_smart_defaults(stats)