""" Smart language-aware chunking strategies for FSS-Mini-RAG. Automatically adapts chunking based on file type and content patterns. """ from typing import Dict, Any, List from pathlib import Path import json class SmartChunkingStrategy: """Intelligent chunking that adapts to file types and content.""" def __init__(self): self.language_configs = { 'python': { 'max_size': 3000, # Larger for better function context 'min_size': 200, 'strategy': 'function', 'prefer_semantic': True }, 'javascript': { 'max_size': 2500, 'min_size': 150, 'strategy': 'function', 'prefer_semantic': True }, 'markdown': { 'max_size': 2500, 'min_size': 300, # Larger minimum for complete thoughts 'strategy': 'header', 'preserve_structure': True }, 'json': { 'max_size': 1000, # Smaller for config files 'min_size': 50, 'skip_if_large': True, # Skip huge config JSONs 'max_file_size': 50000 # 50KB limit }, 'yaml': { 'max_size': 1500, 'min_size': 100, 'strategy': 'key_block' }, 'text': { 'max_size': 2000, 'min_size': 200, 'strategy': 'paragraph' }, 'bash': { 'max_size': 1500, 'min_size': 100, 'strategy': 'function' } } # Smart defaults for unknown languages self.default_config = { 'max_size': 2000, 'min_size': 150, 'strategy': 'semantic' } def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]: """Get optimal chunking config for a specific language.""" config = self.language_configs.get(language, self.default_config).copy() # Smart adjustments based on file size if file_size > 0: if file_size < 500: # Very small files config['max_size'] = max(config['max_size'] // 2, 200) config['min_size'] = 50 elif file_size > 20000: # Large files config['max_size'] = min(config['max_size'] + 1000, 4000) return config def should_skip_file(self, language: str, file_size: int) -> bool: """Determine if a file should be skipped entirely.""" lang_config = self.language_configs.get(language, {}) # Skip huge JSON config files if language == 'json' and lang_config.get('skip_if_large'): max_size = lang_config.get('max_file_size', 50000) if file_size > max_size: return True # Skip tiny files that won't provide good context if file_size < 30: return True return False def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]: """Generate smart defaults based on project language distribution.""" languages = project_stats.get('languages', {}) total_files = sum(languages.values()) # Determine primary language primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else 'python' primary_config = self.language_configs.get(primary_lang, self.default_config) # Smart streaming threshold based on large files large_files = project_stats.get('large_files', 0) streaming_threshold = 5120 if large_files > 5 else 1048576 # 5KB vs 1MB return { "chunking": { "max_size": primary_config['max_size'], "min_size": primary_config['min_size'], "strategy": primary_config.get('strategy', 'semantic'), "language_specific": { lang: config for lang, config in self.language_configs.items() if languages.get(lang, 0) > 0 } }, "streaming": { "enabled": True, "threshold_bytes": streaming_threshold, "chunk_size_kb": 64 }, "files": { "skip_tiny_files": True, "tiny_threshold": 30, "smart_json_filtering": True } } # Example usage def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]: """Analyze project and suggest optimal configuration.""" from collections import Counter files = manifest_data.get('files', {}) languages = Counter() large_files = 0 for info in files.values(): lang = info.get('language', 'unknown') languages[lang] += 1 if info.get('size', 0) > 10000: large_files += 1 stats = { 'languages': dict(languages), 'large_files': large_files, 'total_files': len(files) } strategy = SmartChunkingStrategy() return strategy.get_smart_defaults(stats)