fss-mini-rag-github/mini_rag/smart_chunking.py
FSSCoding 930f53a0fb Major code quality improvements and structural organization
- Applied Black formatter and isort across entire codebase for professional consistency
- Moved implementation scripts (rag-mini.py, rag-tui.py) to bin/ directory for cleaner root
- Updated shell scripts to reference new bin/ locations maintaining user compatibility
- Added comprehensive linting configuration (.flake8, pyproject.toml) with dedicated .venv-linting
- Removed development artifacts (commit_message.txt, GET_STARTED.md duplicate) from root
- Consolidated documentation and fixed script references across all guides
- Relocated test_fixes.py to proper tests/ directory
- Enhanced project structure following Python packaging standards

All user commands work identically while improving code organization and beginner accessibility.
2025-08-28 15:29:54 +10:00

143 lines
4.9 KiB
Python

"""
Smart language-aware chunking strategies for FSS-Mini-RAG.
Automatically adapts chunking based on file type and content patterns.
"""
from pathlib import Path
from typing import Any, Dict, List
class SmartChunkingStrategy:
"""Intelligent chunking that adapts to file types and content."""
def __init__(self):
self.language_configs = {
"python": {
"max_size": 3000, # Larger for better function context
"min_size": 200,
"strategy": "function",
"prefer_semantic": True,
},
"javascript": {
"max_size": 2500,
"min_size": 150,
"strategy": "function",
"prefer_semantic": True,
},
"markdown": {
"max_size": 2500,
"min_size": 300, # Larger minimum for complete thoughts
"strategy": "header",
"preserve_structure": True,
},
"json": {
"max_size": 1000, # Smaller for config files
"min_size": 50,
"skip_if_large": True, # Skip huge config JSONs
"max_file_size": 50000, # 50KB limit
},
"yaml": {"max_size": 1500, "min_size": 100, "strategy": "key_block"},
"text": {"max_size": 2000, "min_size": 200, "strategy": "paragraph"},
"bash": {"max_size": 1500, "min_size": 100, "strategy": "function"},
}
# Smart defaults for unknown languages
self.default_config = {
"max_size": 2000,
"min_size": 150,
"strategy": "semantic",
}
def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]:
"""Get optimal chunking config for a specific language."""
config = self.language_configs.get(language, self.default_config).copy()
# Smart adjustments based on file size
if file_size > 0:
if file_size < 500: # Very small files
config["max_size"] = max(config["max_size"] // 2, 200)
config["min_size"] = 50
elif file_size > 20000: # Large files
config["max_size"] = min(config["max_size"] + 1000, 4000)
return config
def should_skip_file(self, language: str, file_size: int) -> bool:
"""Determine if a file should be skipped entirely."""
lang_config = self.language_configs.get(language, {})
# Skip huge JSON config files
if language == "json" and lang_config.get("skip_if_large"):
max_size = lang_config.get("max_file_size", 50000)
if file_size > max_size:
return True
# Skip tiny files that won't provide good context
if file_size < 30:
return True
return False
def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]:
"""Generate smart defaults based on project language distribution."""
languages = project_stats.get("languages", {})
# sum(languages.values()) # Unused variable removed
# Determine primary language
primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else "python"
primary_config = self.language_configs.get(primary_lang, self.default_config)
# Smart streaming threshold based on large files
large_files = project_stats.get("large_files", 0)
streaming_threshold = 5120 if large_files > 5 else 1048576 # 5KB vs 1MB
return {
"chunking": {
"max_size": primary_config["max_size"],
"min_size": primary_config["min_size"],
"strategy": primary_config.get("strategy", "semantic"),
"language_specific": {
lang: config
for lang, config in self.language_configs.items()
if languages.get(lang, 0) > 0
},
},
"streaming": {
"enabled": True,
"threshold_bytes": streaming_threshold,
"chunk_size_kb": 64,
},
"files": {
"skip_tiny_files": True,
"tiny_threshold": 30,
"smart_json_filtering": True,
},
}
# Example usage
def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze project and suggest optimal configuration."""
from collections import Counter
files = manifest_data.get("files", {})
languages = Counter()
large_files = 0
for info in files.values():
lang = info.get("language", "unknown")
languages[lang] += 1
if info.get("size", 0) > 10000:
large_files += 1
stats = {
"languages": dict(languages),
"large_files": large_files,
"total_files": len(files),
}
strategy = SmartChunkingStrategy()
return strategy.get_smart_defaults(stats)