- Applied Black formatter and isort across entire codebase for professional consistency - Moved implementation scripts (rag-mini.py, rag-tui.py) to bin/ directory for cleaner root - Updated shell scripts to reference new bin/ locations maintaining user compatibility - Added comprehensive linting configuration (.flake8, pyproject.toml) with dedicated .venv-linting - Removed development artifacts (commit_message.txt, GET_STARTED.md duplicate) from root - Consolidated documentation and fixed script references across all guides - Relocated test_fixes.py to proper tests/ directory - Enhanced project structure following Python packaging standards All user commands work identically while improving code organization and beginner accessibility.
143 lines
4.9 KiB
Python
143 lines
4.9 KiB
Python
"""
|
|
Smart language-aware chunking strategies for FSS-Mini-RAG.
|
|
Automatically adapts chunking based on file type and content patterns.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
class SmartChunkingStrategy:
|
|
"""Intelligent chunking that adapts to file types and content."""
|
|
|
|
def __init__(self):
|
|
self.language_configs = {
|
|
"python": {
|
|
"max_size": 3000, # Larger for better function context
|
|
"min_size": 200,
|
|
"strategy": "function",
|
|
"prefer_semantic": True,
|
|
},
|
|
"javascript": {
|
|
"max_size": 2500,
|
|
"min_size": 150,
|
|
"strategy": "function",
|
|
"prefer_semantic": True,
|
|
},
|
|
"markdown": {
|
|
"max_size": 2500,
|
|
"min_size": 300, # Larger minimum for complete thoughts
|
|
"strategy": "header",
|
|
"preserve_structure": True,
|
|
},
|
|
"json": {
|
|
"max_size": 1000, # Smaller for config files
|
|
"min_size": 50,
|
|
"skip_if_large": True, # Skip huge config JSONs
|
|
"max_file_size": 50000, # 50KB limit
|
|
},
|
|
"yaml": {"max_size": 1500, "min_size": 100, "strategy": "key_block"},
|
|
"text": {"max_size": 2000, "min_size": 200, "strategy": "paragraph"},
|
|
"bash": {"max_size": 1500, "min_size": 100, "strategy": "function"},
|
|
}
|
|
|
|
# Smart defaults for unknown languages
|
|
self.default_config = {
|
|
"max_size": 2000,
|
|
"min_size": 150,
|
|
"strategy": "semantic",
|
|
}
|
|
|
|
def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]:
|
|
"""Get optimal chunking config for a specific language."""
|
|
config = self.language_configs.get(language, self.default_config).copy()
|
|
|
|
# Smart adjustments based on file size
|
|
if file_size > 0:
|
|
if file_size < 500: # Very small files
|
|
config["max_size"] = max(config["max_size"] // 2, 200)
|
|
config["min_size"] = 50
|
|
elif file_size > 20000: # Large files
|
|
config["max_size"] = min(config["max_size"] + 1000, 4000)
|
|
|
|
return config
|
|
|
|
def should_skip_file(self, language: str, file_size: int) -> bool:
|
|
"""Determine if a file should be skipped entirely."""
|
|
lang_config = self.language_configs.get(language, {})
|
|
|
|
# Skip huge JSON config files
|
|
if language == "json" and lang_config.get("skip_if_large"):
|
|
max_size = lang_config.get("max_file_size", 50000)
|
|
if file_size > max_size:
|
|
return True
|
|
|
|
# Skip tiny files that won't provide good context
|
|
if file_size < 30:
|
|
return True
|
|
|
|
return False
|
|
|
|
def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate smart defaults based on project language distribution."""
|
|
languages = project_stats.get("languages", {})
|
|
# sum(languages.values()) # Unused variable removed
|
|
|
|
# Determine primary language
|
|
primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else "python"
|
|
primary_config = self.language_configs.get(primary_lang, self.default_config)
|
|
|
|
# Smart streaming threshold based on large files
|
|
large_files = project_stats.get("large_files", 0)
|
|
streaming_threshold = 5120 if large_files > 5 else 1048576 # 5KB vs 1MB
|
|
|
|
return {
|
|
"chunking": {
|
|
"max_size": primary_config["max_size"],
|
|
"min_size": primary_config["min_size"],
|
|
"strategy": primary_config.get("strategy", "semantic"),
|
|
"language_specific": {
|
|
lang: config
|
|
for lang, config in self.language_configs.items()
|
|
if languages.get(lang, 0) > 0
|
|
},
|
|
},
|
|
"streaming": {
|
|
"enabled": True,
|
|
"threshold_bytes": streaming_threshold,
|
|
"chunk_size_kb": 64,
|
|
},
|
|
"files": {
|
|
"skip_tiny_files": True,
|
|
"tiny_threshold": 30,
|
|
"smart_json_filtering": True,
|
|
},
|
|
}
|
|
|
|
|
|
# Example usage
|
|
|
|
|
|
def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze project and suggest optimal configuration."""
|
|
from collections import Counter
|
|
|
|
files = manifest_data.get("files", {})
|
|
languages = Counter()
|
|
large_files = 0
|
|
|
|
for info in files.values():
|
|
lang = info.get("language", "unknown")
|
|
languages[lang] += 1
|
|
if info.get("size", 0) > 10000:
|
|
large_files += 1
|
|
|
|
stats = {
|
|
"languages": dict(languages),
|
|
"large_files": large_files,
|
|
"total_files": len(files),
|
|
}
|
|
|
|
strategy = SmartChunkingStrategy()
|
|
return strategy.get_smart_defaults(stats)
|