Complete rebrand to eliminate any Claude/Anthropic references: Directory Changes: - claude_rag/ → mini_rag/ (preserving git history) Content Changes: - Replaced 930+ Claude references across 40+ files - Updated all imports: from claude_rag → from mini_rag - Updated all file paths: .claude-rag → .mini-rag - Updated documentation and comments - Updated configuration files and examples Testing Changes: - All tests updated to use mini_rag imports - Integration tests verify new module structure This ensures complete independence from Claude/Anthropic branding while maintaining all functionality and git history.
150 lines
5.2 KiB
Python
150 lines
5.2 KiB
Python
"""
|
|
Smart language-aware chunking strategies for FSS-Mini-RAG.
|
|
Automatically adapts chunking based on file type and content patterns.
|
|
"""
|
|
|
|
from typing import Dict, Any, List
|
|
from pathlib import Path
|
|
import json
|
|
|
|
class SmartChunkingStrategy:
|
|
"""Intelligent chunking that adapts to file types and content."""
|
|
|
|
def __init__(self):
|
|
self.language_configs = {
|
|
'python': {
|
|
'max_size': 3000, # Larger for better function context
|
|
'min_size': 200,
|
|
'strategy': 'function',
|
|
'prefer_semantic': True
|
|
},
|
|
'javascript': {
|
|
'max_size': 2500,
|
|
'min_size': 150,
|
|
'strategy': 'function',
|
|
'prefer_semantic': True
|
|
},
|
|
'markdown': {
|
|
'max_size': 2500,
|
|
'min_size': 300, # Larger minimum for complete thoughts
|
|
'strategy': 'header',
|
|
'preserve_structure': True
|
|
},
|
|
'json': {
|
|
'max_size': 1000, # Smaller for config files
|
|
'min_size': 50,
|
|
'skip_if_large': True, # Skip huge config JSONs
|
|
'max_file_size': 50000 # 50KB limit
|
|
},
|
|
'yaml': {
|
|
'max_size': 1500,
|
|
'min_size': 100,
|
|
'strategy': 'key_block'
|
|
},
|
|
'text': {
|
|
'max_size': 2000,
|
|
'min_size': 200,
|
|
'strategy': 'paragraph'
|
|
},
|
|
'bash': {
|
|
'max_size': 1500,
|
|
'min_size': 100,
|
|
'strategy': 'function'
|
|
}
|
|
}
|
|
|
|
# Smart defaults for unknown languages
|
|
self.default_config = {
|
|
'max_size': 2000,
|
|
'min_size': 150,
|
|
'strategy': 'semantic'
|
|
}
|
|
|
|
def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]:
|
|
"""Get optimal chunking config for a specific language."""
|
|
config = self.language_configs.get(language, self.default_config).copy()
|
|
|
|
# Smart adjustments based on file size
|
|
if file_size > 0:
|
|
if file_size < 500: # Very small files
|
|
config['max_size'] = max(config['max_size'] // 2, 200)
|
|
config['min_size'] = 50
|
|
elif file_size > 20000: # Large files
|
|
config['max_size'] = min(config['max_size'] + 1000, 4000)
|
|
|
|
return config
|
|
|
|
def should_skip_file(self, language: str, file_size: int) -> bool:
|
|
"""Determine if a file should be skipped entirely."""
|
|
lang_config = self.language_configs.get(language, {})
|
|
|
|
# Skip huge JSON config files
|
|
if language == 'json' and lang_config.get('skip_if_large'):
|
|
max_size = lang_config.get('max_file_size', 50000)
|
|
if file_size > max_size:
|
|
return True
|
|
|
|
# Skip tiny files that won't provide good context
|
|
if file_size < 30:
|
|
return True
|
|
|
|
return False
|
|
|
|
def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate smart defaults based on project language distribution."""
|
|
languages = project_stats.get('languages', {})
|
|
total_files = sum(languages.values())
|
|
|
|
# Determine primary language
|
|
primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else 'python'
|
|
primary_config = self.language_configs.get(primary_lang, self.default_config)
|
|
|
|
# Smart streaming threshold based on large files
|
|
large_files = project_stats.get('large_files', 0)
|
|
streaming_threshold = 5120 if large_files > 5 else 1048576 # 5KB vs 1MB
|
|
|
|
return {
|
|
"chunking": {
|
|
"max_size": primary_config['max_size'],
|
|
"min_size": primary_config['min_size'],
|
|
"strategy": primary_config.get('strategy', 'semantic'),
|
|
"language_specific": {
|
|
lang: config for lang, config in self.language_configs.items()
|
|
if languages.get(lang, 0) > 0
|
|
}
|
|
},
|
|
"streaming": {
|
|
"enabled": True,
|
|
"threshold_bytes": streaming_threshold,
|
|
"chunk_size_kb": 64
|
|
},
|
|
"files": {
|
|
"skip_tiny_files": True,
|
|
"tiny_threshold": 30,
|
|
"smart_json_filtering": True
|
|
}
|
|
}
|
|
|
|
# Example usage
|
|
def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze project and suggest optimal configuration."""
|
|
from collections import Counter
|
|
|
|
files = manifest_data.get('files', {})
|
|
languages = Counter()
|
|
large_files = 0
|
|
|
|
for info in files.values():
|
|
lang = info.get('language', 'unknown')
|
|
languages[lang] += 1
|
|
if info.get('size', 0) > 10000:
|
|
large_files += 1
|
|
|
|
stats = {
|
|
'languages': dict(languages),
|
|
'large_files': large_files,
|
|
'total_files': len(files)
|
|
}
|
|
|
|
strategy = SmartChunkingStrategy()
|
|
return strategy.get_smart_defaults(stats) |