Complete rebrand for v1.0-simple-search branch: Directory Changes: - claude_rag/ → mini_rag/ (preserving git history) Content Changes: - Updated all imports: from claude_rag → from mini_rag - Updated all file paths: .claude-rag → .mini-rag - Updated documentation and comments - Updated configuration files and examples - Updated all tests to use mini_rag imports This ensures complete independence from Claude/Anthropic branding while maintaining all functionality and git history. Simple branch contains the basic RAG system without LLM features.
196 lines
7.3 KiB
Python
196 lines
7.3 KiB
Python
"""
|
|
Auto-optimizer for FSS-Mini-RAG.
|
|
Automatically tunes settings based on usage patterns.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
import json
|
|
from typing import Dict, Any, List
|
|
from collections import Counter
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class AutoOptimizer:
|
|
"""Automatically optimizes RAG settings based on project patterns."""
|
|
|
|
def __init__(self, project_path: Path):
|
|
self.project_path = project_path
|
|
self.rag_dir = project_path / '.mini-rag'
|
|
self.config_path = self.rag_dir / 'config.json'
|
|
self.manifest_path = self.rag_dir / 'manifest.json'
|
|
|
|
def analyze_and_optimize(self) -> Dict[str, Any]:
|
|
"""Analyze current patterns and auto-optimize settings."""
|
|
|
|
if not self.manifest_path.exists():
|
|
return {"error": "No index found - run indexing first"}
|
|
|
|
# Load current data
|
|
with open(self.manifest_path) as f:
|
|
manifest = json.load(f)
|
|
|
|
# Analyze patterns
|
|
analysis = self._analyze_patterns(manifest)
|
|
|
|
# Generate optimizations
|
|
optimizations = self._generate_optimizations(analysis)
|
|
|
|
# Apply optimizations if beneficial
|
|
if optimizations['confidence'] > 0.7:
|
|
self._apply_optimizations(optimizations)
|
|
return {
|
|
"status": "optimized",
|
|
"changes": optimizations['changes'],
|
|
"expected_improvement": optimizations['expected_improvement']
|
|
}
|
|
else:
|
|
return {
|
|
"status": "no_changes_needed",
|
|
"analysis": analysis,
|
|
"confidence": optimizations['confidence']
|
|
}
|
|
|
|
def _analyze_patterns(self, manifest: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze current indexing patterns."""
|
|
files = manifest.get('files', {})
|
|
|
|
# Language distribution
|
|
languages = Counter()
|
|
sizes = []
|
|
chunk_ratios = []
|
|
|
|
for filepath, info in files.items():
|
|
lang = info.get('language', 'unknown')
|
|
languages[lang] += 1
|
|
|
|
size = info.get('size', 0)
|
|
chunks = info.get('chunks', 1)
|
|
|
|
sizes.append(size)
|
|
chunk_ratios.append(chunks / max(1, size / 1000)) # chunks per KB
|
|
|
|
avg_chunk_ratio = sum(chunk_ratios) / len(chunk_ratios) if chunk_ratios else 1
|
|
avg_size = sum(sizes) / len(sizes) if sizes else 1000
|
|
|
|
return {
|
|
'languages': dict(languages.most_common()),
|
|
'total_files': len(files),
|
|
'total_chunks': sum(info.get('chunks', 1) for info in files.values()),
|
|
'avg_chunk_ratio': avg_chunk_ratio,
|
|
'avg_file_size': avg_size,
|
|
'large_files': sum(1 for s in sizes if s > 10000),
|
|
'small_files': sum(1 for s in sizes if s < 500)
|
|
}
|
|
|
|
def _generate_optimizations(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate optimization recommendations."""
|
|
changes = []
|
|
confidence = 0.5
|
|
expected_improvement = 0
|
|
|
|
# Optimize chunking based on dominant language
|
|
languages = analysis['languages']
|
|
if languages:
|
|
dominant_lang, count = list(languages.items())[0]
|
|
lang_pct = count / analysis['total_files']
|
|
|
|
if lang_pct > 0.3: # Dominant language >30%
|
|
if dominant_lang == 'python' and analysis['avg_chunk_ratio'] < 1.5:
|
|
changes.append("Increase Python chunk size to 3000 for better function context")
|
|
confidence += 0.2
|
|
expected_improvement += 15
|
|
|
|
elif dominant_lang == 'markdown' and analysis['avg_chunk_ratio'] < 1.2:
|
|
changes.append("Use header-based chunking for Markdown files")
|
|
confidence += 0.15
|
|
expected_improvement += 10
|
|
|
|
# Optimize for large files
|
|
if analysis['large_files'] > 5:
|
|
changes.append("Reduce streaming threshold to 5KB for better large file handling")
|
|
confidence += 0.1
|
|
expected_improvement += 8
|
|
|
|
# Optimize chunk ratio
|
|
if analysis['avg_chunk_ratio'] < 1.0:
|
|
changes.append("Reduce chunk size for more granular search results")
|
|
confidence += 0.15
|
|
expected_improvement += 12
|
|
elif analysis['avg_chunk_ratio'] > 3.0:
|
|
changes.append("Increase chunk size to reduce overhead")
|
|
confidence += 0.1
|
|
expected_improvement += 5
|
|
|
|
# Skip tiny files optimization
|
|
small_file_pct = analysis['small_files'] / analysis['total_files']
|
|
if small_file_pct > 0.3:
|
|
changes.append("Skip files smaller than 300 bytes to improve focus")
|
|
confidence += 0.1
|
|
expected_improvement += 3
|
|
|
|
return {
|
|
'changes': changes,
|
|
'confidence': min(confidence, 1.0),
|
|
'expected_improvement': expected_improvement
|
|
}
|
|
|
|
def _apply_optimizations(self, optimizations: Dict[str, Any]):
|
|
"""Apply the recommended optimizations."""
|
|
|
|
# Load existing config or create default
|
|
if self.config_path.exists():
|
|
with open(self.config_path) as f:
|
|
config = json.load(f)
|
|
else:
|
|
config = self._get_default_config()
|
|
|
|
changes = optimizations['changes']
|
|
|
|
# Apply changes based on recommendations
|
|
for change in changes:
|
|
if "Python chunk size to 3000" in change:
|
|
config.setdefault('chunking', {})['max_size'] = 3000
|
|
|
|
elif "header-based chunking" in change:
|
|
config.setdefault('chunking', {})['strategy'] = 'header'
|
|
|
|
elif "streaming threshold to 5KB" in change:
|
|
config.setdefault('streaming', {})['threshold_bytes'] = 5120
|
|
|
|
elif "Reduce chunk size" in change:
|
|
current_size = config.get('chunking', {}).get('max_size', 2000)
|
|
config.setdefault('chunking', {})['max_size'] = max(1500, current_size - 500)
|
|
|
|
elif "Increase chunk size" in change:
|
|
current_size = config.get('chunking', {}).get('max_size', 2000)
|
|
config.setdefault('chunking', {})['max_size'] = min(4000, current_size + 500)
|
|
|
|
elif "Skip files smaller" in change:
|
|
config.setdefault('files', {})['min_file_size'] = 300
|
|
|
|
# Save optimized config
|
|
config['_auto_optimized'] = True
|
|
config['_optimization_timestamp'] = json.dumps(None, default=str)
|
|
|
|
with open(self.config_path, 'w') as f:
|
|
json.dump(config, f, indent=2)
|
|
|
|
logger.info(f"Applied {len(changes)} optimizations to {self.config_path}")
|
|
|
|
def _get_default_config(self) -> Dict[str, Any]:
|
|
"""Get default configuration."""
|
|
return {
|
|
"chunking": {
|
|
"max_size": 2000,
|
|
"min_size": 150,
|
|
"strategy": "semantic"
|
|
},
|
|
"streaming": {
|
|
"enabled": True,
|
|
"threshold_bytes": 1048576
|
|
},
|
|
"files": {
|
|
"min_file_size": 50
|
|
}
|
|
} |