Fss-Rag-Mini/mini_rag/auto_optimizer.py
BobAi a96ddba3c9 MAJOR: Remove all Claude references and rename to Mini-RAG
Complete rebrand to eliminate any Claude/Anthropic references:

Directory Changes:
- claude_rag/ → mini_rag/ (preserving git history)

Content Changes:
- Replaced 930+ Claude references across 40+ files
- Updated all imports: from claude_rag → from mini_rag
- Updated all file paths: .claude-rag → .mini-rag
- Updated documentation and comments
- Updated configuration files and examples

Testing Changes:
- All tests updated to use mini_rag imports
- Integration tests verify new module structure

This ensures complete independence from Claude/Anthropic
branding while maintaining all functionality and git history.
2025-08-12 19:21:30 +10:00

196 lines
7.3 KiB
Python

"""
Auto-optimizer for FSS-Mini-RAG.
Automatically tunes settings based on usage patterns.
"""
from pathlib import Path
import json
from typing import Dict, Any, List
from collections import Counter
import logging
logger = logging.getLogger(__name__)
class AutoOptimizer:
"""Automatically optimizes RAG settings based on project patterns."""
def __init__(self, project_path: Path):
self.project_path = project_path
self.rag_dir = project_path / '.mini-rag'
self.config_path = self.rag_dir / 'config.json'
self.manifest_path = self.rag_dir / 'manifest.json'
def analyze_and_optimize(self) -> Dict[str, Any]:
"""Analyze current patterns and auto-optimize settings."""
if not self.manifest_path.exists():
return {"error": "No index found - run indexing first"}
# Load current data
with open(self.manifest_path) as f:
manifest = json.load(f)
# Analyze patterns
analysis = self._analyze_patterns(manifest)
# Generate optimizations
optimizations = self._generate_optimizations(analysis)
# Apply optimizations if beneficial
if optimizations['confidence'] > 0.7:
self._apply_optimizations(optimizations)
return {
"status": "optimized",
"changes": optimizations['changes'],
"expected_improvement": optimizations['expected_improvement']
}
else:
return {
"status": "no_changes_needed",
"analysis": analysis,
"confidence": optimizations['confidence']
}
def _analyze_patterns(self, manifest: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze current indexing patterns."""
files = manifest.get('files', {})
# Language distribution
languages = Counter()
sizes = []
chunk_ratios = []
for filepath, info in files.items():
lang = info.get('language', 'unknown')
languages[lang] += 1
size = info.get('size', 0)
chunks = info.get('chunks', 1)
sizes.append(size)
chunk_ratios.append(chunks / max(1, size / 1000)) # chunks per KB
avg_chunk_ratio = sum(chunk_ratios) / len(chunk_ratios) if chunk_ratios else 1
avg_size = sum(sizes) / len(sizes) if sizes else 1000
return {
'languages': dict(languages.most_common()),
'total_files': len(files),
'total_chunks': sum(info.get('chunks', 1) for info in files.values()),
'avg_chunk_ratio': avg_chunk_ratio,
'avg_file_size': avg_size,
'large_files': sum(1 for s in sizes if s > 10000),
'small_files': sum(1 for s in sizes if s < 500)
}
def _generate_optimizations(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
"""Generate optimization recommendations."""
changes = []
confidence = 0.5
expected_improvement = 0
# Optimize chunking based on dominant language
languages = analysis['languages']
if languages:
dominant_lang, count = list(languages.items())[0]
lang_pct = count / analysis['total_files']
if lang_pct > 0.3: # Dominant language >30%
if dominant_lang == 'python' and analysis['avg_chunk_ratio'] < 1.5:
changes.append("Increase Python chunk size to 3000 for better function context")
confidence += 0.2
expected_improvement += 15
elif dominant_lang == 'markdown' and analysis['avg_chunk_ratio'] < 1.2:
changes.append("Use header-based chunking for Markdown files")
confidence += 0.15
expected_improvement += 10
# Optimize for large files
if analysis['large_files'] > 5:
changes.append("Reduce streaming threshold to 5KB for better large file handling")
confidence += 0.1
expected_improvement += 8
# Optimize chunk ratio
if analysis['avg_chunk_ratio'] < 1.0:
changes.append("Reduce chunk size for more granular search results")
confidence += 0.15
expected_improvement += 12
elif analysis['avg_chunk_ratio'] > 3.0:
changes.append("Increase chunk size to reduce overhead")
confidence += 0.1
expected_improvement += 5
# Skip tiny files optimization
small_file_pct = analysis['small_files'] / analysis['total_files']
if small_file_pct > 0.3:
changes.append("Skip files smaller than 300 bytes to improve focus")
confidence += 0.1
expected_improvement += 3
return {
'changes': changes,
'confidence': min(confidence, 1.0),
'expected_improvement': expected_improvement
}
def _apply_optimizations(self, optimizations: Dict[str, Any]):
"""Apply the recommended optimizations."""
# Load existing config or create default
if self.config_path.exists():
with open(self.config_path) as f:
config = json.load(f)
else:
config = self._get_default_config()
changes = optimizations['changes']
# Apply changes based on recommendations
for change in changes:
if "Python chunk size to 3000" in change:
config.setdefault('chunking', {})['max_size'] = 3000
elif "header-based chunking" in change:
config.setdefault('chunking', {})['strategy'] = 'header'
elif "streaming threshold to 5KB" in change:
config.setdefault('streaming', {})['threshold_bytes'] = 5120
elif "Reduce chunk size" in change:
current_size = config.get('chunking', {}).get('max_size', 2000)
config.setdefault('chunking', {})['max_size'] = max(1500, current_size - 500)
elif "Increase chunk size" in change:
current_size = config.get('chunking', {}).get('max_size', 2000)
config.setdefault('chunking', {})['max_size'] = min(4000, current_size + 500)
elif "Skip files smaller" in change:
config.setdefault('files', {})['min_file_size'] = 300
# Save optimized config
config['_auto_optimized'] = True
config['_optimization_timestamp'] = json.dumps(None, default=str)
with open(self.config_path, 'w') as f:
json.dump(config, f, indent=2)
logger.info(f"Applied {len(changes)} optimizations to {self.config_path}")
def _get_default_config(self) -> Dict[str, Any]:
"""Get default configuration."""
return {
"chunking": {
"max_size": 2000,
"min_size": 150,
"strategy": "semantic"
},
"streaming": {
"enabled": True,
"threshold_bytes": 1048576
},
"files": {
"min_file_size": 50
}
}