- Applied Black formatter and isort across entire codebase for professional consistency - Moved implementation scripts (rag-mini.py, rag-tui.py) to bin/ directory for cleaner root - Updated shell scripts to reference new bin/ locations maintaining user compatibility - Added comprehensive linting configuration (.flake8, pyproject.toml) with dedicated .venv-linting - Removed development artifacts (commit_message.txt, GET_STARTED.md duplicate) from root - Consolidated documentation and fixed script references across all guides - Relocated test_fixes.py to proper tests/ directory - Enhanced project structure following Python packaging standards All user commands work identically while improving code organization and beginner accessibility.
191 lines
6.9 KiB
Python
191 lines
6.9 KiB
Python
"""
|
|
Auto-optimizer for FSS-Mini-RAG.
|
|
Automatically tunes settings based on usage patterns.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import Any, Dict
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AutoOptimizer:
|
|
"""Automatically optimizes RAG settings based on project patterns."""
|
|
|
|
def __init__(self, project_path: Path):
|
|
self.project_path = project_path
|
|
self.rag_dir = project_path / ".mini-rag"
|
|
self.config_path = self.rag_dir / "config.json"
|
|
self.manifest_path = self.rag_dir / "manifest.json"
|
|
|
|
def analyze_and_optimize(self) -> Dict[str, Any]:
|
|
"""Analyze current patterns and auto-optimize settings."""
|
|
|
|
if not self.manifest_path.exists():
|
|
return {"error": "No index found - run indexing first"}
|
|
|
|
# Load current data
|
|
with open(self.manifest_path) as f:
|
|
manifest = json.load(f)
|
|
|
|
# Analyze patterns
|
|
analysis = self._analyze_patterns(manifest)
|
|
|
|
# Generate optimizations
|
|
optimizations = self._generate_optimizations(analysis)
|
|
|
|
# Apply optimizations if beneficial
|
|
if optimizations["confidence"] > 0.7:
|
|
self._apply_optimizations(optimizations)
|
|
return {
|
|
"status": "optimized",
|
|
"changes": optimizations["changes"],
|
|
"expected_improvement": optimizations["expected_improvement"],
|
|
}
|
|
else:
|
|
return {
|
|
"status": "no_changes_needed",
|
|
"analysis": analysis,
|
|
"confidence": optimizations["confidence"],
|
|
}
|
|
|
|
def _analyze_patterns(self, manifest: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Analyze current indexing patterns."""
|
|
files = manifest.get("files", {})
|
|
|
|
# Language distribution
|
|
languages = Counter()
|
|
sizes = []
|
|
chunk_ratios = []
|
|
|
|
for filepath, info in files.items():
|
|
lang = info.get("language", "unknown")
|
|
languages[lang] += 1
|
|
|
|
size = info.get("size", 0)
|
|
chunks = info.get("chunks", 1)
|
|
|
|
sizes.append(size)
|
|
chunk_ratios.append(chunks / max(1, size / 1000)) # chunks per KB
|
|
|
|
avg_chunk_ratio = sum(chunk_ratios) / len(chunk_ratios) if chunk_ratios else 1
|
|
avg_size = sum(sizes) / len(sizes) if sizes else 1000
|
|
|
|
return {
|
|
"languages": dict(languages.most_common()),
|
|
"total_files": len(files),
|
|
"total_chunks": sum(info.get("chunks", 1) for info in files.values()),
|
|
"avg_chunk_ratio": avg_chunk_ratio,
|
|
"avg_file_size": avg_size,
|
|
"large_files": sum(1 for s in sizes if s > 10000),
|
|
"small_files": sum(1 for s in sizes if s < 500),
|
|
}
|
|
|
|
def _generate_optimizations(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Generate optimization recommendations."""
|
|
changes = []
|
|
confidence = 0.5
|
|
expected_improvement = 0
|
|
|
|
# Optimize chunking based on dominant language
|
|
languages = analysis["languages"]
|
|
if languages:
|
|
dominant_lang, count = list(languages.items())[0]
|
|
lang_pct = count / analysis["total_files"]
|
|
|
|
if lang_pct > 0.3: # Dominant language >30%
|
|
if dominant_lang == "python" and analysis["avg_chunk_ratio"] < 1.5:
|
|
changes.append(
|
|
"Increase Python chunk size to 3000 for better function context"
|
|
)
|
|
confidence += 0.2
|
|
expected_improvement += 15
|
|
|
|
elif dominant_lang == "markdown" and analysis["avg_chunk_ratio"] < 1.2:
|
|
changes.append("Use header-based chunking for Markdown files")
|
|
confidence += 0.15
|
|
expected_improvement += 10
|
|
|
|
# Optimize for large files
|
|
if analysis["large_files"] > 5:
|
|
changes.append("Reduce streaming threshold to 5KB for better large file handling")
|
|
confidence += 0.1
|
|
expected_improvement += 8
|
|
|
|
# Optimize chunk ratio
|
|
if analysis["avg_chunk_ratio"] < 1.0:
|
|
changes.append("Reduce chunk size for more granular search results")
|
|
confidence += 0.15
|
|
expected_improvement += 12
|
|
elif analysis["avg_chunk_ratio"] > 3.0:
|
|
changes.append("Increase chunk size to reduce overhead")
|
|
confidence += 0.1
|
|
expected_improvement += 5
|
|
|
|
# Skip tiny files optimization
|
|
small_file_pct = analysis["small_files"] / analysis["total_files"]
|
|
if small_file_pct > 0.3:
|
|
changes.append("Skip files smaller than 300 bytes to improve focus")
|
|
confidence += 0.1
|
|
expected_improvement += 3
|
|
|
|
return {
|
|
"changes": changes,
|
|
"confidence": min(confidence, 1.0),
|
|
"expected_improvement": expected_improvement,
|
|
}
|
|
|
|
def _apply_optimizations(self, optimizations: Dict[str, Any]):
|
|
"""Apply the recommended optimizations."""
|
|
|
|
# Load existing config or create default
|
|
if self.config_path.exists():
|
|
with open(self.config_path) as f:
|
|
config = json.load(f)
|
|
else:
|
|
config = self._get_default_config()
|
|
|
|
changes = optimizations["changes"]
|
|
|
|
# Apply changes based on recommendations
|
|
for change in changes:
|
|
if "Python chunk size to 3000" in change:
|
|
config.setdefault("chunking", {})["max_size"] = 3000
|
|
|
|
elif "header-based chunking" in change:
|
|
config.setdefault("chunking", {})["strategy"] = "header"
|
|
|
|
elif "streaming threshold to 5KB" in change:
|
|
config.setdefault("streaming", {})["threshold_bytes"] = 5120
|
|
|
|
elif "Reduce chunk size" in change:
|
|
current_size = config.get("chunking", {}).get("max_size", 2000)
|
|
config.setdefault("chunking", {})["max_size"] = max(1500, current_size - 500)
|
|
|
|
elif "Increase chunk size" in change:
|
|
current_size = config.get("chunking", {}).get("max_size", 2000)
|
|
config.setdefault("chunking", {})["max_size"] = min(4000, current_size + 500)
|
|
|
|
elif "Skip files smaller" in change:
|
|
config.setdefault("files", {})["min_file_size"] = 300
|
|
|
|
# Save optimized config
|
|
config["_auto_optimized"] = True
|
|
config["_optimization_timestamp"] = json.dumps(None, default=str)
|
|
|
|
with open(self.config_path, "w") as f:
|
|
json.dump(config, f, indent=2)
|
|
|
|
logger.info(f"Applied {len(changes)} optimizations to {self.config_path}")
|
|
|
|
def _get_default_config(self) -> Dict[str, Any]:
|
|
"""Get default configuration."""
|
|
return {
|
|
"chunking": {"max_size": 2000, "min_size": 150, "strategy": "semantic"},
|
|
"streaming": {"enabled": True, "threshold_bytes": 1048576},
|
|
"files": {"min_file_size": 50},
|
|
}
|