Fss-Rag-Mini/examples/smart_config_suggestions.py
BobAi 4166d0a362 Initial release: FSS-Mini-RAG - Lightweight semantic code search system
🎯 Complete transformation from 5.9GB bloated system to 70MB optimized solution

 Key Features:
- Hybrid embedding system (Ollama + ML fallback + hash backup)
- Intelligent chunking with language-aware parsing
- Semantic + BM25 hybrid search with rich context
- Zero-config portable design with graceful degradation
- Beautiful TUI for beginners + powerful CLI for experts
- Comprehensive documentation with 8+ Mermaid diagrams
- Professional animated demo (183KB optimized GIF)

🏗️ Architecture Highlights:
- LanceDB vector storage with streaming indexing
- Smart file tracking (size/mtime) to avoid expensive rehashing
- Progressive chunking: Markdown headers → Python functions → fixed-size
- Quality filtering: 200+ chars, 20+ words, 30% alphanumeric content
- Concurrent batch processing with error recovery

📦 Package Contents:
- Core engine: claude_rag/ (11 modules, 2,847 lines)
- Entry points: rag-mini (unified), rag-tui (beginner interface)
- Documentation: README + 6 guides with visual diagrams
- Assets: 3D icon, optimized demo GIF, recording tools
- Tests: 8 comprehensive integration and validation tests
- Examples: Usage patterns, config templates, dependency analysis

🎥 Demo System:
- Scripted demonstration showing 12 files → 58 chunks indexing
- Semantic search with multi-line result previews
- Complete workflow from TUI startup to CLI mastery
- Professional recording pipeline with asciinema + GIF conversion

🛡️ Security & Quality:
- Complete .gitignore with personal data protection
- Dependency optimization (removed python-dotenv)
- Code quality validation and educational test suite
- Agent-reviewed architecture and documentation

Ready for production use - copy folder, run ./rag-mini, start searching\!
2025-08-12 16:38:28 +10:00

130 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""
Smart configuration suggestions for FSS-Mini-RAG based on usage patterns.
Analyzes the indexed data to suggest optimal settings.
"""
import json
from pathlib import Path
from collections import defaultdict, Counter
import sys
def analyze_project_patterns(manifest_path: Path):
"""Analyze project patterns and suggest optimizations."""
with open(manifest_path) as f:
manifest = json.load(f)
files = manifest.get('files', {})
print("🔍 FSS-Mini-RAG Smart Tuning Analysis")
print("=" * 50)
# Analyze file types and chunking efficiency
languages = Counter()
chunk_efficiency = []
large_files = []
small_files = []
for filepath, info in files.items():
lang = info.get('language', 'unknown')
languages[lang] += 1
size = info.get('size', 0)
chunks = info.get('chunks', 1)
chunk_efficiency.append(chunks / max(1, size / 1000)) # chunks per KB
if size > 10000: # >10KB
large_files.append((filepath, size, chunks))
elif size < 500: # <500B
small_files.append((filepath, size, chunks))
# Analysis results
total_files = len(files)
total_chunks = sum(info.get('chunks', 1) for info in files.values())
avg_chunks_per_file = total_chunks / max(1, total_files)
print(f"📊 Current Stats:")
print(f" Files: {total_files}")
print(f" Chunks: {total_chunks}")
print(f" Avg chunks/file: {avg_chunks_per_file:.1f}")
print(f"\n🗂️ Language Distribution:")
for lang, count in languages.most_common(10):
pct = 100 * count / total_files
print(f" {lang}: {count} files ({pct:.1f}%)")
print(f"\n💡 Smart Optimization Suggestions:")
# Suggestion 1: Language-specific chunking
if languages['python'] > 10:
print(f"✨ Python Optimization:")
print(f" - Use function-level chunking (detected {languages['python']} Python files)")
print(f" - Increase chunk size to 3000 chars for Python (better context)")
if languages['markdown'] > 5:
print(f"✨ Markdown Optimization:")
print(f" - Use header-based chunking (detected {languages['markdown']} MD files)")
print(f" - Keep sections together for better search relevance")
if languages['json'] > 20:
print(f"✨ JSON Optimization:")
print(f" - Consider object-level chunking (detected {languages['json']} JSON files)")
print(f" - Might want to exclude large config JSONs")
# Suggestion 2: File size optimization
if large_files:
print(f"\n📈 Large File Optimization:")
print(f" Found {len(large_files)} files >10KB:")
for filepath, size, chunks in sorted(large_files, key=lambda x: x[1], reverse=True)[:3]:
kb = size / 1024
print(f" - {filepath}: {kb:.1f}KB → {chunks} chunks")
if len(large_files) > 5:
print(f" 💡 Consider streaming threshold: 5KB (current: 1MB)")
if small_files and len(small_files) > total_files * 0.3:
print(f"\n📉 Small File Optimization:")
print(f" {len(small_files)} files <500B might not need chunking")
print(f" 💡 Consider: combine small files or skip tiny ones")
# Suggestion 3: Search optimization
avg_efficiency = sum(chunk_efficiency) / len(chunk_efficiency)
print(f"\n🔍 Search Optimization:")
if avg_efficiency < 0.5:
print(f" 💡 Chunks are large relative to files - consider smaller chunks")
print(f" 💡 Current: {avg_chunks_per_file:.1f} chunks/file, try 2-3 chunks/file")
elif avg_efficiency > 2:
print(f" 💡 Many small chunks - consider larger chunk size")
print(f" 💡 Reduce chunk overhead with 2000-4000 char chunks")
# Suggestion 4: Smart defaults
print(f"\n⚙️ Recommended Config Updates:")
print(f"""{{
"chunking": {{
"max_size": {3000 if languages['python'] > languages['markdown'] else 2000},
"min_size": 200,
"strategy": "{"function" if languages['python'] > 10 else "semantic"}",
"language_specific": {{
"python": {{ "max_size": 3000, "strategy": "function" }},
"markdown": {{ "max_size": 2500, "strategy": "header" }},
"json": {{ "max_size": 1000, "skip_large": true }}
}}
}},
"files": {{
"skip_small_files": {500 if len(small_files) > total_files * 0.3 else 0},
"streaming_threshold_kb": {5 if len(large_files) > 5 else 1024}
}}
}}""")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python smart_config_suggestions.py <path_to_manifest.json>")
sys.exit(1)
manifest_path = Path(sys.argv[1])
if not manifest_path.exists():
print(f"Manifest not found: {manifest_path}")
sys.exit(1)
analyze_project_patterns(manifest_path)