- Applied Black formatter and isort across entire codebase for professional consistency - Moved implementation scripts (rag-mini.py, rag-tui.py) to bin/ directory for cleaner root - Updated shell scripts to reference new bin/ locations maintaining user compatibility - Added comprehensive linting configuration (.flake8, pyproject.toml) with dedicated .venv-linting - Removed development artifacts (commit_message.txt, GET_STARTED.md duplicate) from root - Consolidated documentation and fixed script references across all guides - Relocated test_fixes.py to proper tests/ directory - Enhanced project structure following Python packaging standards All user commands work identically while improving code organization and beginner accessibility.
48 lines
1.2 KiB
Python
48 lines
1.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Show what files are actually indexed in the RAG system.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
from mini_rag.vector_store import VectorStore
|
|
|
|
if sys.platform == "win32":
|
|
os.environ["PYTHONUTF8"] = "1"
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
project_path = Path.cwd()
|
|
store = VectorStore(project_path)
|
|
store._connect()
|
|
|
|
# Get all indexed files
|
|
files = []
|
|
chunks_by_file = Counter()
|
|
chunk_types = Counter()
|
|
|
|
for row in store.table.to_pandas().itertuples():
|
|
files.append(row.file_path)
|
|
chunks_by_file[row.file_path] += 1
|
|
chunk_types[row.chunk_type] += 1
|
|
|
|
unique_files = sorted(set(files))
|
|
|
|
print("\n Indexed Files Summary")
|
|
print(f"Total files: {len(unique_files)}")
|
|
print(f"Total chunks: {len(files)}")
|
|
print(f"\nChunk types: {dict(chunk_types)}")
|
|
|
|
print("\n Files with most chunks:")
|
|
for file, count in chunks_by_file.most_common(10):
|
|
print(f" {count:3d} chunks: {file}")
|
|
|
|
print("\n Text-to-speech files:")
|
|
tts_files = [f for f in unique_files if "text-to-speech" in f or "speak" in f.lower()]
|
|
for f in tts_files:
|
|
print(f" - {f} ({chunks_by_file[f]} chunks)")
|