fss-mini-rag-github/mini_rag/indexer.py

"""
Parallel indexing engine for efficient codebase processing.
Handles file discovery, chunking, embedding, and storage.
"""

import os
import json
import hashlib
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Set, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import numpy as np
import pandas as pd
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
from rich.console import Console

# Optional LanceDB import
try:
    import lancedb
    import pyarrow as pa
    LANCEDB_AVAILABLE = True
except ImportError:
    lancedb = None
    pa = None
    LANCEDB_AVAILABLE = False

from .ollama_embeddings import OllamaEmbedder as CodeEmbedder
from .chunker import CodeChunker, CodeChunk
from .path_handler import normalize_path, normalize_relative_path

logger = logging.getLogger(__name__)
console = Console()


class ProjectIndexer:
    """Indexes a project directory for semantic search."""

    def __init__(self,
                 project_path: Path,
                 embedder: Optional[CodeEmbedder] = None,
                 chunker: Optional[CodeChunker] = None,
                 max_workers: int = 4):
        """
        Initialize the indexer.

        Args:
            project_path: Path to the project to index
            embedder: CodeEmbedder instance (creates one if not provided)
            chunker: CodeChunker instance (creates one if not provided)
            max_workers: Number of parallel workers for indexing
        """
        self.project_path = Path(project_path).resolve()
        self.rag_dir = self.project_path / '.mini-rag'
        self.manifest_path = self.rag_dir / 'manifest.json'
        self.config_path = self.rag_dir / 'config.json'

        # Create RAG directory if it doesn't exist
        self.rag_dir.mkdir(exist_ok=True)

        # Initialize components
        self.embedder = embedder or CodeEmbedder()
        self.chunker = chunker or CodeChunker()
        self.max_workers = max_workers

        # Initialize database connection
        self.db = None
        self.table = None

        # File patterns to include/exclude
        self.include_patterns = [
            # Code files
            '*.py', '*.js', '*.jsx', '*.ts', '*.tsx',
            '*.go', '*.java', '*.cpp', '*.c', '*.cs',
            '*.rs', '*.rb', '*.php', '*.swift', '*.kt',
            '*.scala', '*.r', '*.m', '*.h', '*.hpp',
            # Documentation files
            '*.md', '*.markdown', '*.rst', '*.txt',
            '*.adoc', '*.asciidoc',
            # Config files
            '*.json', '*.yaml', '*.yml', '*.toml', '*.ini',
            '*.xml', '*.conf', '*.config',
            # Other text files
            'README', 'LICENSE', 'CHANGELOG', 'AUTHORS',
            'CONTRIBUTING', 'TODO', 'NOTES'
        ]

        self.exclude_patterns = [
            '__pycache__', '.git', 'node_modules', '.venv', 'venv',
            'env', 'dist', 'build', 'target', '.idea', '.vscode',
            '*.pyc', '*.pyo', '*.pyd', '.DS_Store', '*.so', '*.dll',
            '*.dylib', '*.exe', '*.bin', '*.log', '*.lock'
        ]

        # Load existing manifest if it exists
        self.manifest = self._load_manifest()

    def _load_manifest(self) -> Dict[str, Any]:
        """Load existing manifest or create new one."""
        if self.manifest_path.exists():
            try:
                with open(self.manifest_path, 'r') as f:
                    return json.load(f)
            except Exception as e:
                logger.warning(f"Failed to load manifest: {e}")

        return {
            'version': '1.0',
            'indexed_at': None,
            'file_count': 0,
            'chunk_count': 0,
            'files': {}
        }

    def _save_manifest(self):
        """Save manifest to disk."""
        try:
            with open(self.manifest_path, 'w') as f:
                json.dump(self.manifest, f, indent=2)
        except Exception as e:
            logger.error(f"Failed to save manifest: {e}")

    def _load_config(self) -> Dict[str, Any]:
        """Load or create comprehensive configuration."""
        if self.config_path.exists():
            try:
                with open(self.config_path, 'r') as f:
                    config = json.load(f)
                    # Apply any loaded settings
                    self._apply_config(config)
                    return config
            except Exception as e:
                logger.warning(f"Failed to load config: {e}, using defaults")

        # Default configuration - comprehensive and user-friendly
        config = {
            "project": {
                "name": self.project_path.name,
                "description": f"RAG index for {self.project_path.name}",
                "created_at": datetime.now().isoformat()
            },
            "embedding": {
                "provider": "ollama",
                "model": self.embedder.model_name if hasattr(self.embedder, 'model_name') else 'nomic-embed-text:latest',
                "base_url": "http://localhost:11434",
                "batch_size": 4,
                "max_workers": 4
            },
            "chunking": {
                "max_size": self.chunker.max_chunk_size if hasattr(self.chunker, 'max_chunk_size') else 2500,
                "min_size": self.chunker.min_chunk_size if hasattr(self.chunker, 'min_chunk_size') else 100,
                "overlap": 100,
                "strategy": "semantic"
            },
            "streaming": {
                "enabled": True,
                "threshold_mb": 1,
                "chunk_size_kb": 64
            },
            "files": {
                "include_patterns": self.include_patterns,
                "exclude_patterns": self.exclude_patterns,
                "max_file_size_mb": 50,
                "encoding_fallbacks": ["utf-8", "latin-1", "cp1252", "utf-8-sig"]
            },
            "indexing": {
                "parallel_workers": self.max_workers,
                "incremental": True,
                "track_changes": True,
                "skip_binary": True
            },
            "search": {
                "default_top_k": 10,
                "similarity_threshold": 0.7,
                "hybrid_search": True,
                "bm25_weight": 0.3
            },
            "storage": {
                "compress_vectors": False,
                "index_type": "ivf_pq",
                "cleanup_old_chunks": True
            }
        }

        # Save comprehensive config with nice formatting
        self._save_config(config)
        return config

    def _apply_config(self, config: Dict[str, Any]):
        """Apply configuration settings to the indexer."""
        try:
            # Apply embedding settings
            if 'embedding' in config:
                emb_config = config['embedding']
                if hasattr(self.embedder, 'model_name'):
                    self.embedder.model_name = emb_config.get('model', self.embedder.model_name)
                if hasattr(self.embedder, 'base_url'):
                    self.embedder.base_url = emb_config.get('base_url', self.embedder.base_url)

            # Apply chunking settings
            if 'chunking' in config:
                chunk_config = config['chunking']
                if hasattr(self.chunker, 'max_chunk_size'):
                    self.chunker.max_chunk_size = chunk_config.get('max_size', self.chunker.max_chunk_size)
                if hasattr(self.chunker, 'min_chunk_size'):
                    self.chunker.min_chunk_size = chunk_config.get('min_size', self.chunker.min_chunk_size)

            # Apply file patterns
            if 'files' in config:
                file_config = config['files']
                self.include_patterns = file_config.get('include_patterns', self.include_patterns)
                self.exclude_patterns = file_config.get('exclude_patterns', self.exclude_patterns)

            # Apply indexing settings
            if 'indexing' in config:
                idx_config = config['indexing']
                self.max_workers = idx_config.get('parallel_workers', self.max_workers)

        except Exception as e:
            logger.warning(f"Failed to apply some config settings: {e}")

    def _save_config(self, config: Dict[str, Any]):
        """Save configuration with nice formatting and comments."""
        try:
            # Add helpful comments as a separate file
            config_with_comments = {
                "_comment": "RAG System Configuration - Edit this file to customize indexing behavior",
                "_version": "2.0",
                "_docs": "See README.md for detailed configuration options",
                **config
            }

            with open(self.config_path, 'w') as f:
                json.dump(config_with_comments, f, indent=2, sort_keys=True)

            logger.info(f"Configuration saved to {self.config_path}")

        except Exception as e:
            logger.error(f"Failed to save config: {e}")

    def _get_file_hash(self, file_path: Path) -> str:
        """Calculate SHA256 hash of a file."""
        sha256_hash = hashlib.sha256()
        try:
            with open(file_path, "rb") as f:
                for byte_block in iter(lambda: f.read(4096), b""):
                    sha256_hash.update(byte_block)
            return sha256_hash.hexdigest()
        except Exception as e:
            logger.error(f"Failed to hash {file_path}: {e}")
            return ""

    def _should_index_file(self, file_path: Path) -> bool:
        """Check if a file should be indexed based on patterns and content."""
        # Check file size (skip files > 1MB)
        try:
            if file_path.stat().st_size > 1_000_000:
                return False
        except:
            return False

        # Check exclude patterns first
        path_str = str(file_path)
        for pattern in self.exclude_patterns:
            if pattern in path_str:
                return False

        # Check include patterns (extension-based)
        for pattern in self.include_patterns:
            if file_path.match(pattern):
                return True

        # NEW: Content-based inclusion for extensionless files
        if not file_path.suffix:
            return self._should_index_extensionless_file(file_path)

        return False

    def _should_index_extensionless_file(self, file_path: Path) -> bool:
        """Check if an extensionless file should be indexed based on content."""
        try:
            # Read first 1KB to check content
            with open(file_path, 'rb') as f:
                first_chunk = f.read(1024)

            # Check if it's a text file (not binary)
            try:
                text_content = first_chunk.decode('utf-8')
            except UnicodeDecodeError:
                return False  # Binary file, skip

            # Check for code indicators
            code_indicators = [
                '#!/usr/bin/env python', '#!/usr/bin/python', '#!.*python',
                'import ', 'from ', 'def ', 'class ', 'if __name__',
                'function ', 'var ', 'const ', 'let ', 'package main',
                'public class', 'private class', 'public static void'
            ]

            text_lower = text_content.lower()
            for indicator in code_indicators:
                if indicator in text_lower:
                    return True

            # Check for configuration files
            config_indicators = [
                '#!/bin/bash', '#!/bin/sh', '[', 'version =', 'name =',
                'description =', 'author =', '<configuration>', '<?xml'
            ]

            for indicator in config_indicators:
                if indicator in text_lower:
                    return True

            return False

        except Exception:
            return False

    def _needs_reindex(self, file_path: Path) -> bool:
        """Smart check if a file needs to be reindexed - optimized for speed."""
        file_str = normalize_relative_path(file_path, self.project_path)

        # Not in manifest - needs indexing
        if file_str not in self.manifest['files']:
            return True

        file_info = self.manifest['files'][file_str]

        try:
            stat = file_path.stat()

            # Quick checks first (no I/O) - check size and modification time
            stored_size = file_info.get('size', 0)
            stored_mtime = file_info.get('mtime', 0)

            current_size = stat.st_size
            current_mtime = stat.st_mtime

            # If size or mtime changed, definitely needs reindex
            if current_size != stored_size or current_mtime != stored_mtime:
                return True

            # Size and mtime same - check hash only if needed (for paranoia)
            # This catches cases where content changed but mtime didn't (rare but possible)
            current_hash = self._get_file_hash(file_path)
            stored_hash = file_info.get('hash', '')

            return current_hash != stored_hash

        except (OSError, IOError) as e:
            logger.warning(f"Could not check file stats for {file_path}: {e}")
            # If we can't check file stats, assume it needs reindex
            return True

    def _cleanup_removed_files(self):
        """Remove entries for files that no longer exist from manifest and database."""
        if 'files' not in self.manifest:
            return

        removed_files = []
        for file_str in list(self.manifest['files'].keys()):
            file_path = self.project_path / file_str
            if not file_path.exists():
                removed_files.append(file_str)

        if removed_files:
            logger.info(f"Cleaning up {len(removed_files)} removed files from index")

            for file_str in removed_files:
                # Remove from database
                try:
                    if hasattr(self, 'table') and self.table:
                        self.table.delete(f"file_path = '{file_str}'")
                        logger.debug(f"Removed chunks for deleted file: {file_str}")
                except Exception as e:
                    logger.warning(f"Could not remove chunks for {file_str}: {e}")

                # Remove from manifest
                del self.manifest['files'][file_str]

            # Save updated manifest
            self._save_manifest()
            logger.info(f"Cleanup complete - removed {len(removed_files)} files")

    def _get_files_to_index(self) -> List[Path]:
        """Get all files that need to be indexed."""
        files_to_index = []

        # Walk through project directory
        for root, dirs, files in os.walk(self.project_path):
            # Skip excluded directories
            dirs[:] = [d for d in dirs if not any(pattern in d for pattern in self.exclude_patterns)]

            root_path = Path(root)
            for file in files:
                file_path = root_path / file

                if self._should_index_file(file_path) and self._needs_reindex(file_path):
                    files_to_index.append(file_path)

        return files_to_index

    def _process_file(self, file_path: Path, stream_threshold: int = 1024 * 1024) -> Optional[List[Dict[str, Any]]]:
        """Process a single file: read, chunk, embed.

        Args:
            file_path: Path to the file to process
            stream_threshold: Files larger than this (in bytes) use streaming (default: 1MB)
        """
        try:
            # Check file size for streaming decision
            file_size = file_path.stat().st_size

            if file_size > stream_threshold:
                logger.info(f"Streaming large file ({file_size:,} bytes): {file_path}")
                content = self._read_file_streaming(file_path)
            else:
                # Read file content normally for small files
                content = file_path.read_text(encoding='utf-8')

            # Chunk the file
            chunks = self.chunker.chunk_file(file_path, content)

            if not chunks:
                return None

            # Prepare data for embedding
            chunk_texts = [chunk.content for chunk in chunks]

            # Generate embeddings
            embeddings = self.embedder.embed_code(chunk_texts)

            # Prepare records for database
            records = []
            expected_dim = self.embedder.get_embedding_dim()

            for i, chunk in enumerate(chunks):
                # Validate embedding
                embedding = embeddings[i].astype(np.float32)
                if embedding.shape != (expected_dim,):
                    raise ValueError(
                        f"Invalid embedding dimension for {file_path} chunk {i}: "
                        f"expected ({expected_dim},), got {embedding.shape}"
                    )

                record = {
                    'file_path': normalize_relative_path(file_path, self.project_path),
                    'absolute_path': normalize_path(file_path),
                    'chunk_id': f"{file_path.stem}_{i}",
                    'content': chunk.content,
                    'start_line': int(chunk.start_line),
                    'end_line': int(chunk.end_line),
                    'chunk_type': chunk.chunk_type,
                    'name': chunk.name or f"chunk_{i}",
                    'language': chunk.language,
                    'embedding': embedding,  # Keep as numpy array
                    'indexed_at': datetime.now().isoformat(),
                    # Add new metadata fields
                    'file_lines': int(chunk.file_lines) if chunk.file_lines else 0,
                    'chunk_index': int(chunk.chunk_index) if chunk.chunk_index is not None else i,
                    'total_chunks': int(chunk.total_chunks) if chunk.total_chunks else len(chunks),
                    'parent_class': chunk.parent_class or '',
                    'parent_function': chunk.parent_function or '',
                    'prev_chunk_id': chunk.prev_chunk_id or '',
                    'next_chunk_id': chunk.next_chunk_id or '',
                }
                records.append(record)

            # Update manifest with enhanced tracking
            file_str = normalize_relative_path(file_path, self.project_path)
            stat = file_path.stat()
            self.manifest['files'][file_str] = {
                'hash': self._get_file_hash(file_path),
                'size': stat.st_size,
                'mtime': stat.st_mtime,
                'chunks': len(chunks),
                'indexed_at': datetime.now().isoformat(),
                'language': chunks[0].language if chunks else 'unknown',
                'encoding': 'utf-8'  # Track encoding used
            }

            return records

        except Exception as e:
            logger.error(f"Failed to process {file_path}: {e}")
            return None

    def _read_file_streaming(self, file_path: Path, chunk_size: int = 64 * 1024) -> str:
        """
        Read large files in chunks to avoid loading entirely into memory.

        Args:
            file_path: Path to the file to read
            chunk_size: Size of each read chunk in bytes (default: 64KB)

        Returns:
            Complete file content as string
        """
        content_parts = []

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                while True:
                    chunk = f.read(chunk_size)
                    if not chunk:
                        break
                    content_parts.append(chunk)

            logger.debug(f"Streamed {len(content_parts)} chunks from {file_path}")
            return ''.join(content_parts)

        except UnicodeDecodeError:
            # Try with different encodings for problematic files
            for encoding in ['latin-1', 'cp1252', 'utf-8-sig']:
                try:
                    with open(file_path, 'r', encoding=encoding) as f:
                        content_parts = []
                        while True:
                            chunk = f.read(chunk_size)
                            if not chunk:
                                break
                            content_parts.append(chunk)

                    logger.debug(f"Streamed {len(content_parts)} chunks from {file_path} using {encoding}")
                    return ''.join(content_parts)
                except UnicodeDecodeError:
                    continue

            # If all encodings fail, return empty string
            logger.warning(f"Could not decode {file_path} with any encoding")
            return ""

    def _init_database(self):
        """Initialize LanceDB connection and table."""
        if not LANCEDB_AVAILABLE:
            logger.error("LanceDB is not available. Please install LanceDB for full indexing functionality.")
            logger.info("For Ollama-only mode, consider using hash-based embeddings instead.")
            raise ImportError("LanceDB dependency is required for indexing. Install with: pip install lancedb pyarrow")

        try:
            self.db = lancedb.connect(self.rag_dir)

            # Define schema with fixed-size vector
            embedding_dim = self.embedder.get_embedding_dim()
            schema = pa.schema([
                pa.field("file_path", pa.string()),
                pa.field("absolute_path", pa.string()),
                pa.field("chunk_id", pa.string()),
                pa.field("content", pa.string()),
                pa.field("start_line", pa.int32()),
                pa.field("end_line", pa.int32()),
                pa.field("chunk_type", pa.string()),
                pa.field("name", pa.string()),
                pa.field("language", pa.string()),
                pa.field("embedding", pa.list_(pa.float32(), embedding_dim)),  # Fixed-size list
                pa.field("indexed_at", pa.string()),
                # New metadata fields
                pa.field("file_lines", pa.int32()),
                pa.field("chunk_index", pa.int32()),
                pa.field("total_chunks", pa.int32()),
                pa.field("parent_class", pa.string(), nullable=True),
                pa.field("parent_function", pa.string(), nullable=True),
                pa.field("prev_chunk_id", pa.string(), nullable=True),
                pa.field("next_chunk_id", pa.string(), nullable=True),
            ])

            # Create or open table
            if "code_vectors" in self.db.table_names():
                try:
                    # Try to open existing table
                    self.table = self.db.open_table("code_vectors")

                    # Check if schema matches by trying to get the schema
                    existing_schema = self.table.schema

                    # Check if all required fields exist
                    required_fields = {field.name for field in schema}
                    existing_fields = {field.name for field in existing_schema}

                    if not required_fields.issubset(existing_fields):
                        # Schema mismatch - drop and recreate table
                        logger.warning("Schema mismatch detected. Dropping and recreating table.")
                        self.db.drop_table("code_vectors")
                        self.table = self.db.create_table("code_vectors", schema=schema)
                        logger.info("Recreated code_vectors table with updated schema")
                    else:
                        logger.info("Opened existing code_vectors table")
                except Exception as e:
                    logger.warning(f"Failed to open existing table: {e}. Recreating...")
                    if "code_vectors" in self.db.table_names():
                        self.db.drop_table("code_vectors")
                    self.table = self.db.create_table("code_vectors", schema=schema)
                    logger.info("Recreated code_vectors table")
            else:
                # Create empty table with schema
                self.table = self.db.create_table("code_vectors", schema=schema)
                logger.info(f"Created new code_vectors table with embedding dimension {embedding_dim}")

        except Exception as e:
            logger.error(f"Failed to initialize database: {e}")
            raise

    def index_project(self, force_reindex: bool = False) -> Dict[str, Any]:
        """
        Index the entire project.

        Args:
            force_reindex: If True, reindex all files regardless of changes

        Returns:
            Dictionary with indexing statistics
        """
        start_time = datetime.now()

        # Initialize database
        self._init_database()

        # Clean up removed files (essential for portability)
        if not force_reindex:
            self._cleanup_removed_files()

        # Clear manifest if force reindex
        if force_reindex:
            self.manifest = {
                'version': '1.0',
                'indexed_at': None,
                'file_count': 0,
                'chunk_count': 0,
                'files': {}
            }
            # Clear existing table
            if "code_vectors" in self.db.table_names():
                self.db.drop_table("code_vectors")
                self.table = None
                # Reinitialize the database to recreate the table
                self._init_database()

        # Get files to index
        files_to_index = self._get_files_to_index()

        if not files_to_index:
            console.print("[green][/green] All files are up to date!")
            return {
                'files_indexed': 0,
                'chunks_created': 0,
                'time_taken': 0,
            }

        console.print(f"[cyan]Found {len(files_to_index)} files to index[/cyan]")

        # Process files in parallel
        all_records = []
        failed_files = []

        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
            TimeRemainingColumn(),
            console=console,
        ) as progress:

            task = progress.add_task(
                "[cyan]Indexing files...",
                total=len(files_to_index)
            )

            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                # Submit all files for processing
                future_to_file = {
                    executor.submit(self._process_file, file_path): file_path
                    for file_path in files_to_index
                }

                # Process completed files
                for future in as_completed(future_to_file):
                    file_path = future_to_file[future]

                    try:
                        records = future.result()
                        if records:
                            all_records.extend(records)
                    except Exception as e:
                        logger.error(f"Failed to process {file_path}: {e}")
                        failed_files.append(file_path)

                    progress.advance(task)

        # Batch insert all records
        if all_records:
            try:
                df = pd.DataFrame(all_records)
                # Ensure correct data types
                df["start_line"] = df["start_line"].astype("int32")
                df["end_line"] = df["end_line"].astype("int32")
                df["file_lines"] = df["file_lines"].astype("int32")
                df["chunk_index"] = df["chunk_index"].astype("int32")
                df["total_chunks"] = df["total_chunks"].astype("int32")

                # Table should already be created in _init_database
                if self.table is None:
                    raise RuntimeError("Table not initialized properly")

                self.table.add(df)

                console.print(f"[green][/green] Added {len(all_records)} chunks to database")
            except Exception as e:
                logger.error(f"Failed to insert records: {e}")
                raise

        # Update manifest
        self.manifest['indexed_at'] = datetime.now().isoformat()
        self.manifest['file_count'] = len(self.manifest['files'])
        self.manifest['chunk_count'] = sum(
            f['chunks'] for f in self.manifest['files'].values()
        )
        self._save_manifest()

        # Calculate statistics
        end_time = datetime.now()
        time_taken = (end_time - start_time).total_seconds()

        stats = {
            'files_indexed': len(files_to_index) - len(failed_files),
            'files_failed': len(failed_files),
            'chunks_created': len(all_records),
            'time_taken': time_taken,
            'files_per_second': len(files_to_index) / time_taken if time_taken > 0 else 0,
        }

        # Print summary
        console.print("\n[bold green]Indexing Complete![/bold green]")
        console.print(f"Files indexed: {stats['files_indexed']}")
        console.print(f"Chunks created: {stats['chunks_created']}")
        console.print(f"Time taken: {stats['time_taken']:.2f} seconds")
        console.print(f"Speed: {stats['files_per_second']:.1f} files/second")

        if failed_files:
            console.print(f"\n[yellow]Warning:[/yellow] {len(failed_files)} files failed to index")

        return stats

    def update_file(self, file_path: Path) -> bool:
        """
        Update index for a single file with proper vector multiply in/out.

        Args:
            file_path: Path to the file to update

        Returns:
            True if successful, False otherwise
        """
        try:
            # Make sure database is initialized
            if self.table is None:
                self._init_database()

            # Get normalized file path for consistent lookup
            file_str = normalize_relative_path(file_path, self.project_path)

            # Process the file to get new chunks
            records = self._process_file(file_path)

            if records:
                # Create DataFrame with proper types
                df = pd.DataFrame(records)
                df["start_line"] = df["start_line"].astype("int32")
                df["end_line"] = df["end_line"].astype("int32")
                df["file_lines"] = df["file_lines"].astype("int32")
                df["chunk_index"] = df["chunk_index"].astype("int32")
                df["total_chunks"] = df["total_chunks"].astype("int32")

                # Use vector store's update method (multiply out old, multiply in new)
                if hasattr(self, '_vector_store') and self._vector_store:
                    success = self._vector_store.update_file_vectors(file_str, df)
                else:
                    # Fallback: delete by file path and add new data
                    try:
                        self.table.delete(f"file = '{file_str}'")
                    except Exception as e:
                        logger.debug(f"Could not delete existing chunks (might not exist): {e}")
                    self.table.add(df)
                    success = True

                if success:
                    # Update manifest with enhanced file tracking
                    file_hash = self._get_file_hash(file_path)
                    stat = file_path.stat()
                    if 'files' not in self.manifest:
                        self.manifest['files'] = {}
                    self.manifest['files'][file_str] = {
                        'hash': file_hash,
                        'size': stat.st_size,
                        'mtime': stat.st_mtime,
                        'chunks': len(records),
                        'last_updated': datetime.now().isoformat(),
                        'language': records[0].get('language', 'unknown') if records else 'unknown',
                        'encoding': 'utf-8'
                    }
                    self._save_manifest()
                    logger.debug(f"Successfully updated {len(records)} chunks for {file_str}")
                    return True
            else:
                # File exists but has no processable content - remove existing chunks
                if hasattr(self, '_vector_store') and self._vector_store:
                    self._vector_store.delete_by_file(file_str)
                else:
                    try:
                        self.table.delete(f"file = '{file_str}'")
                    except Exception:
                        pass
                logger.debug(f"Removed chunks for empty/unprocessable file: {file_str}")
                return True

            return False

        except Exception as e:
            logger.error(f"Failed to update {file_path}: {e}")
            return False

    def delete_file(self, file_path: Path) -> bool:
        """
        Delete all chunks for a file from the index.

        Args:
            file_path: Path to the file to delete from index

        Returns:
            True if successful, False otherwise
        """
        try:
            if self.table is None:
                self._init_database()

            file_str = normalize_relative_path(file_path, self.project_path)

            # Delete from vector store
            if hasattr(self, '_vector_store') and self._vector_store:
                success = self._vector_store.delete_by_file(file_str)
            else:
                try:
                    self.table.delete(f"file = '{file_str}'")
                    success = True
                except Exception as e:
                    logger.error(f"Failed to delete {file_str}: {e}")
                    success = False

            # Update manifest
            if success and 'files' in self.manifest and file_str in self.manifest['files']:
                del self.manifest['files'][file_str]
                self._save_manifest()
                logger.debug(f"Deleted chunks for file: {file_str}")

            return success

        except Exception as e:
            logger.error(f"Failed to delete {file_path}: {e}")
            return False

    def get_statistics(self) -> Dict[str, Any]:
        """Get indexing statistics."""
        stats = {
            'project_path': str(self.project_path),
            'indexed_at': self.manifest.get('indexed_at', 'Never'),
            'file_count': self.manifest.get('file_count', 0),
            'chunk_count': self.manifest.get('chunk_count', 0),
            'index_size_mb': 0,
        }

        # Calculate index size
        try:
            db_path = self.rag_dir / 'code_vectors.lance'
            if db_path.exists():
                size_bytes = sum(f.stat().st_size for f in db_path.rglob('*') if f.is_file())
                stats['index_size_mb'] = size_bytes / (1024 * 1024)
        except:
            pass

        return stats