Fss-Rag-Mini/mini_rag/smart_chunking.py

"""
Smart language-aware chunking strategies for FSS-Mini-RAG.
Automatically adapts chunking based on file type and content patterns.
"""

from typing import Dict, Any, List
from pathlib import Path
import json

class SmartChunkingStrategy:
    """Intelligent chunking that adapts to file types and content."""

    def __init__(self):
        self.language_configs = {
            'python': {
                'max_size': 3000,  # Larger for better function context
                'min_size': 200,
                'strategy': 'function',
                'prefer_semantic': True
            },
            'javascript': {
                'max_size': 2500,
                'min_size': 150,
                'strategy': 'function',
                'prefer_semantic': True
            },
            'markdown': {
                'max_size': 2500,
                'min_size': 300,  # Larger minimum for complete thoughts
                'strategy': 'header',
                'preserve_structure': True
            },
            'json': {
                'max_size': 1000,  # Smaller for config files
                'min_size': 50,
                'skip_if_large': True,  # Skip huge config JSONs
                'max_file_size': 50000  # 50KB limit
            },
            'yaml': {
                'max_size': 1500,
                'min_size': 100,
                'strategy': 'key_block'
            },
            'text': {
                'max_size': 2000,
                'min_size': 200,
                'strategy': 'paragraph'
            },
            'bash': {
                'max_size': 1500,
                'min_size': 100,
                'strategy': 'function'
            }
        }

        # Smart defaults for unknown languages
        self.default_config = {
            'max_size': 2000,
            'min_size': 150,
            'strategy': 'semantic'
        }

    def get_config_for_language(self, language: str, file_size: int = 0) -> Dict[str, Any]:
        """Get optimal chunking config for a specific language."""
        config = self.language_configs.get(language, self.default_config).copy()

        # Smart adjustments based on file size
        if file_size > 0:
            if file_size < 500:  # Very small files
                config['max_size'] = max(config['max_size'] // 2, 200)
                config['min_size'] = 50
            elif file_size > 20000:  # Large files
                config['max_size'] = min(config['max_size'] + 1000, 4000)

        return config

    def should_skip_file(self, language: str, file_size: int) -> bool:
        """Determine if a file should be skipped entirely."""
        lang_config = self.language_configs.get(language, {})

        # Skip huge JSON config files
        if language == 'json' and lang_config.get('skip_if_large'):
            max_size = lang_config.get('max_file_size', 50000)
            if file_size > max_size:
                return True

        # Skip tiny files that won't provide good context
        if file_size < 30:
            return True

        return False

    def get_smart_defaults(self, project_stats: Dict[str, Any]) -> Dict[str, Any]:
        """Generate smart defaults based on project language distribution."""
        languages = project_stats.get('languages', {})
        total_files = sum(languages.values())

        # Determine primary language
        primary_lang = max(languages.items(), key=lambda x: x[1])[0] if languages else 'python'
        primary_config = self.language_configs.get(primary_lang, self.default_config)

        # Smart streaming threshold based on large files
        large_files = project_stats.get('large_files', 0)
        streaming_threshold = 5120 if large_files > 5 else 1048576  # 5KB vs 1MB

        return {
            "chunking": {
                "max_size": primary_config['max_size'],
                "min_size": primary_config['min_size'],
                "strategy": primary_config.get('strategy', 'semantic'),
                "language_specific": {
                    lang: config for lang, config in self.language_configs.items()
                    if languages.get(lang, 0) > 0
                }
            },
            "streaming": {
                "enabled": True,
                "threshold_bytes": streaming_threshold,
                "chunk_size_kb": 64
            },
            "files": {
                "skip_tiny_files": True,
                "tiny_threshold": 30,
                "smart_json_filtering": True
            }
        }

# Example usage
def analyze_and_suggest(manifest_data: Dict[str, Any]) -> Dict[str, Any]:
    """Analyze project and suggest optimal configuration."""
    from collections import Counter

    files = manifest_data.get('files', {})
    languages = Counter()
    large_files = 0

    for info in files.values():
        lang = info.get('language', 'unknown')
        languages[lang] += 1
        if info.get('size', 0) > 10000:
            large_files += 1

    stats = {
        'languages': dict(languages),
        'large_files': large_files,
        'total_files': len(files)
    }

    strategy = SmartChunkingStrategy()
    return strategy.get_smart_defaults(stats)