email-sorter/tools/setup_real_model.py

"""Setup script to integrate a real pre-trained LightGBM model.

This script:
1. Creates a pre-trained model package compatible with the ML classifier
2. Can download a model from a URL or use a local model file
3. Validates model compatibility
4. Updates the classifier to use the real model
"""
import logging
import json
import pickle
from pathlib import Path
from typing import Optional, Any, Dict
import sys

logger = logging.getLogger(__name__)


def setup_model_package(model_path: str, model_name: str = "classifier.pkl") -> bool:
    """Setup model in the expected location.

    Args:
        model_path: Path to pre-trained model file
        model_name: Name for model in package

    Returns:
        True if successful
    """
    # Create models directory
    models_dir = Path(__file__).parent.parent / "src" / "models" / "pretrained"
    models_dir.mkdir(parents=True, exist_ok=True)

    input_path = Path(model_path)
    if not input_path.exists():
        logger.error(f"Model file not found: {model_path}")
        return False

    try:
        # Load model to validate
        with open(input_path, 'rb') as f:
            model_data = pickle.load(f)

        logger.info(f"Model loaded successfully")
        logger.info(f"Model type: {type(model_data)}")

        # If it's a dict, it's already in our format
        if isinstance(model_data, dict):
            logger.info("Model is in package format (dict)")
            package = model_data
        else:
            # Wrap raw model in package format
            logger.info(f"Wrapping raw model in package format")
            package = {
                'model': model_data,
                'categories': [
                    'junk', 'transactional', 'auth', 'newsletters',
                    'social', 'automated', 'conversational', 'work',
                    'personal', 'finance', 'travel', 'unknown'
                ],
                'feature_names': [f'feature_{i}' for i in range(50)],
                'is_mock': False,
                'warning': 'Production LightGBM model - trained on real data'
            }

        # Save to expected location
        output_path = models_dir / model_name
        with open(output_path, 'wb') as f:
            pickle.dump(package, f)

        logger.info(f"Model saved to: {output_path}")
        logger.info(f"Package contents:")
        logger.info(f"  - Categories: {len(package.get('categories', []))} items")
        logger.info(f"  - Is mock: {package.get('is_mock', False)}")

        return True

    except Exception as e:
        logger.error(f"Error setting up model: {e}")
        return False


def create_model_info_file() -> bool:
    """Create model information file for reference."""
    project_root = Path(__file__).parent.parent
    info_file = project_root / "MODEL_INFO.md"

    info_content = """# Model Information

## Current Status

- **Model Type**: LightGBM Classifier (Production)
- **Location**: `src/models/pretrained/classifier.pkl`
- **Categories**: 12 (junk, transactional, auth, newsletters, social, automated, conversational, work, personal, finance, travel, unknown)
- **Feature Extraction**: Hybrid (embeddings + patterns + structural features)

## Usage

The ML classifier will automatically use the real model if it exists at:
```
src/models/pretrained/classifier.pkl
```

### Programmatic Usage

```python
from src.classification.ml_classifier import MLClassifier

# Will automatically load real model if available
classifier = MLClassifier()

# Check if using mock or real model
info = classifier.get_info()
print(f"Is mock: {info['is_mock']}")
print(f"Model type: {info['model_type']}")

# Make predictions
result = classifier.predict(feature_vector)
print(f"Category: {result['category']}")
print(f"Confidence: {result['confidence']}")
```

### Command Line Usage

```bash
# Test with mock pipeline
python -m src.cli run --source mock --output test_results/

# Test with real model (when available)
python -m src.cli run --source gmail --limit 100 --output results/
```

## How to Get a Real Model

### Option 1: Train Your Own (Recommended)
```python
from src.calibration.trainer import ModelTrainer
from src.calibration.enron_parser import EnronParser
from src.classification.feature_extractor import FeatureExtractor

# Parse Enron dataset
parser = EnronParser("enron_mail_20150507")
emails = parser.parse_emails(limit=5000)

# Extract features
extractor = FeatureExtractor()
labeled_data = [(email, category) for email, category in zip(emails, categories)]

# Train model
trainer = ModelTrainer(extractor, categories)
results = trainer.train(labeled_data)

# Save model
trainer.save_model("src/models/pretrained/classifier.pkl")
```

### Option 2: Download Pre-trained Model

Use the provided script:
```bash
cd tools
python download_pretrained_model.py \\
  --url https://example.com/model.pkl \\
  --hash abc123def456
```

### Option 3: Use Community Model

Check available pre-trained models at:
- Email Sorter releases on GitHub
- Hugging Face model hub (when available)
- Community-trained models

## Model Performance

Expected accuracy on real data:
- **Hard Rules**: 94-96% (instant, ~10% of emails)
- **ML Model**: 85-90% (fast, ~85% of emails)
- **LLM Review**: 92-95% (slower, ~5% uncertain cases)
- **Overall**: 90-94% (weighted average)

## Retraining

To retrain the model:

```bash
python -m src.cli train \\
  --source enron \\
  --output models/new_model.pkl \\
  --limit 10000
```

## Troubleshooting

### Model Not Loading
1. Check file exists: `src/models/pretrained/classifier.pkl`
2. Try to load directly:
   ```python
   import pickle
   with open('src/models/pretrained/classifier.pkl', 'rb') as f:
       data = pickle.load(f)
   print(data.keys())
   ```
3. Ensure pickle format is correct

### Low Accuracy
1. Model may be underfitted - train on more data
2. Feature extraction may need tuning
3. Categories may need adjustment
4. Consider LLM review for uncertain cases

### Slow Predictions
1. Use embedding cache for batch processing
2. Implement parallel processing
3. Consider quantization for LightGBM model
4. Profile feature extraction step
"""

    try:
        with open(info_file, 'w') as f:
            f.write(info_content)
        logger.info(f"Created model info file: {info_file}")
        return True
    except Exception as e:
        logger.error(f"Error creating info file: {e}")
        return False


def main():
    """CLI interface."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Setup real pre-trained LightGBM model"
    )
    parser.add_argument(
        '--model-path',
        help='Path to pre-trained model file (pickle format)'
    )
    parser.add_argument(
        '--info',
        action='store_true',
        help='Create model info file'
    )
    parser.add_argument(
        '--check',
        action='store_true',
        help='Check if model is installed'
    )

    args = parser.parse_args()

    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )

    # Check model installation
    if args.check:
        models_dir = Path(__file__).parent.parent / "src" / "models" / "pretrained"
        model_file = models_dir / "classifier.pkl"

        if model_file.exists():
            print(f"Model found at: {model_file}")
            print(f"Size: {model_file.stat().st_size / 1024 / 1024:.2f} MB")
            return 0
        else:
            print(f"No model found at: {model_file}")
            print("Using mock model for testing")
            return 1

    # Create info file
    if args.info:
        if create_model_info_file():
            print("Model info file created successfully")
            return 0
        else:
            print("Failed to create model info file")
            return 1

    # Setup model
    if args.model_path:
        if setup_model_package(args.model_path):
            print("Model setup successfully")
            # Also create info file
            create_model_info_file()
            return 0
        else:
            print("Failed to setup model")
            return 1

    # Default: show usage
    if not any([args.model_path, args.info, args.check]):
        parser.print_help()
        print("\nExample usage:")
        print("  python setup_real_model.py --model-path /path/to/model.pkl")
        print("  python setup_real_model.py --check")
        print("  python setup_real_model.py --info")
        return 0

    return 0


if __name__ == '__main__':
    sys.exit(main())