email-sorter/src/calibration/llm_analyzer.py

"""LLM-based calibration analysis."""
import logging
import json
import re
from typing import List, Dict, Any, Optional, Tuple

from src.email_providers.base import Email
from src.llm.base import BaseLLMProvider

logger = logging.getLogger(__name__)


class CalibrationAnalyzer:
    """
    Use LLM to discover natural categories in email sample.

    This runs ONCE during calibration to understand what categories
    exist naturally in this inbox.
    """

    def __init__(
        self,
        llm_provider: BaseLLMProvider,
        config: Dict[str, Any]
    ):
        """Initialize calibration analyzer."""
        self.llm_provider = llm_provider
        self.config = config
        self.llm_available = llm_provider.is_available()

        if not self.llm_available:
            logger.warning("LLM not available for calibration analysis")

    def discover_categories(
        self,
        sample_emails: List[Email]
    ) -> Tuple[Dict[str, Any], List[Tuple[str, str]]]:
        """
        Discover natural categories in email sample.

        Args:
            sample_emails: Stratified sample of emails

        Returns:
            (category_map, email_labels) where:
            - category_map: discovered categories with descriptions
            - email_labels: list of (email_id, assigned_category)
        """
        if not self.llm_available:
            logger.warning("LLM unavailable, using default categories")
            return self._default_categories(), []

        logger.info(f"Starting LLM category discovery on {len(sample_emails)} emails")

        # Batch emails for analysis
        batch_size = 20
        discovered_categories = {}
        email_labels = []

        for batch_idx in range(0, len(sample_emails), batch_size):
            batch = sample_emails[batch_idx:batch_idx + batch_size]

            try:
                batch_results = self._analyze_batch(batch)

                # Merge categories
                for category, desc in batch_results.get('categories', {}).items():
                    if category not in discovered_categories:
                        discovered_categories[category] = desc

                # Collect labels
                for email_id, category in batch_results.get('labels', []):
                    email_labels.append((email_id, category))

            except Exception as e:
                logger.error(f"Error analyzing batch: {e}")

        logger.info(f"Discovery complete: {len(discovered_categories)} categories found")

        return discovered_categories, email_labels

    def _analyze_batch(self, batch: List[Email]) -> Dict[str, Any]:
        """Analyze single batch of emails."""
        # Build email summary
        email_summary = "\n".join([
            f"Email {i+1}:\n"
            f"  From: {e.sender}\n"
            f"  Subject: {e.subject}\n"
            f"  Preview: {e.body_snippet[:100]}...\n"
            for i, e in enumerate(batch)
        ])

        prompt = f"""Analyze these emails and identify natural categories they belong to.
For each email, assign ONE category. Create new categories as needed based on the emails.

EMAILS:
{email_summary}

Respond with JSON only:
{{
  "categories": {{"category_name": "brief description", ...}},
  "labels": [["email_1_id", "category_name"], ["email_2_id", "category_name"], ...]
}}
"""

        try:
            response = self.llm_provider.complete(
                prompt,
                temperature=0.1,
                max_tokens=1000
            )

            return self._parse_response(response)

        except Exception as e:
            logger.error(f"LLM analysis failed: {e}")
            return {'categories': {}, 'labels': []}

    def _parse_response(self, response: str) -> Dict[str, Any]:
        """Parse LLM JSON response."""
        try:
            json_match = re.search(r'\{.*\}', response, re.DOTALL)
            if json_match:
                return json.loads(json_match.group())
        except json.JSONDecodeError as e:
            logger.debug(f"JSON parse error: {e}")

        return {'categories': {}, 'labels': []}

    def _default_categories(self) -> Dict[str, Any]:
        """Return default categories."""
        return {
            'junk': 'Spam and unwanted emails',
            'transactional': 'Receipts and confirmations',
            'auth': 'Authentication and security',
            'newsletters': 'Newsletters and subscriptions',
            'work': 'Work correspondence',
            'personal': 'Personal emails',
            'finance': 'Financial documents',
            'unknown': 'Unclassified'
        }