"""LLM-based calibration analysis.""" import logging import json import re from typing import List, Dict, Any, Optional, Tuple from src.email_providers.base import Email from src.llm.base import BaseLLMProvider logger = logging.getLogger(__name__) class CalibrationAnalyzer: """ Use LLM to discover natural categories in email sample. This runs ONCE during calibration to understand what categories exist naturally in this inbox. """ def __init__( self, llm_provider: BaseLLMProvider, config: Dict[str, Any] ): """Initialize calibration analyzer.""" self.llm_provider = llm_provider self.config = config self.llm_available = llm_provider.is_available() if not self.llm_available: logger.warning("LLM not available for calibration analysis") def discover_categories( self, sample_emails: List[Email] ) -> Tuple[Dict[str, Any], List[Tuple[str, str]]]: """ Discover natural categories in email sample. Args: sample_emails: Stratified sample of emails Returns: (category_map, email_labels) where: - category_map: discovered categories with descriptions - email_labels: list of (email_id, assigned_category) """ if not self.llm_available: logger.warning("LLM unavailable, using default categories") return self._default_categories(), [] logger.info(f"Starting LLM category discovery on {len(sample_emails)} emails") # Batch emails for analysis batch_size = 20 discovered_categories = {} email_labels = [] for batch_idx in range(0, len(sample_emails), batch_size): batch = sample_emails[batch_idx:batch_idx + batch_size] try: batch_results = self._analyze_batch(batch) # Merge categories for category, desc in batch_results.get('categories', {}).items(): if category not in discovered_categories: discovered_categories[category] = desc # Collect labels for email_id, category in batch_results.get('labels', []): email_labels.append((email_id, category)) except Exception as e: logger.error(f"Error analyzing batch: {e}") logger.info(f"Discovery complete: {len(discovered_categories)} categories found") return discovered_categories, email_labels def _analyze_batch(self, batch: List[Email]) -> Dict[str, Any]: """Analyze single batch of emails.""" # Build email summary email_summary = "\n".join([ f"Email {i+1}:\n" f" From: {e.sender}\n" f" Subject: {e.subject}\n" f" Preview: {e.body_snippet[:100]}...\n" for i, e in enumerate(batch) ]) prompt = f"""Analyze these emails and identify natural categories they belong to. For each email, assign ONE category. Create new categories as needed based on the emails. EMAILS: {email_summary} Respond with JSON only: {{ "categories": {{"category_name": "brief description", ...}}, "labels": [["email_1_id", "category_name"], ["email_2_id", "category_name"], ...] }} """ try: response = self.llm_provider.complete( prompt, temperature=0.1, max_tokens=1000 ) return self._parse_response(response) except Exception as e: logger.error(f"LLM analysis failed: {e}") return {'categories': {}, 'labels': []} def _parse_response(self, response: str) -> Dict[str, Any]: """Parse LLM JSON response.""" try: json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: return json.loads(json_match.group()) except json.JSONDecodeError as e: logger.debug(f"JSON parse error: {e}") return {'categories': {}, 'labels': []} def _default_categories(self) -> Dict[str, Any]: """Return default categories.""" return { 'junk': 'Spam and unwanted emails', 'transactional': 'Receipts and confirmations', 'auth': 'Authentication and security', 'newsletters': 'Newsletters and subscriptions', 'work': 'Work correspondence', 'personal': 'Personal emails', 'finance': 'Financial documents', 'unknown': 'Unclassified' }