"""LLM-based email classifier.""" import logging import json import re from typing import Dict, List, Any, Optional from src.llm.base import BaseLLMProvider logger = logging.getLogger(__name__) class LLMClassifier: """ Email classifier using LLM for uncertain cases. Usage: - Only called for emails with low ML confidence - Batches emails for efficiency - Gracefully degrades if LLM unavailable """ def __init__( self, provider: BaseLLMProvider, categories: Dict[str, Dict], config: Dict[str, Any] ): """Initialize LLM classifier.""" self.provider = provider self.categories = categories self.config = config self.llm_available = provider.is_available() if not self.llm_available: logger.warning("LLM provider not available, LLM classification will be disabled") self.classification_prompt = self._load_prompt_template() def _load_prompt_template(self) -> str: """Load or create classification prompt.""" # Try to load from file try: with open('prompts/classification.txt', 'r') as f: return f.read() except FileNotFoundError: pass # Default prompt - optimized for caching (static instructions first) return """You are an expert email classifier. Analyze the email and classify it. INSTRUCTIONS: - Review the email content and available categories below - Select the single most appropriate category - Provide confidence score (0.0 to 1.0) - Give brief reasoning for your classification OUTPUT FORMAT: Respond with ONLY valid JSON (no markdown, no extra text): {{ "category": "category_name", "confidence": 0.95, "reasoning": "brief reason" }} CATEGORIES: {categories} EMAIL TO CLASSIFY: Subject: {subject} From: {sender} Has Attachments: {has_attachments} Body (first 300 chars): {body_snippet} ML Prediction: {ml_prediction} (confidence: {ml_confidence:.2f}) """ def classify(self, email: Dict[str, Any]) -> Dict[str, Any]: """ Classify email using LLM. Args: email: Email data with subject, sender, body_snippet, ml_prediction Returns: Classification result with category, confidence, reasoning """ if not self.llm_available: logger.warning("LLM not available, returning ML prediction") return { 'category': email.get('ml_prediction', {}).get('category', 'unknown'), 'confidence': 0.5, 'reasoning': 'LLM not available, using ML prediction', 'method': 'ml_fallback' } try: # Build prompt categories_str = "\n".join([ f"- {name}: {info.get('description', 'N/A')}" for name, info in self.categories.items() ]) ml_pred = email.get('ml_prediction', {}) prompt = self.classification_prompt.format( categories=categories_str, subject=email.get('subject', 'N/A')[:100], sender=email.get('sender', 'N/A')[:50], has_attachments=email.get('has_attachments', False), body_snippet=email.get('body_snippet', '')[:300], ml_prediction=ml_pred.get('category', 'unknown'), ml_confidence=ml_pred.get('confidence', 0.0) ) logger.debug(f"LLM classifying: {email.get('subject', 'No subject')[:50]}") # Get LLM response response = self.provider.complete( prompt, temperature=self.config.get('llm', {}).get('temperature', 0.1), max_tokens=self.config.get('llm', {}).get('max_tokens', 500) ) # Parse response result = self._parse_response(response) result['method'] = 'llm' return result except Exception as e: logger.error(f"LLM classification failed: {e}") return { 'category': 'unknown', 'confidence': 0.5, 'reasoning': f'LLM error: {str(e)[:100]}', 'method': 'llm_error', 'error': True } def classify_batch(self, emails: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Classify batch of emails (individually for now, can optimize later).""" results = [] for email in emails: result = self.classify(email) results.append(result) return results def _parse_response(self, response: str) -> Dict[str, Any]: """Parse LLM JSON response.""" try: # Try to extract JSON block json_match = re.search(r'\{.*\}', response, re.DOTALL) if json_match: json_str = json_match.group() parsed = json.loads(json_str) return { 'category': parsed.get('category', 'unknown'), 'confidence': float(parsed.get('confidence', 0.5)), 'reasoning': parsed.get('reasoning', '') } except json.JSONDecodeError as e: logger.debug(f"JSON parsing error: {e}") except Exception as e: logger.debug(f"Response parsing error: {e}") # Fallback parsing - try to extract category name logger.warning(f"Failed to parse LLM response, using fallback parsing") logger.debug(f"Response was: {response[:200]}") return { 'category': 'unknown', 'confidence': 0.5, 'reasoning': response[:100] } def get_status(self) -> Dict[str, Any]: """Get classifier status.""" return { 'llm_available': self.llm_available, 'provider': self.provider.name if self.provider else 'none', 'categories': len(self.categories), 'status': 'ready' if self.llm_available else 'degraded' }