PHASE 9: Processing Pipeline & Queue Management (bulk_processor.py) - BulkProcessor class for batch processing with checkpointing - ProcessingCheckpoint: Save/resume state for resumable processing - Handles batches with periodic checkpoints every N emails - Tracks completed, queued_for_llm, and failed emails - Progress callbacks for UI integration PHASE 10: Calibration System (sampler.py, llm_analyzer.py) - EmailSampler: Stratified and random sampling - Stratifies by sender domain type for representativeness - CalibrationAnalyzer: Use LLM to discover natural categories - Batched analysis to control LLM load - Maps discovered categories to universal schema PHASE 11: Export & Reporting (exporter.py) - ResultsExporter: Export to JSON, CSV, organized by category - ReportGenerator: Generate human-readable text reports - Category statistics and method breakdown - Accuracy metrics and processing time tracking PHASE 13: Enron Dataset Parser (enron_parser.py) - Parses Enron maildir format into Email objects - Handles multipart emails and attachments - Date parsing with fallback for malformed dates - Ready to train mock model on real data PHASE 14: Main Orchestration (orchestration.py) - EmailSorterOrchestrator: Coordinates entire pipeline - 4-phase workflow: Calibration → Bulk → LLM → Export - Lazy initialization of components - Progress tracking and timing - Full pipeline runner with resume support Components Now Available: ✅ Sampling (stratified and random) ✅ Calibration (LLM-driven category discovery) ✅ Bulk processing (with checkpointing) ✅ LLM review (batched) ✅ Export (JSON, CSV, by category) ✅ Reporting (text summaries) ✅ Enron parsing (ready for training) ✅ Full orchestration (4 phases) What's Left (Phases 15-16): - E2E pipeline tests - Integration test with Enron data - Setup.py and wheel packaging - Deployment documentation Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
142 lines
4.5 KiB
Python
142 lines
4.5 KiB
Python
"""LLM-based calibration analysis."""
|
|
import logging
|
|
import json
|
|
import re
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
from src.email_providers.base import Email
|
|
from src.llm.base import BaseLLMProvider
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CalibrationAnalyzer:
|
|
"""
|
|
Use LLM to discover natural categories in email sample.
|
|
|
|
This runs ONCE during calibration to understand what categories
|
|
exist naturally in this inbox.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
llm_provider: BaseLLMProvider,
|
|
config: Dict[str, Any]
|
|
):
|
|
"""Initialize calibration analyzer."""
|
|
self.llm_provider = llm_provider
|
|
self.config = config
|
|
self.llm_available = llm_provider.is_available()
|
|
|
|
if not self.llm_available:
|
|
logger.warning("LLM not available for calibration analysis")
|
|
|
|
def discover_categories(
|
|
self,
|
|
sample_emails: List[Email]
|
|
) -> Tuple[Dict[str, Any], List[Tuple[str, str]]]:
|
|
"""
|
|
Discover natural categories in email sample.
|
|
|
|
Args:
|
|
sample_emails: Stratified sample of emails
|
|
|
|
Returns:
|
|
(category_map, email_labels) where:
|
|
- category_map: discovered categories with descriptions
|
|
- email_labels: list of (email_id, assigned_category)
|
|
"""
|
|
if not self.llm_available:
|
|
logger.warning("LLM unavailable, using default categories")
|
|
return self._default_categories(), []
|
|
|
|
logger.info(f"Starting LLM category discovery on {len(sample_emails)} emails")
|
|
|
|
# Batch emails for analysis
|
|
batch_size = 20
|
|
discovered_categories = {}
|
|
email_labels = []
|
|
|
|
for batch_idx in range(0, len(sample_emails), batch_size):
|
|
batch = sample_emails[batch_idx:batch_idx + batch_size]
|
|
|
|
try:
|
|
batch_results = self._analyze_batch(batch)
|
|
|
|
# Merge categories
|
|
for category, desc in batch_results.get('categories', {}).items():
|
|
if category not in discovered_categories:
|
|
discovered_categories[category] = desc
|
|
|
|
# Collect labels
|
|
for email_id, category in batch_results.get('labels', []):
|
|
email_labels.append((email_id, category))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing batch: {e}")
|
|
|
|
logger.info(f"Discovery complete: {len(discovered_categories)} categories found")
|
|
|
|
return discovered_categories, email_labels
|
|
|
|
def _analyze_batch(self, batch: List[Email]) -> Dict[str, Any]:
|
|
"""Analyze single batch of emails."""
|
|
# Build email summary
|
|
email_summary = "\n".join([
|
|
f"Email {i+1}:\n"
|
|
f" From: {e.sender}\n"
|
|
f" Subject: {e.subject}\n"
|
|
f" Preview: {e.body_snippet[:100]}...\n"
|
|
for i, e in enumerate(batch)
|
|
])
|
|
|
|
prompt = f"""Analyze these emails and identify natural categories they belong to.
|
|
For each email, assign ONE category. Create new categories as needed based on the emails.
|
|
|
|
EMAILS:
|
|
{email_summary}
|
|
|
|
Respond with JSON only:
|
|
{{
|
|
"categories": {{"category_name": "brief description", ...}},
|
|
"labels": [["email_1_id", "category_name"], ["email_2_id", "category_name"], ...]
|
|
}}
|
|
"""
|
|
|
|
try:
|
|
response = self.llm_provider.complete(
|
|
prompt,
|
|
temperature=0.1,
|
|
max_tokens=1000
|
|
)
|
|
|
|
return self._parse_response(response)
|
|
|
|
except Exception as e:
|
|
logger.error(f"LLM analysis failed: {e}")
|
|
return {'categories': {}, 'labels': []}
|
|
|
|
def _parse_response(self, response: str) -> Dict[str, Any]:
|
|
"""Parse LLM JSON response."""
|
|
try:
|
|
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
if json_match:
|
|
return json.loads(json_match.group())
|
|
except json.JSONDecodeError as e:
|
|
logger.debug(f"JSON parse error: {e}")
|
|
|
|
return {'categories': {}, 'labels': []}
|
|
|
|
def _default_categories(self) -> Dict[str, Any]:
|
|
"""Return default categories."""
|
|
return {
|
|
'junk': 'Spam and unwanted emails',
|
|
'transactional': 'Receipts and confirmations',
|
|
'auth': 'Authentication and security',
|
|
'newsletters': 'Newsletters and subscriptions',
|
|
'work': 'Work correspondence',
|
|
'personal': 'Personal emails',
|
|
'finance': 'Financial documents',
|
|
'unknown': 'Unclassified'
|
|
}
|