Add category caching system and analytical data to prompts

Category Cache System (src/calibration/category_cache.py): - Persistent storage of discovered categories across mailbox runs - Semantic matching to snap new categories to existing ones - Usage tracking for category popularity - Configurable similarity threshold and new category limits - JSON-based cache with metadata (created, last_seen, email counts) Discovery Improvements (src/calibration/llm_analyzer.py): - Calculate batch statistics: sender domains, recipient counts, attachments, subject lengths, common keywords - Add statistics to LLM discovery prompt for better decisions - Integrate CategoryCache into CalibrationAnalyzer - 3-step workflow: Discover → Consolidate → Snap to Cache Consolidation Improvements: - Add cached categories as hints in consolidation prompt - LLM prefers snapping to established categories - Maintains cross-mailbox consistency while allowing new categories Configuration Parameters: - use_category_cache: Enable/disable caching (default: true) - cache_similarity_threshold: Min similarity for snap (default: 0.7) - cache_allow_new: Allow new categories (default: true) - cache_max_new: Max new categories per run (default: 3) - category_cache_path: Custom cache location Result: Consistent category sets across different mailboxes with intelligent discovery of new categories when appropriate.
2025-10-23 14:25:41 +11:00 · 2025-10-23 14:25:41 +11:00 · 874caf38bc
commit 874caf38bc
parent 183b12c9b4
2 changed files with 325 additions and 0 deletions
--- a/src/calibration/category_cache.py
+++ b/src/calibration/category_cache.py
@ -0,0 +1,231 @@
 """
 Category cache system for consistent categorization across mailboxes.
 Stores discovered categories and provides semantic matching to snap
 new discoveries to existing categories for cross-mailbox consistency.
 """
 import json
 import logging
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 from datetime import datetime
 logger = logging.getLogger(__name__)
 class CategoryCache:
    """
    Manages cached categories for consistent email classification.
    Features:
    - Persistent storage of discovered categories
    - Semantic matching to snap new categories to cached ones
    - Usage tracking for category popularity
    - Support for mailbox-specific overrides
    """
    def __init__(self, cache_path: str = "src/models/category_cache.json"):
        self.cache_path = Path(cache_path)
        self.cache: Dict[str, dict] = {}
        self.load()
    def load(self) -> None:
        """Load category cache from disk."""
        if self.cache_path.exists():
            try:
                with open(self.cache_path, 'r') as f:
                    data = json.load(f)
                    self.cache = data.get('categories', {})
                logger.info(f"Loaded {len(self.cache)} cached categories from {self.cache_path}")
            except Exception as e:
                logger.error(f"Failed to load category cache: {e}")
                self.cache = {}
        else:
            logger.info("No category cache found, starting fresh")
            self.cache = {}
    def save(self) -> None:
        """Save category cache to disk."""
        try:
            self.cache_path.parent.mkdir(parents=True, exist_ok=True)
            data = {
                'version': '1.0',
                'updated': datetime.now().isoformat(),
                'categories': self.cache
            }
            with open(self.cache_path, 'w') as f:
                json.dump(data, f, indent=2)
            logger.info(f"Saved {len(self.cache)} categories to cache")
        except Exception as e:
            logger.error(f"Failed to save category cache: {e}")
    def get_cached_categories(self) -> Dict[str, str]:
        """Get all cached categories as {name: description}."""
        return {name: info['description'] for name, info in self.cache.items()}
    def snap_to_cache(
        self,
        discovered: Dict[str, str],
        similarity_threshold: float = 0.7,
        allow_new: bool = True,
        max_new: int = 3
    ) -> Tuple[Dict[str, str], Dict[str, str]]:
        """
        Snap discovered categories to cached ones using semantic similarity.
        Args:
            discovered: Newly discovered categories {name: description}
            similarity_threshold: Minimum similarity to match (0-1)
            allow_new: Whether to allow new categories not in cache
            max_new: Maximum new categories to add per run
        Returns:
            (snapped_categories, mapping) where:
            - snapped_categories: Final category set (from cache + new)
            - mapping: {discovered_name: final_name} for all discovered
        """
        if not self.cache:
            # No cache yet, return discovered as-is
            logger.info("Empty cache, using all discovered categories")
            return discovered, {name: name for name in discovered}
        snapped = {}
        mapping = {}
        new_categories = []
        cached_cats = self.get_cached_categories()
        for disc_name, disc_desc in discovered.items():
            # Try to find best match in cache
            best_match, best_score = self._find_best_match(
                disc_name, disc_desc, cached_cats
            )
            if best_score >= similarity_threshold:
                # Snap to cached category
                mapping[disc_name] = best_match
                if best_match not in snapped:
                    snapped[best_match] = cached_cats[best_match]
                logger.debug(f"Snapped '{disc_name}' → '{best_match}' (similarity: {best_score:.2f})")
            else:
                # No good match found
                if allow_new and len(new_categories) < max_new:
                    # Allow as new category
                    new_categories.append((disc_name, disc_desc))
                    mapping[disc_name] = disc_name
                    snapped[disc_name] = disc_desc
                    logger.info(f"New category: '{disc_name}' (no cache match, score: {best_score:.2f})")
                else:
                    # Force snap to best match even if below threshold
                    if best_match:
                        mapping[disc_name] = best_match
                        if best_match not in snapped:
                            snapped[best_match] = cached_cats[best_match]
                        logger.warning(f"Forced snap '{disc_name}' → '{best_match}' (low similarity: {best_score:.2f})")
                    else:
                        # Fallback to first cached category
                        fallback = list(cached_cats.keys())[0]
                        mapping[disc_name] = fallback
                        if fallback not in snapped:
                            snapped[fallback] = cached_cats[fallback]
                        logger.warning(f"Fallback: '{disc_name}' → '{fallback}'")
        logger.info(f"Snapping result: {len(snapped)} final categories ({len(new_categories)} new)")
        return snapped, mapping
    def _find_best_match(
        self,
        name: str,
        description: str,
        cached: Dict[str, str]
    ) -> Tuple[Optional[str], float]:
        """
        Find best matching cached category using simple similarity.
        Uses exact name match, keyword overlap, and description similarity.
        Returns (best_category_name, similarity_score).
        """
        if not cached:
            return None, 0.0
        name_lower = name.lower()
        desc_words = set(description.lower().split())
        best_match = None
        best_score = 0.0
        for cached_name, cached_desc in cached.items():
            score = 0.0
            # Exact name match
            if name_lower == cached_name.lower():
                score = 1.0
            # Partial name match
            elif name_lower in cached_name.lower() or cached_name.lower() in name_lower:
                score = 0.8
            # Keyword overlap
            else:
                cached_words = set(cached_desc.lower().split())
                common_words = desc_words & cached_words
                if desc_words:
                    overlap = len(common_words) / len(desc_words)
                    score = overlap * 0.6  # Max 0.6 from keyword overlap
            if score > best_score:
                best_score = score
                best_match = cached_name
        return best_match, best_score
    def update_cache(
        self,
        categories: Dict[str, str],
        usage_count: Optional[Dict[str, int]] = None
    ) -> None:
        """
        Update cache with new/refined categories.
        Args:
            categories: Categories to add/update {name: description}
            usage_count: Optional email counts per category
        """
        for name, desc in categories.items():
            if name in self.cache:
                # Update existing
                self.cache[name]['description'] = desc
                self.cache[name]['last_seen'] = datetime.now().isoformat()
                if usage_count and name in usage_count:
                    self.cache[name]['total_emails'] = self.cache[name].get('total_emails', 0) + usage_count[name]
            else:
                # Add new
                self.cache[name] = {
                    'description': desc,
                    'created': datetime.now().isoformat(),
                    'last_seen': datetime.now().isoformat(),
                    'total_emails': usage_count.get(name, 0) if usage_count else 0
                }
        self.save()
        logger.info(f"Updated cache with {len(categories)} categories")
    def get_stats(self) -> Dict:
        """Get cache statistics."""
        if not self.cache:
            return {'total_categories': 0}
        total_emails = sum(info.get('total_emails', 0) for info in self.cache.values())
        sorted_by_usage = sorted(
            self.cache.items(),
            key=lambda x: x[1].get('total_emails', 0),
            reverse=True
        )
        return {
            'total_categories': len(self.cache),
            'total_emails_classified': total_emails,
            'top_categories': [
                (name, info.get('total_emails', 0))
                for name, info in sorted_by_usage[:10]
            ]
        }
--- a/src/calibration/llm_analyzer.py
+++ b/src/calibration/llm_analyzer.py
@ -6,6 +6,7 @@ from typing import List, Dict, Any, Optional, Tuple
 from src.email_providers.base import Email
 from src.llm.base import BaseLLMProvider
 from src.calibration.category_cache import CategoryCache
 logger = logging.getLogger(__name__)
@ -28,6 +29,10 @@ class CalibrationAnalyzer:
        self.config = config
        self.llm_available = llm_provider.is_available()
        # Initialize category cache for cross-mailbox consistency
        cache_path = config.get('category_cache_path', 'src/models/category_cache.json')
        self.category_cache = CategoryCache(cache_path)
        if not self.llm_available:
            logger.warning("LLM not available for calibration analysis")
@ -91,10 +96,82 @@ class CalibrationAnalyzer:
            else:
                logger.warning("Consolidation didn't reduce categories, keeping original")
        # Step 3: Snap to cached categories for cross-mailbox consistency
        use_cache = self.config.get('use_category_cache', True)
        if use_cache and self.category_cache:
            similarity_threshold = self.config.get('cache_similarity_threshold', 0.7)
            allow_new = self.config.get('cache_allow_new', True)
            max_new = self.config.get('cache_max_new', 3)
            logger.info(f"Snapping to cached categories (threshold={similarity_threshold}, allow_new={allow_new}, max_new={max_new})")
            final_categories, snap_mapping = self.category_cache.snap_to_cache(
                discovered_categories,
                similarity_threshold=similarity_threshold,
                allow_new=allow_new,
                max_new=max_new
            )
            # Update email labels with snapped categories
            for i, (email_id, old_cat) in enumerate(email_labels):
                if old_cat in snap_mapping:
                    email_labels[i] = (email_id, snap_mapping[old_cat])
            logger.info(f"After cache snap: {len(final_categories)} categories")
            discovered_categories = final_categories
            # Update cache with usage counts
            category_counts = {}
            for _, cat in email_labels:
                category_counts[cat] = category_counts.get(cat, 0) + 1
            self.category_cache.update_cache(discovered_categories, category_counts)
        return discovered_categories, email_labels
    def _analyze_batch(self, batch: List[Email], batch_idx: int = 0) -> Dict[str, Any]:
        """Analyze single batch of emails."""
        # Calculate analytical patterns
        sender_domains = {}
        recipients_count = []
        has_attachments = 0
        avg_subject_len = 0
        common_keywords = {}
        for e in batch:
            # Domain analysis
            if '@' in e.sender:
                domain = e.sender.split('@')[1].lower()
                sender_domains[domain] = sender_domains.get(domain, 0) + 1
            # Recipient count
            recipient_count = len(e.recipients) if hasattr(e, 'recipients') else 1
            recipients_count.append(recipient_count)
            # Attachments
            if hasattr(e, 'has_attachments') and e.has_attachments:
                has_attachments += 1
            # Subject length
            avg_subject_len += len(e.subject)
            # Extract keywords from subject (simple word frequency)
            words = e.subject.lower().split()
            for word in words:
                if len(word) > 3:  # Skip short words
                    common_keywords[word] = common_keywords.get(word, 0) + 1
        # Build statistics summary
        top_domains = sorted(sender_domains.items(), key=lambda x: x[1], reverse=True)[:5]
        top_keywords = sorted(common_keywords.items(), key=lambda x: x[1], reverse=True)[:10]
        avg_recipients = sum(recipients_count) / len(recipients_count) if recipients_count else 0
        avg_subject_len = avg_subject_len / len(batch) if batch else 0
        stats_summary = f"""BATCH STATISTICS ({len(batch)} emails):
 - Top sender domains: {', '.join([f'{d} ({c})' for d, c in top_domains])}
 - Avg recipients per email: {avg_recipients:.1f}
 - Emails with attachments: {has_attachments}/{len(batch)}
 - Avg subject length: {avg_subject_len:.0f} chars
 - Common keywords: {', '.join([f'{w}({c})' for w, c in top_keywords[:5]])}"""
        # Build email summary with actual IDs
        email_list = []
        for i, e in enumerate(batch):
@ -116,6 +193,8 @@ GUIDELINES FOR GOOD CATEGORIES:
 - FUNCTIONAL: Each category serves a distinct purpose
 - 3-10 categories ideal: Too many = noise, too few = useless
 {stats_summary}
 EMAILS TO ANALYZE:
 {email_summary}
@ -251,6 +330,21 @@ JSON:
        # Build context section
        context_parts = []
        # Add cached categories as consolidation hints
        if self.category_cache:
            cached_cats = self.category_cache.get_cached_categories()
            if cached_cats:
                cache_stats = self.category_cache.get_stats()
                cache_list = "\n".join([
                    f"  - {name}: {desc}"
                    for name, desc in list(cached_cats.items())[:15]  # Show top 15
                ])
                context_parts.append(f"""CACHED CATEGORIES ({cache_stats['total_categories']} total, showing top 15):
 These are established categories from previous mailboxes. PREFER consolidating to these
 when semantically appropriate to maintain cross-mailbox consistency.
 {cache_list}""")
        if inbox_context:
            context_parts.append(f"INBOX CONTEXT: {inbox_context}")