Add LLM-driven cache evolution - selective category persistence

LLM now decides which new categories should be added to persistent cache for future mailbox runs vs temporary (run-only) categories. ENHANCED LLM REVIEW: - New field: "cache_worthy" (true/false) for each "new" category - LLM judges: "Is this category useful across different mailboxes?" - Examples: - "Customer Support" → cache_worthy: true (universal) - "Project X Updates" → cache_worthy: false (mailbox-specific) CACHE EVOLUTION: - cache_worthy=true → Added to persistent cache for future runs - cache_worthy=false → Used for current run only, not cached - First run (empty cache) → All categories treated as cache-worthy - LLM reasoning logged for transparency INTELLIGENT GROWTH: - Cache grows organically with high-quality, reusable categories - Prevents pollution with mailbox-specific categories - Maintains cross-mailbox consistency while allowing natural evolution - LLM balances: consistency (snap existing) vs expansion (add worthy) SINGLE LLM CALL EFFICIENCY: - Same ~4 second LLM call now handles: 1. Snap vs new decision 2. Cache persistence decision 3. Reasoning for both - No additional overhead for cache evolution Result: Cache evolves intelligently over time, collecting universally useful categories while filtering out temporary/specific ones.
2025-10-23 15:36:51 +11:00 · 2025-10-23 15:36:51 +11:00 · fa09d14e52
commit fa09d14e52
parent eab378409e
2 changed files with 51 additions and 17 deletions
--- a/src/calibration/category_cache.py
+++ b/src/calibration/category_cache.py
@ -74,7 +74,7 @@ class CategoryCache:
        allow_new: bool = True,
        max_new: int = 3,
        llm_review_threshold: float = 0.5
-    ) -> Tuple[Dict[str, str], Dict[str, str]]:
+    ) -> Tuple[Dict[str, str], Dict[str, str], List[Tuple[str, str]]]:
        """
        Snap discovered categories to cached ones using multi-stage matching.
@ -93,13 +93,15 @@ class CategoryCache:
            llm_review_threshold: Min score to trigger LLM review (default: 0.5)
        Returns:
-            (snapped_categories, mapping) where:
+            (snapped_categories, mapping, cache_worthy) where:
            - snapped_categories: Final category set (from cache + new)
            - mapping: {discovered_name: final_name} for all discovered
            - cache_worthy: List of (name, desc) for categories to add to persistent cache
        """
        if not self.cache:
            logger.info("Empty cache, using all discovered categories")
-            return discovered, {name: name for name in discovered}
+            # First run - all categories can be cache-worthy
            return discovered, {name: name for name in discovered}, list(discovered.items())
        snapped = {}
        mapping = {}
@ -139,6 +141,8 @@ class CategoryCache:
                    logger.warning(f"Force review: '{disc_name}' (max_new exceeded, score: {best_score:.2f})")
        # LLM Review for ambiguous cases
        cache_worthy_additions = []  # Track categories to add to persistent cache
        if ambiguous_cases and self.llm_provider:
            logger.info(f"Requesting LLM review for {len(ambiguous_cases)} ambiguous cases...")
            llm_decisions = self._llm_review_ambiguous(ambiguous_cases, cached_cats, allow_new, len(new_categories), max_new)
@ -149,14 +153,25 @@ class CategoryCache:
                    mapping[disc_name] = target
                    if target not in snapped:
                        snapped[target] = cached_cats[target]
-                    logger.info(f"LLM snap: '{disc_name}' → '{target}'")
+                    reasoning = decision.get('reasoning', 'similar to existing')
                    logger.info(f"LLM snap: '{disc_name}' → '{target}' ({reasoning})")
                elif decision['action'] == 'new':
                    # Find original description
                    disc_desc = next(desc for name, desc, _, _ in ambiguous_cases if name == disc_name)
                    new_categories.append((disc_name, disc_desc))
                    mapping[disc_name] = disc_name
                    snapped[disc_name] = disc_desc
-                    logger.info(f"LLM approved new: '{disc_name}'")
+
                    # Check if LLM recommends adding to cache
                    is_cache_worthy = decision.get('cache_worthy', False)
                    reasoning = decision.get('reasoning', 'new category')
                    if is_cache_worthy:
                        cache_worthy_additions.append((disc_name, disc_desc))
                        logger.info(f"LLM approved new + CACHE: '{disc_name}' ({reasoning})")
                    else:
                        logger.info(f"LLM approved new (run-only): '{disc_name}' ({reasoning})")
        elif ambiguous_cases:
            # No LLM available → use heuristic fallback
@ -175,8 +190,8 @@ class CategoryCache:
                    if best_match not in snapped:
                        snapped[best_match] = cached_cats[best_match]
-        logger.info(f"Snapping result: {len(snapped)} final categories ({len(new_categories)} new)")
+        logger.info(f"Snapping result: {len(snapped)} final categories ({len(new_categories)} new, {len(cache_worthy_additions)} cache-worthy)")
-        return snapped, mapping
+        return snapped, mapping, cache_worthy_additions
    def _find_best_match(
        self,
@ -310,23 +325,34 @@ CONTEXT:
 TASK:
 For each ambiguous case, decide:
-1. "snap" - If semantically similar enough to cached category (even if not perfect match)
+1. "snap" - If semantically similar enough to cached category
-2. "new" - If genuinely distinct and worth adding (only if slots available)
+2. "new" - If genuinely distinct and worth adding for THIS RUN
 ADDITIONALLY:
 For "new" categories, decide if they should be added to the CACHE for future mailboxes:
 - "cache_worthy": true - High-quality, reusable category (e.g., "Customer Support", "Sales")
 - "cache_worthy": false - Mailbox-specific, not broadly useful (e.g., "Project X Updates")
 GUIDELINES:
 - PREFER snapping to maintain consistency across mailboxes
 - Only approve "new" if category serves a clearly distinct purpose
- Consider: Will users benefit from separating this vs merging with existing?
+- Be VERY selective with cache_worthy - only approve universally useful categories
- Be conservative with "new" - consolidation is better than fragmentation
+- Consider: Would this category be useful across different users' mailboxes?
 - Mailbox-specific categories can be "new" without being cache_worthy
 Return JSON:
 {{
-  "CategoryName": {{"action": "snap"|"new", "target": "CachedCategoryName"}},
+  "CategoryName": {{
    "action": "snap"|"new",
    "target": "CachedCategoryName",
    "cache_worthy": true|false,
    "reasoning": "brief explanation"
  }},
  ...
 }}
-For "snap": target = cached category to snap to
+For "snap": target = cached category to snap to, cache_worthy = false (not applicable)
-For "new": target = same as CategoryName (keeps original)
+For "new": target = same as CategoryName, cache_worthy = true/false based on reusability
 JSON:
 """
--- a/src/calibration/llm_analyzer.py
+++ b/src/calibration/llm_analyzer.py
@ -105,7 +105,7 @@ class CalibrationAnalyzer:
            max_new = self.config.get('cache_max_new', 3)
            logger.info(f"Snapping to cached categories (threshold={similarity_threshold}, allow_new={allow_new}, max_new={max_new})")
-            final_categories, snap_mapping = self.category_cache.snap_to_cache(
+            final_categories, snap_mapping, cache_worthy = self.category_cache.snap_to_cache(
                discovered_categories,
                similarity_threshold=similarity_threshold,
                allow_new=allow_new,
@ -120,11 +120,19 @@ class CalibrationAnalyzer:
            logger.info(f"After cache snap: {len(final_categories)} categories")
            discovered_categories = final_categories
-            # Update cache with usage counts
+            # Update cache with usage counts AND add cache-worthy new categories
            category_counts = {}
            for _, cat in email_labels:
                category_counts[cat] = category_counts.get(cat, 0) + 1
-            self.category_cache.update_cache(discovered_categories, category_counts)
+
            # Add cache-worthy categories to persistent cache
            if cache_worthy:
                cache_additions = {name: desc for name, desc in cache_worthy}
                logger.info(f"Adding {len(cache_worthy)} LLM-approved categories to persistent cache: {list(cache_additions.keys())}")
                self.category_cache.update_cache(cache_additions, category_counts)
            else:
                # Just update usage counts for existing categories
                self.category_cache.update_cache(discovered_categories, category_counts)
        return discovered_categories, email_labels