email-sorter/src/classification/llm_classifier.py
FSSCoding 10862583ad Add batch LLM classifier tool with prompt caching optimization
- Created standalone batch_llm_classifier.py for custom email queries
- Optimized all LLM prompts for caching (static instructions first, variables last)
- Configured rtx3090 vLLM endpoint (qwen3-coder-30b)
- Tested batch_size=4 optimal (100% success, 4.65 req/sec)
- Added comprehensive documentation (tools/README.md, BATCH_LLM_QUICKSTART.md)

Tool is completely separate from main ML pipeline - no interference.
Prerequisite: vLLM server must be running at rtx3090.bobai.com.au
2025-11-14 16:01:57 +11:00

184 lines
5.9 KiB
Python

"""LLM-based email classifier."""
import logging
import json
import re
from typing import Dict, List, Any, Optional
from src.llm.base import BaseLLMProvider
logger = logging.getLogger(__name__)
class LLMClassifier:
"""
Email classifier using LLM for uncertain cases.
Usage:
- Only called for emails with low ML confidence
- Batches emails for efficiency
- Gracefully degrades if LLM unavailable
"""
def __init__(
self,
provider: BaseLLMProvider,
categories: Dict[str, Dict],
config: Dict[str, Any]
):
"""Initialize LLM classifier."""
self.provider = provider
self.categories = categories
self.config = config
self.llm_available = provider.is_available()
if not self.llm_available:
logger.warning("LLM provider not available, LLM classification will be disabled")
self.classification_prompt = self._load_prompt_template()
def _load_prompt_template(self) -> str:
"""Load or create classification prompt."""
# Try to load from file
try:
with open('prompts/classification.txt', 'r') as f:
return f.read()
except FileNotFoundError:
pass
# Default prompt - optimized for caching (static instructions first)
return """You are an expert email classifier. Analyze the email and classify it.
INSTRUCTIONS:
- Review the email content and available categories below
- Select the single most appropriate category
- Provide confidence score (0.0 to 1.0)
- Give brief reasoning for your classification
OUTPUT FORMAT:
Respond with ONLY valid JSON (no markdown, no extra text):
{{
"category": "category_name",
"confidence": 0.95,
"reasoning": "brief reason"
}}
CATEGORIES:
{categories}
EMAIL TO CLASSIFY:
Subject: {subject}
From: {sender}
Has Attachments: {has_attachments}
Body (first 300 chars): {body_snippet}
ML Prediction: {ml_prediction} (confidence: {ml_confidence:.2f})
"""
def classify(self, email: Dict[str, Any]) -> Dict[str, Any]:
"""
Classify email using LLM.
Args:
email: Email data with subject, sender, body_snippet, ml_prediction
Returns:
Classification result with category, confidence, reasoning
"""
if not self.llm_available:
logger.warning("LLM not available, returning ML prediction")
return {
'category': email.get('ml_prediction', {}).get('category', 'unknown'),
'confidence': 0.5,
'reasoning': 'LLM not available, using ML prediction',
'method': 'ml_fallback'
}
try:
# Build prompt
categories_str = "\n".join([
f"- {name}: {info.get('description', 'N/A')}"
for name, info in self.categories.items()
])
ml_pred = email.get('ml_prediction', {})
prompt = self.classification_prompt.format(
categories=categories_str,
subject=email.get('subject', 'N/A')[:100],
sender=email.get('sender', 'N/A')[:50],
has_attachments=email.get('has_attachments', False),
body_snippet=email.get('body_snippet', '')[:300],
ml_prediction=ml_pred.get('category', 'unknown'),
ml_confidence=ml_pred.get('confidence', 0.0)
)
logger.debug(f"LLM classifying: {email.get('subject', 'No subject')[:50]}")
# Get LLM response
response = self.provider.complete(
prompt,
temperature=self.config.get('llm', {}).get('temperature', 0.1),
max_tokens=self.config.get('llm', {}).get('max_tokens', 500)
)
# Parse response
result = self._parse_response(response)
result['method'] = 'llm'
return result
except Exception as e:
logger.error(f"LLM classification failed: {e}")
return {
'category': 'unknown',
'confidence': 0.5,
'reasoning': f'LLM error: {str(e)[:100]}',
'method': 'llm_error',
'error': True
}
def classify_batch(self, emails: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Classify batch of emails (individually for now, can optimize later)."""
results = []
for email in emails:
result = self.classify(email)
results.append(result)
return results
def _parse_response(self, response: str) -> Dict[str, Any]:
"""Parse LLM JSON response."""
try:
# Try to extract JSON block
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
json_str = json_match.group()
parsed = json.loads(json_str)
return {
'category': parsed.get('category', 'unknown'),
'confidence': float(parsed.get('confidence', 0.5)),
'reasoning': parsed.get('reasoning', '')
}
except json.JSONDecodeError as e:
logger.debug(f"JSON parsing error: {e}")
except Exception as e:
logger.debug(f"Response parsing error: {e}")
# Fallback parsing - try to extract category name
logger.warning(f"Failed to parse LLM response, using fallback parsing")
logger.debug(f"Response was: {response[:200]}")
return {
'category': 'unknown',
'confidence': 0.5,
'reasoning': response[:100]
}
def get_status(self) -> Dict[str, Any]:
"""Get classifier status."""
return {
'llm_available': self.llm_available,
'provider': self.provider.name if self.provider else 'none',
'categories': len(self.categories),
'status': 'ready' if self.llm_available else 'degraded'
}