From 8f25e30f52c6a24d81a40a04c963f6c30bf187cf Mon Sep 17 00:00:00 2001 From: FSSCoding Date: Fri, 28 Nov 2025 13:07:27 +1100 Subject: [PATCH] Rewrite CLAUDE.md and clean project structure - Rewrote CLAUDE.md with comprehensive development guide - Archived 20 old docs to docs/archive/ - Added PROJECT_ROADMAP_2025.md with research learnings - Added CLASSIFICATION_METHODS_COMPARISON.md - Added SESSION_HANDOVER_20251128.md - Added tools for analysis (brett_gmail/microsoft analyzers) - Updated .gitignore for archive folders - Config changes for local vLLM endpoint --- .gitignore | 13 +- CLAUDE.md | 645 ++- config/default_config.yaml | 7 +- docs/BUILD_INSTRUCTIONS.md | 1298 ----- docs/CLASSIFICATION_METHODS_COMPARISON.md | 518 ++ docs/COMPLETION_ASSESSMENT.md | 526 -- docs/COMPREHENSIVE_PROJECT_OVERVIEW.md | 5357 --------------------- docs/CURRENT_WORK_SUMMARY.md | 232 - docs/FAST_ML_ONLY_WORKFLOW.html | 527 -- docs/LABEL_TRAINING_PHASE_DETAIL.html | 564 --- docs/MODEL_INFO.md | 129 - docs/NEXT_STEPS.md | 437 -- docs/PROJECT_BLUEPRINT.md | 1063 ---- docs/PROJECT_COMPLETE.md | 566 --- docs/PROJECT_ROADMAP_2025.md | 479 ++ docs/PROJECT_STATUS.md | 402 -- docs/PROJECT_STATUS_AND_NEXT_STEPS.html | 648 --- docs/REPORT_FORMAT.md | 232 + docs/RESEARCH_FINDINGS.md | 419 -- docs/ROOT_CAUSE_ANALYSIS.md | 319 -- docs/SESSION_HANDOVER_20251128.md | 128 + docs/START_HERE.md | 324 -- docs/SYSTEM_FLOW.html | 493 -- docs/VERIFY_CATEGORIES_FEATURE.html | 357 -- docs/WORKFLOW_DIAGRAM.md | 255 - docs/chat-gippity-research.md | 126 - src/cli.py | 36 +- src/llm/openai_compat.py | 12 +- tools/batch_llm_classifier.py | 364 ++ tools/brett_gmail_analyzer.py | 391 ++ tools/brett_microsoft_analyzer.py | 500 ++ tools/generate_html_report.py | 642 +++ 32 files changed, 3592 insertions(+), 14417 deletions(-) delete mode 100644 docs/BUILD_INSTRUCTIONS.md create mode 100644 docs/CLASSIFICATION_METHODS_COMPARISON.md delete mode 100644 docs/COMPLETION_ASSESSMENT.md delete mode 100644 docs/COMPREHENSIVE_PROJECT_OVERVIEW.md delete mode 100644 docs/CURRENT_WORK_SUMMARY.md delete mode 100644 docs/FAST_ML_ONLY_WORKFLOW.html delete mode 100644 docs/LABEL_TRAINING_PHASE_DETAIL.html delete mode 100644 docs/MODEL_INFO.md delete mode 100644 docs/NEXT_STEPS.md delete mode 100644 docs/PROJECT_BLUEPRINT.md delete mode 100644 docs/PROJECT_COMPLETE.md create mode 100644 docs/PROJECT_ROADMAP_2025.md delete mode 100644 docs/PROJECT_STATUS.md delete mode 100644 docs/PROJECT_STATUS_AND_NEXT_STEPS.html create mode 100644 docs/REPORT_FORMAT.md delete mode 100644 docs/RESEARCH_FINDINGS.md delete mode 100644 docs/ROOT_CAUSE_ANALYSIS.md create mode 100644 docs/SESSION_HANDOVER_20251128.md delete mode 100644 docs/START_HERE.md delete mode 100644 docs/SYSTEM_FLOW.html delete mode 100644 docs/VERIFY_CATEGORIES_FEATURE.html delete mode 100644 docs/WORKFLOW_DIAGRAM.md delete mode 100644 docs/chat-gippity-research.md create mode 100755 tools/batch_llm_classifier.py create mode 100644 tools/brett_gmail_analyzer.py create mode 100644 tools/brett_microsoft_analyzer.py create mode 100644 tools/generate_html_report.py diff --git a/.gitignore b/.gitignore index 7497f42..25a433f 100644 --- a/.gitignore +++ b/.gitignore @@ -72,8 +72,17 @@ ml_only_test/ results_*/ phase1_*/ -# Python scripts (experimental/research) +# Python scripts (experimental/research - not in src/tests/tools) *.py !src/**/*.py !tests/**/*.py -!setup.py \ No newline at end of file +!tools/**/*.py +!setup.py + +# Archive folders (historical content) +archive/ +docs/archive/ + +# Data folders (user-specific content) +data/Bruce emails/ +data/emails-for-link/ \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index f67d924..4a02256 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,377 +1,304 @@ -# Email Sorter - Claude Development Guide +# Email Sorter - Development Guide -This document provides essential context for Claude (or other AI assistants) working on this project. +## What This Tool Does -## Project Overview - -**Email Sorter** is a hybrid ML/LLM email classification system designed to process large email backlogs (10k-100k+ emails) with high speed and accuracy. - -### Current MVP Status - -**✅ PROVEN WORKING** - 10,000 emails classified in ~24 seconds with 72.7% accuracy - -**Core Features:** -- LLM-driven category discovery (no hardcoded categories) -- ML model training on discovered categories (LightGBM) -- Fast pure-ML classification with `--no-llm-fallback` -- Category verification for new mailboxes with `--verify-categories` -- Batched embedding extraction (512 emails/batch) -- Multiple email provider support (Gmail, Outlook, IMAP, Enron) - -## Architecture - -### Three-Tier Classification Pipeline +**Email Sorter is a TRIAGE tool** that sorts emails into buckets for downstream processing. It is NOT a complete email management solution - it's one part of a larger ecosystem. ``` -Email → Rules Check → ML Classifier → LLM Fallback (optional) - ↓ ↓ ↓ - Definite High Confidence Low Confidence - (5-10%) (70-80%) (10-20%) +Raw Inbox (10k+) --> Email Sorter --> Categorized Buckets --> Specialized Tools + (this tool) (output) (other tools) ``` -### Key Technologies +--- -- **ML Model**: LightGBM (1.8MB, 11 categories, 28 threads) -- **Embeddings**: all-minilm:l6-v2 via Ollama (384-dim, universal) -- **LLM**: qwen3:4b-instruct-2507-q8_0 via Ollama (calibration only) -- **Feature Extraction**: Embeddings + TF-IDF + pattern detection -- **Thresholds**: 0.55 (optimized from 0.75 to reduce LLM fallback) +## Quick Start -### Performance Metrics +```bash +cd /MASTERFOLDER/Tools/email-sorter +source venv/bin/activate -| Emails | Time | Accuracy | LLM Calls | Throughput | -|--------|------|----------|-----------|------------| -| 10,000 | 24s | 72.7% | 0 | 423/sec | -| 10,000 | 5min | 92.7% | 2,100 | 33/sec | +# Classify emails with ML + LLM fallback +python -m src.cli run --source local \ + --directory "/path/to/emails" \ + --output "/path/to/output" \ + --force-ml --llm-provider openai + +# Generate HTML report from results +python tools/generate_html_report.py --input /path/to/results.json +``` + +--- + +## Key Documentation + +| Document | Purpose | Location | +|----------|---------|----------| +| **PROJECT_ROADMAP_2025.md** | Master learnings, research findings, development roadmap | `docs/` | +| **CLASSIFICATION_METHODS_COMPARISON.md** | ML vs LLM vs Agent comparison | `docs/` | +| **REPORT_FORMAT.md** | HTML report documentation | `docs/` | +| **BATCH_LLM_QUICKSTART.md** | Quick LLM batch processing guide | root | + +--- + +## Research Findings Summary + +### Dataset Size Routing + +| Size | Best Method | Why | +|------|-------------|-----| +| <500 | Agent-only | ML overhead exceeds benefit | +| 500-5000 | Agent pre-scan + ML | Discovery improves accuracy | +| >5000 | ML pipeline | Speed critical | + +### Research Results + +| Dataset | Type | ML-Only | ML+LLM | Agent | +|---------|------|---------|--------|-------| +| brett-gmail (801) | Personal | 54.9% | 93.3% | 99.8% | +| brett-microsoft (596) | Business | - | - | 98.2% | + +### Key Insight: Inbox Character Matters + +| Type | Pattern | Approach | +|------|---------|----------| +| **Personal** | Subscriptions, marketing (40-50% automated) | Sender domain first | +| **Business** | Client work, operations (60-70% professional) | Sender + Subject context | + +--- ## Project Structure ``` email-sorter/ -├── src/ -│ ├── cli.py # Main CLI interface -│ ├── classification/ # Classification pipeline -│ │ ├── adaptive_classifier.py # Rules → ML → LLM orchestration -│ │ ├── ml_classifier.py # LightGBM classifier -│ │ ├── llm_classifier.py # LLM fallback -│ │ └── feature_extractor.py # Batched embedding extraction -│ ├── calibration/ # LLM-driven calibration -│ │ ├── workflow.py # Calibration orchestration -│ │ ├── llm_analyzer.py # Batch category discovery (20 emails/call) -│ │ ├── trainer.py # ML model training -│ │ └── category_verifier.py # Category verification -│ ├── email_providers/ # Email source connectors -│ │ ├── gmail.py # Gmail API (OAuth 2.0) -│ │ ├── outlook.py # Microsoft Graph API (OAuth 2.0) -│ │ ├── imap.py # IMAP protocol -│ │ └── enron.py # Enron dataset (testing) -│ ├── llm/ # LLM provider interfaces -│ │ ├── ollama.py # Ollama provider -│ │ └── openai_compat.py # OpenAI-compatible provider -│ └── models/ # Trained models -│ ├── calibrated/ # User-calibrated models -│ │ └── classifier.pkl # Current trained model (1.8MB) -│ └── pretrained/ # Default models -├── config/ -│ ├── default_config.yaml # System defaults -│ ├── categories.yaml # Category definitions (thresholds: 0.55) -│ └── llm_models.yaml # LLM configuration -├── credentials/ # Email provider credentials (gitignored) -│ ├── gmail/ # Gmail OAuth (3 accounts) -│ ├── outlook/ # Outlook OAuth (3 accounts) -│ └── imap/ # IMAP credentials (3 accounts) -├── docs/ # Documentation -├── scripts/ # Utility scripts -└── logs/ # Log files (gitignored) +├── CLAUDE.md # THIS FILE +├── README.md # General readme +├── BATCH_LLM_QUICKSTART.md # LLM batch processing +│ +├── src/ # Source code +│ ├── cli.py # Main entry point +│ ├── classification/ # ML/LLM classification +│ ├── calibration/ # Model training, email parsing +│ ├── email_providers/ # Gmail, Outlook, IMAP, Local +│ └── llm/ # LLM providers +│ +├── tools/ # Utility scripts +│ ├── brett_gmail_analyzer.py # Personal inbox template +│ ├── brett_microsoft_analyzer.py # Business inbox template +│ ├── generate_html_report.py # HTML report generator +│ └── batch_llm_classifier.py # Batch LLM classification +│ +├── config/ # Configuration +│ ├── default_config.yaml # LLM endpoints, thresholds +│ └── categories.yaml # Category definitions +│ +├── docs/ # Current documentation +│ ├── PROJECT_ROADMAP_2025.md +│ ├── CLASSIFICATION_METHODS_COMPARISON.md +│ ├── REPORT_FORMAT.md +│ └── archive/ # Old docs (historical) +│ +├── data/ # Analysis outputs (gitignored) +│ ├── brett_gmail_analysis.json +│ └── brett_microsoft_analysis.json +│ +├── credentials/ # OAuth/API creds (gitignored) +├── results/ # Classification outputs (gitignored) +├── archive/ # Old scripts (gitignored) +├── maildir/ # Enron test data +└── venv/ # Python environment ``` -## Critical Implementation Details - -### 1. Batched Embedding Extraction (CRITICAL!) - -**ALWAYS use batched feature extraction:** - -```python -# ✅ CORRECT - Batched (150x faster) -all_features = feature_extractor.extract_batch(emails, batch_size=512) -for email, features in zip(emails, all_features): - result = adaptive_classifier.classify_with_features(email, features) - -# ❌ WRONG - Sequential (extremely slow) -for email in emails: - result = adaptive_classifier.classify(email) # Extracts features one-at-a-time -``` - -**Why this matters:** -- Sequential: 10,000 emails × 15ms = 150 seconds just for embeddings -- Batched: 20 batches × 1s = 20 seconds for embeddings -- **150x performance difference** - -### 2. Model Paths - -**The model exists in TWO locations:** -- `src/models/calibrated/classifier.pkl` - Created during calibration (authoritative) -- `src/models/pretrained/classifier.pkl` - Loaded by default (copy of calibrated) - -**When calibration runs:** -1. Saves model to `calibrated/classifier.pkl` -2. MLClassifier loads from `pretrained/classifier.pkl` by default -3. Need to copy or update path - -**Current status:** Both paths have the same 1.8MB model (Oct 25 02:54) - -### 3. LLM-Driven Calibration - -**NOT hardcoded categories** - categories are discovered by LLM: - -```python -# Calibration process: -1. Sample 300 emails (3% of 10k) -2. Batch process in groups of 20 emails -3. LLM discovers categories (not predefined) -4. LLM labels each email -5. Train LightGBM on discovered categories -``` - -**Result:** 11 categories discovered from Enron dataset: -- Updates, Work, Meetings, External, Financial, Test, Administrative, Operational, Technical, Urgent, Requests - -### 4. Threshold Optimization - -**Default threshold: 0.55** (reduced from 0.75) - -**Impact:** -- 0.75 threshold: 35% LLM fallback -- 0.55 threshold: 21% LLM fallback -- **40% reduction in LLM usage** - -All category thresholds in `config/categories.yaml` set to 0.55. - -### 5. Email Provider Credentials - -**Multi-account support:** 3 accounts per provider type - -**Credential files:** -``` -credentials/ -├── gmail/ -│ ├── account1.json # Gmail OAuth credentials -│ ├── account2.json -│ └── account3.json -├── outlook/ -│ ├── account1.json # Outlook OAuth credentials -│ ├── account2.json -│ └── account3.json -└── imap/ - ├── account1.json # IMAP username/password - ├── account2.json - └── account3.json -``` - -**Security:** All `*.json` files in `credentials/` are gitignored (only `.example` files tracked). - -## Common Commands - -### Development - -```bash -# Activate virtual environment -source venv/bin/activate - -# Run classification (Enron dataset) -python -m src.cli run --source enron --limit 10000 --output results/ - -# Pure ML (no LLM fallback) - FAST -python -m src.cli run --source enron --limit 10000 --output results/ --no-llm-fallback - -# With category verification -python -m src.cli run --source enron --limit 10000 --output results/ --verify-categories - -# Gmail -python -m src.cli run --source gmail --credentials credentials/gmail/account1.json --limit 1000 - -# Outlook -python -m src.cli run --source outlook --credentials credentials/outlook/account1.json --limit 1000 -``` - -### Training - -```bash -# Force recalibration (clears cached model) -rm -rf src/models/calibrated/ src/models/pretrained/ -python -m src.cli run --source enron --limit 10000 --output results/ -``` - -## Code Patterns - -### Adding New Features - -1. **Update CLI** ([src/cli.py](src/cli.py)): - - Add click options - - Pass to appropriate modules - -2. **Update Classifier** ([src/classification/adaptive_classifier.py](src/classification/adaptive_classifier.py)): - - Add methods following existing pattern - - Use `classify_with_features()` for batched processing - -3. **Update Feature Extractor** ([src/classification/feature_extractor.py](src/classification/feature_extractor.py)): - - Always support batching (`extract_batch()`) - - Keep `extract()` for backward compatibility - -### Testing - -```bash -# Test imports -python -c "from src.cli import cli; print('OK')" - -# Test providers -python -c "from src.email_providers.gmail import GmailProvider; from src.email_providers.outlook import OutlookProvider; print('OK')" - -# Test classification -python -m src.cli run --source enron --limit 100 --output test/ -``` - -## Performance Optimization - -### Current Bottlenecks - -1. **Embedding generation** - 20s for 10k emails (batched) - - Optimized with batch_size=512 - - Could use local sentence-transformers for 5-10x speedup - -2. **Email parsing** - 0.5s for 10k emails (fast) - -3. **ML inference** - 0.7s for 10k emails (very fast) - -### Optimization Opportunities - -1. **Local embeddings** - Replace Ollama API with sentence-transformers - - Current: 20 API calls, ~20 seconds - - With local: Direct GPU, ~2-5 seconds - - Trade-off: More dependencies, larger memory footprint - -2. **Embedding cache** - Pre-compute and cache to disk - - One-time cost: 20 seconds - - Subsequent runs: 2-3 seconds to load from disk - - Perfect for development/testing - -3. **Larger batches** - Tested 512, 1024, 2048 - - 512: 23.6s (chosen for balance) - - 1024: 22.1s (6.6% faster) - - 2048: 21.9s (7.5% faster, diminishing returns) - -## Known Issues - -### 1. Background Processes - -There are stale background bash processes from previous sessions: -- These can be safely ignored -- Do NOT try to kill them (per user's CLAUDE.md instructions) - -### 2. Model Path Confusion - -- Calibration saves to `src/models/calibrated/` -- Default loads from `src/models/pretrained/` -- Both currently have the same model (synced) - -### 3. Category Cache - -- `src/models/category_cache.json` stores discovered categories -- Can become polluted if different datasets used -- Clear with `rm src/models/category_cache.json` if issues - -## Dependencies - -### Required - -```bash -pip install click pyyaml lightgbm numpy scikit-learn ollama -``` - -### Email Providers - -```bash -# Gmail -pip install google-api-python-client google-auth-oauthlib google-auth-httplib2 - -# Outlook -pip install msal requests - -# IMAP - no additional dependencies (Python stdlib) -``` - -### Optional - -```bash -# For faster local embeddings -pip install sentence-transformers - -# For development -pip install pytest black mypy -``` - -## Git Workflow - -### What's Gitignored - -- `credentials/` (except `.example` files) -- `logs/` -- `results/` -- `src/models/calibrated/` (trained models) -- `*.log` -- `debug_*.txt` -- Test directories - -### What's Tracked - -- All source code -- Configuration files -- Documentation -- Example credential files -- Pretrained model (if present) - -## Important Notes for AI Assistants - -1. **NEVER create files unless necessary** - Always prefer editing existing files - -2. **ALWAYS use batching** - Feature extraction MUST be batched (512 emails/batch) - -3. **Read before writing** - Use Read tool before any Edit operations - -4. **Verify paths** - Model paths can be confusing (calibrated vs pretrained) - -5. **No emoji in commits** - Per user's CLAUDE.md preferences - -6. **Test before committing** - Verify imports and CLI work - -7. **Security** - Never commit actual credentials, only `.example` files - -8. **Performance matters** - 10x performance differences are common, always batch - -9. **LLM is optional** - System works without LLM (pure ML mode with --no-llm-fallback) - -10. **Categories are dynamic** - They're discovered by LLM, not hardcoded - -## Recent Changes (Last Session) - -1. **Fixed embedding bottleneck** - Changed from sequential to batched feature extraction (10x speedup) -2. **Added Outlook provider** - Full Microsoft Graph API integration -3. **Added credentials system** - Support for 3 accounts per provider type -4. **Optimized thresholds** - Reduced from 0.75 to 0.55 (40% less LLM usage) -5. **Added category verifier** - Optional single LLM call to verify model fit -6. **Project reorganization** - Clean docs/, scripts/, logs/ structure - -## Next Steps (Roadmap) - -See [docs/PROJECT_STATUS_AND_NEXT_STEPS.html](docs/PROJECT_STATUS_AND_NEXT_STEPS.html) for complete roadmap. - -**Immediate priorities:** -1. Test Gmail provider with real credentials -2. Test Outlook provider with real credentials -3. Implement email syncing (apply labels back to mailbox) -4. Add incremental classification (process only new emails) -5. Create web dashboard for results visualization - --- -**Remember:** This is an MVP with proven performance. Don't over-engineer. Keep it fast and simple. +## Common Operations + +### 1. Classify Emails (ML Pipeline) + +```bash +source venv/bin/activate + +# With LLM fallback for low confidence +python -m src.cli run --source local \ + --directory "/path/to/emails" \ + --output "/path/to/output" \ + --force-ml --llm-provider openai + +# Pure ML (fastest, no LLM) +python -m src.cli run --source local \ + --directory "/path/to/emails" \ + --output "/path/to/output" \ + --force-ml --no-llm-fallback +``` + +### 2. Generate HTML Report + +```bash +python tools/generate_html_report.py --input /path/to/results.json +# Creates report.html in same directory +``` + +### 3. Manual Agent Analysis (Best Accuracy) + +For <1000 emails, agent analysis gives 98-99% accuracy: + +```bash +# Copy and customize analyzer template +cp tools/brett_gmail_analyzer.py tools/my_inbox_analyzer.py + +# Edit classify_email() function for your inbox patterns +# Update email_dir path +# Run +python tools/my_inbox_analyzer.py +``` + +### 4. Different Email Sources + +```bash +# Local .eml/.msg files +--source local --directory "/path/to/emails" + +# Gmail (OAuth) +--source gmail --credentials credentials/gmail/account1.json + +# Outlook (OAuth) +--source outlook --credentials credentials/outlook/account1.json + +# Enron test data +--source enron --limit 10000 +``` + +--- + +## Output Locations + +**Analysis reports are stored OUTSIDE this project:** + +``` +/home/bob/Documents/Email Manager/emails/ +├── brett-gmail/ # Source emails (untouched) +├── brett-gm-md/ # ML-only classification output +│ ├── results.json +│ ├── report.html +│ └── BRETT_GMAIL_ANALYSIS_REPORT.md +├── brett-gm-llm/ # ML+LLM classification output +│ ├── results.json +│ └── report.html +└── brett-ms-sorter/ # Microsoft inbox analysis + └── BRETT_MICROSOFT_ANALYSIS_REPORT.md +``` + +**Project data outputs (gitignored):** +``` +/MASTERFOLDER/Tools/email-sorter/data/ +├── brett_gmail_analysis.json +└── brett_microsoft_analysis.json +``` + +--- + +## Configuration + +### LLM Endpoint (config/default_config.yaml) + +```yaml +llm: + provider: "openai" + openai: + base_url: "http://localhost:11433/v1" # vLLM endpoint + api_key: "not-needed" + classification_model: "qwen3-coder-30b" +``` + +### Thresholds (config/categories.yaml) + +Default: 0.55 (reduced from 0.75 for 40% less LLM fallback) + +--- + +## Key Code Locations + +| Function | File | +|----------|------| +| CLI entry | `src/cli.py` | +| ML classifier | `src/classification/ml_classifier.py` | +| LLM classifier | `src/classification/llm_classifier.py` | +| Feature extraction | `src/classification/feature_extractor.py` | +| Email parsing | `src/calibration/local_file_parser.py` | +| OpenAI-compat LLM | `src/llm/openai_compat.py` | + +--- + +## Recent Changes (Nov 2025) + +1. **cli.py**: Added `--force-ml` flag, enriched results.json with metadata +2. **openai_compat.py**: Removed API key requirement for local vLLM +3. **default_config.yaml**: Changed to openai provider on localhost:11433 +4. **tools/**: Added brett_gmail_analyzer.py, brett_microsoft_analyzer.py, generate_html_report.py +5. **docs/**: Added PROJECT_ROADMAP_2025.md, CLASSIFICATION_METHODS_COMPARISON.md + +--- + +## Troubleshooting + +### "LLM endpoint not responding" +- Check vLLM running on localhost:11433 +- Verify model name in config matches running model + +### "Low accuracy (50-60%)" +- For <1000 emails, use agent analysis +- Dataset may differ from Enron training data + +### "Too many LLM calls" +- Use `--no-llm-fallback` for pure ML +- Increase threshold in categories.yaml + +--- + +## Development Notes + +### Virtual Environment Required +```bash +source venv/bin/activate +# ALWAYS activate before Python commands +``` + +### Batched Feature Extraction (CRITICAL) +```python +# CORRECT - Batched (150x faster) +all_features = feature_extractor.extract_batch(emails, batch_size=512) + +# WRONG - Sequential (extremely slow) +for email in emails: + result = classifier.classify(email) # Don't do this +``` + +### Model Paths +- `src/models/calibrated/` - Created during calibration +- `src/models/pretrained/` - Loaded by default + +--- + +## What's Gitignored + +- `credentials/` - OAuth tokens +- `results/`, `data/` - User data +- `archive/`, `docs/archive/` - Historical content +- `maildir/` - Enron test data (large) +- `enron_mail_20150507.tar.gz` - Source archive +- `venv/` - Python environment +- `*.log`, `logs/` - Log files + +--- + +## Philosophy + +1. **Triage, not management** - Sort into buckets for other tools +2. **Risk-based accuracy** - High for personal, acceptable errors for junk +3. **Speed matters** - 10k emails in <1 min +4. **Inbox character matters** - Business vs personal = different approaches +5. **Agent pre-scan adds value** - 10-15 min discovery improves everything + +--- + +*Last Updated: 2025-11-28* +*See docs/PROJECT_ROADMAP_2025.md for full research findings* diff --git a/config/default_config.yaml b/config/default_config.yaml index f907140..2a452b6 100644 --- a/config/default_config.yaml +++ b/config/default_config.yaml @@ -27,7 +27,7 @@ classification: conversational: 0.55 llm: - provider: "ollama" + provider: "openai" fallback_enabled: true ollama: @@ -41,9 +41,10 @@ llm: retry_attempts: 3 openai: - base_url: "https://rtx3090.bobai.com.au/v1" - api_key: "rtx3090_foxadmin_10_8034ecb47841f45ba1d5f3f5d875c092" + base_url: "http://localhost:11433/v1" + api_key: "not-needed" calibration_model: "qwen3-coder-30b" + consolidation_model: "qwen3-coder-30b" classification_model: "qwen3-coder-30b" temperature: 0.1 max_tokens: 500 diff --git a/docs/BUILD_INSTRUCTIONS.md b/docs/BUILD_INSTRUCTIONS.md deleted file mode 100644 index b044eeb..0000000 --- a/docs/BUILD_INSTRUCTIONS.md +++ /dev/null @@ -1,1298 +0,0 @@ -# EMAIL SORTER - BUILD INSTRUCTIONS - -**Step-by-Step Implementation Guide** - -Version: 1.0 -Date: 2024-10-21 - ---- - -## PREREQUISITES - -### Required Software -- Python 3.8 or higher -- Git -- Ollama (for local LLM) -- Text editor / IDE - -### Required Accounts -- Gmail account (for testing) -- Google Cloud Console project (for Gmail API) - -### Skills Needed -- Python programming -- Basic understanding of ML concepts -- Command line comfort -- OAuth 2.0 basics - ---- - -## IMPLEMENTATION ORDER - -Build in this exact order. Each phase depends on previous phases. - ---- - -## PHASE 1: PROJECT SETUP - -### Step 1: Initialize Git Repository - -```bash -cd C:\Users\BrettFox\Documents\Claude\email-sorter -git init -git add . -git commit -m "Initial commit - project blueprint" -``` - -### Step 2: Create Virtual Environment - -```bash -# Create venv -python -m venv venv - -# Activate (Windows) -venv\Scripts\activate - -# Activate (Linux/Mac) -source venv/bin/activate -``` - -### Step 3: Create requirements.txt - -Already exists, but verify contents: - -```txt -# Core -python-dotenv>=1.0.0 -pyyaml>=6.0 -pydantic>=2.0.0 - -# Email Providers -google-api-python-client>=2.100.0 -google-auth-httplib2>=0.1.1 -google-auth-oauthlib>=1.1.0 -msal>=1.24.0 -imapclient>=2.3.1 - -# Machine Learning -scikit-learn>=1.3.0 -xgboost>=2.0.0 -lightgbm>=4.0.0 -pandas>=2.0.0 -numpy>=1.24.0 - -# LLM Integration -ollama>=0.1.0 - -# Text Processing -nltk>=3.8 -beautifulsoup4>=4.12.0 -lxml>=4.9.0 - -# Utilities -tqdm>=4.66.0 -click>=8.1.0 -rich>=13.0.0 -joblib>=1.3.0 -tenacity>=8.2.0 - -# Testing -pytest>=7.4.0 -pytest-cov>=4.1.0 -pytest-mock>=3.11.0 -``` - -### Step 4: Install Dependencies - -```bash -pip install -r requirements.txt -``` - -### Step 5: Create .gitignore - -``` -# Python -__pycache__/ -*.py[cod] -*$py.class -*.so -.Python -env/ -venv/ -*.egg-info/ -dist/ -build/ - -# Data and Models -data/training/ -src/models/pretrained/*.pkl -*.h5 -*.joblib - -# Credentials -.env -credentials/ -*.json -!config/*.json - -# Logs -logs/*.log -*.log - -# IDE -.vscode/ -.idea/ -*.swp - -# OS -.DS_Store -Thumbs.db - -# Checkpoints -checkpoints/ -*.checkpoint - -# Results -results/ -output/ -``` - -### Step 6: Create Directory Structure - -```bash -# Create all directories -mkdir -p src/calibration -mkdir -p src/classification -mkdir -p src/models/pretrained -mkdir -p src/email_providers -mkdir -p src/processing -mkdir -p src/adjustment -mkdir -p src/export -mkdir -p src/utils -mkdir -p tests -mkdir -p prompts -mkdir -p scripts -mkdir -p data/samples -mkdir -p logs -mkdir -p config - -# Create __init__.py files -touch src/__init__.py -touch src/calibration/__init__.py -touch src/classification/__init__.py -touch src/models/__init__.py -touch src/email_providers/__init__.py -touch src/processing/__init__.py -touch src/adjustment/__init__.py -touch src/export/__init__.py -touch src/utils/__init__.py -touch tests/__init__.py - -# Windows equivalent: -# type nul > src\__init__.py -# (repeat for each) -``` - ---- - -## PHASE 2: CORE INFRASTRUCTURE - -### Step 7: Config System (src/utils/config.py) - -Create the configuration loader: - -```python -"""Configuration management.""" -import yaml -from pathlib import Path -from typing import Dict, Any -from pydantic import BaseModel - - -class Config(BaseModel): - """Main configuration model.""" - version: str - calibration: Dict[str, Any] - processing: Dict[str, Any] - classification: Dict[str, Any] - llm: Dict[str, Any] - email_providers: Dict[str, Any] - features: Dict[str, Any] - export: Dict[str, Any] - logging: Dict[str, Any] - cleanup: Dict[str, Any] - - class Config: - extra = "allow" - - -def load_config(config_path: str = "config/default_config.yaml") -> Config: - """Load configuration from YAML file.""" - with open(config_path, 'r') as f: - config_dict = yaml.safe_load(f) - return Config(**config_dict) - - -def load_categories(categories_path: str = "config/categories.yaml") -> Dict[str, Dict]: - """Load category definitions.""" - with open(categories_path, 'r') as f: - data = yaml.safe_load(f) - return data['categories'] - - -def load_features(features_path: str = "config/features.yaml") -> Dict[str, Any]: - """Load feature configuration.""" - with open(features_path, 'r') as f: - return yaml.safe_load(f) -``` - -**Test:** -```bash -python -c "from src.utils.config import load_config; print(load_config())" -``` - -### Step 8: Logging System (src/utils/logging.py) - -```python -"""Logging configuration.""" -import logging -import sys -from pathlib import Path -from rich.logging import RichHandler - - -def setup_logging(config: dict): - """Setup logging with console and file handlers.""" - log_level = config.get('level', 'INFO') - log_file = config.get('file', 'logs/email-sorter.log') - - # Create logs directory - Path(log_file).parent.mkdir(parents=True, exist_ok=True) - - # Create logger - logger = logging.getLogger() - logger.setLevel(log_level) - - # Remove existing handlers - logger.handlers = [] - - # Console handler with rich formatting - console_handler = RichHandler( - rich_tracebacks=True, - markup=True, - show_time=True, - show_path=False - ) - console_handler.setLevel(log_level) - console_formatter = logging.Formatter('%(message)s') - console_handler.setFormatter(console_formatter) - - # File handler - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(log_level) - file_formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) - file_handler.setFormatter(file_formatter) - - # Add handlers - logger.addHandler(console_handler) - logger.addHandler(file_handler) - - return logger -``` - -### Step 9: Email Data Models (src/email_providers/base.py) - -```python -"""Base email provider interface and data models.""" -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from datetime import datetime -from typing import List, Dict, Any, Optional - - -@dataclass -class Email: - """Unified email data model.""" - id: str - subject: str - sender: str - sender_name: Optional[str] = None - date: Optional[datetime] = None - body: str = "" - body_snippet: str = "" - has_attachments: bool = False - attachments: List[Dict] = field(default_factory=list) - headers: Dict = field(default_factory=dict) - labels: List[str] = field(default_factory=list) - is_read: bool = False - - def __post_init__(self): - """Generate body_snippet if not provided.""" - if not self.body_snippet and self.body: - self.body_snippet = self.body[:500] - - -class BaseProvider(ABC): - """Abstract base class for email providers.""" - - @abstractmethod - def connect(self, credentials: Dict) -> bool: - """Establish connection to email provider.""" - pass - - @abstractmethod - def fetch_emails(self, limit: int = None, filters: Dict = None) -> List[Email]: - """Fetch emails from provider.""" - pass - - @abstractmethod - def update_labels(self, email_id: str, labels: List[str]) -> bool: - """Update email labels/folders.""" - pass - - @abstractmethod - def batch_update(self, updates: List[Dict]) -> bool: - """Batch update multiple emails.""" - pass - - @abstractmethod - def disconnect(self): - """Close connection.""" - pass -``` - -**Test:** -```bash -python -c "from src.email_providers.base import Email; e = Email(id='1', subject='Test', sender='test@test.com'); print(e)" -``` - ---- - -## PHASE 3: CONFIGURATION FILES - -### Step 10: Create config/default_config.yaml - -```yaml -version: "1.0.0" - -calibration: - sample_size: 1500 - sample_strategy: "stratified" - validation_size: 300 - min_confidence: 0.6 - -processing: - batch_size: 100 - llm_queue_size: 100 - parallel_workers: 4 - checkpoint_interval: 1000 - -classification: - default_threshold: 0.75 - min_threshold: 0.60 - max_threshold: 0.90 - adjustment_step: 0.05 - adjustment_frequency: 1000 - category_thresholds: - junk: 0.85 - auth: 0.80 - conversational: 0.65 - -llm: - provider: "ollama" - model: "qwen2.5:1.5b" - base_url: "http://localhost:11434" - temperature: 0.1 - max_tokens: 500 - timeout: 30 - retry_attempts: 3 - -email_providers: - gmail: - batch_size: 100 - microsoft: - batch_size: 100 - imap: - timeout: 30 - batch_size: 50 - -features: - text_features: - max_vocab_size: 10000 - ngram_range: [1, 2] - min_df: 2 - max_df: 0.95 - -export: - format: "json" - include_confidence: true - create_report: true - -logging: - level: "INFO" - file: "logs/email-sorter.log" - -cleanup: - delete_temp_files: true - delete_repo_after: false -``` - -### Step 11: Create config/categories.yaml - -(See PROJECT_BLUEPRINT.md for full content) - -### Step 12: Create config/features.yaml - -(See PROJECT_BLUEPRINT.md for full content) - -**Test:** -```bash -python -c "from src.utils.config import load_config, load_categories; print(load_config()); print(load_categories())" -``` - ---- - -## PHASE 4: EMAIL PROVIDERS - -### Step 13: Gmail Provider (src/email_providers/gmail.py) - -```python -"""Gmail API provider implementation.""" -import base64 -import logging -from typing import List, Dict, Optional -from datetime import datetime -from email.utils import parsedate_to_datetime - -from google.oauth2.credentials import Credentials -from google.auth.transport.requests import Request -from google_auth_oauthlib.flow import InstalledAppFlow -from googleapiclient.discovery import build -from googleapiclient.errors import HttpError - -from .base import BaseProvider, Email - -logger = logging.getLogger(__name__) - - -class GmailProvider(BaseProvider): - """Gmail API email provider.""" - - SCOPES = [ - 'https://www.googleapis.com/auth/gmail.readonly', - 'https://www.googleapis.com/auth/gmail.modify' - ] - - def __init__(self): - self.service = None - self.user_id = 'me' - - def connect(self, credentials_path: str) -> bool: - """Connect to Gmail API using OAuth credentials.""" - try: - # For first-time auth - flow = InstalledAppFlow.from_client_secrets_file( - credentials_path, self.SCOPES - ) - creds = flow.run_local_server(port=0) - - self.service = build('gmail', 'v1', credentials=creds) - logger.info("Connected to Gmail API") - return True - - except Exception as e: - logger.error(f"Failed to connect to Gmail: {e}") - return False - - def fetch_emails(self, limit: int = None, filters: Dict = None) -> List[Email]: - """Fetch emails from Gmail.""" - emails = [] - - try: - # Build query - query = filters.get('query', '') if filters else '' - - # Get message IDs - results = self.service.users().messages().list( - userId=self.user_id, - q=query, - maxResults=min(limit or 500, 500) if limit else 500 - ).execute() - - messages = results.get('messages', []) - - # Fetch full messages - for msg_info in messages: - email = self._fetch_message(msg_info['id']) - if email: - emails.append(email) - if limit and len(emails) >= limit: - break - - logger.info(f"Fetched {len(emails)} emails from Gmail") - return emails - - except HttpError as e: - logger.error(f"Gmail API error: {e}") - return emails - - def _fetch_message(self, msg_id: str) -> Optional[Email]: - """Fetch and parse a single message.""" - try: - msg = self.service.users().messages().get( - userId=self.user_id, - id=msg_id, - format='full' - ).execute() - - return self._parse_message(msg) - - except Exception as e: - logger.error(f"Error fetching message {msg_id}: {e}") - return None - - def _parse_message(self, msg: Dict) -> Email: - """Parse Gmail message into Email object.""" - headers = {h['name']: h['value'] for h in msg['payload']['headers']} - - # Extract body - body = self._get_body(msg['payload']) - - # Parse date - date = None - if 'Date' in headers: - try: - date = parsedate_to_datetime(headers['Date']) - except: - pass - - # Check attachments - has_attachments = False - attachments = [] - if 'parts' in msg['payload']: - for part in msg['payload']['parts']: - if part.get('filename'): - has_attachments = True - attachments.append({ - 'filename': part['filename'], - 'mime_type': part['mimeType'], - 'size': part.get('body', {}).get('size', 0) - }) - - return Email( - id=msg['id'], - subject=headers.get('Subject', 'No Subject'), - sender=headers.get('From', ''), - date=date, - body=body, - has_attachments=has_attachments, - attachments=attachments, - headers=headers, - labels=msg.get('labelIds', []), - is_read='UNREAD' not in msg.get('labelIds', []) - ) - - def _get_body(self, payload: Dict) -> str: - """Extract email body from payload.""" - body = "" - - if 'body' in payload and 'data' in payload['body']: - body = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8', errors='ignore') - elif 'parts' in payload: - for part in payload['parts']: - if part['mimeType'] == 'text/plain': - if 'data' in part['body']: - body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore') - break - - return body - - def update_labels(self, email_id: str, labels: List[str]) -> bool: - """Update labels for a single email.""" - try: - self.service.users().messages().modify( - userId=self.user_id, - id=email_id, - body={'addLabelIds': labels} - ).execute() - return True - except Exception as e: - logger.error(f"Error updating labels: {e}") - return False - - def batch_update(self, updates: List[Dict]) -> bool: - """Batch update multiple emails.""" - try: - batch_size = 100 - - for i in range(0, len(updates), batch_size): - batch = updates[i:i+batch_size] - email_ids = [u['email_id'] for u in batch] - labels = list(set([l for u in batch for l in u.get('labels', [])])) - - self.service.users().messages().batchModify( - userId=self.user_id, - body={ - 'ids': email_ids, - 'addLabelIds': labels - } - ).execute() - - logger.info(f"Batch updated {len(updates)} emails") - return True - - except Exception as e: - logger.error(f"Batch update error: {e}") - return False - - def disconnect(self): - """Close connection.""" - self.service = None - logger.info("Disconnected from Gmail") -``` - -**Test (requires Gmail OAuth setup):** -```bash -# First: Set up OAuth in Google Cloud Console -# Download credentials.json -python -c "from src.email_providers.gmail import GmailProvider; p = GmailProvider(); p.connect('credentials.json'); emails = p.fetch_emails(limit=10); print(f'Fetched {len(emails)} emails')" -``` - ---- - -## PHASE 5: FEATURE EXTRACTION - -### Step 14: Feature Extractor (src/classification/feature_extractor.py) - -```python -"""Feature extraction from emails.""" -import re -import logging -from typing import Dict, List, Any -from datetime import datetime -from urllib.parse import urlparse - -import pandas as pd -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer - -from src.email_providers.base import Email - -logger = logging.getLogger(__name__) - - -class FeatureExtractor: - """Extract features from emails for classification.""" - - def __init__(self, config: Dict = None): - """Initialize with feature configuration.""" - self.config = config or { - 'text_features': { - 'max_features': 10000, - 'ngram_range': [1, 2], - 'min_df': 2, - 'max_df': 0.95 - } - } - - self.text_vectorizer = None - self._initialize_vectorizer() - - def _initialize_vectorizer(self): - """Initialize TF-IDF vectorizer.""" - text_config = self.config.get('text_features', {}) - self.text_vectorizer = TfidfVectorizer( - max_features=text_config.get('max_features', 10000), - ngram_range=tuple(text_config.get('ngram_range', [1, 2])), - min_df=text_config.get('min_df', 2), - max_df=text_config.get('max_df', 0.95), - sublinear_tf=True - ) - - def extract(self, email: Email) -> Dict[str, Any]: - """ - Extract features from a single email. - - Args: - email: Email object - - Returns: - Dictionary of features - """ - features = {} - - # Text for TF-IDF - features['text'] = f"{email.subject} {email.body_snippet}" - - # Structural features - features.update(self._extract_structural(email)) - - # Sender features - features.update(self._extract_sender(email)) - - # Pattern features - features.update(self._extract_patterns(email)) - - return features - - def _extract_structural(self, email: Email) -> Dict[str, Any]: - """Extract structural features.""" - features = {} - - # Attachments - features['has_attachments'] = email.has_attachments - features['attachment_count'] = len(email.attachments) - - # Links and images - body = email.body or email.body_snippet - features['link_count'] = len(re.findall(r'https?://', body)) - features['image_count'] = len(re.findall(r' Dict[str, Any]: - """Extract sender-based features.""" - features = {} - - sender = email.sender - if '@' in sender: - # Extract domain - domain = sender.split('@')[1].lower() - features['sender_domain'] = domain - - # Domain type - freemail_domains = {'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'icloud.com'} - noreply_patterns = ['noreply', 'no-reply', 'donotreply'] - marketing_patterns = ['marketing', 'newsletter', 'promo'] - - if domain in freemail_domains: - features['sender_domain_type'] = 'freemail' - elif any(p in sender.lower() for p in noreply_patterns): - features['sender_domain_type'] = 'noreply' - elif any(p in sender.lower() for p in marketing_patterns): - features['sender_domain_type'] = 'marketing' - else: - features['sender_domain_type'] = 'corporate' - - features['is_noreply'] = any(p in sender.lower() for p in noreply_patterns) - else: - features['sender_domain'] = 'unknown' - features['sender_domain_type'] = 'unknown' - features['is_noreply'] = False - - return features - - def _extract_patterns(self, email: Email) -> Dict[str, Any]: - """Extract pattern-based features.""" - features = {} - - body = (email.body or email.body_snippet).lower() - subject = email.subject.lower() - combined = f"{subject} {body}" - - # Common patterns - features['has_unsubscribe'] = 'unsubscribe' in combined - features['has_otp_pattern'] = bool(re.search(r'\b\d{4,6}\b', combined)) - features['has_price'] = bool(re.search(r'\$\d+', combined)) - features['has_tracking_pattern'] = bool(re.search(r'tracking\s*(number|#)', combined)) - features['has_invoice_pattern'] = bool(re.search(r'(invoice|bill|receipt)\s*#?\d+', combined)) - features['has_meeting_pattern'] = bool(re.search(r'(meeting|call|zoom|teams)', combined)) - - return features - - def extract_batch(self, emails: List[Email]) -> pd.DataFrame: - """ - Extract features from batch of emails. - - Args: - emails: List of Email objects - - Returns: - DataFrame with all features - """ - # Extract features for each email - feature_dicts = [self.extract(email) for email in emails] - - # Convert to DataFrame - df = pd.DataFrame(feature_dicts) - - # Transform text features if vectorizer is fitted - if self.text_vectorizer and 'text' in df.columns: - if hasattr(self.text_vectorizer, 'vocabulary_'): - text_features = self.text_vectorizer.transform(df['text']) - text_df = pd.DataFrame( - text_features.toarray(), - columns=[f"text_{i}" for i in range(text_features.shape[1])] - ) - df = pd.concat([df.drop('text', axis=1), text_df], axis=1) - else: - df = df.drop('text', axis=1) - - return df - - def fit_text_vectorizer(self, emails: List[Email]): - """Fit text vectorizer on corpus.""" - texts = [f"{e.subject} {e.body_snippet}" for e in emails] - self.text_vectorizer.fit(texts) - logger.info(f"Fitted vectorizer with {len(self.text_vectorizer.vocabulary_)} features") -``` - -**Test:** -```bash -# Create mock email and test -python -c " -from src.email_providers.base import Email -from src.classification.feature_extractor import FeatureExtractor -from datetime import datetime - -email = Email( - id='1', - subject='Meeting at 3pm', - sender='john@company.com', - date=datetime.now(), - body='Let us meet to discuss the project', - has_attachments=True -) - -extractor = FeatureExtractor() -features = extractor.extract(email) -print(features) -" -``` - ---- - -## PHASE 6: ML CLASSIFIER (BLOCKER - NEED MODEL) - -### Step 15: ML Classifier Wrapper (src/classification/ml_classifier.py) - -```python -"""ML-based email classifier.""" -import logging -import pickle -from typing import Dict, List, Any -import numpy as np -from pathlib import Path - -logger = logging.getLogger(__name__) - - -class MLClassifier: - """Wrapper for pre-trained ML classification model.""" - - def __init__(self, model_path: str = "src/models/pretrained/classifier.pkl"): - """Load pre-trained model.""" - self.model = None - self.label_encoder = None - self.categories = [] - self.feature_names = [] - - self._load_model(model_path) - - def _load_model(self, model_path: str): - """Load model from file.""" - try: - with open(model_path, 'rb') as f: - model_data = pickle.load(f) - - self.model = model_data['model'] - self.label_encoder = model_data.get('label_encoder') - self.categories = model_data.get('categories', []) - self.feature_names = model_data.get('feature_names', []) - - logger.info(f"Loaded ML model with {len(self.categories)} categories") - - except FileNotFoundError: - logger.warning(f"Model file not found: {model_path}") - logger.warning("Will need to train model or use alternative classification") - except Exception as e: - logger.error(f"Error loading model: {e}") - - def predict(self, features: np.ndarray) -> Dict[str, Any]: - """ - Predict category for feature vector. - - Args: - features: Feature vector or DataFrame row - - Returns: - { - 'category': str, - 'confidence': float, - 'probabilities': Dict[str, float] - } - """ - if self.model is None: - return { - 'category': 'unknown', - 'confidence': 0.0, - 'probabilities': {}, - 'error': 'Model not loaded' - } - - # Get probabilities - probs = self.model.predict_proba([features])[0] - - # Get predicted class - pred_class = np.argmax(probs) - category = self.categories[pred_class] - confidence = float(probs[pred_class]) - - # All probabilities - prob_dict = { - self.categories[i]: float(probs[i]) - for i in range(len(self.categories)) - } - - return { - 'category': category, - 'confidence': confidence, - 'probabilities': prob_dict - } - - def predict_batch(self, features: np.ndarray) -> List[Dict[str, Any]]: - """Predict for batch of feature vectors.""" - return [self.predict(f) for f in features] -``` - -### ⚠️ CRITICAL: You need to either: - -**Option A: Create a placeholder model for testing** -```python -# scripts/create_mock_model.py -import pickle -from sklearn.ensemble import RandomForestClassifier -import numpy as np - -# Create dummy model -model = RandomForestClassifier(n_estimators=10) -X_dummy = np.random.rand(100, 50) -y_dummy = np.random.randint(0, 12, 100) -model.fit(X_dummy, y_dummy) - -categories = [ - 'junk', 'transactional', 'auth', 'newsletters', - 'social', 'automated', 'conversational', 'work', - 'personal', 'finance', 'travel', 'unknown' -] - -model_data = { - 'model': model, - 'categories': categories, - 'feature_names': [f'feature_{i}' for i in range(50)] -} - -with open('src/models/pretrained/classifier.pkl', 'wb') as f: - pickle.dump(model_data, f) - -print("Mock model created!") -``` - -**Option B: Train a real model (recommended)** -See scripts/train_model.py (to be created in next phase) - ---- - -## PHASE 7: LLM INTEGRATION - -### Step 16: LLM Classifier (src/classification/llm_classifier.py) - -```python -"""LLM-based email classifier using Ollama.""" -import logging -import json -import re -from typing import Dict, List, Any -from abc import ABC, abstractmethod - -logger = logging.getLogger(__name__) - - -class BaseLLMProvider(ABC): - """Abstract LLM provider.""" - - @abstractmethod - def complete(self, prompt: str, **kwargs) -> str: - pass - - @abstractmethod - def test_connection(self) -> bool: - pass - - -class OllamaProvider(BaseLLMProvider): - """Ollama local LLM provider.""" - - def __init__(self, model: str = "qwen2.5:1.5b", base_url: str = "http://localhost:11434"): - try: - import ollama - self.client = ollama.Client(host=base_url) - self.model = model - logger.info(f"Initialized Ollama provider with model {model}") - except ImportError: - logger.error("ollama package not installed. Run: pip install ollama") - self.client = None - except Exception as e: - logger.error(f"Failed to initialize Ollama: {e}") - self.client = None - - def complete(self, prompt: str, **kwargs) -> str: - if not self.client: - raise RuntimeError("Ollama client not available") - - response = self.client.generate( - model=self.model, - prompt=prompt, - options={ - 'temperature': kwargs.get('temperature', 0.1), - 'num_predict': kwargs.get('max_tokens', 500) - } - ) - return response['response'] - - def test_connection(self) -> bool: - try: - self.client.list() - return True - except: - return False - - -class LLMClassifier: - """Email classifier using LLM.""" - - def __init__(self, provider: BaseLLMProvider, categories: Dict[str, Dict], config: Dict): - self.provider = provider - self.categories = categories - self.config = config - self.classification_prompt = self._load_prompt_template() - - def _load_prompt_template(self) -> str: - """Load or create classification prompt.""" - # Try to load from file first - try: - with open('prompts/classification.txt', 'r') as f: - return f.read() - except FileNotFoundError: - # Use default prompt - return """You are an expert email classifier. - -CATEGORIES: -{categories} - -EMAIL: -Subject: {subject} -From: {sender} -Has Attachments: {has_attachments} -Body Snippet: {body_snippet} - -ML Prediction: {ml_prediction} (confidence: {ml_confidence}) - -Respond with JSON only: -{{ - "category": "chosen_category", - "confidence": 0.85, - "reasoning": "brief explanation" -}} -""" - - def classify(self, email: Dict[str, Any]) -> Dict[str, Any]: - """Classify email using LLM.""" - # Build prompt - categories_str = "\n".join([ - f"- {name}: {info['description']}" - for name, info in self.categories.items() - ]) - - ml_pred = email.get('ml_prediction', {}) - - prompt = self.classification_prompt.format( - categories=categories_str, - subject=email.get('subject', 'N/A'), - sender=email.get('sender', 'N/A'), - has_attachments=email.get('has_attachments', False), - body_snippet=email.get('body_snippet', '')[:300], - ml_prediction=ml_pred.get('category', 'unknown'), - ml_confidence=ml_pred.get('confidence', 0.0) - ) - - try: - # Get LLM response - response = self.provider.complete( - prompt, - temperature=self.config['llm']['temperature'], - max_tokens=self.config['llm']['max_tokens'] - ) - - # Parse JSON response - result = self._parse_response(response) - return result - - except Exception as e: - logger.error(f"LLM classification failed: {e}") - return { - 'category': 'unknown', - 'confidence': 0.0, - 'reasoning': f'Error: {str(e)}', - 'error': True - } - - def _parse_response(self, response: str) -> Dict[str, Any]: - """Parse LLM JSON response.""" - # Try to extract JSON - json_match = re.search(r'\{.*\}', response, re.DOTALL) - if json_match: - try: - return json.loads(json_match.group()) - except json.JSONDecodeError: - pass - - # Fallback parsing - return { - 'category': 'unknown', - 'confidence': 0.5, - 'reasoning': response[:200] - } -``` - -**Test (requires Ollama running):** -```bash -# First: Install and start Ollama -# ollama pull qwen2.5:1.5b - -python -c " -from src.classification.llm_classifier import OllamaProvider, LLMClassifier -from src.utils.config import load_categories, load_config - -provider = OllamaProvider() -categories = load_categories() -config = load_config() - -classifier = LLMClassifier(provider, categories, config) - -email = { - 'subject': 'Your verification code is 123456', - 'sender': 'noreply@bank.com', - 'has_attachments': False, - 'body_snippet': 'Your one-time password is 123456', - 'ml_prediction': {'category': 'auth', 'confidence': 0.65} -} - -result = classifier.classify(email) -print(result) -" -``` - ---- - -## NEXT PHASES - -Due to length limits, the remaining phases are: - -### Phase 8: Adaptive Classifier -- Dynamic threshold adjustment -- Sender rule learning -- Classification orchestration - -### Phase 9: Processing Pipeline -- Bulk processor -- Queue management -- Checkpointing - -### Phase 10: Calibration System -- Email sampling -- LLM calibration analysis -- Validation - -### Phase 11: Export & Sync -- Results exporter -- Gmail sync -- Report generation - -### Phase 12: Main CLI -- Click interface -- End-to-end orchestration - -### Phase 13: Testing -- Unit tests -- Integration tests -- Full pipeline test on Marion's inbox - ---- - -## TESTING STRATEGY - -### Unit Testing -```bash -pytest tests/test_classification.py -v -``` - -### Integration Testing -```bash -# Test on 100 emails -python src/main.py --source gmail --credentials creds.json --output test/ --limit 100 - -# Test on 1000 emails -python src/main.py --source gmail --credentials creds.json --output test/ --limit 1000 -``` - -### Full Pipeline -```bash -# Run on Marion's full inbox -python src/main.py --source gmail --credentials marion-creds.json --output results/ -``` - ---- - -## CRITICAL NEXT STEPS - -1. **DECIDE: ML Model Strategy** - - Option A: Create mock model for immediate testing - - Option B: Train real model (takes 1-2 days) - -2. **Set up Gmail OAuth** - - Google Cloud Console - - Enable Gmail API - - Download credentials.json - -3. **Install and test Ollama** - - Download from ollama.ai - - Pull model: `ollama pull qwen2.5:1.5b` - - Test: `ollama run qwen2.5:1.5b "test"` - -4. **Continue building** - - Next: Adaptive Classifier - - Then: Processing Pipeline - - Then: Full integration - ---- - -**THIS IS THE ACTUAL BUILD GUIDE** - -Everything in this document provides real, executable steps to build the system. diff --git a/docs/CLASSIFICATION_METHODS_COMPARISON.md b/docs/CLASSIFICATION_METHODS_COMPARISON.md new file mode 100644 index 0000000..820773a --- /dev/null +++ b/docs/CLASSIFICATION_METHODS_COMPARISON.md @@ -0,0 +1,518 @@ +# Email Classification Methods: Comparative Analysis + +## Executive Summary + +This document compares three email classification approaches tested on an 801-email personal Gmail dataset: + +| Method | Accuracy | Time | Best For | +|--------|----------|------|----------| +| ML-Only | 54.9% | 5 sec | 10k+ emails, speed critical | +| ML+LLM Fallback | 93.3% | 3.5 min | 1k-10k emails, balanced | +| Agent Analysis | 99.8% | 15-30 min | <1k emails, deep insights | + +**Key Finding:** The ML pipeline is overkill for datasets under ~5,000 emails. A 10-15 minute agent pre-analysis phase could dramatically improve ML accuracy for larger datasets. + +--- + +## Test Dataset Profile + +| Characteristic | Value | +|----------------|-------| +| Total Emails | 801 | +| Date Range | 20 years (2005-2025) | +| Unique Senders | ~150 | +| Automated % | 48.8% | +| Personal % | 1.6% | +| Structure Level | MEDIUM-HIGH | + +### Email Type Breakdown (Sanitized) + +``` +Automated Notifications 48.8% ████████████████████████ +├─ Art marketplace alerts 16.2% ████████ +├─ Shopping promotions 15.4% ███████ +├─ Travel recommendations 13.4% ██████ +└─ Streaming promotions 8.5% ████ + +Business/Professional 20.1% ██████████ +├─ Cloud service reports 13.0% ██████ +├─ Security alerts 7.1% ███ + +AI/Developer Services 12.8% ██████ +├─ AI platform updates 6.4% ███ +├─ Developer tool updates 6.4% ███ + +Personal/Other 18.3% █████████ +├─ Entertainment 5.1% ██ +├─ Productivity tools 3.7% █ +├─ Direct correspondence 1.6% █ +└─ Miscellaneous 7.9% ███ +``` + +--- + +## Method 1: ML-Only Classification + +### Configuration +```yaml +model: LightGBM (pretrained on Enron dataset) +embeddings: all-minilm:l6-v2 (384 dimensions) +threshold: 0.55 confidence +categories: 11 generic (Work, Updates, Financial, etc.) +``` + +### Results + +| Metric | Value | +|--------|-------| +| Accuracy Estimate | 54.9% | +| High Confidence (>55%) | 477 (59.6%) | +| Low Confidence | 324 (40.4%) | +| Processing Time | ~5 seconds | +| LLM Calls | 0 | + +### Category Distribution (ML-Only) + +| Category | Count | % | +|----------|-------|---| +| Work | 243 | 30.3% | +| Technical | 198 | 24.7% | +| Updates | 156 | 19.5% | +| External | 89 | 11.1% | +| Operational | 45 | 5.6% | +| Financial | 38 | 4.7% | +| Other | 32 | 4.0% | + +### Limitations Observed + +1. **Domain Mismatch:** Trained on corporate Enron emails, applied to personal Gmail +2. **Generic Categories:** "Work" and "Technical" absorbed everything +3. **No Sender Intelligence:** Didn't leverage sender domain patterns +4. **High Uncertainty:** 40% needed LLM review but got none + +### When ML-Only Works + +- 10,000+ emails where speed matters +- Corporate/enterprise datasets similar to training data +- Pre-filtering before human review +- Cost-constrained environments (no LLM API) + +--- + +## Method 2: ML + LLM Fallback + +### Configuration +```yaml +ml_model: LightGBM (same as above) +llm_model: qwen3-coder-30b (vLLM on localhost:11433) +threshold: 0.55 confidence +fallback_trigger: confidence < threshold +``` + +### Results + +| Metric | Value | +|--------|-------| +| Accuracy Estimate | 93.3% | +| ML Classified | 477 (59.6%) | +| LLM Classified | 324 (40.4%) | +| Processing Time | ~3.5 minutes | +| LLM Calls | 324 | + +### Category Distribution (ML+LLM) + +| Category | Count | % | Source | +|----------|-------|---|--------| +| Work | 243 | 30.3% | ML | +| Technical | 156 | 19.5% | ML | +| newsletters | 98 | 12.2% | LLM | +| junk | 87 | 10.9% | LLM | +| transactional | 76 | 9.5% | LLM | +| Updates | 62 | 7.7% | ML | +| auth | 45 | 5.6% | LLM | +| Other | 34 | 4.2% | Mixed | + +### Improvements Over ML-Only + +1. **New Categories:** LLM introduced "newsletters", "junk", "transactional", "auth" +2. **Better Separation:** Marketing vs. transactional distinguished +3. **Higher Confidence:** 93.3% vs 54.9% accuracy estimate + +### Limitations Observed + +1. **Category Inconsistency:** ML uses "Updates", LLM uses "newsletters" +2. **No Sender Context:** Still classifying email-by-email +3. **Generic LLM Prompt:** Doesn't know about user's specific interests +4. **Time Cost:** 324 sequential LLM calls at ~0.6s each + +### When ML+LLM Works + +- 1,000-10,000 emails +- Mixed automated/personal content +- When accuracy matters more than speed +- Local LLM available (cost-free fallback) + +--- + +## Method 3: Agent Analysis (Manual) + +### Approach +``` +Phase 1: Initial Discovery (5 min) + - Sample filenames and subjects + - Identify sender domains + - Detect patterns + +Phase 2: Pattern Extraction (10 min) + - Design domain-specific rules + - Test regex patterns + - Validate on subset + +Phase 3: Deep Dive (5 min) + - Track order lifecycles + - Identify billing patterns + - Find edge cases + +Phase 4: Report Generation (5 min) + - Synthesize findings + - Create actionable recommendations +``` + +### Results + +| Metric | Value | +|--------|-------| +| Accuracy | 99.8% (799/801) | +| Categories | 15 custom | +| Processing Time | ~25 minutes | +| LLM Calls | ~20 (analysis only) | + +### Category Distribution (Agent Analysis) + +| Category | Count | % | Subcategories | +|----------|-------|---|---------------| +| Art & Collectibles | 130 | 16.2% | Marketplace alerts | +| Shopping | 123 | 15.4% | eBay, AliExpress, Automotive | +| Entertainment | 109 | 13.6% | Streaming, Gaming, Social | +| Travel & Tourism | 107 | 13.4% | Review sites, Bookings | +| Google Services | 104 | 13.0% | Business, Ads, Analytics | +| Security | 57 | 7.1% | Sign-in alerts, 2FA | +| AI Services | 51 | 6.4% | Claude, OpenAI, Lambda | +| Developer Tools | 51 | 6.4% | ngrok, Firebase, Docker | +| Productivity | 30 | 3.7% | Screen recording, Docs | +| Personal | 13 | 1.6% | Direct correspondence | +| Other | 26 | 3.2% | Childcare, Legal, etc. | + +### Unique Insights (Not Found by ML) + +1. **Specific Artist Tracking:** 95 alerts for specific artist "Dan Colen" +2. **Order Lifecycle:** Single order generated 7 notification emails +3. **Billing Patterns:** Monthly receipts from AI services on 15th +4. **Business Context:** User runs "Fox Software Solutions" +5. **Filtering Rules:** Ready-to-implement Gmail filters + +### When Agent Analysis Works + +- Under 1,000 emails +- Initial dataset understanding +- Creating filtering rules +- One-time deep analysis +- Training data preparation + +--- + +## Comparative Analysis + +### Accuracy vs Time Tradeoff + +``` +Accuracy +100% ─┬─────────────────────────●─── Agent (99.8%) + │ ●─────── ML+LLM (93.3%) + 75% ─┤ + │ + 50% ─┼────●───────────────────────── ML-Only (54.9%) + │ + 25% ─┤ + │ + 0% ─┴────┬────────┬────────┬────────┬─── Time + 5s 1m 5m 30m +``` + +### Cost Analysis (per 1000 emails) + +| Method | Compute | LLM Calls | Est. Cost | +|--------|---------|-----------|-----------| +| ML-Only | 5 sec | 0 | $0.00 | +| ML+LLM | 4 min | ~400 | $0.02-0.40* | +| Agent | 30 min | ~30 | $0.01-0.10* | + +*Depends on LLM provider; local = free, cloud = varies + +### Category Quality + +| Aspect | ML-Only | ML+LLM | Agent | +|--------|---------|--------|-------| +| Granularity | Low (11) | Medium (16) | High (15+subs) | +| Domain-Specific | No | Partial | Yes | +| Actionable | Limited | Moderate | High | +| Sender-Aware | No | No | Yes | +| Context-Aware | No | Limited | Yes | + +--- + +## Enhancement Recommendations + +### 1. Pre-Analysis Phase (10-15 min investment) + +**Concept:** Run agent analysis BEFORE ML classification to: +- Discover sender domains and their purposes +- Identify category patterns specific to dataset +- Generate custom classification rules +- Create sender-to-category mappings + +**Implementation:** +```python +class PreAnalysisAgent: + def analyze(self, emails: List[Email], sample_size=100): + # Phase 1: Sender domain clustering + domains = self.cluster_by_sender_domain(emails) + + # Phase 2: Subject pattern extraction + patterns = self.extract_subject_patterns(emails) + + # Phase 3: Generate custom categories + categories = self.generate_categories(domains, patterns) + + # Phase 4: Create sender-category mapping + sender_map = self.map_senders_to_categories(domains, categories) + + return { + 'categories': categories, + 'sender_map': sender_map, + 'patterns': patterns + } +``` + +**Expected Impact:** +- Accuracy: 54.9% → 85-90% (ML-only with pre-analysis) +- Time: +10 min setup, same runtime +- Best for: 5,000+ email datasets + +### 2. Sender-First Classification + +**Concept:** Classify by sender domain BEFORE content analysis: + +```python +SENDER_CATEGORIES = { + # High-volume automated + 'mutualart.com': ('Notifications', 'Art Alerts'), + 'tripadvisor.com': ('Notifications', 'Travel Marketing'), + 'ebay.com': ('Shopping', 'Marketplace'), + 'spotify.com': ('Entertainment', 'Streaming'), + + # Security - never auto-filter + 'accounts.google.com': ('Security', 'Account Alerts'), + + # Business + 'businessprofile-noreply@google.com': ('Business', 'Reports'), +} + +def classify(email): + domain = extract_domain(email.sender) + if domain in SENDER_CATEGORIES: + return SENDER_CATEGORIES[domain] # 80% of emails + else: + return ml_classify(email) # Fallback for 20% +``` + +**Expected Impact:** +- Accuracy: 85-95% for known senders +- Speed: 10x faster (skip ML for known senders) +- Maintenance: Requires sender map updates + +### 3. Post-Analysis Enhancement + +**Concept:** Run agent analysis AFTER ML to: +- Validate classification quality +- Extract deeper insights +- Generate reports and recommendations +- Identify misclassifications + +**Implementation:** +```python +class PostAnalysisAgent: + def analyze(self, emails: List[Email], classifications: List[Result]): + # Validate: Check for obvious errors + errors = self.detect_misclassifications(emails, classifications) + + # Enrich: Add metadata not captured by ML + enriched = self.extract_metadata(emails) + + # Insights: Generate actionable recommendations + insights = self.generate_insights(emails, classifications) + + return { + 'corrections': errors, + 'enrichments': enriched, + 'insights': insights + } +``` + +### 4. Dataset Size Routing + +**Concept:** Automatically choose method based on volume: + +```python +def choose_method(email_count: int, time_budget: str = 'normal'): + if email_count < 500: + return 'agent_only' # Full agent analysis + + elif email_count < 2000: + return 'agent_then_ml' # Pre-analysis + ML + + elif email_count < 10000: + return 'ml_with_llm' # ML + LLM fallback + + else: + return 'ml_only' # Pure ML for speed +``` + +**Recommended Thresholds:** + +| Volume | Recommended Method | Rationale | +|--------|-------------------|-----------| +| <500 | Agent Only | ML overhead not worth it | +| 500-2000 | Agent Pre-Analysis + ML | Investment pays off | +| 2000-10000 | ML + LLM Fallback | Balanced approach | +| >10000 | ML-Only | Speed critical | + +### 5. Hybrid Category System + +**Concept:** Merge ML categories with agent-discovered categories: + +```python +# ML Generic Categories (trained) +ML_CATEGORIES = ['Work', 'Updates', 'Technical', 'Financial', ...] + +# Agent-Discovered Categories (per-dataset) +AGENT_CATEGORIES = { + 'Art Alerts': {'parent': 'Updates', 'sender': 'mutualart.com'}, + 'Travel Marketing': {'parent': 'Updates', 'sender': 'tripadvisor.com'}, + 'AI Services': {'parent': 'Technical', 'keywords': ['anthropic', 'openai']}, +} + +def classify_hybrid(email, ml_result): + # First: Check agent-specific rules + for cat, rules in AGENT_CATEGORIES.items(): + if matches_rules(email, rules): + return (cat, ml_result.category) # Specific + generic + + # Fallback: ML result + return (ml_result.category, None) +``` + +--- + +## Implementation Roadmap + +### Phase 1: Quick Wins (1-2 hours) + +1. **Add sender-domain classifier** + - Map top 20 senders to categories + - Use as fast-path before ML + - Expected: +20% accuracy + +2. **Add dataset size routing** + - Check email count before processing + - Route small datasets to agent analysis + - Route large datasets to ML pipeline + +### Phase 2: Pre-Analysis Agent (4-8 hours) + +1. **Build sender clustering** + - Group emails by domain + - Calculate volume per domain + - Identify automated vs personal + +2. **Build pattern extraction** + - Find subject templates + - Extract IDs and tracking numbers + - Identify lifecycle stages + +3. **Generate sender map** + - Output: JSON mapping senders to categories + - Feed into ML pipeline as rules + +### Phase 3: Post-Analysis Enhancement (4-8 hours) + +1. **Build validation agent** + - Check low-confidence results + - Detect category conflicts + - Flag for review + +2. **Build enrichment agent** + - Extract order IDs + - Track lifecycles + - Generate insights + +3. **Integrate with HTML report** + - Add insights section + - Show lifecycle tracking + - Include recommendations + +--- + +## Conclusion + +### Key Takeaways + +1. **ML pipeline is overkill for <5,000 emails** - Agent analysis provides better accuracy with similar time investment + +2. **Sender domain is the strongest signal** - 80%+ emails can be classified by sender alone + +3. **Pre-analysis investment pays off** - 10-15 min agent setup dramatically improves ML accuracy + +4. **One-size-fits-all doesn't work** - Route by dataset size for optimal results + +5. **Post-analysis adds unique value** - Lifecycle tracking and insights not possible with ML alone + +### Recommended Default Pipeline + +``` +┌─────────────────────────────────────────────────────────────┐ +│ EMAIL CLASSIFICATION │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ + ┌─────────────────┐ + │ Count Emails │ + └────────┬────────┘ + │ + ┌──────────────────┼──────────────────┐ + │ │ │ + ▼ ▼ ▼ + <500 emails 500-5000 >5000 + │ │ │ + ▼ ▼ ▼ + ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ + │ Agent Only │ │ Pre-Analysis │ │ ML Pipeline │ + │ (15-30 min) │ │ + ML + Post │ │ (fast) │ + │ │ │ (15 min + ML)│ │ │ + └──────────────┘ └──────────────┘ └──────────────┘ + │ │ │ + ▼ ▼ ▼ + ┌──────────────────────────────────────────────────┐ + │ UNIFIED OUTPUT │ + │ - Categorized emails │ + │ - Confidence scores │ + │ - Insights & recommendations │ + │ - Filtering rules │ + └──────────────────────────────────────────────────┘ +``` + +--- + +*Document Version: 1.0* +*Created: 2025-11-28* +*Based on: brett-gmail dataset analysis (801 emails)* diff --git a/docs/COMPLETION_ASSESSMENT.md b/docs/COMPLETION_ASSESSMENT.md deleted file mode 100644 index e756cb3..0000000 --- a/docs/COMPLETION_ASSESSMENT.md +++ /dev/null @@ -1,526 +0,0 @@ -# Email Sorter - Completion Assessment - -**Date**: 2025-10-21 -**Status**: FEATURE COMPLETE - All 16 Phases Implemented -**Test Results**: 27/30 passing (90% success rate) -**Code Quality**: Complete with full type hints and clear mock labeling - ---- - -## Executive Summary - -The Email Sorter framework is **100% feature-complete** with all 16 development phases implemented. The system is ready for: - -1. **Immediate Use**: Framework testing with mock model (~90% test pass rate) -2. **Real Model Integration**: Download/train LightGBM model and deploy -3. **Production Processing**: Process Marion's 80k+ emails with real Gmail integration - -All core infrastructure, classifiers, learning systems, and export/sync mechanisms are complete and tested. - ---- - -## Phase Completion Checklist - -### Phase 1-3: Core Infrastructure ✅ -- [x] Project setup & dependencies (42 packages) -- [x] YAML-based configuration system -- [x] Rich-based logging with file output -- [x] Email data models with full type hints -- [x] Pydantic validation -- **Status**: Complete - -### Phase 4: Email Providers ✅ -- [x] MockProvider (fully functional for testing) -- [x] GmailProvider stub (OAuth-ready, graceful error handling) -- [x] IMAPProvider stub (ready for server config) -- [x] Attachment handling -- **Status**: Framework complete, awaiting credentials - -### Phase 5: Feature Extraction ✅ -- [x] Semantic embeddings (sentence-transformers, 384 dims) -- [x] Hard pattern matching (20+ regex patterns) -- [x] Structural features (metadata, timing, attachments) -- [x] Attachment analysis (PDF, DOCX, XLSX text extraction) -- [x] Embedding cache with MD5 hashing -- [x] Batch processing for efficiency -- **Status**: Complete with 90%+ test coverage - -### Phase 6: ML Classifier ✅ -- [x] Mock Random Forest (clearly labeled) -- [x] LightGBM trainer for real models -- [x] Model serialization/deserialization -- [x] Model integration framework -- [x] Pre-trained model loading -- **Status**: Framework ready, mock model for testing, real model integration tools provided - -### Phase 7: LLM Integration ✅ -- [x] OllamaProvider (local, with retry logic) -- [x] OpenAIProvider (API-compatible) -- [x] Graceful degradation when unavailable -- [x] Batch processing support -- **Status**: Complete - -### Phase 8: Adaptive Classifier ✅ -- [x] Three-tier classification system -- [x] Hard rules (instant, ~10%) -- [x] ML classifier (fast, ~85%) -- [x] LLM review (uncertain cases, ~5%) -- [x] Dynamic threshold management -- [x] Statistics tracking -- **Status**: Complete - -### Phase 9: Processing Pipeline ✅ -- [x] BulkProcessor with checkpointing -- [x] Resumable processing from checkpoints -- [x] Batch-based processing -- [x] Progress tracking -- [x] Error recovery -- **Status**: Complete with test coverage - -### Phase 10: Calibration System ✅ -- [x] EmailSampler (stratified + random) -- [x] LLMAnalyzer (discover natural categories) -- [x] CalibrationWorkflow (end-to-end) -- [x] Category validation -- **Status**: Complete with Enron dataset support - -### Phase 11: Export & Reporting ✅ -- [x] JSON export with metadata -- [x] CSV export for analysis -- [x] Organization by category -- [x] Human-readable reports -- [x] Statistics and metrics -- **Status**: Complete - -### Phase 12: Threshold & Pattern Learning ✅ -- [x] ThresholdAdjuster (learn from LLM feedback) -- [x] Agreement tracking per category -- [x] Automatic threshold suggestions -- [x] PatternLearner (sender-specific rules) -- [x] Category distribution tracking -- [x] Hard rule suggestions -- **Status**: Complete - -### Phase 13: Advanced Processing ✅ -- [x] EnronParser (maildir format support) -- [x] AttachmentHandler (PDF/DOCX content extraction) -- [x] ModelTrainer (real LightGBM training) -- [x] EmbeddingCache (MD5-based with disk persistence) -- [x] EmbeddingBatcher (parallel processing) -- [x] QueueManager (batch persistence) -- **Status**: Complete - -### Phase 14: Provider Sync ✅ -- [x] GmailSync (sync to Gmail labels) -- [x] IMAPSync (sync to IMAP keywords) -- [x] Configurable label mapping -- [x] Batch update support -- [x] Error handling and retry logic -- **Status**: Complete - -### Phase 15: Orchestration ✅ -- [x] EmailSorterOrchestrator (4-phase pipeline) -- [x] Full progress tracking -- [x] Timing and metrics -- [x] Error recovery -- [x] Modular component design -- **Status**: Complete - -### Phase 16: Packaging ✅ -- [x] setup.py with setuptools -- [x] pyproject.toml with PEP 517/518 -- [x] Optional dependencies (dev, gmail, ollama, openai) -- [x] Console script entry point -- [x] Git history with 11 commits -- **Status**: Complete - -### Phase 17: Testing ✅ -- [x] 23 unit tests -- [x] Integration tests -- [x] E2E pipeline tests -- [x] Feature extraction validation -- [x] Classifier flow testing -- **Status**: 27/30 passing (90% success rate) - ---- - -## Test Results Summary - -``` -======================== Test Execution Results ======================== - -PASSED (27 tests): -✅ test_email_model_validation - Email dataclass validation -✅ test_attachment_parsing - Attachment metadata extraction -✅ test_mock_provider - Mock email provider -✅ test_feature_extraction_basic - Basic feature extraction -✅ test_semantic_embeddings - Embedding generation (384 dims) -✅ test_hard_pattern_matching - Pattern detection (19/20 patterns) -✅ test_ml_classifier_prediction - Random Forest predictions -✅ test_adaptive_classifier_workflow - Three-tier classification -✅ test_embedding_cache - MD5-based cache hits/misses -✅ test_embedding_batcher - Batch processing -✅ test_queue_manager - LLM queue management -✅ test_bulk_processor - Resumable checkpointing -✅ test_email_sampler - Stratified sampling -✅ test_llm_analyzer - Category discovery -✅ test_threshold_adjuster - Dynamic threshold learning -✅ test_pattern_learner - Sender-specific rules -✅ test_results_exporter - JSON/CSV export -✅ test_provider_sync - Gmail/IMAP sync -✅ test_ollama_provider - LLM provider integration -✅ test_openai_provider - API-compatible LLM -✅ test_configuration_loading - YAML config parsing -✅ test_logging_system - Rich logging output -✅ test_end_to_end_mock_classification - Full pipeline -✅ test_e2e_mock_pipeline - Mock pipeline validation -✅ test_e2e_export_formats - Export format validation -✅ test_e2e_hard_rules_accuracy - Hard rule precision -✅ test_e2e_batch_processing_performance - Batch efficiency - -FAILED (3 tests - Expected/Documented): -❌ test_e2e_checkpoint_resume - Feature vector mismatch (expected when upgrading models) -❌ test_e2e_enron_parsing - Parser validation (Enron dataset needs validation) -❌ test_pattern_detection_invoice - Minor regex pattern issue (cosmetic) - -======================== Summary ======================== -Total: 30 tests -Passed: 27 (90%) -Failed: 3 (10% - all expected and documented) -Duration: ~90 seconds -Coverage: All major components -``` - ---- - -## Code Statistics - -``` -Files: 38 Python modules + configs -Lines of Code: ~6,000+ production code -Core Modules: 16 major components -Test Files: 6 test suites -Dependencies: 42 packages installed -Git Commits: 11 tracking full development -Total Size: ~450 MB (includes venv + Enron dataset) -``` - -### Module Breakdown - -**Core Infrastructure (3 modules)** -- `src/utils/config.py` - Configuration management -- `src/utils/logging.py` - Logging system -- `src/email_providers/base.py` - Base classes - -**Classification (5 modules)** -- `src/classification/feature_extractor.py` - Feature extraction -- `src/classification/ml_classifier.py` - ML predictions -- `src/classification/llm_classifier.py` - LLM predictions -- `src/classification/adaptive_classifier.py` - Orchestration -- `src/classification/embedding_cache.py` - Caching & batching - -**Calibration (4 modules)** -- `src/calibration/sampler.py` - Email sampling -- `src/calibration/llm_analyzer.py` - Category discovery -- `src/calibration/trainer.py` - Model training -- `src/calibration/workflow.py` - Calibration pipeline - -**Processing & Learning (5 modules)** -- `src/processing/bulk_processor.py` - Batch processing -- `src/processing/queue_manager.py` - Queue management -- `src/processing/attachment_handler.py` - Attachment analysis -- `src/adjustment/threshold_adjuster.py` - Threshold learning -- `src/adjustment/pattern_learner.py` - Pattern learning - -**Export & Sync (4 modules)** -- `src/export/exporter.py` - Results export -- `src/export/provider_sync.py` - Gmail/IMAP sync - -**Integration (3 modules)** -- `src/llm/ollama.py` - Ollama provider -- `src/llm/openai_compat.py` - OpenAI provider -- `src/orchestration.py` - Main orchestrator - -**Email Providers (3 modules)** -- `src/email_providers/gmail.py` - Gmail provider -- `src/email_providers/imap.py` - IMAP provider -- `src/email_providers/mock.py` - Mock provider - -**CLI & Testing (2 modules)** -- `src/cli.py` - Command-line interface -- `tests/` - 23 test cases - -**Tools & Setup (2 scripts)** -- `tools/download_pretrained_model.py` - Model downloading -- `tools/setup_real_model.py` - Model setup - ---- - -## Current Framework Status - -### What's Complete Now -✅ All core infrastructure -✅ Feature extraction system -✅ Three-tier adaptive classifier -✅ Embedding cache and batching -✅ Mock model for testing -✅ LLM integration (Ollama/OpenAI) -✅ Processing pipeline with checkpointing -✅ Calibration workflow -✅ Export (JSON/CSV) -✅ Provider sync (Gmail/IMAP) -✅ Learning systems (threshold + patterns) -✅ CLI interface -✅ Test suite (90% pass rate) - -### What Requires Your Input -1. **Real Model**: Download or train LightGBM model -2. **Gmail Credentials**: OAuth setup for live email access -3. **Real Data**: Use Enron dataset (already downloaded) or your email data - ---- - -## Real Model Integration - -### Quick Start: Using Pre-trained Model - -```bash -# Check if model is installed -python tools/setup_real_model.py --check - -# Setup a pre-trained model (download or local file) -python tools/setup_real_model.py --model-path /path/to/model.pkl - -# Create model info documentation -python tools/setup_real_model.py --info -``` - -### Step 1: Get a Real Model - -**Option A: Train on Enron Dataset** (Recommended) -```python -from src.calibration.enron_parser import EnronParser -from src.calibration.trainer import ModelTrainer -from src.classification.feature_extractor import FeatureExtractor - -# Parse Enron -parser = EnronParser("enron_mail_20150507") -emails = parser.parse_emails(limit=5000) - -# Train model -extractor = FeatureExtractor() -trainer = ModelTrainer(extractor, categories=['junk', 'transactional', ...]) -results = trainer.train(labeled_data) - -# Save -trainer.save_model("src/models/pretrained/classifier.pkl") -``` - -**Option B: Download Pre-trained** -```bash -python tools/download_pretrained_model.py \ - --url https://example.com/model.pkl \ - --hash abc123def456 -``` - -### Step 2: Verify Integration - -```bash -# Check model is loaded -python -c "from src.classification.ml_classifier import MLClassifier; \ - c = MLClassifier(); \ - print(c.get_info())" - -# Should show: is_mock: False, model_type: LightGBM -``` - -### Step 3: Run Full Pipeline - -```bash -# With real model (once set up) -python -m src.cli run --source mock --output results/ -``` - ---- - -## Feature Overview - -### Classification Accuracy -- **Hard Rules**: 94-96% (instant, ~10% of emails) -- **ML Model**: 85-90% (fast, ~85% of emails) -- **LLM Review**: 92-95% (slower, ~5% uncertain) -- **Overall**: 90-94% (weighted average) - -### Performance -- **Calibration**: 3-5 minutes (1500 emails) -- **Bulk Processing**: 10-12 minutes (80k emails) -- **LLM Review**: 4-5 minutes (batched) -- **Export**: 2-3 minutes -- **Total**: ~17-25 minutes for 80k emails - -### Categories (12) -junk, transactional, auth, newsletters, social, automated, conversational, work, personal, finance, travel, unknown - -### Features Extracted -- **Semantic**: 384-dimensional embeddings (all-MiniLM-L6-v2) -- **Patterns**: 20+ regex-based patterns -- **Structural**: Metadata, timing, attachments, sender analysis - ---- - -## Known Issues & Limitations - -### Expected Test Failures (3/30 - Documented) - -**1. test_e2e_checkpoint_resume** -- **Reason**: Feature vector mismatch when switching from mock to real model -- **Impact**: Only relevant when upgrading models -- **Resolution**: Not needed until real model deployed - -**2. test_e2e_enron_parsing** -- **Reason**: EnronParser needs validation against actual maildir format -- **Impact**: Parser works but needs dataset verification -- **Resolution**: Will be validated during real training phase - -**3. test_pattern_detection_invoice** -- **Reason**: Minor regex pattern doesn't match "bill #456" -- **Impact**: Cosmetic - doesn't affect production accuracy -- **Resolution**: Easy regex adjustment if needed - -### Pydantic Warnings (16 warnings) -- **Reason**: Using deprecated `.dict()` method (Pydantic v2 compatibility) -- **Severity**: Cosmetic - code still works perfectly -- **Resolution**: Will migrate to `.model_dump()` in next update - ---- - -## Component Validation - -### Critical Components ✅ -- [x] Feature extraction (embeddings + patterns + structural) -- [x] Three-tier adaptive classifier -- [x] Mock model clearly labeled -- [x] Real model integration framework -- [x] LLM providers (Ollama + OpenAI) -- [x] Queue management with persistence -- [x] Checkpointed processing -- [x] Export/sync mechanisms -- [x] Learning systems (threshold + patterns) -- [x] End-to-end orchestration - -### Framework Quality ✅ -- [x] Type hints on all functions -- [x] Comprehensive error handling -- [x] Logging at all critical points -- [x] Clear mock vs production separation -- [x] Graceful degradation -- [x] Batch processing optimization -- [x] Cache efficiency -- [x] Resumable operations - -### Testing ✅ -- [x] 27/30 tests passing -- [x] All core functions tested -- [x] Integration tests included -- [x] E2E pipeline tests -- [x] Mock model clearly separated -- [x] 90% coverage of critical paths - ---- - -## Deployment Path - -### Phase 1: Framework Validation ✓ (COMPLETE) -- All 16 phases implemented -- 27/30 tests passing -- Documentation complete -- Ready for real data - -### Phase 2: Real Model Deployment (NEXT) -1. Download or train LightGBM model -2. Place in `src/models/pretrained/classifier.pkl` -3. Run verification tests -4. Deploy to production - -### Phase 3: Gmail Integration (PARALLEL) -1. Set up Google Cloud Console -2. Download OAuth credentials -3. Configure `credentials.json` -4. Test with 100 emails first -5. Scale to full dataset - -### Phase 4: Production Processing (FINAL) -1. Process all 80k+ emails -2. Sync results to Gmail labels -3. Review accuracy metrics -4. Iterate on threshold tuning - ---- - -## How to Proceed - -### Immediate (Framework Testing) -```bash -# Test current framework with mock model -pytest tests/ -v # Run full test suite -python -m src.cli test-config # Test config loading -python -m src.cli run --source mock # Test mock pipeline -``` - -### Short Term (Real Model) -```bash -# Option 1: Train on Enron dataset -python -c "from tools import train_enron; train_enron.train()" - -# Option 2: Download pre-trained -python tools/download_pretrained_model.py --url https://... - -# Verify -python tools/setup_real_model.py --check -``` - -### Medium Term (Gmail Integration) -```bash -# Set up credentials -# Place credentials.json in project root - -# Test with 100 emails -python -m src.cli run --source gmail --limit 100 --output test_results/ - -# Review results -``` - -### Production (Full Processing) -```bash -# Process all emails -python -m src.cli run --source gmail --output marion_results/ - -# Package for deployment -python setup.py sdist bdist_wheel -``` - ---- - -## Conclusion - -The Email Sorter framework is **100% feature-complete** and ready to use. All 16 development phases are implemented with: - -- ✅ 38 Python modules with full type hints -- ✅ 27/30 tests passing (90% success rate) -- ✅ ~6,000 lines of code -- ✅ Clear mock vs real model separation -- ✅ Comprehensive logging and error handling -- ✅ Graceful degradation -- ✅ Batch processing optimization -- ✅ Complete documentation - -**The system is ready for:** -1. Real model integration (tools provided) -2. Gmail OAuth setup (framework ready) -3. Full production deployment (80k+ emails) - -No architectural changes needed. Just add real data and credentials. - ---- - -**Next Step**: Download/train a real LightGBM model or use the mock for continued framework testing. diff --git a/docs/COMPREHENSIVE_PROJECT_OVERVIEW.md b/docs/COMPREHENSIVE_PROJECT_OVERVIEW.md deleted file mode 100644 index f2f47e3..0000000 --- a/docs/COMPREHENSIVE_PROJECT_OVERVIEW.md +++ /dev/null @@ -1,5357 +0,0 @@ -# Email Sorter: Comprehensive Project Overview -## A Deep Dive into Hybrid ML/LLM Email Classification Architecture - -**Document Version:** 1.0 -**Project Version:** MVP v1.0 -**Last Updated:** October 26, 2025 -**Total Lines of Production Code:** ~10,000+ -**Proven Performance:** 10,000 emails in 24 seconds with 72.7% accuracy - ---- - -## Table of Contents - -1. [Executive Summary](#executive-summary) -2. [Project Genesis and Vision](#project-genesis-and-vision) -3. [The Problem Space](#the-problem-space) -4. [Architectural Philosophy](#architectural-philosophy) -5. [System Architecture](#system-architecture) -6. [The Three-Tier Classification Strategy](#the-three-tier-classification-strategy) -7. [LLM-Driven Calibration Workflow](#llm-driven-calibration-workflow) -8. [Feature Engineering](#feature-engineering) -9. [Machine Learning Model](#machine-learning-model) -10. [Email Provider Abstraction](#email-provider-abstraction) -11. [Configuration System](#configuration-system) -12. [Performance Optimization Journey](#performance-optimization-journey) -13. [Category Discovery and Management](#category-discovery-and-management) -14. [Testing Infrastructure](#testing-infrastructure) -15. [Data Flow](#data-flow) -16. [Critical Implementation Decisions](#critical-implementation-decisions) -17. [Security and Privacy](#security-and-privacy) -18. [Known Limitations and Trade-offs](#known-limitations-and-trade-offs) -19. [Evolution and Learning](#evolution-and-learning) -20. [Future Roadmap](#future-roadmap) -21. [Technical Debt and Refactoring Opportunities](#technical-debt-and-refactoring-opportunities) -22. [Deployment Considerations](#deployment-considerations) -23. [Comparative Analysis](#comparative-analysis) -24. [Lessons Learned](#lessons-learned) -25. [Conclusion](#conclusion) - ---- - -## Executive Summary - -Email Sorter is a sophisticated hybrid machine learning and large language model (ML/LLM) email classification system designed to automatically organize large email backlogs with high speed and accuracy. The system represents a pragmatic approach to a complex problem: how to efficiently categorize tens of thousands of emails when traditional rule-based systems are too rigid and pure LLM approaches are too slow. - -### Core Innovation - -The system's primary innovation lies in its three-tier classification strategy: - -1. **Hard Rules Layer** (5-10% of emails): Instant classification using regex patterns for obvious cases like OTP codes, invoices, and meeting invitations -2. **ML Classification Layer** (70-85% of emails): Fast LightGBM-based classification using semantic embeddings combined with structural and pattern features -3. **LLM Review Layer** (0-20% of emails): Intelligent fallback for low-confidence predictions, providing human-level judgment only when needed - -This architecture achieves a rare trifecta: high accuracy (92.7% with LLM, 72.7% pure ML), exceptional speed (423 emails/second), and complete adaptability through LLM-driven category discovery. - -### Current Status - -The system has reached MVP status with proven performance on the Enron email dataset: -- 10,000 emails classified in 24 seconds (pure ML mode) -- 1.8MB trained LightGBM model with 11 discovered categories -- Zero LLM calls during classification in fast mode -- Optional category verification with single LLM call -- Full calibration workflow taking ~3-5 minutes on typical datasets - -### What Makes This Different - -Unlike traditional email classifiers that rely on hardcoded rules or cloud-based services, Email Sorter: -- Discovers categories naturally from your own emails using LLM analysis -- Runs entirely locally with no cloud dependencies -- Adapts to any mailbox automatically -- Maintains cross-mailbox consistency through category caching -- Handles attachment content analysis (PDFs, DOCX) -- Provides graceful degradation when LLM is unavailable - -### Technology Stack - -- **ML Framework**: LightGBM (gradient boosting) -- **Embeddings**: all-minilm:l6-v2 via Ollama (384 dimensions) -- **LLM**: qwen3:4b-instruct-2507-q8_0 for calibration -- **Email Providers**: Gmail (OAuth 2.0), Outlook (Microsoft Graph), IMAP, Enron dataset -- **Feature Engineering**: Hybrid approach combining embeddings, TF-IDF, and pattern detection -- **Configuration**: YAML-based with Pydantic validation -- **CLI**: Click-based interface with comprehensive options - ---- - -## Project Genesis and Vision - -### The Original Problem - -The project was born from a real-world pain point observed across self-employed professionals, small business owners, and anyone who has let their email spiral out of control. The typical scenario: - -- 10,000 to 100,000+ unread emails accumulated over months or years -- Fear of "just deleting everything" because important items are buried in there -- Unwillingness to upload sensitive business data to cloud services -- Subscription fatigue from too many SaaS tools -- Need for a one-time cleanup solution - -### Early Explorations - -The initial exploration considered several approaches: - -**Pure Rule-Based System**: Quick to implement but brittle and inflexible. Rules that work for one inbox fail on another. - -**Cloud-Based LLM Service**: High accuracy but prohibitively expensive for bulk processing. Classifying 100,000 emails at $0.001 per email = $100 per job. Also raises privacy concerns. - -**Pure Local LLM**: Solves privacy and cost but extremely slow. Even fast models like qwen3:1.7b process only 30-40 emails per second. - -**Pure ML Without LLM**: Fast but lacks adaptability. How do you train a model without labeled data? Traditional approaches require manual labeling of thousands of examples. - -### The Hybrid Insight - -The breakthrough came from recognizing that these approaches could complement each other: - -1. Use LLM once during calibration to discover categories and label a small training set -2. Train a fast ML model on this LLM-labeled data -3. Use the ML model for bulk classification -4. Fall back to LLM only for uncertain predictions - -This hybrid approach provides the best of all worlds: -- LLM intelligence for category discovery (3% of emails, once) -- ML speed for bulk classification (90% of emails, repeatedly) -- LLM accuracy for edge cases (7% of emails, optional) - -### Vision Evolution - -The vision has evolved through several phases: - -**Phase 1: Proof of Concept** (Complete) -- Enron dataset as test corpus -- Basic three-tier pipeline -- LLM-driven calibration -- Pure ML fast mode - -**Phase 2: Real-World Integration** (In Progress) -- Gmail and Outlook providers -- Email syncing (apply labels back to mailbox) -- Incremental classification (new emails only) -- Multi-account support - -**Phase 3: Production Ready** (Planned) -- Web dashboard for results visualization -- Active learning from user feedback -- Custom category training per user -- Performance tuning (local embeddings, GPU support) - -**Phase 4: Enterprise Features** (Future) -- Multi-language support -- Team collaboration features -- Federated learning (privacy-preserving updates) -- Real-time filtering as emails arrive - ---- - -## The Problem Space - -### Email Classification Complexity - -Email classification is deceptively complex. At first glance, it seems like a straightforward text classification problem. In reality, it involves: - -**1. Massive Context Windows** -- Full email threads can span thousands of tokens -- Attachments contain critical context (invoices, contracts) -- Historical context matters (is this part of an ongoing conversation?) - -**2. Extreme Class Imbalance** -- Most inboxes: 60-80% junk/newsletters, 10-20% work, 5-10% personal, 5% critical -- Rare but important categories (financial, legal) appear infrequently -- Training data naturally skewed toward common categories - -**3. Ambiguous Boundaries** -- Is a work email from a colleague about dinner "work" or "personal"? -- Newsletter from a business tool: "work" or "newsletters"? -- Automated notification about a bank transaction: "automated" or "finance"? - -**4. Evolving Language** -- Spam evolves to evade filters -- Business communication styles change -- New platforms introduce new patterns (Zoom, Teams, Slack notifications) - -**5. Personal Variation** -- What's "important" varies dramatically by person -- Categories meaningful to one user are irrelevant to another -- Same sender can send different types of emails - -### Traditional Approaches and Their Failures - -**Naive Bayes (2000s Standard)** -- Fast and simple -- Works well for spam detection -- Fails on nuanced categories -- Requires extensive manual feature engineering - -**SVM with TF-IDF (2010s Standard)** -- Better than Naive Bayes for multi-class -- Still requires manual category definition -- Sensitive to class imbalance -- Doesn't handle semantic similarity well - -**Deep Learning (LSTM/Transformers)** -- Excellent accuracy with enough data -- Requires thousands of labeled examples per category -- Slow inference (especially transformers) -- Overkill for this problem - -**Commercial Services (Gmail, Outlook)** -- Excellent but limited to their predefined categories -- Privacy concerns (emails uploaded to cloud) -- Not customizable -- Subscription-based - -### Our Approach: Hybrid ML/LLM - -The Email Sorter approach addresses these issues through: - -**Adaptive Categories**: LLM discovers natural categories in each inbox rather than imposing predefined ones. A freelancer's inbox differs from a corporate executive's; the system adapts. - -**Efficient Labeling**: Instead of manually labeling thousands of emails, we use LLM to analyze 300-1500 emails once. This provides training data for ML model. - -**Semantic Understanding**: Sentence embeddings (all-minilm:l6-v2) capture meaning beyond keywords. "Meeting at 3pm" and "Sync at 15:00" cluster together. - -**Pattern Detection**: Hard rules catch obvious cases before expensive ML/LLM processing. OTP codes, invoice numbers, tracking numbers have clear patterns. - -**Graceful Degradation**: System works at three levels: -- Best: All three tiers (rules + ML + LLM) -- Good: Rules + ML only (fast mode) -- Basic: Rules only (if ML unavailable) - ---- - -## Architectural Philosophy - -### Core Principles - -The architecture embodies several key principles learned through iteration: - -#### 1. **Separation of Concerns** - -Each component has a single, well-defined responsibility: -- Email providers handle data acquisition -- Feature extractors handle feature engineering -- Classifiers handle prediction -- Calibration handles training -- CLI handles user interaction - -This separation enables: -- Independent testing of each component -- Easy addition of new providers -- Swapping ML models without touching feature extraction -- Multiple frontend interfaces (CLI, web, API) - -#### 2. **Progressive Enhancement** - -The system provides value at multiple levels: -- Minimum: Rule-based classification (fast, simple) -- Better: + ML classification (accurate, still fast) -- Best: + LLM review (highest accuracy) - -Users can choose their speed/accuracy trade-off via `--no-llm-fallback` flag. - -#### 3. **Fail Gracefully** - -At every level, the system handles failures gracefully: -- LLM unavailable? Fall back to ML -- ML model missing? Fall back to rules -- Rules don't match? Category = "unknown" -- Network error? Retry with exponential backoff -- Email malformed? Skip and log, don't crash - -#### 4. **Make It Observable** - -Logging and metrics throughout: -- Classification stats tracked (rules/ML/LLM breakdown) -- Timing information for each stage -- Confidence distributions -- Error rates and types - -Users always know what the system is doing and why. - -#### 5. **Optimize the Common Case** - -The architecture optimizes for the common path: -- Batched embedding extraction (10x speedup) -- Multi-threaded ML inference -- Category caching across mailboxes -- Threshold tuning to minimize LLM calls - -Edge cases are handled correctly but not at the expense of common path performance. - -#### 6. **Configuration Over Code** - -All behavior controlled via configuration: -- Threshold values (per category) -- Model selection (calibration vs classification LLM) -- Batch sizes -- Sample sizes for calibration - -No code changes needed to tune system behavior. - -### Architecture Layers - -The system follows a clean layered architecture: - -``` -┌─────────────────────────────────────────────────────┐ -│ CLI Layer (User Interface) │ -│ Click-based commands, logging │ -├─────────────────────────────────────────────────────┤ -│ Orchestration Layer │ -│ Calibration Workflow, Classification Pipeline │ -├─────────────────────────────────────────────────────┤ -│ Processing Layer │ -│ AdaptiveClassifier, FeatureExtractor, Trainers │ -├─────────────────────────────────────────────────────┤ -│ Service Layer │ -│ ML Classifier (LightGBM), LLM Classifier (Ollama) │ -├─────────────────────────────────────────────────────┤ -│ Provider Abstraction │ -│ Gmail, Outlook, IMAP, Enron, Mock │ -├─────────────────────────────────────────────────────┤ -│ External Services │ -│ Ollama API, Gmail API, Microsoft Graph API │ -└─────────────────────────────────────────────────────┘ -``` - -Each layer communicates only with adjacent layers, maintaining clean boundaries. - ---- - -## System Architecture - -### High-Level Component Overview - -The system consists of 11 major components: - -#### 1. **CLI Interface** ([src/cli.py](src/cli.py:1)) - -Entry point for all user interactions. Built with Click framework for excellent UX: -- Auto-generated help text -- Type validation -- Multiple commands (run, test-config, test-ollama, test-gmail) -- Comprehensive options (--source, --credentials, --output, --llm-provider, --no-llm-fallback, etc.) - -The CLI orchestrates the entire pipeline: -1. Loads configuration from YAML -2. Initializes email provider based on --source -3. Sets up LLM provider (Ollama or OpenAI) -4. Creates feature extractor, ML classifier, LLM classifier -5. Fetches emails from provider -6. Optionally runs category verification -7. Runs calibration if model doesn't exist -8. Extracts features in batches -9. Classifies emails using adaptive strategy -10. Exports results to JSON/CSV - -#### 2. **Email Providers** ([src/email_providers/](src/email_providers/)) - -Abstract base class with concrete implementations for each source: - -**BaseProvider** defines interface: -- `connect(credentials)`: Initialize connection -- `disconnect()`: Close connection -- `fetch_emails(limit, filters)`: Retrieve emails -- `update_labels(email_id, labels)`: Apply classification results -- `batch_update(updates)`: Bulk label application - -**Email Data Model**: -```python -@dataclass -class Email: - id: str # Unique identifier - subject: str - sender: str - sender_name: Optional[str] - date: Optional[datetime] - body: str # Full body - body_snippet: str # First 500 chars - has_attachments: bool - attachments: List[Attachment] - headers: Dict[str, str] - labels: List[str] - is_read: bool - provider: str # gmail, outlook, imap, enron -``` - -**Implementations**: -- **GmailProvider**: Google OAuth 2.0, Gmail API, batch operations -- **OutlookProvider**: Microsoft Graph API, device flow auth, Office365 support -- **IMAPProvider**: Standard IMAP protocol, username/password auth -- **EnronProvider**: Maildir parser for Enron dataset (testing) -- **MockProvider**: Synthetic emails for testing - -Each provider handles authentication, pagination, rate limiting, and error handling specific to that API. - -#### 3. **Feature Extractor** ([src/classification/feature_extractor.py](src/classification/feature_extractor.py:1)) - -Converts raw emails into feature vectors for ML. Three feature types: - -**A. Semantic Features (384 dimensions)** -- Sentence embeddings via Ollama all-minilm:l6-v2 -- Captures semantic similarity between emails -- Trained on 1B+ sentence pairs -- Universal model (works across domains) - -**B. Structural Features (24 dimensions)** -- has_attachments, attachment_count, attachment_types -- link_count, image_count -- body_length, subject_length -- has_reply_prefix (Re:, Fwd:) -- time_of_day (night/morning/afternoon/evening) -- day_of_week -- sender_domain, sender_domain_type (freemail/corporate/noreply) -- is_noreply - -**C. Pattern Features (11 dimensions)** -- OTP detection: has_otp_pattern, has_verification, has_reset_password -- Transaction: has_invoice_pattern, has_price, has_order_number, has_tracking -- Marketing: has_unsubscribe, has_view_in_browser, has_promotional -- Meeting: has_meeting, has_calendar -- Signature: has_signature - -**Critical Methods**: -- `extract(email)`: Single email (slow, sequential embedding) -- `extract_batch(emails, batch_size=512)`: Batched processing (FAST) - -The batch method is 10x-150x faster because it batches embedding API calls. - -#### 4. **ML Classifier** ([src/classification/ml_classifier.py](src/classification/ml_classifier.py:1)) - -Wrapper around LightGBM model: - -**Initialization**: -- Attempts to load from `src/models/pretrained/classifier.pkl` -- If not found, creates mock RandomForest (warns user) -- Loads category list from model metadata - -**Prediction**: -- Takes embedding vector (384 dims) -- Returns: category, confidence, probability distribution -- Confidence = max probability across all categories - -**Model Structure**: -- LightGBM gradient boosting classifier -- 11 categories (discovered from Enron) -- 200 boosting rounds -- Max depth 8 -- Learning rate 0.1 -- 28 threads for parallel tree building -- 1.8MB serialized size - -#### 5. **LLM Classifier** ([src/classification/llm_classifier.py](src/classification/llm_classifier.py:1)) - -Fallback classifier for low-confidence predictions: - -**Usage Pattern**: -```python -# Only called when ML confidence < threshold -email_dict = { - 'subject': email.subject, - 'sender': email.sender, - 'body_snippet': email.body_snippet, - 'ml_prediction': { - 'category': 'work', - 'confidence': 0.53 # Below 0.55 threshold - } -} -result = llm_classifier.classify(email_dict) -``` - -**Prompt Engineering**: -- Provides ML prediction as context -- Asks LLM to either confirm or override -- Requests reasoning for decision -- Returns JSON with: category, confidence, reasoning - -**Error Handling**: -- Retries with exponential backoff (3 attempts) -- Falls back to ML prediction if all attempts fail -- Logs all failures for analysis - -#### 6. **Adaptive Classifier** ([src/classification/adaptive_classifier.py](src/classification/adaptive_classifier.py:1)) - -Orchestrates the three-tier classification strategy: - -**Decision Flow**: -``` -Email → Hard Rules Check - ├─ Match found? → Return (99% confidence) - └─ No match → ML Classifier - ├─ Confidence ≥ threshold? → Return - └─ Confidence < threshold - ├─ --no-llm-fallback? → Return ML result - └─ LLM available? → LLM Review -``` - -**Classification Statistics Tracking**: -- total_emails, rule_matched, ml_classified, llm_classified, needs_review -- Calculates accuracy estimate: weighted average of 99% (rules) + 92% (ML) + 95% (LLM) - -**Dynamic Threshold Adjustment**: -- Per-category thresholds (initially all 0.55) -- Can adjust based on LLM feedback -- Constrained to min_threshold (0.50) and max_threshold (0.70) - -**Key Methods**: -- `classify(email)`: Full pipeline (extracts features inline, SLOW) -- `classify_with_features(email, features)`: Uses pre-extracted features (FAST) -- `classify_with_llm(ml_result, email)`: LLM review of low-confidence result - -#### 7. **Calibration Workflow** ([src/calibration/workflow.py](src/calibration/workflow.py:1)) - -Complete training pipeline from raw emails to trained model: - -**Pipeline Steps**: - -**Step 1: Sampling** -- Stratified sampling by sender domain -- Ensures diverse representation of email types -- Sample size: 3% of total (min 250, max 1500) -- Validation size: 1% of total (min 100, max 300) - -**Step 2: LLM Category Discovery** -- Processes sample in batches of 20 emails -- LLM analyzes each batch, discovers categories -- Categories are NOT hardcoded - emerge naturally -- Returns: category_map (name → description), email_labels (id → category) - -**Step 3: Category Consolidation** -- If >10 categories discovered, consolidate overlapping ones -- Uses separate (larger) consolidation LLM -- Target: 5-10 final categories -- Maps old categories to consolidated ones - -**Step 4: Category Caching** -- Snaps discovered categories to cached ones (cross-mailbox consistency) -- Allows 3 new categories per mailbox -- Updates usage counts in cache -- Adds cache-worthy new categories to persistent cache - -**Step 5: Model Training** -- Extracts features from labeled emails -- Trains LightGBM on (embedding + structural + pattern) features -- Validates on held-out set -- Saves model to `src/models/calibrated/classifier.pkl` - -**Configuration**: -```python -CalibrationConfig( - sample_size=1500, # Training samples - validation_size=300, # Validation samples - llm_batch_size=50, # Emails per LLM call - model_n_estimators=200, # Boosting rounds - model_learning_rate=0.1, # LightGBM learning rate - model_max_depth=8 # Max tree depth -) -``` - -#### 8. **Calibration Analyzer** ([src/calibration/llm_analyzer.py](src/calibration/llm_analyzer.py:1)) - -LLM-driven category discovery and email labeling: - -**Discovery Process**: - -**Batch Analysis**: -- Processes 20 emails per LLM call -- Calculates batch statistics (domains, keywords, attachment patterns) -- Provides context to LLM for better categorization - -**Category Discovery Guidelines** (in prompt): -- Broad and reusable (not too specific) -- Mutually exclusive (clear boundaries) -- Actionable (useful for filtering/prioritization) -- 3-7 categories per mailbox typical -- Focus on user intent, not sender domain - -**LLM Prompt Structure**: -``` -BATCH STATISTICS: -- Top sender domains: gmail.com (12), paypal.com (5) -- Avg recipients per email: 1.2 -- Emails with attachments: 8/20 -- Common keywords: meeting(4), invoice(3) - -EMAILS: -1. ID: maildir_williams-w3__sent_12 - From: john@enron.com - Subject: Q4 Trading Strategy - Preview: Hi team, I wanted to discuss... - -[... 19 more emails ...] - -TASK: Identify 3-7 natural categories and assign each email. -``` - -**Consolidation Process**: -- If initial discovery yields >10 categories, trigger consolidation -- Separate LLM call with consolidation prompt -- Presents all discovered categories with descriptions -- LLM merges overlapping ones (e.g., "Meetings" + "Calendar" → "Meetings") -- Returns mapping: old_category → new_category - -**Category Caching**: -- Persistent JSON cache at `src/models/category_cache.json` -- Structure: {category: {description, created_at, last_seen, usage_count}} -- Semantic similarity matching (cosine similarity of embeddings) -- Threshold: 0.7 similarity to snap to existing category -- Max 3 new categories per mailbox to prevent cache explosion - -#### 9. **LLM Providers** ([src/llm/](src/llm/)) - -Abstract interface for different LLM backends: - -**BaseLLMProvider** (abstract): -- `is_available()`: Check if service is reachable -- `complete(prompt, temperature, max_tokens)`: Get completion -- Retry logic with exponential backoff - -**OllamaProvider** ([src/llm/ollama.py](src/llm/ollama.py:1)): -- Local Ollama server (http://localhost:11434) -- Models: - - Calibration: qwen3:4b-instruct-2507-q8_0 (better output formatting) - - Consolidation: qwen3:4b-instruct-2507-q8_0 (structured output) - - Classification: qwen3:4b-instruct-2507-q8_0 (smaller, faster) -- Temperature: 0.1 (low randomness for consistent output) -- Max tokens: 2000 (calibration), 500 (classification) -- Timeout: 30 seconds -- Retry: 3 attempts with exponential backoff - -**OpenAIProvider** ([src/llm/openai_compat.py](src/llm/openai_compat.py:1)): -- OpenAI API or compatible endpoints -- Models: gpt-4o-mini (cost-effective) -- API key from environment variable -- Same interface as Ollama for drop-in replacement - -#### 10. **Configuration System** ([src/utils/config.py](src/utils/config.py:1)) - -YAML-based configuration with Pydantic validation: - -**Configuration Files**: -- `config/default_config.yaml`: System defaults (83 lines) -- `config/categories.yaml`: Category definitions (139 lines) -- `config/llm_models.yaml`: LLM provider settings - -**Pydantic Models**: -```python -class CalibrationConfig(BaseModel): - sample_size: int = 250 - sample_strategy: str = "stratified" - validation_size: int = 50 - min_confidence: float = 0.6 - -class ProcessingConfig(BaseModel): - batch_size: int = 100 - llm_queue_size: int = 100 - parallel_workers: int = 4 - checkpoint_interval: int = 1000 - -class ClassificationConfig(BaseModel): - default_threshold: float = 0.55 - min_threshold: float = 0.50 - max_threshold: float = 0.70 -``` - -**Benefits**: -- Type validation at load time -- Auto-completion in IDEs -- Clear documentation of all options -- Easy to extend with new fields - -#### 11. **Export System** ([src/export/](src/export/)) - -Results serialization and provider sync: - -**Exporter** ([src/export/exporter.py](src/export/exporter.py:1)): -- JSON format (full details) -- CSV format (simple spreadsheet) -- By-category organization -- Summary reports - -**ProviderSync** ([src/export/provider_sync.py](src/export/provider_sync.py:1)): -- Applies classification results back to email provider -- Creates/updates labels in Gmail, Outlook -- Batch operations for efficiency -- Dry-run mode for testing - ---- - -## The Three-Tier Classification Strategy - -The heart of the system is its three-tier classification approach. This isn't just a technical detail - it's the core innovation that makes the system both fast and accurate. - -### Tier 1: Hard Rules (Instant Classification) - -**Coverage**: 5-10% of emails -**Accuracy**: 99% -**Latency**: <1ms per email - -The first tier catches obvious cases using regex pattern matching. These are emails where the category is unambiguous: - -**Authentication Emails**: -```python -patterns = [ - 'verification code', - 'otp', - 'reset password', - 'confirm identity', - r'\b\d{4,6}\b' # 4-6 digit codes -] -``` -Any email containing these phrases is immediately classified as "auth" with 99% confidence. No need for ML or LLM. - -**Financial Emails**: -```python -# Sender name contains bank keywords AND content has financial terms -if ('bank' in sender_name.lower() and - any(p in text for p in ['statement', 'balance', 'account'])): - return 'finance' -``` - -**Transactional Emails**: -```python -patterns = [ - r'invoice\s*#?\d+', - r'receipt\s*#?\d+', - r'order\s*#?\d+', - r'tracking\s*#?' -] -``` - -**Spam/Junk**: -```python -patterns = [ - 'unsubscribe', - 'click here now', - 'limited time offer', - 'view in browser' -] -``` - -**Meeting/Calendar**: -```python -patterns = [ - 'meeting at', - 'zoom link', - 'teams meeting', - 'calendar invite' -] -``` - -**Why Hard Rules First?** - -1. **Speed**: Regex matching is microseconds, ML is milliseconds, LLM is seconds -2. **Certainty**: These patterns have near-zero false positive rate -3. **Cost**: No computation needed beyond string matching -4. **Debugging**: Easy to understand why an email was classified - -**Limitations**: - -- Only catches obvious cases -- Brittle (new patterns require code updates) -- Can't handle ambiguity -- Language/culture dependent - -But for 5-10% of emails, these limitations don't matter because the cases are genuinely unambiguous. - -### Tier 2: ML Classification (Fast, Accurate) - -**Coverage**: 70-85% of emails -**Accuracy**: 92% -**Latency**: ~0.07ms per email (with batching) - -The second tier uses a trained LightGBM model operating on semantic embeddings plus structural features. - -**How It Works**: - -1. **Feature Extraction** (batched): - - Embedding: 384-dim vector from all-minilm:l6-v2 - - Structural: 24 features (attachment count, link count, time of day, etc.) - - Patterns: 11 boolean features (has_otp, has_invoice, etc.) - - Total: ~420 dimensions - -2. **Model Prediction**: - - LightGBM predicts probability distribution over categories - - Example: {work: 0.82, personal: 0.11, newsletters: 0.04, ...} - - Predicted category: argmax (work) - - Confidence: max probability (0.82) - -3. **Threshold Check**: - - Compare confidence to category-specific threshold (default 0.55) - - If confidence ≥ threshold: Accept ML prediction - - If confidence < threshold: Queue for LLM review (Tier 3) - -**Why LightGBM?** - -Several ML algorithms were considered: - -**Logistic Regression**: Too simple, can't capture non-linear patterns -**Random Forest**: Good but slower than LightGBM -**XGBoost**: Excellent but LightGBM is faster and more memory efficient -**Neural Network**: Overkill, requires more training data, slower inference -**Transformers**: Extremely accurate but 100x slower - -LightGBM provides the best speed/accuracy trade-off: -- Fast training (seconds, not minutes) -- Fast inference (0.7s for 10k emails) -- Handles mixed feature types (continuous embeddings + binary patterns) -- Excellent with small training sets (300-1500 examples) -- Built-in feature importance -- Low memory footprint (1.8MB model) - -**Threshold Optimization**: - -Original threshold: 0.75 (conservative) -- 35% of emails sent to LLM review -- Total time: 5 minutes for 10k emails -- Accuracy: 95% - -Optimized threshold: 0.55 (balanced) -- 21% of emails sent to LLM review -- Total time: 24 seconds for 10k emails (with --no-llm-fallback) -- Accuracy: 92% - -Trade-off decision: 3% accuracy loss for 12x speedup. In fast mode (no LLM), this is the final result. - -**Why It Works**: - -The key insight is that semantic embeddings capture most of the signal: -- "Meeting at 3pm" and "Sync tomorrow afternoon" have similar embeddings -- "Your invoice is ready" and "Receipt for order #12345" cluster together -- Sender domain + subject + body snippet contains enough information for 85% of emails - -The structural and pattern features help with edge cases: -- Email with tracking number → likely transactional -- No-reply sender + unsubscribe link → likely junk -- Weekend send time + informal language → likely personal - -### Tier 3: LLM Review (Human-Level Judgment) - -**Coverage**: 0-20% of emails (user-configurable) -**Accuracy**: 95% -**Latency**: ~1-2s per email - -The third tier provides human-level judgment for uncertain cases. - -**When Triggered**: -- ML confidence < threshold (0.55) -- LLM provider available -- Not disabled with --no-llm-fallback - -**What Gets Sent to LLM**: -```python -email_dict = { - 'subject': 'Re: Q4 Strategy Discussion', - 'sender': 'john@acme.com', - 'body_snippet': 'Thanks for the detailed analysis. I think we should...', - 'has_attachments': True, - 'ml_prediction': { - 'category': 'work', - 'confidence': 0.53 # Below threshold! - } -} -``` - -**LLM Prompt**: -``` -You are an email classification assistant. Review this email and either confirm or override the ML prediction. - -ML PREDICTION: work (53% confidence) - -EMAIL: -Subject: Re: Q4 Strategy Discussion -From: john@acme.com -Preview: Thanks for the detailed analysis. I think we should... -Has Attachments: True - -TASK: Assign to one of these categories: -- work: Business correspondence, projects, deadlines -- personal: Friends and family -- newsletters: Marketing emails, digests -[... all categories ...] - -Respond in JSON: -{ - "category": "work", - "confidence": 0.85, - "reasoning": "Business topic, corporate sender, professional tone" -} -``` - -**Why LLM for Uncertain Cases?** - -LLMs excel at ambiguous cases because they can: -- Reason about context and intent -- Handle unusual patterns -- Understand nuanced language -- Make judgment calls like humans - -Examples where LLM adds value: - -**Ambiguous Sender + Topic**: -- Subject: "Dinner Friday?" -- From: colleague@work.com -- Is this work or personal? -- LLM can reason: "Colleague asking about dinner likely personal/social unless context indicates work dinner" - -**Unusual Format**: -- Forwarded email chain with 5 prior messages -- ML gets confused by mixed topics -- LLM can follow conversation thread and identify primary topic - -**Emerging Patterns**: -- New type of automated notification -- ML hasn't seen this pattern before -- LLM can generalize from description - -**Cost-Benefit Analysis**: - -Without LLM tier (fast mode): -- Time: 24 seconds for 10k emails -- Accuracy: 72.7% -- Cost: $0 (local only) - -With LLM tier: -- Time: 4 minutes for 10k emails (10x slower) -- Accuracy: 92.7% -- Cost: ~2000 LLM calls × $0.0001 = $0.20 -- When: 20% improvement in accuracy matters (business email, legal, important archives) - -### Intelligent Mode Selection - -The system intelligently selects appropriate tier based on dataset size: - -**<1000 emails**: LLM-only mode -- Too few emails to train accurate ML model -- LLM processes all emails -- Time: ~30-40 minutes for 1000 emails -- Use case: Small personal inboxes - -**1000-10,000 emails**: Hybrid mode recommended -- Enough data for decent ML model -- Calibration: 3% of emails (30-300 samples) -- Classification: Rules + ML + optional LLM -- Time: 5 minutes with LLM, 30 seconds without -- Use case: Most users - -**>10,000 emails**: ML-optimized mode -- Large dataset → excellent ML model -- Calibration: 1500 samples (capped) -- Classification: Rules + ML, skip LLM -- Time: 2-5 minutes for 100k emails -- Use case: Business archives, bulk cleanup - -User can override with flags: -- `--no-llm-fallback`: Force ML-only (speed priority) -- `--verify-categories`: Single LLM call to check model fit (20 seconds overhead) - ---- - -## LLM-Driven Calibration Workflow - -The calibration workflow is where the magic happens - transforming an unlabeled email dataset into a trained ML model without human intervention. - -### Why LLM-Driven Calibration? - -Traditional ML requires labeled training data: -- Hire humans to label thousands of emails: $$$, weeks of time -- Use active learning: Still requires hundreds of labels -- Transfer learning: Requires similar domain (Gmail categories don't fit business inboxes) - -LLM-driven calibration solves this by using the LLM as a "synthetic human labeler": -- LLM has strong priors about email categories -- Can label hundreds of emails in minutes -- Discovers categories naturally (not hardcoded) -- Adapts to each inbox's unique patterns - -### Calibration Pipeline (Step by Step) - -#### Phase 1: Stratified Sampling - -**Goal**: Select representative subset of emails for analysis - -**Strategy**: Stratified by sender domain -- Ensures diverse email types -- Prevents over-representation of prolific senders -- Captures rare but important categories - -**Algorithm**: -```python -def stratified_sample(emails, sample_size): - # Group by sender domain - by_domain = defaultdict(list) - for email in emails: - domain = extract_domain(email.sender) - by_domain[domain].append(email) - - # Calculate samples per domain - samples_per_domain = {} - for domain, emails in by_domain.items(): - # Proportional allocation with minimum 1 per domain - proportion = len(emails) / total_emails - samples = max(1, int(sample_size * proportion)) - samples_per_domain[domain] = min(samples, len(emails)) - - # Sample from each domain - sample = [] - for domain, count in samples_per_domain.items(): - sample.extend(random.sample(by_domain[domain], count)) - - return sample -``` - -**Parameters**: -- Sample size: 3% of total emails - - Minimum: 250 emails (statistical significance) - - Maximum: 1500 emails (diminishing returns above this) -- Validation size: 1% of total emails - - Minimum: 100 emails - - Maximum: 300 emails - -**Why 3%?** - -Tested different sample sizes: -- 1% (100 emails): Poor model, misses rare categories -- 3% (300 emails): Good balance, captures most patterns -- 5% (500 emails): Marginal improvement, 60% more LLM cost -- 10% (1000 emails): No significant improvement, expensive - -3% captures 95% of category diversity while keeping LLM costs reasonable. - -#### Phase 2: LLM Category Discovery - -**Goal**: Identify natural categories in the email sample - -**Process**: Batch analysis with 20 emails per LLM call - -**Why Batches?** - -Single email analysis: -- LLM sees each email in isolation -- No cross-email pattern recognition -- Inconsistent category naming ("Work" vs "Business" vs "Professional") - -Batch analysis (20 emails): -- LLM sees patterns across emails -- Consistent category naming -- Better boundary definition -- More efficient (fewer API calls) - -**Batch Structure**: - -For each batch of 20 emails: - -1. **Calculate Batch Statistics**: -```python -stats = { - 'top_sender_domains': [('gmail.com', 12), ('paypal.com', 5)], - 'avg_recipients': 1.2, - 'emails_with_attachments': 8/20, - 'avg_subject_length': 45.3, - 'common_keywords': [('meeting', 4), ('invoice', 3), ...] -} -``` - -2. **Build Email Summary**: -``` -1. ID: maildir_williams-w3__sent_12 - From: john@enron.com - Subject: Q4 Trading Strategy Discussion - Preview: Hi team, I wanted to share my thoughts on... - -2. ID: maildir_williams-w3__inbox_543 - From: noreply@paypal.com - Subject: Receipt for your payment - Preview: Thank you for your payment of $29.99... - -[... 18 more ...] -``` - -3. **LLM Analysis Prompt**: -``` -You are analyzing emails to discover natural categories for automatic classification. - -BATCH STATISTICS: -- Top sender domains: gmail.com (12), paypal.com (5) -- Avg recipients: 1.2 -- Emails with attachments: 8/20 -- Common keywords: meeting(4), invoice(3) - -EMAILS: -[... 20 email summaries ...] - -GUIDELINES FOR GOOD CATEGORIES: -1. Broad and reusable (3-7 categories for typical inbox) -2. Mutually exclusive (clear boundaries) -3. Actionable (useful for filtering/sorting) -4. Focus on USER INTENT, not sender domain -5. Examples: Work, Financial, Personal, Updates, Urgent - -TASK: -1. Identify natural categories in this batch -2. Assign each email to exactly one category -3. Provide description for each category - -Respond in JSON: -{ - "categories": { - "Work": "Business correspondence, meetings, projects", - "Financial": "Invoices, receipts, bank statements", - ... - }, - "labels": [ - {"email_id": "maildir_williams-w3__sent_12", "category": "Work"}, - {"email_id": "maildir_williams-w3__inbox_543", "category": "Financial"}, - ... - ] -} -``` - -**LLM Response Parsing**: -```python -response = llm.complete(prompt) -data = json.loads(response) - -# Extract categories -discovered_categories = data['categories'] # {name: description} - -# Extract labels -email_labels = [(label['email_id'], label['category']) - for label in data['labels']] -``` - -**Iterative Discovery**: - -Process all batches (typically 5-75 batches for 100-1500 emails): -```python -all_categories = {} -all_labels = [] - -for batch in batches: - result = analyze_batch(batch) - - # Merge categories (union) - for cat, desc in result['categories'].items(): - if cat not in all_categories: - all_categories[cat] = desc - - # Collect labels - all_labels.extend(result['labels']) -``` - -After processing all batches, we have: -- all_categories: Complete set of discovered categories (typically 8-15) -- all_labels: Every email labeled with a category - -#### Phase 3: Category Consolidation - -**Goal**: Reduce overlapping/redundant categories to 5-10 final categories - -**When Triggered**: Only if >10 categories discovered - -**Why Consolidate?** - -Too many categories: -- Confusion for users (is "Meetings" different from "Calendar"?) -- Class imbalance in ML training -- Harder to maintain consistent labeling - -**Consolidation Process**: - -1. **Consolidation Prompt**: -``` -You have discovered these categories: - -1. Work: Business correspondence, projects, meetings -2. Meetings: Calendar invites, meeting reminders -3. Financial: Bank statements, credit card bills -4. Invoices: Payment receipts, invoices -5. Updates: Product updates, service notifications -6. Newsletters: Marketing emails, newsletters -7. Personal: Friends and family -8. Administrative: HR emails, admin tasks -9. Urgent: Time-sensitive requests -10. Technical: IT notifications, technical discussions -11. Requests: Action items, requests for input - -TASK: Consolidate overlapping categories to max 10 total. - -GUIDELINES: -- Merge similar categories (e.g., Financial + Invoices) -- Keep distinct purposes separate (Work ≠ Personal) -- Prioritize actionable distinctions -- Ensure every old category maps to exactly one new category - -Respond in JSON: -{ - "consolidated_categories": { - "Work": "Business correspondence, meetings, projects", - "Financial": "Invoices, bills, statements, payments", - "Updates": "Product updates, newsletters, notifications", - ... - }, - "mapping": { - "Work": "Work", - "Meetings": "Work", // Merged into Work - "Financial": "Financial", - "Invoices": "Financial", // Merged into Financial - "Updates": "Updates", - "Newsletters": "Updates", // Merged into Updates - ... - } -} -``` - -2. **Apply Mapping**: -```python -consolidated = consolidate_categories(all_categories) - -# Update email labels -for i, (email_id, old_cat) in enumerate(all_labels): - new_cat = consolidated['mapping'][old_cat] - all_labels[i] = (email_id, new_cat) - -# Use consolidated categories -final_categories = consolidated['consolidated_categories'] -``` - -**Result**: 5-10 well-defined, non-overlapping categories - -#### Phase 4: Category Caching (Cross-Mailbox Consistency) - -**Goal**: Reuse categories across mailboxes for consistency - -**The Problem**: -- User A's mailbox: LLM discovers "Work", "Financial", "Personal" -- User B's mailbox: LLM discovers "Business", "Finance", "Private" -- Same concepts, different names → inconsistent experience - -**The Solution**: Category cache - -**Cache Structure** ([src/models/category_cache.json](src/models/category_cache.json:1)): -```json -{ - "Work": { - "description": "Business correspondence, meetings, projects", - "embedding": [0.23, -0.45, 0.67, ...], // 384 dims - "created_at": "2025-10-20T10:30:00Z", - "last_seen": "2025-10-25T14:22:00Z", - "usage_count": 267 - }, - "Financial": { - "description": "Invoices, bills, statements, payments", - "embedding": [0.12, -0.78, 0.34, ...], - "created_at": "2025-10-20T10:30:00Z", - "last_seen": "2025-10-25T14:22:00Z", - "usage_count": 195 - }, - ... -} -``` - -**Snapping Process**: - -1. **Calculate Similarity**: -```python -def calculate_similarity(new_category, cached_categories): - new_embedding = embed(new_category['description']) - - similarities = {} - for cached_name, cached_data in cached_categories.items(): - cached_embedding = cached_data['embedding'] - similarity = cosine_similarity(new_embedding, cached_embedding) - similarities[cached_name] = similarity - - return similarities -``` - -2. **Snap to Cache**: -```python -def snap_to_cache(discovered_categories, cache, threshold=0.7): - snapped = {} - mapping = {} - new_categories = [] - - for name, desc in discovered_categories.items(): - similarities = calculate_similarity({'name': name, 'description': desc}, cache) - - best_match, score = max(similarities.items(), key=lambda x: x[1]) - - if score >= threshold: - # Snap to existing category - snapped[best_match] = cache[best_match]['description'] - mapping[name] = best_match - else: - # Keep as new category (if under limit) - if len(new_categories) < 3: # Max 3 new per mailbox - snapped[name] = desc - mapping[name] = name - new_categories.append((name, desc)) - - return snapped, mapping, new_categories -``` - -3. **Update Labels**: -```python -# Remap email labels to snapped categories -for i, (email_id, old_cat) in enumerate(all_labels): - new_cat = mapping[old_cat] - all_labels[i] = (email_id, new_cat) -``` - -4. **Update Cache**: -```python -# Update usage counts -category_counts = Counter(cat for _, cat in all_labels) - -# Add new cache-worthy categories (LLM-approved) -for name, desc in new_categories: - cache[name] = { - 'description': desc, - 'embedding': embed(desc), - 'created_at': now(), - 'last_seen': now(), - 'usage_count': category_counts[name] - } - -# Update existing categories -for cat, count in category_counts.items(): - if cat in cache: - cache[cat]['last_seen'] = now() - cache[cat]['usage_count'] += count - -save_cache(cache) -``` - -**Benefits**: -- First user: Discovers fresh categories -- Second user: Reuses compatible categories (if similar mailbox) -- Consistency: Same category names across mailboxes -- Flexibility: Can add new categories if genuinely different - -**Example**: - -User A (freelancer): -- Discovered: "ClientWork", "Invoices", "Marketing" -- Cache empty → All three added to cache - -User B (corporate): -- Discovered: "BusinessCorrespondence", "Billing", "Newsletters" -- Similarity matching: - - "BusinessCorrespondence" ↔ "ClientWork": 0.82 → Snap to "ClientWork" - - "Billing" ↔ "Invoices": 0.91 → Snap to "Invoices" - - "Newsletters" ↔ "Marketing": 0.68 → Below threshold, add as new -- Result: Uses "ClientWork", "Invoices", adds "Newsletters" - -User C (small business): -- Discovered: "Work", "Bills", "Updates" -- Similarity matching: - - "Work" ↔ "ClientWork": 0.88 → Snap to "ClientWork" - - "Bills" ↔ "Invoices": 0.94 → Snap to "Invoices" - - "Updates" ↔ "Newsletters": 0.75 → Snap to "Newsletters" -- Result: Uses all cached categories, adds nothing new - -After 10 users, cache has 8-12 stable categories that cover 95% of use cases. - -#### Phase 5: Model Training - -**Goal**: Train LightGBM classifier on LLM-labeled data - -**Training Data Preparation**: - -1. **Feature Extraction**: -```python -training_features = [] -training_labels = [] - -for email in sample_emails: - # Find LLM label - category = label_map.get(email.id) - if not category: - continue # Skip unlabeled - - # Extract features - features = feature_extractor.extract(email) - embedding = features['embedding'] # 384 dims - - training_features.append(embedding) - training_labels.append(category) -``` - -2. **Train LightGBM**: -```python -import lightgbm as lgb - -# Create dataset -lgb_train = lgb.Dataset( - training_features, - label=training_labels, - categorical_feature=['sender_domain_type', 'time_of_day', 'day_of_week'] -) - -# Training parameters -params = { - 'objective': 'multiclass', - 'num_class': len(categories), - 'metric': 'multi_logloss', - 'num_leaves': 31, - 'max_depth': 8, - 'learning_rate': 0.1, - 'feature_fraction': 0.8, - 'bagging_fraction': 0.8, - 'bagging_freq': 5, - 'verbose': -1, - 'num_threads': 28 // Use all CPU cores -} - -# Train -model = lgb.train( - params, - lgb_train, - num_boost_round=200, - valid_sets=[lgb_val], - early_stopping_rounds=20 -) -``` - -3. **Validation**: -```python -# Predict on validation set -val_predictions = model.predict(validation_features) -val_categories = [categories[np.argmax(pred)] for pred in val_predictions] - -# Calculate accuracy -accuracy = sum(pred == true for pred, true in zip(val_categories, validation_labels)) / len(validation_labels) - -logger.info(f"Validation accuracy: {accuracy:.1%}") -``` - -4. **Save Model**: -```python -import joblib - -model_data = { - 'model': model, - 'categories': categories, - 'feature_names': feature_extractor.get_feature_names(), - 'category_to_idx': {cat: idx for idx, cat in enumerate(categories)}, - 'idx_to_category': {idx: cat for idx, cat in enumerate(categories)}, - 'training_accuracy': train_accuracy, - 'validation_accuracy': validation_accuracy, - 'training_size': len(training_features), - 'created_at': datetime.now().isoformat() -} - -joblib.dump(model_data, 'src/models/calibrated/classifier.pkl') -``` - -**Training Time**: -- Feature extraction: 20-30 seconds (batched embeddings) -- LightGBM training: 5-10 seconds (200 rounds, 28 threads) -- Total: ~30-40 seconds - -**Model Size**: 1.8MB (small enough to commit to git if desired) - -### Calibration Performance - -**Input**: 10,000 Enron emails (unsorted) - -**Calibration**: -- Sample size: 300 emails (3%) -- LLM analysis: 15 batches × 20 emails -- Categories discovered: 11 -- Training time: 3 minutes -- Validation accuracy: 94.1% - -**Classification** (pure ML, no LLM fallback): -- 10,000 emails in 24 seconds (423 emails/sec) -- Accuracy: 72.7% -- Method breakdown: Rules 8%, ML 92% - -**Classification** (with LLM fallback): -- 10,000 emails in 4 minutes (42 emails/sec) -- Accuracy: 92.7% -- Method breakdown: Rules 8%, ML 71%, LLM 21% - -**Key Metrics**: -- LLM cost (calibration): 15 calls × $0.01 = $0.15 -- LLM cost (classification with fallback): 2100 calls × $0.0001 = $0.21 -- Total cost: $0.36 for 10k emails -- Amortized: $0.000036 per email - ---- - -## Feature Engineering - -Feature engineering is where domain knowledge meets machine learning. The system combines three feature types to capture different aspects of emails. - -### Philosophy - -The feature engineering philosophy follows these principles: - -1. **Semantic + Structural**: Embeddings capture meaning, patterns capture form -2. **Universal Features**: Work across domains (business, personal, different languages) -3. **Interpretable**: Each feature has clear meaning for debugging -4. **Efficient**: Fast to extract, even at scale - -### Feature Type 1: Semantic Embeddings (384 dimensions) - -**What**: Dense vector representations of email content using pre-trained sentence transformer - -**Model**: all-minilm:l6-v2 -- 384-dimensional output -- 22M parameters -- Trained on 1B+ sentence pairs -- Universal (works across domains without fine-tuning) - -**Via Ollama**: Important architectural decision -```python -# Why Ollama instead of sentence-transformers directly? -# 1. Ollama caches model (instant loading) -# 2. sentence-transformers downloads 90MB each run (90s overhead) -# 3. Same underlying model, different API - -import ollama -client = ollama.Client(host="http://localhost:11434") - -response = client.embed( - model='all-minilm:l6-v2', - input=text -) -embedding = response['embeddings'][0] # 384 floats -``` - -**Text Construction**: - -Not just subject + body. We build structured text with metadata: - -```python -def _build_embedding_text(email): - return f"""[EMAIL_METADATA] -sender_type: {email.sender_domain_type} -time_of_day: {email.time_of_day} -has_attachments: {email.has_attachments} -attachment_count: {email.attachment_count} - -[DETECTED_PATTERNS] -has_otp: {email.has_otp_pattern} -has_invoice: {email.has_invoice_pattern} -has_unsubscribe: {email.has_unsubscribe} -is_noreply: {email.is_noreply} -has_meeting: {email.has_meeting} - -[CONTENT] -subject: {email.subject[:100]} -body: {email.body_snippet[:300]} -""" -``` - -**Why Structured Format?** - -Experiments showed 8% accuracy improvement with structured format vs. raw text: -- Raw: "Receipt for your payment Your order..." -- Structured: Clear sections with labels -- Model learns to weight metadata vs. content - -**Batching Critical**: - -```python -# SLOW: Sequential (15ms per email) -embeddings = [embed(email) for email in emails] # 10k emails = 150 seconds - -# FAST: Batched (20ms per batch of 512) -texts = [build_text(email) for email in emails] -embeddings = [] -for i in range(0, len(texts), 512): - batch = texts[i:i+512] - response = ollama_client.embed(model='all-minilm:l6-v2', input=batch) - embeddings.extend(response['embeddings']) -# 10k emails = 20 batches = 20 seconds (7.5x speedup) -``` - -**Why This Matters**: - -Embeddings capture semantic similarity that keywords miss: -- "Meeting at 3pm" ≈ "Sync tomorrow afternoon" ≈ "Calendar: Team standup" -- "Invoice #12345" ≈ "Receipt for order" ≈ "Payment confirmation" -- "Verify your account" ≈ "Confirm your identity" ≈ "One-time code: 123456" - -### Feature Type 2: Structural Features (24 dimensions) - -**What**: Metadata about email structure, timing, sender - -**Attachment Features** (3): -```python -has_attachments: bool # Any attachments? -attachment_count: int # How many? -attachment_types: List[str] # ['.pdf', '.docx', ...] -``` - -Why: Transactional emails often have PDF invoices. Work emails have presentations. Personal emails rarely have attachments. - -**Link/Media Features** (2): -```python -link_count: int # Count of https:// in text -image_count: int # Count of 500 chars). - -**Reply/Forward Features** (1): -```python -has_reply_prefix: bool # Subject starts with Re: or Fwd: -``` - -Why: Conversations have reply prefixes. Marketing never does. - -**Temporal Features** (2): -```python -time_of_day: str # night/morning/afternoon/evening -day_of_week: str # monday...sunday -``` - -Why: Automated emails sent at 3am. Personal emails on weekends. Work emails during business hours. - -**Sender Features** (3): -```python -sender_domain: str # gmail.com, paypal.com, etc. -sender_domain_type: str # freemail/corporate/noreply -is_noreply: bool # no-reply@ or noreply@ -``` - -Why: noreply@ is always automated. Freemail might be personal or spam. Corporate domain likely work or transactional. - -**Domain Classification**: -```python -def classify_domain(sender): - domain = sender.split('@')[1].lower() - - freemail = {'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com'} - noreply_patterns = ['noreply', 'no-reply', 'donotreply'] - - if domain in freemail: - return 'freemail' - elif any(p in sender.lower() for p in noreply_patterns): - return 'noreply' - else: - return 'corporate' -``` - -### Feature Type 3: Pattern Detection (11 dimensions) - -**What**: Boolean flags for specific patterns detected via regex - -**Authentication Patterns** (3): -```python -has_otp_pattern: bool # 4-6 digit code: \b\d{4,6}\b -has_verification: bool # Contains "verification" -has_reset_password: bool # Contains "reset password" -``` - -Examples: -- "Your code is 723481" → has_otp_pattern=True -- "Verify your account" → has_verification=True - -**Transactional Patterns** (4): -```python -has_invoice_pattern: bool # invoice #\d+ -has_price: bool # $\d+\.\d{2} -has_order_number: bool # order #\d+ -has_tracking: bool # tracking number -``` - -Examples: -- "Invoice #INV-2024-00123" → has_invoice_pattern=True -- "Total: $49.99" → has_price=True - -**Marketing Patterns** (3): -```python -has_unsubscribe: bool # Contains "unsubscribe" -has_view_in_browser: bool # Contains "view in browser" -has_promotional: bool # "limited time", "special offer", "sale" -``` - -Examples: -- "Click here to unsubscribe" → has_unsubscribe=True -- "Limited time: 50% off!" → has_promotional=True - -**Meeting Patterns** (2): -```python -has_meeting: bool # meeting|zoom|teams -has_calendar: bool # Contains "calendar" -``` - -Examples: -- "Zoom link: https://..." → has_meeting=True - -**Signature Pattern** (1): -```python -has_signature: bool # regards|sincerely|best|cheers -``` - -Example: -- "Best regards, John" → has_signature=True (suggests conversational) - -**Why Pattern Features?** - -ML models (including LightGBM) excel when given both: -- High-level representations (embeddings) -- Low-level discriminative features (patterns) - -Pattern features provide: -1. **Strong signals**: OTP pattern almost guarantees "auth" category -2. **Interpretability**: Easy to understand why classifier chose category -3. **Robustness**: Regex patterns work even if embedding model fails -4. **Speed**: Pattern matching is microseconds - -### Feature Vector Assembly - -Final feature vector for ML model: - -```python -def assemble_feature_vector(email_features): - # Embedding: 384 dimensions - embedding = email_features['embedding'] - - # Structural: 24 dimensions (encoded) - structural = [ - email_features['has_attachments'], # 0/1 - email_features['attachment_count'], # int - email_features['link_count'], # int - email_features['image_count'], # int - email_features['body_length'], # int - email_features['subject_length'], # int - email_features['has_reply_prefix'], # 0/1 - encode_categorical(email_features['time_of_day']), # 0-3 - encode_categorical(email_features['day_of_week']), # 0-6 - encode_categorical(email_features['sender_domain_type']), # 0-2 - email_features['is_noreply'], # 0/1 - ] - - # Patterns: 11 dimensions - patterns = [ - email_features['has_otp_pattern'], # 0/1 - email_features['has_verification'], # 0/1 - email_features['has_reset_password'], # 0/1 - email_features['has_invoice_pattern'], # 0/1 - email_features['has_price'], # 0/1 - email_features['has_order_number'], # 0/1 - email_features['has_tracking'], # 0/1 - email_features['has_unsubscribe'], # 0/1 - email_features['has_view_in_browser'], # 0/1 - email_features['has_promotional'], # 0/1 - email_features['has_meeting'], # 0/1 - ] - - # Concatenate: 384 + 24 + 11 = 419 dimensions - return np.concatenate([embedding, structural, patterns]) -``` - -### Feature Importance (From LightGBM) - -After training, LightGBM reports feature importance: - -``` -Top 20 Features: -1. embedding_dim_42: 0.082 (specific semantic concept) -2. embedding_dim_156: 0.074 (another semantic concept) -3. has_unsubscribe: 0.065 (strong junk signal) -4. is_noreply: 0.058 (automated email indicator) -5. has_otp_pattern: 0.055 (strong auth signal) -6. sender_domain_type: 0.051 (freemail vs corporate) -7. embedding_dim_233: 0.048 -8. has_invoice_pattern: 0.045 (transactional signal) -9. body_length: 0.041 (short=automated, long=personal) -10. time_of_day: 0.039 (business hours matter) -... -``` - -**Key Insights**: -- Embeddings dominate (top features are embedding dimensions) -- But pattern features punch above their weight (11 dims, 30% of total importance) -- Structural features provide context (length, timing, sender type) - ---- - -## Machine Learning Model - -### Why LightGBM? - -LightGBM (Light Gradient Boosting Machine) was chosen after evaluating multiple algorithms. - -**Algorithms Considered**: - -| Algorithm | Training Time | Inference Time | Accuracy | Memory | Notes | -|-----------|--------------|----------------|----------|--------|-------| -| Logistic Regression | 1s | 0.5s | 68% | 100KB | Too simple | -| Random Forest | 8s | 2.1s | 88% | 8MB | Good but slow | -| XGBoost | 12s | 1.5s | 91% | 4MB | Excellent but slower | -| **LightGBM** | **5s** | **0.7s** | **92%** | **1.8MB** | ✓ Winner | -| Neural Network (2-layer) | 45s | 3.2s | 90% | 12MB | Overkill | -| Transformer (BERT) | 5min | 15s | 95% | 500MB | Way overkill | - -**LightGBM Advantages**: -1. **Speed**: Fastest training and inference among competitive algorithms -2. **Accuracy**: Nearly matches XGBoost (1% difference) -3. **Memory**: Smallest model size among tree-based methods -4. **Small Data**: Excellent performance with just 300-1500 training examples -5. **Mixed Features**: Handles continuous (embeddings) + categorical (patterns) seamlessly -6. **Interpretability**: Feature importance, tree visualization -7. **Mature**: Battle-tested in Kaggle competitions and production systems - -### Model Architecture - -LightGBM builds an ensemble of decision trees using gradient boosting. - -**Key Concepts**: - -**Gradient Boosting**: Train trees sequentially, each correcting errors of previous trees -``` -prediction = tree1 + tree2 + tree3 + ... + tree200 -``` - -**Leaf-Wise Growth**: Grows trees leaf-by-leaf (not level-by-level) -- Faster convergence -- Better accuracy with same number of nodes -- Risk of overfitting (controlled by max_depth) - -**Histogram-Based Splitting**: Buckets continuous features into discrete bins -- Much faster than exact split finding -- Minimal accuracy loss -- Enables GPU acceleration - -### Training Configuration - -```python -params = { - # Task - 'objective': 'multiclass', # Multi-class classification - 'num_class': 11, # Number of categories - 'metric': 'multi_logloss', # Optimization metric - - # Tree structure - 'num_leaves': 31, # Max leaves per tree (2^5 - 1) - 'max_depth': 8, # Max tree depth (prevents overfitting) - - # Learning - 'learning_rate': 0.1, # Step size (aka eta) - 'num_estimators': 200, # Number of boosting rounds - - # Regularization - 'feature_fraction': 0.8, # Use 80% of features per tree - 'bagging_fraction': 0.8, # Use 80% of data per tree - 'bagging_freq': 5, # Bagging every 5 iterations - 'lambda_l1': 0.0, # L1 regularization (Lasso) - 'lambda_l2': 0.0, # L2 regularization (Ridge) - - # Performance - 'num_threads': 28, # Use all CPU cores - 'verbose': -1, # Suppress output - - # Categorical features - 'categorical_feature': [ # These are categorical, not continuous - 'sender_domain_type', - 'time_of_day', - 'day_of_week' - ] -} -``` - -**Parameter Tuning Journey**: - -Initial (conservative): -- num_estimators: 100 -- learning_rate: 0.05 -- max_depth: 6 -- Result: 85% accuracy, underfit - -Optimized (current): -- num_estimators: 200 -- learning_rate: 0.1 -- max_depth: 8 -- Result: 92% accuracy, good balance - -Aggressive (experimented): -- num_estimators: 500 -- learning_rate: 0.15 -- max_depth: 12 -- Result: 94% accuracy on training, 89% on validation (overfit!) - -**Final Choice**: Optimized config provides best generalization. - -### Training Process - -```python -def train(training_data, validation_data, params): - # 1. Prepare data - X_train, y_train = zip(*training_data) - X_val, y_val = zip(*validation_data) - - # 2. Create LightGBM datasets - lgb_train = lgb.Dataset( - X_train, - label=y_train, - categorical_feature=['sender_domain_type', 'time_of_day', 'day_of_week'] - ) - lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train) - - # 3. Train with early stopping - callbacks = [ - lgb.early_stopping(stopping_rounds=20), # Stop if no improvement for 20 rounds - lgb.log_evaluation(period=10) # Log every 10 rounds - ] - - model = lgb.train( - params, - lgb_train, - num_boost_round=200, - valid_sets=[lgb_train, lgb_val], - valid_names=['train', 'val'], - callbacks=callbacks - ) - - # 4. Evaluate - train_pred = model.predict(X_train) - val_pred = model.predict(X_val) - - train_acc = accuracy(train_pred, y_train) - val_acc = accuracy(val_pred, y_val) - - return model, {'train_acc': train_acc, 'val_acc': val_acc} -``` - -**Early Stopping**: Critical for preventing overfitting -- Monitors validation loss each round -- If no improvement for 20 rounds, stop training -- Typically stops at round 120-150 (not full 200) - -### Inference - -```python -def predict(model, email_features): - # 1. Get probability distribution - probs = model.predict(email_features) # [0.15, 0.68, 0.03, 0.11, 0.02, ...] - - # 2. Get predicted category - predicted_idx = np.argmax(probs) - category = idx_to_category[predicted_idx] - - # 3. Get confidence (max probability) - confidence = np.max(probs) - - # 4. Build probability dict - prob_dict = { - cat: float(prob) - for cat, prob in zip(categories, probs) - } - - return { - 'category': category, - 'confidence': confidence, - 'probabilities': prob_dict - } -``` - -**Example Output**: -```python -{ - 'category': 'work', - 'confidence': 0.847, - 'probabilities': { - 'work': 0.847, - 'personal': 0.082, - 'newsletters': 0.041, - 'transactional': 0.019, - 'junk': 0.008, - ... - } -} -``` - -### Performance Characteristics - -**Training**: -- Dataset: 300 emails with 419-dim features -- Time: 5 seconds (28 threads) -- Memory: <500MB peak -- Disk: 1.8MB saved model - -**Inference**: -- Batch: 10,000 emails -- Time: 0.7 seconds (14,285 emails/sec) -- Memory: <100MB (model loaded) -- Per-email: 0.07ms average - -**Accuracy** (on Enron dataset): -- Training: 98.2% (slight overfit acceptable) -- Validation: 94.1% -- Test (pure ML): 72.7% -- Test (ML + LLM): 92.7% - -**Why Test Accuracy Lower?** - -Training/validation uses LLM-labeled data (high quality). -Test uses ground truth from folder names (noisy labels). -Example: Email in "sent" folder might be work, personal, or other. - -### Model Serialization - -```python -import joblib - -model_bundle = { - 'model': lgb_model, # LightGBM booster - 'categories': categories, # List of category names - 'category_to_idx': {cat: i for i, cat in enumerate(categories)}, - 'idx_to_category': {i: cat for i, cat in enumerate(categories)}, - 'feature_names': feature_extractor.get_feature_names(), - 'training_accuracy': 0.982, - 'validation_accuracy': 0.941, - 'training_size': 300, - 'config': params, - 'created_at': '2025-10-25T02:54:00Z' -} - -joblib.dump(model_bundle, 'src/models/calibrated/classifier.pkl') -``` - -**Loading**: -```python -model_bundle = joblib.load('src/models/calibrated/classifier.pkl') -model = model_bundle['model'] -categories = model_bundle['categories'] -``` - -**Model Versioning**: -- File includes creation timestamp -- Can compare different training runs -- Easy to A/B test model versions - -### Model Interpretability - -**Feature Importance**: -```python -importance = model.feature_importance(importance_type='gain') -feature_importance = list(zip(feature_names, importance)) -feature_importance.sort(key=lambda x: x[1], reverse=True) - -for name, importance in feature_importance[:20]: - print(f"{name}: {importance:.3f}") -``` - -**Tree Visualization**: -```python -lgb.plot_tree(model, tree_index=0, figsize=(20, 15)) -# Shows first tree structure -``` - -**Prediction Explanation**: -```python -# For any prediction, can trace through trees -contribution = model.predict(features, pred_contrib=True) -# Shows how each feature contributed to prediction -``` - ---- - -## Email Provider Abstraction - -The system supports multiple email sources through a clean provider abstraction. - -### Provider Interface - -**BaseProvider** abstract class defines the contract: - -```python -class BaseProvider(ABC): - @abstractmethod - def connect(self, credentials: Dict[str, Any]) -> bool: - """Initialize connection to email service.""" - pass - - @abstractmethod - def disconnect(self) -> None: - """Close connection.""" - pass - - @abstractmethod - def fetch_emails( - self, - limit: Optional[int] = None, - filters: Optional[Dict[str, Any]] = None - ) -> List[Email]: - """Fetch emails with optional filters.""" - pass - - @abstractmethod - def update_labels( - self, - email_id: str, - labels: List[str] - ) -> bool: - """Apply labels/categories to email.""" - pass - - def batch_update( - self, - updates: List[Tuple[str, List[str]]] - ) -> Dict[str, bool]: - """Bulk label updates (optional optimization).""" - results = {} - for email_id, labels in updates: - results[email_id] = self.update_labels(email_id, labels) - return results -``` - -### Gmail Provider - -**Authentication**: OAuth 2.0 with installed app flow - -**Setup**: -1. Create project in Google Cloud Console -2. Enable Gmail API -3. Create OAuth 2.0 credentials (Desktop app) -4. Download credentials.json - -**First Run** (interactive): -```python -provider = GmailProvider() -provider.connect({'credentials_path': 'credentials.json'}) -# Opens browser for OAuth consent -# Saves token.json for future runs -``` - -**Subsequent Runs** (automatic): -```python -provider = GmailProvider() -provider.connect({'credentials_path': 'credentials.json'}) -# Loads token.json automatically -# No browser interaction needed -``` - -**Implementation Highlights**: - -```python -class GmailProvider(BaseProvider): - def __init__(self): - self.service = None - self.creds = None - - def connect(self, credentials): - creds = None - - # Load existing token - if os.path.exists('token.json'): - creds = Credentials.from_authorized_user_file('token.json', SCOPES) - - # Refresh if expired - if creds and creds.expired and creds.refresh_token: - creds.refresh(Request()) - - # New authorization if needed - if not creds or not creds.valid: - flow = InstalledAppFlow.from_client_secrets_file( - credentials['credentials_path'], SCOPES - ) - creds = flow.run_local_server(port=0) - - # Save for next time - with open('token.json', 'w') as token: - token.write(creds.to_json()) - - # Build Gmail service - self.service = build('gmail', 'v1', credentials=creds) - self.creds = creds - return True - - def fetch_emails(self, limit=None, filters=None): - emails = [] - - # Build query - query = filters.get('query', '') if filters else '' - - # Fetch message IDs - results = self.service.users().messages().list( - userId='me', - q=query, - maxResults=min(limit, 500) if limit else 500 - ).execute() - - messages = results.get('messages', []) - - # Fetch full messages (batched) - for msg_ref in messages: - msg = self.service.users().messages().get( - userId='me', - id=msg_ref['id'], - format='full' - ).execute() - - # Parse to Email object - email = self._parse_gmail_message(msg) - emails.append(email) - - if limit and len(emails) >= limit: - break - - return emails - - def update_labels(self, email_id, labels): - # Create labels if they don't exist - for label in labels: - self._create_label_if_needed(label) - - # Apply labels - label_ids = [self.label_name_to_id[label] for label in labels] - - self.service.users().messages().modify( - userId='me', - id=email_id, - body={'addLabelIds': label_ids} - ).execute() - - return True -``` - -**Challenges**: -- Rate limiting (batch requests where possible) -- Pagination (handle continuation tokens) -- Label creation (async, need to check existence) -- HTML parsing (extract plain text from multipart messages) - -### Outlook Provider - -**Authentication**: Microsoft OAuth 2.0 with device flow - -**Why Device Flow?** - -Installed app flow (like Gmail) requires browser on same machine. -Device flow works on headless servers: -1. Show code to user -2. User visits aka.ms/devicelogin on any device -3. Enters code -4. App gets token - -**Setup**: -1. Register app in Azure AD -2. Configure redirect URI -3. Note client ID and tenant ID -4. Grant Mail.Read and Mail.ReadWrite permissions - -**Implementation**: - -```python -from msal import PublicClientApplication - -class OutlookProvider(BaseProvider): - def __init__(self): - self.client = None - self.token = None - - def connect(self, credentials): - self.client = PublicClientApplication( - credentials['client_id'], - authority=f"https://login.microsoftonline.com/{credentials['tenant_id']}" - ) - - # Try to load cached token - accounts = self.client.get_accounts() - if accounts: - result = self.client.acquire_token_silent(SCOPES, account=accounts[0]) - if result: - self.token = result['access_token'] - return True - - # Device flow for new token - flow = self.client.initiate_device_flow(scopes=SCOPES) - - print(flow['message']) # "To sign in, use a web browser to open https://..." - - result = self.client.acquire_token_by_device_flow(flow) - - if 'access_token' in result: - self.token = result['access_token'] - return True - else: - logger.error(f"Auth failed: {result.get('error_description')}") - return False - - def fetch_emails(self, limit=None, filters=None): - headers = {'Authorization': f'Bearer {self.token}'} - - url = 'https://graph.microsoft.com/v1.0/me/messages' - params = { - '$top': min(limit, 999) if limit else 999, - '$select': 'id,subject,from,receivedDateTime,body,hasAttachments', - '$orderby': 'receivedDateTime DESC' - } - - response = requests.get(url, headers=headers, params=params) - data = response.json() - - emails = [] - for msg in data.get('value', []): - email = self._parse_graph_message(msg) - emails.append(email) - - return emails - - def update_labels(self, email_id, labels): - # Microsoft Graph uses categories (not labels) - headers = {'Authorization': f'Bearer {self.token}'} - - url = f'https://graph.microsoft.com/v1.0/me/messages/{email_id}' - body = {'categories': labels} - - response = requests.patch(url, headers=headers, json=body) - return response.status_code == 200 -``` - -**Graph API Benefits**: -- RESTful (easier than IMAP) -- Rich querying ($filter, $select, $orderby) -- Batch operations supported -- Well-documented - -### IMAP Provider - -**Authentication**: Username + password - -**Use Cases**: -- Corporate email servers -- Self-hosted email -- Any server supporting IMAP protocol - -**Implementation**: - -```python -import imaplib -import email -from email.header import decode_header - -class IMAPProvider(BaseProvider): - def __init__(self): - self.connection = None - - def connect(self, credentials): - host = credentials['host'] - port = credentials.get('port', 993) - username = credentials['username'] - password = credentials['password'] - - # Connect with SSL - self.connection = imaplib.IMAP4_SSL(host, port) - self.connection.login(username, password) - - # Select inbox - self.connection.select('INBOX') - - return True - - def fetch_emails(self, limit=None, filters=None): - # Search for emails - search_criteria = filters.get('criteria', 'ALL') if filters else 'ALL' - _, message_numbers = self.connection.search(None, search_criteria) - - email_ids = message_numbers[0].split() - - if limit: - email_ids = email_ids[-limit:] # Most recent N - - emails = [] - for email_id in email_ids: - _, msg_data = self.connection.fetch(email_id, '(RFC822)') - - raw_email = msg_data[0][1] - msg = email.message_from_bytes(raw_email) - - parsed = self._parse_imap_message(msg, email_id) - emails.append(parsed) - - return emails - - def update_labels(self, email_id, labels): - # IMAP uses flags, not labels - # Map categories to IMAP flags - flag_mapping = { - 'important': '\\Flagged', - 'read': '\\Seen', - 'archived': '\\Deleted', # or move to Archive folder - } - - for label in labels: - if label in flag_mapping: - self.connection.store(email_id, '+FLAGS', flag_mapping[label]) - - # For custom labels, need to move to folder - for label in labels: - if label not in flag_mapping: - # Create folder if needed - self._create_folder_if_needed(label) - # Move message - self.connection.copy(email_id, label) - - return True -``` - -**IMAP Challenges**: -- No standardized label system (use flags or folders) -- Slow for large mailboxes (no batch fetch) -- Connection can timeout -- Different servers have quirks - -### Enron Provider - -**Purpose**: Testing and development - -**Dataset**: Enron email corpus -- 500,000+ emails from 150 users -- Public domain -- Organized into maildir format -- Real-world complexity - -**Structure**: -``` -maildir/ -├── williams-w3/ -│ ├── inbox/ -│ │ ├── 1. -│ │ ├── 2. -│ │ └── ... -│ ├── sent/ -│ ├── deleted_items/ -│ └── ... -├── allen-p/ -└── ... -``` - -**Implementation**: - -```python -class EnronProvider(BaseProvider): - def __init__(self, maildir_path='maildir'): - self.maildir_path = Path(maildir_path) - - def connect(self, credentials=None): - # No authentication needed - return self.maildir_path.exists() - - def fetch_emails(self, limit=None, filters=None): - emails = [] - - # Walk through all users and folders - for user_dir in self.maildir_path.iterdir(): - if not user_dir.is_dir(): - continue - - for folder in user_dir.iterdir(): - if not folder.is_dir(): - continue - - for email_file in folder.iterdir(): - if limit and len(emails) >= limit: - break - - # Parse email file - email_obj = self._parse_enron_email(email_file, user_dir.name, folder.name) - emails.append(email_obj) - - return emails[:limit] if limit else emails - - def _parse_enron_email(self, path, user, folder): - with open(path, 'r', encoding='latin-1') as f: - msg = email.message_from_file(f) - - # Build unique ID - email_id = f"maildir_{user}_{folder}_{path.name}" - - # Extract fields - subject = self._decode_header(msg['Subject']) - sender = msg['From'] - date = email.utils.parsedate_to_datetime(msg['Date']) - body = self._get_body(msg) - - # Folder name is ground truth label (for testing) - ground_truth = folder - - return Email( - id=email_id, - subject=subject, - sender=sender, - date=date, - body=body, - body_snippet=body[:500], - has_attachments=False, # Enron dataset doesn't include attachments - headers={'X-Folder': folder}, # Store for evaluation - labels=[], - is_read=False, - provider='enron' - ) -``` - -**Benefits**: -- No authentication required -- Large, realistic dataset -- Deterministic (same emails every run) -- Ground truth labels (folder names) -- Fast iteration during development - ---- - -## Configuration System - -The system uses YAML configuration files with Pydantic validation for type safety and documentation. - -### Configuration Files - -#### default_config.yaml (System Defaults) - -```yaml -version: "1.0.0" - -calibration: - sample_size: 250 # Start small - sample_strategy: "stratified" # By sender domain - validation_size: 50 # Held-out test set - min_confidence: 0.6 # Min to accept LLM label - -processing: - batch_size: 100 # Emails per batch - llm_queue_size: 100 # Max queued for LLM - parallel_workers: 4 # Thread pool size - checkpoint_interval: 1000 # Save progress every N - -classification: - default_threshold: 0.55 # OPTIMIZED (was 0.75) - min_threshold: 0.50 # Lower bound - max_threshold: 0.70 # Upper bound - -llm: - provider: "ollama" - ollama: - base_url: "http://localhost:11434" - calibration_model: "qwen3:4b-instruct-2507-q8_0" - consolidation_model: "qwen3:4b-instruct-2507-q8_0" - classification_model: "qwen3:4b-instruct-2507-q8_0" - temperature: 0.1 # Low randomness - max_tokens: 2000 # For calibration - timeout: 30 # Seconds - retry_attempts: 3 - -features: - embedding_model: "all-MiniLM-L6-v2" - embedding_batch_size: 32 - -export: - format: "json" - include_confidence: true - create_report: true - -logging: - level: "INFO" - file: "logs/email-sorter.log" -``` - -#### categories.yaml (Category Definitions) - -```yaml -categories: - junk: - description: "Spam, unwanted marketing, phishing attempts" - patterns: - - "unsubscribe" - - "click here" - - "limited time" - threshold: 0.55 - priority: 1 # Higher priority = checked first - - auth: - description: "OTPs, password resets, 2FA codes" - patterns: - - "verification code" - - "otp" - - "reset password" - threshold: 0.55 - priority: 1 - - transactional: - description: "Receipts, invoices, confirmations" - patterns: - - "receipt" - - "invoice" - - "order" - threshold: 0.55 - priority: 2 - - work: - description: "Business correspondence, meetings, projects" - patterns: - - "meeting" - - "project" - - "deadline" - threshold: 0.55 - priority: 2 - - [... 8 more categories ...] - -processing_order: # Order for rule matching - - auth - - finance - - transactional - - work - - personal - - newsletters - - junk - - unknown -``` - -### Pydantic Models - -Type-safe configuration with validation: - -```python -from pydantic import BaseModel, Field, validator - -class CalibrationConfig(BaseModel): - sample_size: int = Field(250, ge=50, le=5000) - sample_strategy: str = Field("stratified", pattern="^(stratified|random)$") - validation_size: int = Field(50, ge=10, le=1000) - min_confidence: float = Field(0.6, ge=0.0, le=1.0) - - @validator('validation_size') - def validate_validation_size(cls, v, values): - if 'sample_size' in values and v >= values['sample_size']: - raise ValueError("validation_size must be < sample_size") - return v - -class ProcessingConfig(BaseModel): - batch_size: int = Field(100, ge=1, le=1000) - llm_queue_size: int = Field(100, ge=1) - parallel_workers: int = Field(4, ge=1, le=64) - checkpoint_interval: int = Field(1000, ge=100) - -class ClassificationConfig(BaseModel): - default_threshold: float = Field(0.55, ge=0.0, le=1.0) - min_threshold: float = Field(0.50, ge=0.0, le=1.0) - max_threshold: float = Field(0.70, ge=0.0, le=1.0) - - @validator('max_threshold') - def validate_thresholds(cls, v, values): - if v < values.get('min_threshold', 0): - raise ValueError("max_threshold must be >= min_threshold") - return v - -class OllamaConfig(BaseModel): - base_url: str = "http://localhost:11434" - calibration_model: str = "qwen3:4b-instruct-2507-q8_0" - consolidation_model: str = "qwen3:4b-instruct-2507-q8_0" - classification_model: str = "qwen3:4b-instruct-2507-q8_0" - temperature: float = Field(0.1, ge=0.0, le=2.0) - max_tokens: int = Field(2000, ge=100, le=10000) - timeout: int = Field(30, ge=1, le=300) - retry_attempts: int = Field(3, ge=1, le=10) - -class Config(BaseModel): - version: str - calibration: CalibrationConfig - processing: ProcessingConfig - classification: ClassificationConfig - llm: LLMConfig - features: FeaturesConfig - export: ExportConfig - logging: LoggingConfig -``` - -### Loading Configuration - -```python -def load_config(config_path='config/default_config.yaml') -> Config: - with open(config_path) as f: - yaml_data = yaml.safe_load(f) - - try: - config = Config(**yaml_data) - return config - except ValidationError as e: - logger.error(f"Config validation failed: {e}") - sys.exit(1) -``` - -### Configuration Override - -Command-line flags override config file: - -```python -# In CLI -cfg = load_config(config_path) - -# Override threshold if specified -if threshold_flag: - cfg.classification.default_threshold = threshold_flag - -# Override LLM model if specified -if model_flag: - cfg.llm.ollama.classification_model = model_flag -``` - -### Benefits of This Approach - -1. **Type Safety**: Pydantic catches type errors at load time -2. **Validation**: Range checks, pattern matching, cross-field validation -3. **Documentation**: Field descriptions serve as inline docs -4. **IDE Support**: Auto-completion for config fields -5. **Testing**: Easy to create test configs programmatically -6. **Versioning**: Version field enables migration logic -7. **Defaults**: Sensible defaults, override only what's needed - ---- - -## Performance Optimization Journey - -The system's performance evolved significantly through multiple optimization iterations. - -### Iteration 1: Naive Baseline - -**Approach**: Sequential processing, one email at a time - -```python -results = [] -for email in emails: - features = feature_extractor.extract(email) # 15ms (embedding API call) - prediction = ml_classifier.predict(features) # 0.1ms - if prediction.confidence < threshold: - llm_result = llm_classifier.classify(email) # 2000ms - results.append(llm_result) - else: - results.append(prediction) -``` - -**Performance** (10,000 emails): -- Feature extraction: 10,000 × 15ms = 150 seconds -- ML classification: 10,000 × 0.1ms = 1 second -- LLM review (30%): 3,000 × 2s = 6,000 seconds (100 minutes!) -- **Total: 103 minutes** - -**Bottleneck**: LLM calls dominate (98% of time) - -### Iteration 2: Threshold Optimization - -**Approach**: Reduce LLM fallback by lowering threshold - -```python -# Changed threshold from 0.75 → 0.55 -``` - -**Impact**: -- LLM fallback: 30% → 20% (33% reduction) -- Accuracy: 95% → 92% (3% loss) -- Time: 103 minutes → 70 minutes (32% faster) - -**Trade-off**: Acceptable accuracy loss for significant speedup - -### Iteration 3: Batched Embedding Extraction - -**Approach**: Batch embedding API calls - -```python -# Before: One call per email -embeddings = [ollama_client.embed(email) for email in emails] -# 10,000 calls × 15ms = 150 seconds - -# After: Batch calls -embeddings = [] -for i in range(0, len(emails), 512): - batch = emails[i:i+512] - response = ollama_client.embed(batch) # Single call for 512 emails - embeddings.extend(response) -# 20 calls × 1000ms = 20 seconds (7.5x speedup!) -``` - -**Batch Size Experiment**: - -| Batch Size | API Calls | Total Time | Speedup | -|------------|-----------|------------|---------| -| 1 (baseline) | 10,000 | 150s | 1x | -| 128 | 78 | 39s | 3.8x | -| 256 | 39 | 27s | 5.6x | -| 512 | 20 | 20s | 7.5x | -| 1024 | 10 | 22s | 6.8x (diminishing returns) | -| 2048 | 5 | 22s | 6.8x (same as 1024) | - -**Chosen**: 512 (best speed without memory pressure) - -**Impact**: -- Feature extraction: 150s → 20s (7.5x faster) -- Total time: 70 minutes → 50 minutes (29% faster) - -### Iteration 4: Multi-Threaded ML Inference - -**Approach**: Parallelize LightGBM predictions - -```python -# LightGBM config -params = { - 'num_threads': 28, # Use all CPU cores - ... -} - -# Inference -predictions = model.predict(features, num_threads=28) -``` - -**Impact**: -- ML inference: 2s → 0.7s (2.8x faster) -- Total time: 50 minutes → 50 minutes (negligible, ML not bottleneck) - -**Note**: ML was already fast, threading helps but doesn't matter much - -### Iteration 5: LLM Batching (Attempted) - -**Approach**: Review multiple emails in one LLM call - -```python -# Send 10 low-confidence emails per LLM call -batch = low_confidence_emails[:10] -llm_result = llm_classifier.classify_batch(batch) # Single call -``` - -**Experiment Results**: - -| Batch Size | Latency/Batch | Emails/Sec | Accuracy | -|------------|---------------|------------|----------| -| 1 (baseline) | 2s | 0.5 | 95% | -| 5 | 8s | 0.625 | 93% | -| 10 | 18s | 0.556 | 91% | - -**Finding**: Batching hurts more than helps -- Latency increases super-linearly (context length) -- Accuracy decreases (less focus per email) -- Throughput barely improves - -**Decision**: Keep single-email LLM calls - -### Iteration 6: Fast Mode (No LLM) - -**Approach**: Add `--no-llm-fallback` flag - -```python -if not no_llm_fallback and prediction.confidence < threshold: - llm_result = llm_classifier.classify(email) - results.append(llm_result) -else: - results.append(prediction) # Accept ML result regardless -``` - -**Performance** (10,000 emails): -- Feature extraction: 20s -- ML inference: 0.7s -- LLM review: 0s (disabled) -- **Total: 24 seconds** (175x faster than iteration 1!) - -**Accuracy**: 72.7% (vs 92.7% with LLM) - -**Use Case**: Bulk cleanup where 73% accuracy is acceptable - -### Iteration 7: Parallel Email Fetching - -**Approach**: Fetch emails in parallel (for multiple accounts) - -```python -from concurrent.futures import ThreadPoolExecutor - -def fetch_all_accounts(providers): - with ThreadPoolExecutor(max_workers=4) as executor: - futures = [executor.submit(p.fetch_emails) for p in providers] - results = [f.result() for f in futures] - return [email for result in results for email in result] -``` - -**Impact**: -- Single account: No benefit -- Multiple accounts: Linear speedup (4 accounts in parallel) - -### Final Performance (Current) - -**Configuration**: 10,000 Enron emails, 28-core CPU - -**Fast Mode** (--no-llm-fallback): -- Feature extraction (batched): 20s -- ML classification: 0.7s -- Export: 0.5s -- **Total: 24 seconds (423 emails/sec)** -- **Accuracy: 72.7%** - -**Hybrid Mode** (with LLM fallback): -- Feature extraction: 20s -- ML classification: 0.7s -- LLM review (21%): 2,100 emails × 2s = 4,200s -- Export: 0.5s -- **Total: 4 minutes 21s (38 emails/sec)** -- **Accuracy: 92.7%** - -**Calibration** (one-time, 300 sample emails): -- Sampling: 1s -- LLM analysis: 15 batches × 12s = 180s (3 minutes) -- ML training: 5s -- **Total: 3 minutes 6s** - -### Performance Comparison - -| Mode | Time (10k emails) | Emails/Sec | Accuracy | Cost | -|------|-------------------|------------|----------|------| -| Naive (Iteration 1) | 103 min | 1.6 | 95% | $2.00 | -| Optimized Hybrid | 4.4 min | 38 | 92.7% | $0.21 | -| Fast (No LLM) | 24s | 423 | 72.7% | $0.00 | - -**Speedup**: 257x faster than naive baseline (fast mode) - -### Optimization Lessons Learned - -1. **Profile First**: Don't optimize blindly. Measure where time is spent. -2. **Batch Everything**: API calls, embeddings, predictions - batching is free speedup -3. **Threshold Tuning**: Often the biggest performance/accuracy trade-off lever -4. **Know Your Bottleneck**: Optimizing ML inference (1s) when LLM takes 4000s is pointless -5. **User Choice**: Provide speed vs accuracy options rather than one-size-fits-all -6. **Parallelism**: Helps for I/O (API calls) more than CPU (ML inference) -7. **Diminishing Returns**: 7.5x speedup from batching, 2.8x from threading, then plateaus - ---- - -## Category Discovery and Management - -One of the system's key innovations is dynamic category discovery rather than hardcoded categories. - -### Why Dynamic Categories? - -**The Problem with Hardcoded Categories**: - -Traditional email classifiers use fixed categories: -- Gmail: Primary, Social, Promotions, Updates, Forums -- Outlook: Focused, Other -- Custom: Work, Personal, Finance, etc. - -These work for general cases but fail for specific users: -- Freelancer needs: ClientA, ClientB, Invoices, Marketing, Personal -- Executive needs: Strategic, Operational, Reports, Meetings, Travel -- Student needs: Coursework, Assignments, Clubs, Administrative, Social - -**The Solution**: Let LLM discover natural categories in each mailbox. - -### Discovery Process - -**Step 1: LLM Analyzes Sample** - -Given 300 emails from a freelancer's inbox: - -``` -Sample emails show: -- 80 emails from client domains (acme.com, widgets-r-us.com) -- 45 emails with invoice/payment subjects -- 35 emails from LinkedIn, Twitter, Facebook -- 30 emails about marketing campaigns -- 20 emails from family/friends -- 90 misc (tools, services, confirmations) -``` - -LLM discovers: -1. **ClientWork**: Business correspondence with clients -2. **Financial**: Invoices, payments, tax documents -3. **Marketing**: Campaign emails, analytics, ad platforms -4. **SocialMedia**: LinkedIn connections, Twitter notifications -5. **Personal**: Friends and family -6. **Tools**: Software services, productivity tools - -**Step 2: Consolidation** (if needed) - -If LLM discovers too many categories (>10), consolidate: - -Initial discovery (15 categories): -- ClientWork, Proposals, Meetings, ProjectUpdates -- Invoices, Payments, Taxes, Banking -- Marketing, Analytics, Advertising -- LinkedIn, Twitter, Facebook -- Personal - -After consolidation (6 categories): -- **ClientWork**: ClientWork + Proposals + Meetings + ProjectUpdates -- **Financial**: Invoices + Payments + Taxes + Banking -- **Marketing**: Marketing + Analytics + Advertising -- **SocialMedia**: LinkedIn + Twitter + Facebook -- **Personal**: (unchanged) -- **Tools**: (new, for everything else) - -**Step 3: Snap to Cache** - -Check if discovered categories match cached ones: - -Cached (from previous users): -- Work (867 emails) -- Financial (423 emails) -- Personal (312 emails) -- Marketing (189 emails) -- Updates (156 emails) - -Similarity matching: -- "ClientWork" ↔ "Work": 0.89 → Snap to "Work" -- "Financial" ↔ "Financial": 1.0 → Use "Financial" -- "Marketing" ↔ "Marketing": 1.0 → Use "Marketing" -- "SocialMedia" ↔ "Updates": 0.68 → Below threshold (0.7), keep "SocialMedia" -- "Personal" ↔ "Personal": 1.0 → Use "Personal" -- "Tools" → No match → Keep "Tools" - -Final categories: -- Work (snapped from ClientWork) -- Financial -- Marketing -- SocialMedia (new) -- Personal -- Tools (new) - -Cache updated: -- Work: usage_count += 80 -- Financial: usage_count += 45 -- Marketing: usage_count += 30 -- SocialMedia: added with usage_count = 35 -- Personal: usage_count += 20 -- Tools: added with usage_count = 90 - -### Category Cache Structure - -**Purpose**: Maintain consistency across mailboxes - -**File**: `src/models/category_cache.json` - -**Schema**: -```json -{ - "Work": { - "description": "Business correspondence, meetings, projects, client communication", - "embedding": [0.234, -0.456, 0.678, ...], // 384 dims - "created_at": "2025-10-20T10:30:00Z", - "last_seen": "2025-10-25T14:22:00Z", - "usage_count": 867, - "aliases": ["Business", "ClientWork", "Professional"] - }, - "Financial": { - "description": "Invoices, bills, statements, payments, banking", - "embedding": [0.123, -0.789, 0.345, ...], - "created_at": "2025-10-20T10:30:00Z", - "last_seen": "2025-10-25T14:22:00Z", - "usage_count": 423, - "aliases": ["Finance", "Billing", "Invoices"] - }, - ... -} -``` - -**Fields**: -- **description**: Human-readable explanation -- **embedding**: Semantic embedding of description (for similarity matching) -- **created_at**: When first discovered -- **last_seen**: Most recent usage -- **usage_count**: Total emails across all users -- **aliases**: Alternative names that map to this category - -### Similarity Matching Algorithm - -**Goal**: Determine if new category matches cached category - -**Method**: Cosine similarity of embeddings - -```python -def calculate_similarity(new_category, cached_category): - new_emb = embed(new_category['description']) - cached_emb = cached_category['embedding'] - - # Cosine similarity - similarity = np.dot(new_emb, cached_emb) / ( - np.linalg.norm(new_emb) * np.linalg.norm(cached_emb) - ) - - return similarity - -def find_best_match(new_category, cache, threshold=0.7): - best_match = None - best_score = 0.0 - - for cached_name, cached_data in cache.items(): - score = calculate_similarity(new_category, cached_data) - if score > best_score: - best_score = score - best_match = cached_name - - if best_score >= threshold: - return best_match, best_score - else: - return None, best_score -``` - -**Thresholds**: -- 0.9-1.0: Definitely same category -- 0.7-0.9: Probably same category (snap) -- 0.5-0.7: Possibly related (don't snap, but log) -- 0.0-0.5: Different categories - -**Example Similarities**: -``` -"Work" ↔ "Business": 0.92 (snap) -"Work" ↔ "ClientWork": 0.88 (snap) -"Work" ↔ "Professional": 0.85 (snap) -"Work" ↔ "Personal": 0.15 (different) -"Work" ↔ "Finance": 0.32 (different) -"Work" ↔ "Meetings": 0.68 (borderline, don't snap) -``` - -### Cache Update Strategy - -**Conservative**: Don't pollute cache with noise - -**Rules**: -1. **High Usage**: Category must be used for 10+ emails to be cache-worthy -2. **LLM Approval**: Must be explicitly discovered by LLM (not user-created) -3. **Uniqueness**: Must be sufficiently different from existing (similarity < 0.7) -4. **Limit**: Max 3 new categories per mailbox (prevent explosion) - -**Update Process**: -```python -def update_cache(cache, discovered_categories, email_labels): - category_counts = Counter(cat for _, cat in email_labels) - - for cat, desc in discovered_categories.items(): - if cat in cache: - # Update existing - cache[cat]['last_seen'] = now() - cache[cat]['usage_count'] += category_counts.get(cat, 0) - else: - # Add new (if cache-worthy) - if category_counts.get(cat, 0) >= 10: # Min 10 emails - cache[cat] = { - 'description': desc, - 'embedding': embed(desc), - 'created_at': now(), - 'last_seen': now(), - 'usage_count': category_counts.get(cat, 0), - 'aliases': [] - } - - save_cache(cache) -``` - -### Category Evolution - -**Cache grows over time**: - -After 1 user: -- 5 categories (discovered fresh) - -After 10 users: -- 8 categories (5 original + 3 new) -- 92% of new mailboxes snap to existing - -After 100 users: -- 12 categories (core set stabilized) -- 97% of new mailboxes snap to existing - -After 1000 users: -- 15 categories (long tail of specialized needs) -- 99% of new mailboxes snap to existing - -**Cache represents collective knowledge of what categories are useful.** - -### Category Verification - -**Feature**: `--verify-categories` flag - -**Purpose**: Check if cached model categories fit new mailbox - -**Process**: -1. Sample 20 emails from new mailbox -2. Single LLM call: "Do these categories fit this mailbox?" -3. LLM responds: GOOD_MATCH, POOR_MATCH, or UNCERTAIN -4. If POOR_MATCH, suggest new categories - -**Example Output**: -``` -Verifying model categories... - -Model categories: -- Work: Business correspondence, meetings, projects -- Financial: Invoices, bills, statements -- Marketing: Campaigns, analytics, advertising -- Personal: Friends and family -- Updates: Newsletters, product updates - -Sample emails: -1. From: admin@university.edu - "Course Schedule for Fall 2025" -2. From: assignments@lms.edu - "Assignment 3 Due Next Week" -[... 18 more ...] - -Verdict: POOR_MATCH (confidence: 0.85) - -Reasoning: Mailbox appears to be a student inbox. Suggested categories: -- Coursework: Lectures, readings, course materials -- Assignments: Homework, projects, submissions -- Administrative: Registration, financial aid, campus announcements -- Clubs: Student organizations, events -- Personal: Friends and family - -Recommendation: Run full calibration for better accuracy. -``` - -**Cost**: One LLM call (~20 seconds, $0.01) - -**Value**: Avoids poor classification from model mismatch - ---- - -## Testing Infrastructure - -While the system is currently in MVP status, a testing framework has been established to ensure reliability as the codebase grows. - -### Test Structure - -**Test Files**: -- `tests/conftest.py`: Pytest fixtures and shared test utilities -- `tests/test_classifiers.py`: Unit tests for ML and LLM classifiers -- `tests/test_feature_extraction.py`: Feature extractor validation -- `tests/test_e2e_pipeline.py`: End-to-end workflow tests -- `tests/test_integration.py`: Provider integration tests - -### Test Data - -**Mock Provider**: Generates synthetic emails for testing -- Configurable email counts -- Various categories represented -- Realistic metadata (timestamps, domains, patterns) -- No external dependencies - -**Enron Dataset**: Real-world test corpus -- 500,000+ actual emails -- Natural language variation -- Folder structure provides ground truth -- Reproducible results - -### Testing Philosophy - -**Unit Tests**: Test individual components in isolation -- Feature extraction produces expected dimensions -- Pattern detection matches known patterns -- ML model loads and predicts -- LLM provider handles errors gracefully - -**Integration Tests**: Test component interactions -- Email provider → Feature extractor → Classifier pipeline -- Calibration workflow produces valid model -- Results export to correct format - -**End-to-End Tests**: Test complete user workflows -- Run classification on sample dataset -- Verify results accuracy -- Check performance benchmarks -- Validate output format - -**Property-Based Tests**: Test invariants -- All emails get classified (no crashes) -- Confidence always between 0 and 1 -- Category always in valid set -- Feature vectors always same dimensions - -### Testing Challenges - -**LLM Testing**: LLMs are non-deterministic -- Use low temperature for consistency -- Test error handling, not exact outputs -- Mock LLM responses for unit tests -- Use real LLM for integration tests - -**Performance Testing**: Hardware-dependent -- Report relative speedups, not absolute times -- Test batch vs sequential (should be faster) -- Test threading utilization -- Monitor memory usage - -**Accuracy Testing**: Ground truth is noisy -- Enron folder names approximate true category -- Accept accuracy within range (70-95%) -- Test consistency (same results on re-run) -- Human evaluation on sample - -### Current Test Coverage - -**Estimated Coverage**: ~60% of critical paths - -**Well-Tested**: -- Feature extraction (embeddings, patterns, structural) -- Hard rules matching -- Configuration loading and validation -- Email provider interface compliance - -**Needs More Tests**: -- LLM calibration workflow -- Category consolidation -- Category caching and similarity matching -- Error recovery paths - -### Running Tests - -**Full Test Suite**: -```bash -pytest tests/ -``` - -**Specific Test File**: -```bash -pytest tests/test_classifiers.py -``` - -**With Coverage**: -```bash -pytest --cov=src tests/ -``` - -**Fast Tests Only** (skip slow integration tests): -```bash -pytest -m "not slow" tests/ -``` - ---- - -## Data Flow - -Understanding how data flows through the system is critical for debugging and optimization. - -### Classification Data Flow - -**Input**: Raw email from provider - -**Stage 1: Email Retrieval** -``` -Provider API/Dataset - ↓ -Email objects (id, subject, sender, body, metadata) - ↓ -List[Email] -``` - -**Stage 2: Feature Extraction** -``` -List[Email] - ↓ -Batch emails (512 per batch) - ↓ -Extract structural features (per email, fast) - ↓ -Extract patterns (per email, regex) - ↓ -Batch embed texts (512 texts → Ollama API → 512 embeddings) - ↓ -List[Dict[str, Any]] (features per email) -``` - -**Stage 3: Hard Rules Check** -``` -Email + Features - ↓ -Pattern matching (regex) - ↓ -Match found? → ClassificationResult (confidence=0.99, method='rule') - ↓ -No match → Continue to ML -``` - -**Stage 4: ML Classification** -``` -Features (embedding + structural + patterns) - ↓ -LightGBM model prediction - ↓ -Probability distribution over categories - ↓ -Max probability = confidence - ↓ -Confidence >= threshold? - ↓ Yes -ClassificationResult (confidence=0.55-1.0, method='ml') - ↓ No -Queue for LLM (if enabled) -``` - -**Stage 5: LLM Review** (optional) -``` -Email metadata + ML prediction - ↓ -LLM prompt construction - ↓ -LLM API call (Ollama/OpenAI) - ↓ -JSON response parsing - ↓ -ClassificationResult (confidence=0.8-0.95, method='llm') -``` - -**Stage 6: Results Export** -``` -List[ClassificationResult] - ↓ -Aggregate statistics (rules/ML/LLM breakdown) - ↓ -JSON serialization - ↓ -Write to output directory - ↓ -Optional: Sync labels back to provider -``` - -### Calibration Data Flow - -**Input**: Raw emails from new mailbox - -**Stage 1: Sampling** -``` -All emails - ↓ -Group by sender domain - ↓ -Stratified sample (3% of total, min 250, max 1500) - ↓ -Split: Training (90%) + Validation (10%) -``` - -**Stage 2: LLM Discovery** -``` -Training emails - ↓ -Batch into groups of 20 - ↓ -For each batch: - Calculate statistics (domains, keywords, patterns) - Build prompt with statistics + email summaries - LLM analyzes and returns categories + labels - ↓ -Merge all batch results - ↓ -Categories discovered + Email labels -``` - -**Stage 3: Consolidation** (if >10 categories) -``` -Discovered categories - ↓ -Build consolidation prompt - ↓ -LLM merges overlapping categories - ↓ -Returns mapping (old → new) - ↓ -Update email labels with consolidated categories -``` - -**Stage 4: Category Caching** -``` -Discovered categories - ↓ -Calculate embeddings for each category description - ↓ -Compare to cached categories (cosine similarity) - ↓ -Similarity >= 0.7? → Snap to cached -Similarity < 0.7 and new_count < 3? → Keep as new - ↓ -Update cache with usage counts - ↓ -Final category set -``` - -**Stage 5: Feature Extraction** -``` -Labeled training emails - ↓ -Batch feature extraction (same as classification) - ↓ -Training features + labels -``` - -**Stage 6: Model Training** -``` -Training features + labels - ↓ -Create LightGBM dataset - ↓ -Train model (200 rounds, early stopping, 28 threads) - ↓ -Validate on held-out set - ↓ -Serialize model + metadata - ↓ -Save to src/models/calibrated/classifier.pkl -``` - -### Data Persistence - -**Temporary Data** (session-only): -- Fetched emails (in memory) -- Extracted features (in memory) -- Classification results (in memory until export) - -**Cached Data** (persistent): -- Category cache (src/models/category_cache.json) -- Trained model (src/models/calibrated/classifier.pkl) -- OAuth tokens (token.json for Gmail/Outlook) - -**Exported Data** (user-visible): -- Results JSON (results/results.json) -- Results CSV (results/results.csv) -- By-category results (results/by_category/*) -- Logs (logs/email-sorter.log) - -**Never Stored**: -- Raw email content (unless user explicitly saves) -- Passwords or sensitive credentials -- LLM API keys (environment variables only) - ---- - -## Critical Implementation Decisions - -Several key decisions shaped the system's architecture and performance. - -### Decision 1: Ollama for Embeddings (Not sentence-transformers) - -**Options Considered**: -1. sentence-transformers library (standard approach) -2. Ollama embedding API -3. OpenAI embedding API - -**Choice**: Ollama embedding API - -**Rationale**: -- sentence-transformers downloads 90MB model on every run (90s overhead) -- Ollama caches model locally (instant loading after first pull) -- Same underlying model (all-minilm:l6-v2) -- Ollama already required for LLM, no extra dependency -- Local processing (no API costs, no privacy concerns) - -**Trade-offs**: -- Requires Ollama running (extra service dependency) -- Slightly slower than native sentence-transformers (network overhead) -- But overall faster considering model loading time - -### Decision 2: LightGBM Over Other ML Algorithms - -**Options Considered**: -- Logistic Regression (too simple) -- Random Forest (good but slow) -- XGBoost (excellent but slower) -- Neural Network (overkill) -- Transformer (way overkill) - -**Choice**: LightGBM - -**Rationale**: -- Fastest training and inference among competitive algorithms -- Excellent accuracy (92% validation) -- Small model size (1.8MB) -- Handles mixed feature types naturally -- Mature and battle-tested - -**Trade-offs**: -- Slightly less accurate than XGBoost (1% difference) -- Less interpretable than decision trees -- But speed advantage dominates for this use case - -### Decision 3: Threshold 0.55 (Not 0.75) - -**Options Considered**: -- 0.75 (conservative, more LLM calls) -- 0.65 (balanced) -- 0.55 (aggressive, fewer LLM calls) -- 0.45 (too aggressive) - -**Choice**: 0.55 - -**Rationale**: -- Reduces LLM fallback from 35% to 21% (40% reduction) -- Only 3% accuracy loss (95% → 92%) -- 12x speedup in fast mode -- Most users prefer speed over marginal accuracy - -**Trade-offs**: -- Lower confidence threshold accepts more uncertain predictions -- But empirical testing shows 92% is still excellent - -### Decision 4: Batch Size 512 (Not 256 or 1024) - -**Options Considered**: -- 128, 256, 512, 1024, 2048 - -**Choice**: 512 - -**Rationale**: -- 7.5x speedup over sequential (vs 5.6x for 256) -- Only 6% slower than 1024 -- Fits comfortably in memory -- Works well with Ollama API limits - -**Trade-offs**: -- Larger batches (1024+) slightly faster but diminishing returns -- Smaller batches (256) more flexible but 25% slower - -### Decision 5: LLM-Driven Calibration (Not Manual Labeling) - -**Options Considered**: -1. Manual labeling (hire humans) -2. Active learning (iterative user labeling) -3. Transfer learning (use pre-trained model) -4. LLM-driven calibration - -**Choice**: LLM-driven calibration - -**Rationale**: -- Manual labeling: Too expensive and slow ($1000s, weeks) -- Active learning: Still requires hundreds of user labels -- Transfer learning: Gmail categories don't fit all inboxes -- LLM: Automatic, fast (3 minutes), adapts to each inbox - -**Trade-offs**: -- LLM cost (~$0.15 per calibration) -- LLM errors propagate to ML model -- But benefits massively outweigh costs - -### Decision 6: Category Caching (Not Fresh Discovery Every Time) - -**Options Considered**: -1. Fresh category discovery per mailbox -2. Global shared categories (hardcoded) -3. Category cache with similarity matching - -**Choice**: Category cache with similarity matching - -**Rationale**: -- Fresh discovery: Inconsistent naming across users -- Global categories: Too rigid, doesn't adapt -- Caching: Best of both worlds (consistency + flexibility) - -**Trade-offs**: -- Cache can become stale -- Similarity matching can mis-snap -- But 97% of mailboxes benefit from consistency - -### Decision 7: Three-Tier Strategy (Not Pure ML or Pure LLM) - -**Options Considered**: -1. Pure rule-based (too brittle) -2. Pure ML (requires labeled data) -3. Pure LLM (too slow and expensive) -4. Two-tier (ML + LLM) -5. Three-tier (Rules + ML + LLM) - -**Choice**: Three-tier strategy - -**Rationale**: -- Rules catch 5-10% obvious cases instantly -- ML handles 70-85% with good confidence -- LLM reviews 0-20% uncertain cases -- User can disable LLM tier for speed - -**Trade-offs**: -- More complex architecture -- Three components to maintain -- But performance and flexibility benefits are enormous - -### Decision 8: Click CLI (Not argparse or Custom) - -**Options Considered**: -- argparse (Python standard library) -- Click (third-party but popular) -- Custom CLI framework - -**Choice**: Click - -**Rationale**: -- Automatic help generation -- Type validation -- Nested commands -- Better UX than argparse -- Industry standard (used by Flask, etc.) - -**Trade-offs**: -- Extra dependency -- But improves user experience dramatically - ---- - -## Security and Privacy - -Email data is highly sensitive. The system prioritizes security and privacy throughout. - -### Threat Model - -**Threats Considered**: - -1. **Email Content Exposure**: Emails contain sensitive information -2. **Credential Theft**: OAuth tokens, passwords, API keys -3. **Model Extraction**: Trained model reveals information about emails -4. **LLM Provider Trust**: Ollama/OpenAI could log prompts -5. **Local File Access**: Classified results stored locally - -### Security Measures - -**1. Local-First Processing** - -All processing happens locally: -- Emails never uploaded to cloud (except OAuth auth flow) -- ML inference runs locally -- LLM runs locally via Ollama (recommended) -- Only embeddings sent to Ollama (not full email content) - -**2. Credential Management** - -Secure credential storage: -- OAuth tokens stored locally (token.json) -- File permissions: 600 (owner read/write only) -- Never logged or printed -- Never committed to git (.gitignore) - -**3. Email Provider Authentication** - -Best practices followed: -- Gmail: OAuth 2.0 (no passwords stored) -- Outlook: OAuth 2.0 with device flow -- IMAP: Credentials in encrypted storage (user responsibility) -- Tokens refreshed automatically - -**4. LLM Privacy** - -Minimal data sent to LLM: -- Only email metadata (subject, sender, snippet) -- No full bodies sent to LLM -- Local Ollama recommended (no external calls) -- OpenAI support for those who accept risk - -**5. Model Privacy** - -Models don't leak email content: -- LightGBM doesn't memorize training data -- Embeddings are abstract semantic vectors -- Category cache only stores category names, not emails - -**6. File System Security** - -Careful file handling: -- Results stored in user-specified directory -- No world-readable files created -- Logs sanitized (no email content) -- Temporary files cleaned up - -### Privacy Considerations - -**What's Stored**: -- Category cache (category names and descriptions) -- Trained model (abstract ML model, no email text) -- Classification results (email IDs and categories, no content) -- Logs (errors and statistics, no email content) - -**What's NOT Stored**: -- Raw email content (unless user explicitly saves) -- Email bodies or attachments -- Sender personal information (beyond what's in email ID) -- OAuth passwords (only tokens) - -**What's Sent to External Services**: - -**Ollama (Local)**: -- Embedding texts (structured metadata + snippets) -- LLM prompts (email summaries, no full content) -- Controllable: User can inspect Ollama logs - -**Gmail/Outlook APIs**: -- OAuth authentication flow -- Email fetch requests -- Label update requests -- Standard OAuth security - -**OpenAI (If Used)**: -- Email metadata and snippets -- User accepts OpenAI privacy policy -- Can be disabled with Ollama - -### Compliance Considerations - -**GDPR (EU)**: -- Email processing is local (no data transfer) -- Users control data retention -- Easy to delete all data (delete results directory) -- OAuth tokens can be revoked - -**HIPAA (Healthcare)**: -- Not HIPAA compliant out of box -- But local processing helps -- Healthcare users should use Ollama (not OpenAI) -- Audit logs available - -**SOC 2 (Enterprise)**: -- Local processing reduces compliance scope -- Access controls needed (file permissions) -- Audit trail in logs -- Encryption at rest (user responsibility) - -### Security Best Practices for Users - -**Recommendations**: - -1. **Use Ollama** (not OpenAI) for sensitive data -2. **Encrypt disk** where results stored -3. **Review permissions** on results directory -4. **Revoke OAuth tokens** after use -5. **Clear logs** periodically -6. **Don't commit credentials** to git -7. **Run in virtual environment** (isolation) -8. **Update dependencies** regularly - -### Known Security Limitations - -**Not Addressed**: -- Email provider compromise (out of scope) -- Local machine compromise (OS responsibility) -- Ollama server compromise (trust Ollama project) -- Social engineering (user responsibility) - -**Requires User Action**: -- Secure OAuth credentials file -- Protect results directory -- Manage Ollama access controls -- Monitor API usage (if using OpenAI) - ---- - -## Known Limitations and Trade-offs - -Every design involves trade-offs. Here are the system's known limitations and why they exist. - -### Limitation 1: English Language Only - -**Issue**: System optimized for English emails - -**Why**: -- Embedding model trained primarily on English -- Pattern detection uses English keywords -- LLM prompts in English - -**Impact**: -- Non-English emails may classify poorly -- Mixed language emails confuse patterns - -**Workarounds**: -- Multilingual embedding models exist (sentence-transformers) -- LLM can handle multiple languages -- Pattern detection could be disabled - -**Future**: Support for multilingual models planned - -### Limitation 2: No Real-Time Classification - -**Issue**: Batch processing only, not real-time - -**Why**: -- Designed for backlog cleanup (10k-100k emails) -- Batching critical for performance -- Real-time requires different architecture - -**Impact**: -- Can't classify emails as they arrive -- Must fetch all emails first - -**Workarounds**: -- Incremental mode (fetch new emails only) -- Periodic batch runs (cron job) - -**Future**: Real-time mode under consideration - -### Limitation 3: Model Requires Recalibration Per Mailbox - -**Issue**: One model per mailbox, not universal - -**Why**: -- Each mailbox has unique patterns -- Categories differ by user -- Transfer learning attempted but failed - -**Impact**: -- 3-minute calibration per mailbox -- Can't share models between users - -**Workarounds**: -- Category caching reuses concepts -- Fast calibration (3 minutes acceptable) - -**Future**: Universal model research ongoing - -### Limitation 4: Attachment Analysis Limited - -**Issue**: Doesn't deeply analyze attachment content - -**Why**: -- PDF/DOCX extraction complex -- OCR for images expensive -- Adds significant processing time - -**Impact**: -- Invoice in attachment might be missed -- Contract classification relies on subject/body - -**Workarounds**: -- Pattern detection catches common cases -- Filename analysis helps -- Full content extraction optional - -**Future**: Deep attachment analysis planned - -### Limitation 5: No Thread Understanding - -**Issue**: Each email classified independently - -**Why**: -- Email threads span multiple messages -- Context from previous emails ignored -- Thread reconstruction complex - -**Impact**: -- Reply in conversation might be misclassified -- "Re: Dinner plans" context lost - -**Workarounds**: -- Subject line preserves some context -- LLM can reason about conversation hints - -**Future**: Thread-aware classification considered - -### Limitation 6: Accuracy Ceiling at 95% - -**Issue**: Even with LLM, 95% accuracy not exceeded - -**Why**: -- Some emails genuinely ambiguous -- Noisy ground truth in test data -- Edge cases always exist - -**Impact**: -- 5% of emails need manual review -- Perfect classification impossible - -**Workarounds**: -- Confidence scores help identify uncertain cases -- User can manually reclassify - -**Future**: Active learning could improve - -### Limitation 7: Gmail/Outlook Providers Not Fully Tested - -**Issue**: Real Gmail/Outlook integration unverified - -**Why**: -- OAuth setup complex -- Test accounts not available -- Enron dataset sufficient for MVP - -**Impact**: -- May have bugs with real accounts -- Rate limiting not tested -- Error handling incomplete - -**Workarounds**: -- Stub implementations ready -- Error handling in place - -**Future**: Real-world testing in Phase 2 - -### Limitation 8: No Web Dashboard - -**Issue**: CLI only, no GUI - -**Why**: -- MVP focus on core functionality -- Web dashboard is separate concern -- CLI faster to implement - -**Impact**: -- Less user-friendly for non-technical users -- Results in JSON/CSV (need tools to visualize) - -**Workarounds**: -- JSON easily parsed -- CSV opens in Excel/Google Sheets - -**Future**: Web dashboard in Phase 3 - -### Limitation 9: Single User Only - -**Issue**: No multi-user or team features - -**Why**: -- Designed for individual use -- No database or user management -- Local file storage only - -**Impact**: -- Can't share classifications -- Can't collaborate on categories -- Each user maintains own models - -**Workarounds**: -- Category cache provides some consistency -- Can share trained models manually - -**Future**: Team features in Phase 4 - -### Limitation 10: No Active Learning - -**Issue**: Doesn't learn from user corrections - -**Why**: -- Requires feedback loop -- Model retraining on each correction expensive -- User interface for feedback not built - -**Impact**: -- Model accuracy doesn't improve over time -- User corrections not leveraged - -**Workarounds**: -- Can re-run calibration periodically -- Manual model updates possible - -**Future**: Active learning high priority - -### Trade-off Summary - -**Speed vs Accuracy**: -- Chose: Configurable (fast mode vs hybrid mode) -- Trade-off: Users decide per use case - -**Privacy vs Convenience**: -- Chose: Local-first (privacy) -- Trade-off: Setup more complex (Ollama installation) - -**Flexibility vs Simplicity**: -- Chose: Flexible (dynamic categories) -- Trade-off: More complex than hardcoded - -**Universal vs Custom**: -- Chose: Custom (per-mailbox calibration) -- Trade-off: Can't share models directly - -**Features vs Stability**: -- Chose: Stability (MVP feature set) -- Trade-off: Missing some nice-to-haves - ---- - -## Evolution and Learning - -The system evolved significantly through iteration and learning. - -### Version History - -**v0.1 - Proof of Concept** (Week 1) -- Basic rule-based classification -- Hardcoded categories -- Single email processing -- 10 emails/sec, 65% accuracy - -**v0.2 - ML Integration** (Week 2) -- Added LightGBM classifier -- Manual labeling of 500 emails -- Sequential processing -- 50 emails/sec, 82% accuracy - -**v0.3 - LLM Calibration** (Week 3) -- LLM-driven category discovery -- Automatic labeling -- Still sequential processing -- 1.6 emails/sec (LLM bottleneck), 95% accuracy - -**v0.4 - Batched Embeddings** (Week 4) -- Batched feature extraction -- 7.5x speedup -- 40 emails/sec, 95% accuracy - -**v0.5 - Threshold Optimization** (Week 5) -- Lowered threshold to 0.55 -- Added --no-llm-fallback mode -- Fast mode: 423 emails/sec, 73% accuracy -- Hybrid mode: 38 emails/sec, 93% accuracy - -**v1.0 - MVP** (Week 6) -- Category caching -- Category verification -- Multi-provider support (Gmail, Outlook, IMAP stubs) -- Clean architecture -- Comprehensive documentation - -### Key Learnings - -**Learning 1: Batching Changes Everything** - -Early system processed one email at a time. Obvious in hindsight, but batching embeddings provided 7.5x speedup. Lesson: Always batch API calls. - -**Learning 2: LLM for Calibration, ML for Inference** - -Initially tried pure LLM (too slow) and pure ML (no training data). Hybrid approach unlocked both: LLM discovers categories once, ML classifies fast repeatedly. - -**Learning 3: Dynamic Categories Beat Hardcoded** - -Hardcoded categories (junk, work, personal) failed for many users. Letting LLM discover categories per mailbox dramatically improved relevance. - -**Learning 4: Threshold Matters More Than Algorithm** - -Spent days trying different ML algorithms (Random Forest, XGBoost, LightGBM). Accuracy varied by 2-3%. Then adjusted threshold from 0.75 to 0.55 and got 12x speedup. Lesson: Tune hyperparameters before switching algorithms. - -**Learning 5: Category Cache Prevents Chaos** - -Without caching, each mailbox got different category names for same concepts. "Work" vs "Business" vs "Professional" frustrated users. Category cache with similarity matching solved this. - -**Learning 6: Users Want Speed AND Accuracy** - -Initially forced choice: fast (ML) or accurate (LLM). Users wanted both. Solution: Make it configurable with --no-llm-fallback flag. - -**Learning 7: Real Data Is Messy** - -Enron dataset has "sent" folder with work emails, personal emails, and junk. Ground truth is noisy. Can't achieve 100% accuracy when labels are wrong. Lesson: Accept 90-95% as excellent. - -**Learning 8: Embeddings Are Powerful** - -Pattern detection and structural features help, but embeddings do most of the heavy lifting. Semantic understanding captures meaning beyond keywords. - -**Learning 9: Category Consolidation Necessary** - -LLM naturally discovers 10-15 categories. Too many confuses users. Consolidation step merges overlapping categories to 5-10. Lesson: More isn't always better. - -**Learning 10: Local-First Architecture Simplifies** - -Initially planned cloud deployment. Switched to local-first (Ollama, local ML). Privacy benefits plus simpler architecture. Users can run without internet. - -### Mistakes and Corrections - -**Mistake 1: Tried sentence-transformers First** - -Spent day debugging slow model loading. Switched to Ollama embeddings, problem solved. Should have profiled first. - -**Mistake 2: Over-Engineered Category System** - -Built complex category hierarchy with subcategories. Users confused. Simplified to flat categories. Lesson: KISS principle. - -**Mistake 3: Didn't Test Batching Early** - -Built entire sequential pipeline before testing batching. Would have saved days if batched from start. Lesson: Test performance-critical paths first. - -**Mistake 4: Assumed Gmail Categories Were Universal** - -Designed around Gmail categories (Primary, Social, Promotions). Realized most users have different needs. Pivoted to dynamic discovery. - -**Mistake 5: Ignored Model Path Confusion** - -Two model directories (calibrated/ and pretrained/) caused bugs. Should have had single authoritative path. Documented workaround but debt remains. - -### Insights from Enron Dataset - -**Enron Revealed**: - -1. **Business emails dominate** (60%): Work, meetings, reports -2. **Folder structure imperfect**: "sent" has all types -3. **Lots of forwards**: "Fwd: Fwd: Fwd:" common -4. **Short subjects**: Average 40 characters -5. **Timestamps matter**: Automated emails at midnight -6. **Domain patterns**: Corporate domains = work, gmail = maybe personal -7. **Pattern consistency**: Invoices always have "Invoice #", OTPs always 6 digits -8. **Ambiguity unavoidable**: "Lunch meeting?" is work or personal? - -**Enron's Value**: -- Real-world complexity -- Large enough for ML training -- Public domain (no privacy issues) -- Deterministic (same results every run) -- Ground truth (imperfect but useful) - -### Community Feedback - -**If Released Publicly** (hypothetical): - -**Expected Positive Feedback**: -- "Finally, local email classification!" -- "LLM calibration is genius" -- "Fast mode is incredibly fast" -- "Works on my unique mailbox" - -**Expected Negative Feedback**: -- "Why no real-time mode?" -- "Accuracy could be higher" -- "CLI is intimidating" -- "Setup is complex (Ollama, OAuth)" - -**Expected Feature Requests**: -- Web dashboard -- Mobile app -- Gmail plugin -- Active learning -- Multi-language support -- Thread understanding - ---- - -## Future Roadmap - -The system has a clear roadmap for future development. - -### Phase 2: Real-World Integration (Q1 2026) - -**Goals**: Production-ready for real users - -**Features**: -1. **Fully Tested Gmail Provider** - - OAuth flow tested with real accounts - - Rate limiting handled - - Batch operations optimized - - Error recovery robust - -2. **Fully Tested Outlook Provider** - - Microsoft Graph API fully implemented - - Device flow tested - - Categories sync working - - Multi-account tested - -3. **Email Syncing** - - Apply classifications back to mailbox - - Create/update labels in Gmail - - Set categories in Outlook - - Move to folders in IMAP - - Dry-run mode for safety - -4. **Incremental Classification** - - Fetch only new emails (since last run) - - Update existing classifications - - Detect mailbox changes - - Efficient sync - -5. **Multi-Account Support** - - Classify multiple accounts in parallel - - Share categories across accounts (optional) - - Unified results view - - Account-specific models - -**Timeline**: 2-3 months - -**Success Criteria**: -- 100 real users successfully classify mailboxes -- Gmail and Outlook providers work flawlessly -- Email syncing tested and verified -- Performance maintained at scale - -### Phase 3: Production Ready (Q2 2026) - -**Goals**: Stable, polished product - -**Features**: -1. **Web Dashboard** - - Visualize classification results - - Browse emails by category - - Manually reclassify emails - - View confidence scores - - Export reports - -2. **Active Learning** - - User corrects classification - - System learns from correction - - Model improves over time - - Feedback loop closes - -3. **Custom Category Training** - - User defines custom categories - - Provides example emails - - System fine-tunes model - - Per-user personalization - -4. **Performance Tuning** - - Local sentence-transformers (2-5s embeddings) - - GPU acceleration (if available) - - Larger batch sizes (1024-2048) - - Parallel LLM calls - -5. **Enhanced Testing** - - 90%+ code coverage - - Integration test suite - - Performance benchmarks - - Regression tests - -**Timeline**: 3-4 months - -**Success Criteria**: -- 1000+ users -- Web dashboard used by 80% of users -- Active learning improves accuracy by 5% -- 95% test coverage - -### Phase 4: Enterprise Features (Q3-Q4 2026) - -**Goals**: Enterprise-ready deployment - -**Features**: -1. **Multi-Language Support** - - Multilingual embedding models - - Pattern detection in multiple languages - - LLM prompts localized - - UI in multiple languages - -2. **Team Collaboration** - - Shared categories across team - - Collaborative training - - Role-based access - - Team analytics - -3. **Federated Learning** - - Learn from multiple users - - Privacy-preserving updates - - Collective intelligence - - No data sharing - -4. **Real-Time Filtering** - - Classify emails as they arrive - - Gmail/Outlook webhooks - - Real-time API - - Low-latency mode - -5. **Advanced Analytics** - - Email trends over time - - Sender analysis - - Response time tracking - - Productivity insights - -6. **API and Integrations** - - REST API for classifications - - Zapier integration - - IFTTT support - - Slack notifications - -**Timeline**: 6-8 months - -**Success Criteria**: -- 10+ enterprise customers -- Multi-language tested in 5 languages -- Real-time mode <1s latency -- API documented and stable - -### Research Directions (2027+) - -**Long-term Explorations**: - -1. **Universal Email Model** - - One model for all mailboxes - - Transfer learning across users - - Continual learning - - Breakthrough required - -2. **Attachment Deep Analysis** - - OCR for images - - PDF content extraction - - Contract analysis - - Invoice parsing - -3. **Thread-Aware Classification** - - Understand email conversations - - Context from previous messages - - Reply classification - - Conversation summarization - -4. **Sentiment Analysis** - - Detect urgent emails - - Identify frustration/joy - - Priority scoring - - Emotional intelligence - -5. **Smart Replies** - - Suggest email responses - - Auto-respond to common queries - - Calendar integration - - Task extraction - -### Community Contributions - -**Open Source Strategy** (if open-sourced): - -**Welcome Contributions**: -- Bug fixes -- Documentation improvements -- Provider implementations (ProtonMail, Yahoo, etc.) -- Translations -- Performance optimizations - -**Guided Contributions**: -- New classification algorithms (with benchmarks) -- Alternative LLM providers -- UI enhancements -- Testing infrastructure - -**Controlled**: -- Core architecture changes -- Breaking API changes -- Security-critical code - -**Community Features**: -- GitHub Issues for bug reports -- Discussions for feature requests -- Pull requests welcome -- Code review process -- Contributor guide - ---- - -## Technical Debt and Refactoring Opportunities - -Like all software, the system has accumulated technical debt that should be addressed. - -### Debt Item 1: Model Path Confusion - -**Issue**: Two model directories (calibrated/ and pretrained/) - -**Why It Exists**: Initially planned separate pre-trained and user-trained models. Architecture changed but dual paths remain. - -**Impact**: Confusion about which model loads, copy/paste required - -**Fix**: Single authoritative model path -- Option A: Remove pretrained/, always use calibrated/ -- Option B: Symbolic link from pretrained to calibrated -- Option C: Config setting for model path - -**Priority**: Medium (documented workaround exists) - -### Debt Item 2: Email Provider Interface Inconsistencies - -**Issue**: Providers have slightly different methods and error handling - -**Why It Exists**: Evolved organically, each provider added separately - -**Impact**: Hard to add new providers, inconsistent behavior - -**Fix**: Refactor to strict interface -- Abstract base class with enforcement -- Common error handling -- Shared utility methods -- Provider test suite - -**Priority**: High (blocks new providers) - -### Debt Item 3: Configuration Sprawl - -**Issue**: Config across multiple files (default_config.yaml, categories.yaml, llm_models.yaml) - -**Why It Exists**: Logical separation seemed good initially - -**Impact**: Hard to manage, easy to miss settings - -**Fix**: Consolidate to single config -- Single YAML with sections -- Or config directory with clear structure -- Or database for complex settings - -**Priority**: Low (works fine, just inelegant) - -### Debt Item 4: Hardcoded Strings - -**Issue**: Category names, paths, patterns scattered in code - -**Why It Exists**: MVP expedience - -**Impact**: Hard to internationalize, error-prone - -**Fix**: Constants module -- CATEGORIES, PATTERNS, PATHS in constants.py -- Easy to modify -- Single source of truth - -**Priority**: Medium (i18n blocker) - -### Debt Item 5: Limited Error Recovery - -**Issue**: Some error paths log and exit, don't recover - -**Why It Exists**: Fail-fast philosophy for MVP - -**Impact**: Brittleness, poor user experience - -**Fix**: Graceful degradation -- Retry logic everywhere -- Fallback behaviors -- Partial results better than failure - -**Priority**: High (production blocker) - -### Debt Item 6: Test Coverage Gaps - -**Issue**: ~60% coverage, missing LLM and calibration tests - -**Why It Exists**: Focused on core functionality first - -**Impact**: Refactoring risky, bugs slip through - -**Fix**: Increase coverage to 90%+ -- Mock LLM responses for unit tests -- Integration tests for calibration -- Property-based tests - -**Priority**: High (quality blocker) - -### Debt Item 7: Logging Inconsistency - -**Issue**: Some modules use print(), others use logger - -**Why It Exists**: Quick debugging that stuck around - -**Impact**: Logs incomplete, hard to debug - -**Fix**: Standardize on logger -- Replace all print() with logger -- Consistent log levels -- Structured logging (JSON) - -**Priority**: Medium (debuggability) - -### Debt Item 8: No Async/Await - -**Issue**: All API calls synchronous - -**Why It Exists**: Simpler to implement - -**Impact**: Can't parallelize I/O efficiently - -**Fix**: Async/await for I/O -- asyncio for email fetching -- aiohttp for HTTP calls -- Concurrent LLM calls - -**Priority**: Low (works fine for now) - -### Debt Item 9: Feature Extractor Monolith - -**Issue**: Feature extractor does too much (embeddings, patterns, structural) - -**Why It Exists**: Seemed logical to combine - -**Impact**: Hard to test, hard to extend - -**Fix**: Separate extractors -- EmbeddingExtractor -- PatternExtractor -- StructuralExtractor -- CompositeExtractor combines them - -**Priority**: Medium (modularity) - -### Debt Item 10: No Database - -**Issue**: Everything in files (JSON, pickle) - -**Why It Exists**: Simplicity for MVP - -**Impact**: Doesn't scale, no ACID guarantees - -**Fix**: Add database -- SQLite for local deployment -- PostgreSQL for enterprise -- ORM for abstraction - -**Priority**: Low for MVP, High for Phase 4 - -### Refactoring Priorities - -**High Priority** (blocking production): -1. Email provider interface standardization -2. Error recovery improvements -3. Test coverage to 90%+ - -**Medium Priority** (quality improvements): -1. Model path consolidation -2. Hardcoded strings to constants -3. Logging consistency -4. Feature extractor modularization - -**Low Priority** (nice to have): -1. Configuration consolidation -2. Async/await refactor -3. Database migration - -**Technical Debt Paydown Strategy**: -- Allocate 20% of each sprint to debt -- Address high priority items first -- Don't let debt accumulate -- Refactor before adding features - ---- - -## Deployment Considerations - -For users or organizations deploying the system. - -### System Requirements - -**Minimum**: -- CPU: 4 cores -- RAM: 4GB -- Disk: 10GB -- OS: Linux, macOS, Windows (WSL) -- Python: 3.8+ -- Ollama: Latest version - -**Recommended**: -- CPU: 8+ cores (for parallel processing) -- RAM: 8GB+ (for large mailboxes) -- Disk: 20GB+ (for Ollama models) -- SSD: Strongly recommended -- GPU: Optional (not used currently) - -**For 100k Emails**: -- CPU: 16+ cores -- RAM: 16GB+ -- Disk: 50GB+ -- Processing time: 5-10 minutes - -### Installation - -**Steps**: -1. Install Python 3.8+ and pip -2. Install Ollama from ollama.ai -3. Pull required models: `ollama pull all-minilm:l6-v2` and `ollama pull qwen3:4b` -4. Clone repository -5. Create virtual environment: `python -m venv venv` -6. Activate: `source venv/bin/activate` -7. Install dependencies: `pip install -r requirements.txt` -8. Configure email provider credentials -9. Run: `python -m src.cli run --source gmail --credentials creds.json` - -**Common Issues**: -- Ollama not running → Start Ollama service -- Credentials invalid → Re-authenticate -- Out of memory → Reduce batch size -- Slow performance → Check CPU usage, consider faster machine - -### Configuration - -**Key Settings to Adjust**: - -**Batch Size** (config/default_config.yaml): -- Default: 512 -- Low memory: 128 -- High memory: 1024-2048 - -**Threshold** (config/default_config.yaml): -- Default: 0.55 -- Higher accuracy: 0.65-0.75 -- Higher speed: 0.45-0.55 - -**Sample Size** (config/default_config.yaml): -- Default: 250-1500 (3% of total) -- Faster calibration: 100-500 -- Better model: 1000-2000 - -**LLM Provider**: -- Local: Ollama (recommended) -- Cloud: OpenAI (set API key) - -### Monitoring - -**Key Metrics**: -- Classification throughput (emails/sec) -- Accuracy (from validation set) -- LLM fallback rate (should be <25%) -- Memory usage (should be <50% of available) -- Error rate (should be <1%) - -**Logging**: -- Default: INFO level -- Debug: --verbose flag -- Location: logs/email-sorter.log -- Rotation: Implement if running continuously - -**Alerting** (for production): -- Throughput drops below 50 emails/sec -- Accuracy drops below 85% -- Error rate above 5% -- Memory usage above 80% - -### Scaling - -**Horizontal Scaling**: -- Run multiple instances for different accounts -- Each instance independent -- Share category cache (optional) - -**Vertical Scaling**: -- More CPU cores → faster ML inference -- More RAM → larger batches -- SSD → faster model loading -- GPU → not utilized currently - -**Bottlenecks**: -- LLM calls (if not disabled) -- Email fetching (API rate limits) -- Feature extraction (embedding API) - -**Optimization Opportunities**: -- Disable LLM fallback (--no-llm-fallback) -- Increase batch size (up to memory limit) -- Use local sentence-transformers (no API overhead) -- Parallel email fetching (multiple accounts) - -### Backup and Recovery - -**What to Backup**: -- Trained models (src/models/calibrated/) -- Category cache (src/models/category_cache.json) -- Classification results (results/) -- OAuth tokens (token.json) -- Configuration files (config/) - -**Backup Strategy**: -- Daily backup of models and cache -- Real-time backup of results (as generated) -- Encrypted backup of OAuth tokens - -**Recovery**: -- Models can be retrained (3 minutes) -- Cache rebuilt from scratch (consistency loss) -- Results irreplaceable (backup critical) -- OAuth tokens can be regenerated (user re-auth) - -### Updates and Maintenance - -**Updating System**: -1. Backup current installation -2. Pull latest code -3. Update dependencies: `pip install -r requirements.txt --upgrade` -4. Test on small dataset -5. Re-run calibration if model format changed - -**Breaking Changes**: -- Model format changes → Re-calibration required -- Config format changes → Migrate config -- API changes → Update integration code - -**Maintenance Tasks**: -- Clear logs monthly -- Update Ollama models quarterly -- Rotate OAuth tokens yearly -- Review and update patterns as spam evolves - ---- - -## Comparative Analysis - -How does Email Sorter compare to alternatives? - -### vs. Gmail's Built-In Categories - -**Gmail Approach**: -- Hardcoded categories (Primary, Social, Promotions, Updates, Forums) -- Server-side classification -- Neural network models -- No customization - -**Email Sorter Advantages**: -- Custom categories per user -- Works offline (local processing) -- Privacy (no cloud upload) -- Flexible (can disable LLM) - -**Gmail Advantages**: -- Zero setup -- Real-time classification -- Seamless integration -- Extremely fast -- Trained on billions of emails - -**Verdict**: Gmail better for general use, Email Sorter better for custom needs - -### vs. SaneBox (Commercial Service) - -**SaneBox Approach**: -- Cloud-based classification -- $7-36/month subscription -- AI learns from behavior -- Works with any email provider - -**Email Sorter Advantages**: -- One-time cost (no subscription) -- Privacy (local processing) -- Open source (can audit) -- Custom categories - -**SaneBox Advantages**: -- Polished UI -- Real-time filtering -- Active learning -- Works everywhere (IMAP) -- Customer support - -**Verdict**: SaneBox better for ongoing use, Email Sorter better for one-time cleanup - -### vs. Manual Filters/Rules - -**Manual Rules Approach**: -- User defines rules (if sender = X, label = Y) -- Native to email clients -- Simple and deterministic - -**Email Sorter Advantages**: -- Semantic understanding (not just keywords) -- Discovers categories automatically -- Handles ambiguity -- Scales to thousands of emails - -**Manual Rules Advantages**: -- Perfect accuracy (for well-defined rules) -- No setup beyond rule creation -- Instant -- Native to email client - -**Verdict**: Manual rules better for simple cases, Email Sorter better for complex mailboxes - -### vs. Pure LLM Services (GPT-4 for Every Email) - -**Pure LLM Approach**: -- Send each email to GPT-4 -- Get classification -- High accuracy - -**Email Sorter Advantages**: -- 100x faster (batched ML) -- 50x cheaper (local processing) -- Privacy (no external API) -- Offline capable - -**Pure LLM Advantages**: -- Highest accuracy (95-98%) -- Handles any edge case -- No training required -- Language agnostic - -**Verdict**: Pure LLM better for small datasets (<1000), Email Sorter better for large datasets - -### vs. Traditional ML Classifiers (Naive Bayes, SVM) - -**Traditional ML Approach**: -- TF-IDF features -- Naive Bayes or SVM -- Manual labeling required - -**Email Sorter Advantages**: -- No manual labeling (LLM calibration) -- Semantic embeddings (better features) -- Dynamic categories -- Higher accuracy - -**Traditional ML Advantages**: -- Simpler -- Faster inference (no embeddings) -- Smaller models -- More interpretable - -**Verdict**: Email Sorter better in almost every way (modern approach) - -### Unique Positioning - -**Email Sorter's Niche**: -- Local-first (privacy-conscious users) -- One-time cleanup (10k-100k email backlogs) -- Custom categories (unique mailboxes) -- Fast enough (not real-time but acceptable) -- Accurate enough (90%+ with LLM) -- Open source (auditable, modifiable) - -**Best Use Cases**: -1. Self-employed professionals with email backlog -2. Privacy-focused users -3. Users with unique category needs -4. Researchers (Enron dataset experiments) -5. Developers (extendable platform) - -**Not Ideal For**: -1. Real-time filtering (SaneBox better) -2. General users (Gmail categories better) -3. Enterprise (no team features yet) -4. Non-technical users (CLI intimidating) - ---- - -## Lessons Learned - -Key takeaways from building this system. - -### Technical Lessons - -**1. Batch Everything That Can Be Batched** - -Single biggest performance win. Embedding API calls, ML predictions, database queries - batch them all. 7.5x speedup from this alone. - -**2. Profile Before Optimizing** - -Spent days optimizing ML inference (2s → 0.7s). Then realized LLM calls took 4000s. Profile first, optimize bottlenecks. - -**3. User Choice > One-Size-Fits-All** - -Users have different priorities (speed vs accuracy, privacy vs convenience). Provide options (--no-llm-fallback, --verify-categories) rather than forcing one approach. - -**4. LLMs Are Amazing for Few-Shot Learning** - -Using LLM to label 300 emails for ML training is brilliant. Traditional approach requires thousands of manual labels. LLM changes the game. - -**5. Embeddings Capture Semantics Better Than Keywords** - -"Meeting at 3pm" and "Sync tomorrow" have similar embeddings despite different words. TF-IDF would miss this. - -**6. Local-First Simplifies Deployment** - -Initially planned cloud deployment (API, database, auth, scaling). Local-first much simpler and users prefer privacy. - -**7. Testing With Real Data Reveals Issues** - -Enron dataset exposed problems synthetic data didn't: forwarded messages, ambiguous categories, noisy labels. - -**8. Category Discovery Must Be Flexible** - -Hardcoded categories failed for diverse users. LLM discovery per mailbox solved this elegantly. - -**9. Threshold Tuning Often Beats Algorithm Swapping** - -Random Forest vs XGBoost vs LightGBM: 2-3% accuracy difference. Threshold 0.75 vs 0.55: 12x speed difference. - -**10. Documentation Matters** - -Comprehensive CLAUDE.md and this overview document critical for understanding system later. Code documents what, docs document why. - -### Product Lessons - -**1. MVP Is Enough to Prove Concept** - -Didn't need web dashboard, real-time classification, or team features to validate idea. Core functionality sufficient. - -**2. Privacy Is a Feature** - -Local processing not just for technical reasons - users actively want privacy. Market differentiator. - -**3. Performance Perception Matters** - -24 seconds feels instant, 4 minutes feels slow. Both work, but UX dramatically different. - -**4. Configuration Complexity Is Acceptable for Power Users** - -Complex configuration (YAML, thresholds, models) fine for technical users. Would need UI for general users. - -**5. Open Source Enables Auditing** - -For privacy-sensitive application, open source crucial. Users can verify no data leakage. - -### Process Lessons - -**1. Iterate Quickly on Core, Polish Later** - -Built core classification pipeline first. Web dashboard, API, integrations can wait. Ship fast, learn fast. - -**2. Real-World Testing > Synthetic Testing** - -Enron dataset provided real-world complexity. Synthetic emails too clean, missed edge cases. - -**3. Document Decisions in Moment** - -Why chose LightGBM over XGBoost? Forgot reasons weeks later. Document rationale when fresh. - -**4. Technical Debt Is Okay for MVP** - -Model path confusion, hardcoded strings, limited error recovery - all okay for MVP. Can refactor in Phase 2. - -**5. Benchmarking Drives Optimization** - -Without numbers (emails/sec, accuracy %), optimization is guesswork. Measure everything. - -### Surprising Discoveries - -**1. LLM Calibration Works Better Than Expected** - -Expected 80% accuracy from LLM-labeled data. Got 94%. LLMs excellent few-shot learners. - -**2. Threshold 0.55 Optimal** - -Expected 0.7-0.75 optimal. Empirically 0.55 better (marginal accuracy loss, major speed gain). - -**3. Category Cache Convergence Fast** - -Expected 100+ users before category cache stable. Converged after 10 users. - -**4. Enron Dataset Sufficient** - -Expected to need Gmail data immediately. Enron dataset rich enough for MVP. - -**5. Batching Diminishes After 512** - -Expected linear speedup with batch size. Plateaus at 512-1024. - -### Mistakes to Avoid - -**1. Don't Optimize Prematurely** - -Spent time optimizing non-bottlenecks. Profile first. - -**2. Don't Assume User Needs** - -Assumed Gmail categories sufficient. Users have diverse needs. - -**3. Don't Neglect Documentation** - -Undocumented code becomes incomprehensible weeks later. - -**4. Don't Skip Error Handling** - -MVP doesn't mean brittle. Basic error handling critical. - -**5. Don't Build Everything at Once** - -Wanted web dashboard, API, mobile app. Focused on core first. - -### If Starting Over - -**What I'd Keep**: -- Three-tier classification strategy (brilliant) -- LLM-driven calibration (game-changer) -- Batched embeddings (essential) -- Local-first architecture (privacy win) -- Category caching (solves real problem) - -**What I'd Change**: -- Test batching earlier (would save days) -- Single model path from start (avoid debt) -- Database from beginning (for Phase 4) -- More test coverage upfront (easier to refactor) -- Async/await from start (better for I/O) - -**What I'd Add**: -- Web dashboard in Phase 1 (better UX) -- Active learning earlier (compound benefits) -- Better error messages (user experience) -- Progress bars (UX polish) -- Example configurations (easier onboarding) - ---- - -## Conclusion - -Email Sorter represents a pragmatic solution to email organization that balances speed, accuracy, privacy, and flexibility. - -### Key Achievements - -**Technical**: -- Three-tier classification achieving 92.7% accuracy -- 423 emails/second processing (fast mode) -- 1.8MB compact model -- 7.5x speedup through batching -- LLM-driven calibration (3 minutes) - -**Architectural**: -- Clean separation of concerns -- Extensible provider system -- Configurable without code changes -- Local-first processing -- Graceful degradation - -**Innovation**: -- Dynamic category discovery -- Category caching for consistency -- Hybrid ML/LLM approach -- Batched embedding extraction -- Threshold-based fallback - -### System Strengths - -**1. Adaptability**: Discovers categories per mailbox, not hardcoded - -**2. Speed**: 100x faster than pure LLM approach - -**3. Privacy**: Local processing, no cloud upload - -**4. Flexibility**: Configurable speed/accuracy trade-off - -**5. Scalability**: Handles 10k-100k emails easily - -**6. Simplicity**: Single command to classify - -**7. Extensibility**: Easy to add providers, features - -### System Weaknesses - -**1. Not Real-Time**: Batch processing only - -**2. English-Focused**: Limited multilingual support - -**3. Setup Complexity**: Ollama, OAuth, CLI - -**4. No GUI**: CLI-only intimidating - -**5. Per-Mailbox Training**: Can't share models - -**6. Limited Attachment Analysis**: Surface-level only - -**7. No Active Learning**: Doesn't improve from feedback - -### Target Users - -**Ideal Users**: -- Self-employed with email backlog -- Privacy-conscious individuals -- Technical users comfortable with CLI -- Users with unique category needs -- Researchers experimenting with email classification - -**Not Ideal Users**: -- General consumers (Gmail categories sufficient) -- Enterprise teams (no collaboration features) -- Non-technical users (setup too complex) -- Real-time filtering needs (not designed for this) - -### Success Metrics - -**MVP Success** (achieved): -- ✅ 10,000 emails classified in <30 seconds -- ✅ 90%+ accuracy (92.7% with LLM) -- ✅ Local processing (Ollama) -- ✅ Dynamic categories (LLM discovery) -- ✅ Multi-provider support (Gmail, Outlook, IMAP, Enron) - -**Phase 2 Success** (planned): -- 100+ real users -- Gmail/Outlook fully tested -- Email syncing working -- Incremental classification -- Multi-account support - -**Phase 3 Success** (planned): -- 1,000+ users -- Web dashboard (80% adoption) -- Active learning (5% accuracy improvement) -- 95% test coverage -- Performance optimized - -### Final Thoughts - -Email Sorter demonstrates that hybrid ML/LLM systems can achieve excellent results by using each technology where it excels: - -- **LLM for calibration**: One-time category discovery and labeling -- **ML for inference**: Fast bulk classification -- **LLM for review**: Handle uncertain cases - -This approach provides 90%+ accuracy at 100x the speed of pure LLM, with the privacy of local processing and the flexibility of dynamic categories. - -The system is production-ready for technical users with email backlogs. With planned enhancements (web dashboard, real-time mode, active learning), it could serve much broader audiences. - -**Most importantly**, the system proves that local-first, privacy-preserving AI applications can match cloud services in functionality while respecting user data. - -### Acknowledgments - -**Technologies**: -- LightGBM: Fast, accurate gradient boosting -- Ollama: Local LLM and embedding serving -- all-minilm:l6-v2: Excellent sentence embeddings -- Enron dataset: Real-world test corpus -- Click: Excellent CLI framework -- Pydantic: Type-safe configuration - -**Inspiration**: -- Gmail's category system -- SaneBox's AI filtering -- Traditional email filters -- Modern LLM capabilities - -**Community** (hypothetical): -- Early testers providing feedback -- Contributors improving code -- Users sharing use cases -- Researchers building on system - ---- - -## Appendices - -### Appendix A: Configuration Reference - -Complete configuration options in `config/default_config.yaml`: - -**Calibration Section**: -- `sample_size`: Training samples (default: 250) -- `sample_strategy`: Sampling method (default: "stratified") -- `validation_size`: Validation samples (default: 50) -- `min_confidence`: Minimum LLM label confidence (default: 0.6) - -**Processing Section**: -- `batch_size`: Emails per batch (default: 100) -- `llm_queue_size`: Max queued LLM calls (default: 100) -- `parallel_workers`: Thread pool size (default: 4) -- `checkpoint_interval`: Progress save frequency (default: 1000) - -**Classification Section**: -- `default_threshold`: ML confidence threshold (default: 0.55) -- `min_threshold`: Minimum allowed (default: 0.50) -- `max_threshold`: Maximum allowed (default: 0.70) - -**LLM Section**: -- `provider`: "ollama" or "openai" -- `ollama.base_url`: Ollama server URL -- `ollama.calibration_model`: Model for calibration -- `ollama.classification_model`: Model for classification -- `ollama.temperature`: Randomness (default: 0.1) -- `ollama.max_tokens`: Max output length -- `openai.api_key`: OpenAI API key -- `openai.model`: GPT model name - -**Features Section**: -- `embedding_model`: Model name (default: "all-MiniLM-L6-v2") -- `embedding_batch_size`: Batch size (default: 32) - -### Appendix B: Performance Benchmarks - -All benchmarks on 28-core CPU, 32GB RAM, SSD: - -**10,000 Emails**: -- Fast mode: 24 seconds (423 emails/sec) -- Hybrid mode: 4.4 minutes (38 emails/sec) -- Calibration: 3.1 minutes (one-time) - -**100,000 Emails**: -- Fast mode: 4 minutes (417 emails/sec) -- Hybrid mode: 43 minutes (39 emails/sec) -- Calibration: 5 minutes (one-time) - -**Bottlenecks**: -- Embedding extraction: 20-40 seconds -- ML inference: 0.7-7 seconds -- LLM review: 2 seconds per email -- Email fetching: Variable (provider dependent) - -### Appendix C: Accuracy by Category - -Enron dataset, 10,000 emails, ML-only mode: - -| Category | Emails | Accuracy | Common Errors | -|----------|--------|----------|---------------| -| Work | 3200 | 78% | Confused with Meetings | -| Financial | 2100 | 85% | Very distinct patterns | -| Updates | 1800 | 65% | Overlaps with Newsletters | -| Meetings | 800 | 72% | Confused with Work | -| Personal | 600 | 68% | Low sample count | -| Technical | 500 | 75% | Jargon helps | -| Other | 1000 | 60% | Catch-all category | - -**Overall**: 72.7% accuracy - -With LLM: 92.7% accuracy (+20%) - -### Appendix D: Cost Analysis - -**One-Time Costs**: -- Development time: 6 weeks -- Ollama setup: 0 hours (free) -- Model training (per mailbox): 3 minutes - -**Per-Classification Costs** (10,000 emails): - -**Fast Mode**: -- Electricity: ~$0.01 -- Time: 24 seconds -- LLM calls: 0 -- Total: $0.01 - -**Hybrid Mode**: -- Electricity: ~$0.05 -- Time: 4.4 minutes -- LLM calls: 2,100 × $0.0001 = $0.21 -- Total: $0.26 - -**Calibration** (one-time): -- Time: 3 minutes -- LLM calls: 15 × $0.01 = $0.15 -- Total: $0.15 - -**Compare to Alternatives**: -- Manual (10k emails, 30sec each): 83 hours × $20/hr = $1,660 -- SaneBox: $36/month subscription -- Pure GPT-4: 10k × $0.001 = $10 - -### Appendix E: Glossary - -**Terms**: -- **Calibration**: One-time training process to create ML model -- **Category Discovery**: LLM identifies natural categories in mailbox -- **Category Caching**: Reusing categories across mailboxes -- **Confidence**: Probability score for classification (0-1) -- **Embedding**: 384-dim semantic vector representing text -- **Feature Extraction**: Converting email to feature vector -- **Hard Rules**: Regex pattern matching (first tier) -- **LLM Fallback**: Using LLM for low-confidence predictions -- **ML Classification**: LightGBM prediction (second tier) -- **Threshold**: Minimum confidence to accept ML prediction -- **Three-Tier Strategy**: Rules + ML + LLM pipeline - -**Acronyms**: -- **API**: Application Programming Interface -- **CLI**: Command-Line Interface -- **CSV**: Comma-Separated Values -- **IMAP**: Internet Message Access Protocol -- **JSON**: JavaScript Object Notation -- **LLM**: Large Language Model -- **ML**: Machine Learning -- **MVP**: Minimum Viable Product -- **OAuth**: Open Authorization -- **TF-IDF**: Term Frequency-Inverse Document Frequency -- **YAML**: YAML Ain't Markup Language - -### Appendix F: Resources - -**Documentation**: -- README.md: Quick start guide -- CLAUDE.md: Development guide for AI assistants -- docs/PROJECT_STATUS_AND_NEXT_STEPS.html: Detailed roadmap -- This document: Comprehensive overview - -**Code Structure**: -- src/cli.py: Main entry point -- src/classification/: Classification pipeline -- src/calibration/: Training workflow -- src/email_providers/: Provider implementations -- tests/: Test suite - -**External Resources**: -- Ollama: ollama.ai -- LightGBM: lightgbm.readthedocs.io -- Enron dataset: cs.cmu.edu/~enron -- sentence-transformers: sbert.net - ---- - -**Document Complete** - -This comprehensive overview covers the Email Sorter system from conception to current MVP status, documenting every architectural decision, performance optimization, and lesson learned. Total length: ~5,200 lines of detailed, code-free explanation. - -**Last Updated**: October 26, 2025 -**Document Version**: 1.0 -**System Version**: MVP v1.0 diff --git a/docs/CURRENT_WORK_SUMMARY.md b/docs/CURRENT_WORK_SUMMARY.md deleted file mode 100644 index b408899..0000000 --- a/docs/CURRENT_WORK_SUMMARY.md +++ /dev/null @@ -1,232 +0,0 @@ -# Email Sorter - Current Work Summary - -**Date:** 2025-10-23 -**Status:** 100k Enron Classification Complete with Optimization - ---- - -## Current Achievements - -### 1. Calibration System (Phase 1) ✅ -- **LLM-driven category discovery** using qwen3:8b-q4_K_M -- **Trained on:** 50 emails (stratified sample from 100 email batch) -- **Categories discovered:** 10 quality categories - - Work Communication, Financial, Forwarded, Technical Analysis, Administrative, Reports, Technical Issues, Requests, Meetings, HR & Personnel -- **Category cache system:** Cross-mailbox consistency with semantic matching -- **Model:** LightGBM classifier on 384-dim embeddings (all-minilm:l6-v2) -- **Model file:** `src/models/calibrated/classifier.pkl` (1.1MB) - -### 2. Performance Optimization ✅ -**Batch Size Testing Results:** -- batch_size=32: 6.993s (baseline) -- batch_size=64: 5.636s (19.4% faster) -- batch_size=128: 5.617s (19.7% faster) -- batch_size=256: 5.572s (20.3% faster) -- **batch_size=512: 5.453s (22.0% faster)** ← WINNER - -**Key Optimizations:** -- Fixed sequential embedding calls → batched API calls -- Used Ollama's `embed()` API with batch support -- Removed duplicate `extract_batch()` method causing cache issues -- Optimized to 512 batch size for GPU utilization - -### 3. 100k Classification Complete ✅ -**Performance:** -- **Total time:** 3.4 minutes (202 seconds) -- **Speed:** 495 emails/second -- **Per email:** ~2ms (including all processing) - -**Accuracy:** -- **Average confidence:** 81.1% -- **High confidence (≥0.7):** 74,777 emails (74.8%) -- **Medium confidence (0.5-0.7):** 17,381 emails (17.4%) -- **Low confidence (<0.5):** 7,842 emails (7.8%) - -**Category Distribution:** -1. Work Communication: 89,807 (89.8%) | Avg conf: 83.7% -2. Financial: 6,534 (6.5%) | Avg conf: 58.7% -3. Forwarded: 2,457 (2.5%) | Avg conf: 54.4% -4. Technical Analysis: 1,129 (1.1%) | Avg conf: 56.9% -5. Reports: 42 (0.04%) -6. Technical Issues: 14 (0.01%) -7. Administrative: 14 (0.01%) -8. Requests: 3 (0.00%) - -**Output Files:** -- `enron_100k_results/results.json` (19MB) - Full classifications -- `enron_100k_results/summary.json` (1.5KB) - Statistics -- `enron_100k_results/classifications.csv` (8.6MB) - Spreadsheet format - -### 4. Evaluation & Validation Tools ✅ - -**A. LLM Evaluation Script** (`evaluate_with_llm.py`) -- Loads actual email content with EnronProvider -- Uses qwen3:8b-q4_K_M with `` for speed -- Stratified sampling (high/medium/low confidence) -- Verdict parsing: YES/PARTIAL/NO -- Temperature=0.1 for consistency - -**B. Feedback Fine-tuning System** (`feedback_finetune.py`) -- Collects LLM corrections on low-confidence predictions -- Continues LightGBM training with `init_model` parameter -- Lower learning rate (0.05) for stability -- Creates `classifier_finetuned.pkl` -- **Result on 200 samples:** 0 corrections needed (model already accurate!) - -**C. Attachment Handler** (exists but NOT integrated) -- PDF text extraction (PyPDF2) -- DOCX text extraction (python-docx) -- Keyword detection (financial, legal, meeting, report) -- Classification hints -- **Status:** Available in `src/processing/attachment_handler.py` but unused - ---- - -## Technical Architecture - -### Data Flow -``` -Enron Maildir (100k emails) - ↓ -EnronParser (stratified sampling) - ↓ -FeatureExtractor (batch_size=512) - ↓ -Ollama Embeddings (all-minilm:l6-v2, 384-dim) - ↓ -LightGBM Classifier (22 categories) - ↓ -Results (JSON/CSV export) -``` - -### Calibration Flow -``` -100 emails → 5 LLM batches (20 emails each) - ↓ -qwen3:8b-q4_K_M discovers categories - ↓ -Consolidation (15 → 10 categories) - ↓ -Category cache (semantic matching) - ↓ -50 emails labeled for training - ↓ -LightGBM training (200 boosting rounds) - ↓ -Model saved (classifier.pkl) -``` - -### Performance Metrics -- **Calibration:** ~100 emails, ~1 minute -- **Training:** 50 samples, LightGBM 200 rounds, ~1 second -- **Classification:** 100k emails, batch 512, 3.4 minutes -- **Per email:** 2ms total (embedding + inference) -- **GPU utilization:** Batched embeddings, efficient processing - ---- - -## Key Files & Components - -### Models -- `src/models/calibrated/classifier.pkl` - Trained LightGBM model (1.1MB) -- `src/models/category_cache.json` - 10 discovered categories - -### Core Components -- `src/calibration/enron_parser.py` - Enron dataset parsing -- `src/calibration/llm_analyzer.py` - LLM category discovery -- `src/calibration/trainer.py` - LightGBM training -- `src/calibration/workflow.py` - Orchestration -- `src/classification/feature_extractor.py` - Batch embeddings (512) -- `src/email_providers/enron.py` - Enron provider -- `src/processing/attachment_handler.py` - Attachment extraction (unused) - -### Scripts -- `run_100k_classification.py` - Full 100k processing -- `test_model_burst.py` - Batch testing (configurable size) -- `evaluate_with_llm.py` - LLM quality evaluation -- `feedback_finetune.py` - Feedback-driven fine-tuning - -### Results -- `enron_100k_results/` - 100k classification output -- `enron_100k_full_run.log` - Complete processing log - ---- - -## Known Issues & Limitations - -### 1. Attachment Handling ❌ -- AttachmentAnalyzer exists but NOT integrated -- Enron dataset has minimal attachments -- Need integration for Marion emails with PDFs/DOCX - -### 2. Category Imbalance ⚠️ -- 89.8% classified as "Work Communication" -- May be accurate for Enron (internal work emails) -- Other categories underrepresented - -### 3. Low Confidence Samples -- 7,842 emails (7.8%) with confidence <0.5 -- LLM validation shows they're actually correct -- Model confidence may be overly conservative - -### 4. Feature Extraction -- Currently uses only subject + body text -- Attachments not analyzed -- Sender domain/patterns used but could be enhanced - ---- - -## Next Steps - -### Immediate -1. **Comprehensive validation script:** - - 50 low-confidence samples - - 25 random samples - - LLM summary of findings - -2. **Mermaid workflow diagram:** - - Complete data flow visualization - - All LLM call points - - Performance metrics at each stage - -3. **Fresh end-to-end run:** - - Clear all models - - Run calibration → classification → validation - - Document complete pipeline - -### Future Enhancements -1. **Integrate attachment handling** for Marion emails -2. **Add more structural features** (time patterns, thread depth) -3. **Active learning loop** with user feedback -4. **Multi-model ensemble** for higher accuracy -5. **Confidence calibration** to improve certainty estimates - ---- - -## Performance Summary - -| Metric | Value | -|--------|-------| -| **Calibration Time** | ~1 minute | -| **Training Samples** | 50 emails | -| **Model Size** | 1.1MB | -| **Categories** | 10 discovered | -| **100k Processing** | 3.4 minutes | -| **Speed** | 495 emails/sec | -| **Avg Confidence** | 81.1% | -| **High Confidence** | 74.8% | -| **Batch Size** | 512 (optimal) | -| **Embedding Dim** | 384 (all-minilm) | - ---- - -## Conclusion - -The email sorter has achieved: -- ✅ **Fast calibration** (1 minute on 100 emails) -- ✅ **High accuracy** (81% avg confidence) -- ✅ **Excellent performance** (495 emails/sec) -- ✅ **Quality categories** (10 broad, reusable) -- ✅ **Scalable architecture** (100k emails in 3.4 min) - -The system is **ready for production** with Marion emails after integrating attachment handling. diff --git a/docs/FAST_ML_ONLY_WORKFLOW.html b/docs/FAST_ML_ONLY_WORKFLOW.html deleted file mode 100644 index 339c61a..0000000 --- a/docs/FAST_ML_ONLY_WORKFLOW.html +++ /dev/null @@ -1,527 +0,0 @@ - - - - - - Fast ML-Only Workflow Analysis - - - - -

Fast ML-Only Workflow Analysis

- -

Your Question

-
- "I want to run ML-only classification on new mailboxes WITHOUT full calibration. Maybe 1 LLM call to verify categories match, then pure ML on embeddings. How can we do this fast for experimentation?" -
- -

Current Trained Model

- -
-

Model: src/models/calibrated/classifier.pkl (1.8MB)

-
    -
  • Type: LightGBM Booster (not mock)
  • -
  • Categories (11): Updates, Work, Meetings, External, Financial, Test, Administrative, Operational, Technical, Urgent, Requests
  • -
  • Trained on: 10,000 Enron emails
  • -
  • Input: Embeddings (384-dim) + TF-IDF features
  • -
-
- -

1. Current Flow: With Calibration (Slow)

-
-
-flowchart TD
-    Start([New Mailbox: 10k emails]) --> Check{Model exists?}
-    Check -->|No| Calibration[CALIBRATION PHASE
~20 minutes] - Check -->|Yes| LoadModel[Load existing model] - - Calibration --> Sample[Sample 300 emails] - Sample --> Discovery[LLM Category Discovery
15 batches × 20 emails
~5 minutes] - Discovery --> Consolidate[Consolidate categories
LLM call
~5 seconds] - Consolidate --> Label[Label 300 samples] - Label --> Extract[Feature extraction] - Extract --> Train[Train LightGBM
~5 seconds] - Train --> SaveModel[Save new model] - - SaveModel --> Classify[CLASSIFICATION PHASE] - LoadModel --> Classify - - Classify --> Loop{For each email} - Loop --> Embed[Generate embedding
~0.02 sec] - Embed --> TFIDF[TF-IDF features
~0.001 sec] - TFIDF --> Predict[ML Prediction
~0.003 sec] - Predict --> Threshold{Confidence?} - Threshold -->|High| MLDone[ML result] - Threshold -->|Low| LLMFallback[LLM fallback
~4 sec] - MLDone --> Next{More?} - LLMFallback --> Next - Next -->|Yes| Loop - Next -->|No| Done[Results] - - style Calibration fill:#ff6b6b - style Discovery fill:#ff6b6b - style LLMFallback fill:#ff6b6b - style MLDone fill:#4ec9b0 -
-
- -

2. Desired Flow: Fast ML-Only (Your Goal)

-
-
-flowchart TD
-    Start([New Mailbox: 10k emails]) --> LoadModel[Load pre-trained model
Categories: 11 known
~0.5 seconds] - - LoadModel --> OptionalCheck{Verify categories?} - OptionalCheck -->|Yes| QuickVerify[Single LLM call
Sample 10-20 emails
Check category match
~20 seconds] - OptionalCheck -->|Skip| StartClassify - - QuickVerify --> MatchCheck{Categories match?} - MatchCheck -->|Yes| StartClassify[START CLASSIFICATION] - MatchCheck -->|No| Warn[Warning: Category mismatch
Continue anyway] - Warn --> StartClassify - - StartClassify --> Loop{For each email} - Loop --> Embed[Generate embedding
all-minilm:l6-v2
384 dimensions
~0.02 sec] - - Embed --> TFIDF[TF-IDF features
~0.001 sec] - TFIDF --> Combine[Combine features
Embedding + TF-IDF vector] - - Combine --> Predict[LightGBM prediction
~0.003 sec] - Predict --> Result[Category + confidence
NO threshold check
NO LLM fallback] - - Result --> Next{More emails?} - Next -->|Yes| Loop - Next -->|No| Done[10k emails classified
Total time: ~4 minutes] - - style QuickVerify fill:#ffd93d - style Result fill:#4ec9b0 - style Done fill:#4ec9b0 -
-
- -

3. What Already Works (No Code Changes Needed)

- -
-

✓ The Model is Portable

-

Your trained model contains:

-
    -
  • LightGBM Booster (the actual trained weights)
  • -
  • Category list (11 categories)
  • -
  • Category-to-index mapping
  • -
-

It can classify ANY email that has the same feature structure (embeddings + TF-IDF).

-
- -
-

✓ Embeddings are Universal

-

The all-minilm:l6-v2 model creates 384-dim embeddings for ANY text. It doesn't need to be "trained" on your categories - it just maps text to semantic space.

-

Same embedding model works on Gmail, Outlook, any mailbox.

-
- -
-

✓ --no-llm-fallback Flag Exists

-

Already implemented. When set:

-
    -
  • Low confidence emails still get ML classification
  • -
  • NO LLM fallback calls
  • -
  • 100% pure ML speed
  • -
-
- -
-

✓ Model Loads Without Calibration

-

If model exists at src/models/pretrained/classifier.pkl, calibration is skipped entirely.

-
- -

4. The Problem: Category Drift

- -
-

What Happens When Mailboxes Differ

-

Scenario: Model trained on Enron (business emails)

-

New mailbox: Personal Gmail (shopping, social, newsletters)

- - - - - - - - - - - - - - - - - - - - - - -
Enron Categories (Trained)Gmail Categories (Natural)ML Behavior
Work, Meetings, FinancialShopping, Social, TravelForces Gmail into Enron categories
"Operational"No equivalentEmails mis-classified as "Operational"
"External""Newsletters"May map but semantically different
- -

Result: Model works, but accuracy drops. Emails get forced into inappropriate categories.

-
- -

5. Your Proposed Solution: Quick Category Verification

- -
-
-flowchart TD
-    Start([New Mailbox]) --> LoadModel[Load trained model
11 categories known] - - LoadModel --> Sample[Sample 10-20 emails
Quick random sample
~0.1 seconds] - - Sample --> BuildPrompt[Build verification prompt
Show trained categories
Show sample emails] - - BuildPrompt --> LLMCall[Single LLM call
~20 seconds
Task: Are these categories
appropriate for this mailbox?] - - LLMCall --> Parse[Parse response
Expected: Yes/No + suggestions] - - Parse --> Decision{Response?} - Decision -->|"Good match"| Proceed[Proceed with ML-only] - Decision -->|"Poor match"| Options{User choice} - - Options -->|Continue anyway| Proceed - Options -->|Full calibration| Calibrate[Run full calibration
Discover new categories] - Options -->|Abort| Stop[Stop - manual review] - - Proceed --> FastML[Fast ML Classification
10k emails in 4 minutes] - - style LLMCall fill:#ffd93d - style FastML fill:#4ec9b0 - style Calibrate fill:#ff6b6b -
-
- -

6. Implementation Options

- -

Option A: Pure ML (Fastest, No Verification)

-
-Command: -python -m src.cli run \ - --source gmail \ - --limit 10000 \ - --output gmail_results/ \ - --no-llm-fallback - -What happens: -1. Load existing model (11 Enron categories) -2. Classify all 10k emails using those categories -3. NO LLM calls at all -4. Time: ~4 minutes - -Accuracy: 60-80% depending on mailbox similarity to Enron - -Use case: Quick experimentation, bulk processing -
- -

Option B: Quick Verify Then ML (Your Suggestion)

-
-Command: -python -m src.cli run \ - --source gmail \ - --limit 10000 \ - --output gmail_results/ \ - --no-llm-fallback \ - --verify-categories \ # NEW FLAG (needs implementation) - --verify-sample 20 # NEW FLAG (needs implementation) - -What happens: -1. Load existing model (11 Enron categories) -2. Sample 20 random emails from new mailbox -3. Single LLM call: "Are categories [Work, Meetings, ...] appropriate for these emails?" -4. LLM responds: "Good match" or "Poor match - suggest [Shopping, Social, ...]" -5. If good match: Proceed with ML-only -6. If poor match: Warn user, optionally run calibration - -Time: ~4.5 minutes (20 sec verify + 4 min classify) -Accuracy: Same as Option A, but with confidence check -Use case: Production deployment with safety check -
- -

Option C: Lightweight Calibration (Middle Ground)

-
-Command: -python -m src.cli run \ - --source gmail \ - --limit 10000 \ - --output gmail_results/ \ - --no-llm-fallback \ - --quick-calibrate \ # NEW FLAG (needs implementation) - --calibrate-sample 50 # Much smaller than 300 - -What happens: -1. Sample only 50 emails (not 300) -2. Run LLM discovery on 3 batches (not 15) -3. Map discovered categories to existing model categories -4. If >70% overlap: Use existing model -5. If <70% overlap: Train lightweight adapter - -Time: ~6 minutes (2 min quick cal + 4 min classify) -Accuracy: 70-85% (better than Option A) -Use case: New mailbox types with some verification -
- -

7. What Actually Needs Implementation

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FeatureStatusWork RequiredTime
Option A: Pure ML✅ WORKS NOWNone - just use --no-llm-fallback0 hours
--verify-categories flag❌ Needs implementationAdd CLI flag, sample logic, LLM prompt, response parsing2-3 hours
--quick-calibrate flag❌ Needs implementationModify calibration workflow, category mapping logic4-6 hours
Category adapter/mapper❌ Needs implementationMap new categories to existing model categories using embeddings6-8 hours
- -

8. Recommended Approach: Start with Option A

- -
-

Why Option A (Pure ML, No Verification) is Best for Experimentation

-
    -
  1. Works right now - No code changes needed
  2. -
  3. 4 minutes per 10k emails - Ultra fast
  4. -
  5. Reveals real accuracy - See how well Enron model generalizes
  6. -
  7. Easy to compare - Run on multiple mailboxes quickly
  8. -
  9. No false confidence - You know it's approximate, act accordingly
  10. -
- -

Test Protocol

-

Step 1: Run on Enron subset (same domain)

- python -m src.cli run --source enron --limit 5000 --output test_enron/ --no-llm-fallback -

Expected accuracy: ~78% (baseline)

- -

Step 2: Run on different Enron mailbox

- python -m src.cli run --source enron --limit 5000 --output test_enron2/ --no-llm-fallback -

Expected accuracy: ~70-75% (slight drift)

- -

Step 3: If you have personal Gmail/Outlook data, run there

- python -m src.cli run --source gmail --limit 5000 --output test_gmail/ --no-llm-fallback -

Expected accuracy: ~50-65% (significant drift, but still useful)

-
- -

9. Timing Comparison: All Options

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ApproachLLM CallsTime (10k emails)Accuracy (Same domain)Accuracy (Different domain)
Full Calibration~500 (discovery + labeling + classification fallback)~2.5 hours92-95%92-95%
Option A: Pure ML0~4 minutes75-80%50-65%
Option B: Verify + ML1 (verification)~4.5 minutes75-80%50-65%
Option C: Quick Calibrate + ML~50 (quick discovery)~6 minutes80-85%65-75%
Current: ML + LLM Fallback~2100 (21% fallback rate)~2.5 hours92-95%85-90%
- -

10. The Real Question: Embeddings as Universal Features

- -
-

Why Your Intuition is Correct

-

You said: "map it all to our structured embedding and that's how it gets done"

-

This is exactly right.

- -
    -
  • Embeddings are semantic representations - "Meeting tomorrow" has similar embedding whether it's from Enron or Gmail
  • -
  • LightGBM learns patterns in embedding space - "High values in dimensions 50-70 = Meetings"
  • -
  • These patterns transfer - Different mailboxes have similar semantic patterns
  • -
  • Categories are just labels - The model doesn't care if you call it "Work" or "Business" - it learns the embedding pattern
  • -
- -

The Limit

-

Transfer learning works when:

-
    -
  • Email types are similar (business emails train well on business emails)
  • -
  • Email structure is similar (length, formality, sender patterns)
  • -
- -

Transfer learning fails when:

-
    -
  • Email domains differ significantly (e-commerce emails vs internal memos)
  • -
  • Email purposes differ (personal chitchat vs corporate announcements)
  • -
-
- -

11. Recommended Next Step

- -
-Immediate action (works right now): - -# Test current model on new 10k sample WITHOUT calibration -python -m src.cli run \ - --source enron \ - --limit 10000 \ - --output ml_speed_test/ \ - --no-llm-fallback - -# Expected: -# - Time: ~4 minutes -# - Accuracy: ~75-80% -# - LLM calls: 0 -# - Categories used: 11 from trained model - -# Then inspect results: -cat ml_speed_test/results.json | python -m json.tool | less - -# Check category distribution: -cat ml_speed_test/results.json | \ - python -c "import json, sys; data=json.load(sys.stdin); \ - from collections import Counter; \ - print(Counter(c['category'] for c in data['classifications']))" -
- -

12. If You Want Verification (Future Work)

- -

I can implement --verify-categories flag that:

-
    -
  1. Samples 20 emails from new mailbox
  2. -
  3. Makes single LLM call showing both: -
      -
    • Trained model categories: [Work, Meetings, Financial, ...]
    • -
    • Sample emails from new mailbox
    • -
    -
  4. -
  5. Asks LLM: "Rate category fit: Good/Fair/Poor + suggest alternatives"
  6. -
  7. Reports confidence score
  8. -
  9. Proceeds with ML-only if score > threshold
  10. -
- -

Time cost: +20 seconds (1 LLM call)

-

Value: Automated sanity check before bulk processing

- - - - diff --git a/docs/LABEL_TRAINING_PHASE_DETAIL.html b/docs/LABEL_TRAINING_PHASE_DETAIL.html deleted file mode 100644 index 86499fb..0000000 --- a/docs/LABEL_TRAINING_PHASE_DETAIL.html +++ /dev/null @@ -1,564 +0,0 @@ - - - - - - Label Training Phase - Detailed Analysis - - - - -

Label Training Phase - Deep Dive Analysis

- -

1. What is "Label Training"?

-

Location: src/calibration/llm_analyzer.py

-

Purpose: The LLM examines sample emails and assigns each one to a discovered category, creating labeled training data for the ML model.

-

This is NOT the same as category discovery. Discovery finds WHAT categories exist. Labeling creates training examples by saying WHICH emails belong to WHICH categories.

- -
-

CRITICAL MISUNDERSTANDING IN ORIGINAL DIAGRAM

-

The "Label Training Emails" phase described as "~3 seconds per email" is INCORRECT.

-

The actual implementation does NOT label emails individually.

-

Labels are created as a BYPRODUCT of batch category discovery, not as a separate sequential operation.

-
- -

2. Actual Label Training Flow

-
-
-flowchart TD
-    Start([Calibration Phase Starts]) --> Sample[Sample 300 emails
stratified by sender] - Sample --> BatchSetup[Split into batches of 20 emails
300 ÷ 20 = 15 batches] - - BatchSetup --> Batch1[Batch 1: Emails 1-20] - Batch1 --> Stats1[Calculate batch statistics
domains, keywords, attachments
~0.1 seconds] - - Stats1 --> BuildPrompt1[Build LLM prompt
Include all 20 email summaries
~0.05 seconds] - - BuildPrompt1 --> LLMCall1[Single LLM call for entire batch
Discovers categories AND labels all 20
~20 seconds TOTAL for batch] - - LLMCall1 --> Parse1[Parse JSON response
Extract categories + labels
~0.1 seconds] - - Parse1 --> Store1[Store results
categories: Dict
labels: List of Tuples] - - Store1 --> Batch2{More batches?} - Batch2 -->|Yes| NextBatch[Batch 2: Emails 21-40] - Batch2 -->|No| Consolidate - - NextBatch --> Stats2[Same process
15 total batches
~20 seconds each] - Stats2 --> Batch2 - - Consolidate[Consolidate categories
Merge duplicates
Single LLM call
~5 seconds] - - Consolidate --> CacheSnap[Snap to cached categories
Match against persistent cache
~0.5 seconds] - - CacheSnap --> Final[Final output
10-12 categories
300 labeled emails] - - Final --> End([Labels ready for ML training]) - - style LLMCall1 fill:#ff6b6b - style Consolidate fill:#ff6b6b - style Stats2 fill:#ffd93d - style Final fill:#4ec9b0 -
-
- -

3. Key Discovery: Batched Labeling

- -
-src/calibration/llm_analyzer.py:66-83 - -batch_size = 20 # NOT 1 email at a time! - -for batch_idx in range(0, len(sample_emails), batch_size): - batch = sample_emails[batch_idx:batch_idx + batch_size] - - # Single LLM call handles ENTIRE batch - batch_results = self._analyze_batch(batch, batch_idx) - - # Returns BOTH categories AND labels for all 20 emails - for category, desc in batch_results.get('categories', {}).items(): - discovered_categories[category] = desc - - for email_id, category in batch_results.get('labels', []): - email_labels.append((email_id, category)) -
- -
-

Why Batching Matters

-

Sequential (WRONG assumption): 300 emails × 3 sec/email = 900 seconds (15 minutes)

-

Batched (ACTUAL): 15 batches × 20 sec/batch = 300 seconds (5 minutes)

-

Savings: 10 minutes (67% faster than assumed)

-
- -

4. Single Batch Processing Detail

-
-
-flowchart TD
-    Start([Batch of 20 emails]) --> Stats[Calculate Statistics
~0.1 seconds] - - Stats --> StatDetails[Domain analysis
Recipient counts
Attachment detection
Keyword extraction] - - StatDetails --> BuildList[Build email summaries
For each email:
ID + From + Subject + Preview] - - BuildList --> Prompt[Construct LLM prompt
~2KB text
Contains:
- Statistics summary
- All 20 email summaries
- Instructions
- JSON schema] - - Prompt --> LLM[LLM Call
POST /api/generate
qwen3:4b-instruct-2507-q8_0
temp=0.1, max_tokens=2000
~18-22 seconds] - - LLM --> Response[LLM Response
JSON with:
categories: Dict
labels: List of 20 Tuples] - - Response --> Parse[Parse JSON
Regex extraction
Brace counting
~0.05 seconds] - - Parse --> Validate{Valid JSON?} - Validate -->|Yes| Extract[Extract data
categories: 3-8 new
labels: 20 tuples] - Validate -->|No| FallbackParse[Fallback parsing
Try to salvage partial data] - - FallbackParse --> Extract - - Extract --> Return[Return batch results
categories: Dict str→str
labels: List Tuple str,str] - - Return --> End([Merge with global results]) - - style LLM fill:#ff6b6b - style Parse fill:#4ec9b0 - style FallbackParse fill:#ffd93d -
-
- -

5. LLM Prompt Structure

- -
-Actual prompt sent to LLM (src/calibration/llm_analyzer.py:196-232): - -<no_think>You are analyzing emails to discover natural categories... - -BATCH STATISTICS (20 emails): -- Top sender domains: example.com (5), company.org (3)... -- Avg recipients per email: 2.3 -- Emails with attachments: 4/20 -- Avg subject length: 42 chars -- Common keywords: meeting(3), report(2)... - -EMAILS TO ANALYZE: -1. ID: maildir_allen-p__sent_mail_512 - From: phillip.allen@enron.com - Subject: Re: AEC Volumes at OPAL - Preview: Here are the volumes... - -2. ID: maildir_allen-p__sent_mail_513 - From: phillip.allen@enron.com - Subject: Meeting Tomorrow - Preview: Can we schedule... - -[... 18 more emails ...] - -TASK: -1. Identify natural groupings based on PURPOSE -2. Create SHORT category names -3. Assign each email to exactly one category -4. CRITICAL: Copy EXACT email IDs - -Return JSON: -{ - "categories": {"Work": "daily business communication", ...}, - "labels": [["maildir_allen-p__sent_mail_512", "Work"], ...] -} -
- -

6. Timing Breakdown - 300 Sample Emails

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OperationPer Batch (20 emails)Total (15 batches)% of Total Time
Calculate statistics0.1 sec1.5 sec0.5%
Build email summaries0.05 sec0.75 sec0.2%
Construct prompt0.01 sec0.15 sec0.05%
LLM API call18-22 sec270-330 sec98%
Parse JSON response0.05 sec0.75 sec0.2%
Merge results0.02 sec0.3 sec0.1%
SUBTOTAL: Batch Discovery~300 seconds (5 min)98.5%
Consolidation LLM call5 seconds1.3%
Cache snapping (semantic matching)0.5 seconds0.2%
TOTAL LABELING PHASE~305 seconds (5 min)100%
- -
-

Corrected Understanding

-

Original estimate: "~3 seconds per email" = 900 seconds for 300 emails

-

Actual timing: ~20 seconds per batch of 20 = ~305 seconds for 300 emails

-

Difference: 3× faster than original assumption

-

Why: Batching allows LLM to see context across multiple emails and make better category decisions in a single inference pass.

-
- -

7. What Gets Created

- -
-
-flowchart LR
-    Input[300 sampled emails] --> Discovery[Category Discovery
15 batches × 20 emails] - - Discovery --> RawCats[Raw Categories
~30-40 discovered
May have duplicates:
Work, work, Business, etc.] - - RawCats --> Consolidate[Consolidation
LLM merges similar
~5 seconds] - - Consolidate --> Merged[Merged Categories
~12-15 categories
Work, Financial, etc.] - - Merged --> CacheSnap[Cache Snap
Match against persistent cache
~0.5 seconds] - - CacheSnap --> Final[Final Categories
10-12 categories] - - Discovery --> RawLabels[Raw Labels
300 tuples:
email_id, category] - - RawLabels --> UpdateLabels[Update label categories
to match snapped names] - - UpdateLabels --> FinalLabels[Final Labels
300 training pairs] - - Final --> Training[Training Data] - FinalLabels --> Training - - Training --> MLTrain[Train LightGBM Model
~5 seconds] - - MLTrain --> Model[Trained Model
1.8MB .pkl file] - - style Discovery fill:#ff6b6b - style Consolidate fill:#ff6b6b - style Model fill:#4ec9b0 -
-
- -

8. Example Output

- -
-discovered_categories (Dict[str, str]): -{ - "Work": "daily business communication and coordination", - "Financial": "budgets, reports, financial planning", - "Meetings": "scheduling and meeting coordination", - "Technical": "system issues and technical discussions", - "Requests": "action items and requests for information", - "Reports": "status reports and summaries", - "Administrative": "HR, policies, company announcements", - "Urgent": "time-sensitive matters", - "Conversational": "casual check-ins and social", - "External": "communication with external partners" -} - -sample_labels (List[Tuple[str, str]]): -[ - ("maildir_allen-p__sent_mail_1", "Financial"), - ("maildir_allen-p__sent_mail_2", "Work"), - ("maildir_allen-p__sent_mail_3", "Meetings"), - ("maildir_allen-p__sent_mail_4", "Work"), - ("maildir_allen-p__sent_mail_5", "Financial"), - ... (300 total) -] -
- -

9. Why Batching is Critical

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ApproachLLM CallsTime/CallTotal TimeQuality
Sequential (1 email/call)3003 sec900 sec (15 min)Poor - no context
Small batches (5 emails/call)608 sec480 sec (8 min)Fair - limited context
Current (20 emails/call)1520 sec300 sec (5 min)Good - sufficient context
Large batches (50 emails/call)645 sec270 sec (4.5 min)Risk - may exceed token limits
- -
-

Why 20 emails per batch?

-
    -
  • Token limit: 20 emails × ~150 tokens/email = ~3000 tokens input, well under 8K limit
  • -
  • Context window: LLM can see patterns across multiple emails
  • -
  • Speed: Minimizes API calls while staying within limits
  • -
  • Quality: Enough examples to identify patterns, not so many that it gets confused
  • -
-
- -

10. Configuration Parameters

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ParameterLocationDefaultEffect on Timing
sample_sizeCalibrationConfig300300 samples = 15 batches = 5 min
batch_sizellm_analyzer.py:6220Hardcoded - affects batch count
llm_batch_sizeCalibrationConfig50NOT USED for discovery (misleading name)
temperatureLLM call0.1Lower = faster, more deterministic
max_tokensLLM call2000Higher = potentially slower response
- -

11. Full Calibration Timeline

- -
-
-gantt
-    title Calibration Phase Timeline (300 samples, 10k total emails)
-    dateFormat mm:ss
-    axisFormat %M:%S
-
-    section Sampling
-    Stratified sample (3% of 10k) :00:00, 01s
-
-    section Category Discovery
-    Batch 1 (emails 1-20)         :00:01, 20s
-    Batch 2 (emails 21-40)        :00:21, 20s
-    Batch 3 (emails 41-60)        :00:41, 20s
-    Batch 4-13 (emails 61-260)    :01:01, 200s
-    Batch 14 (emails 261-280)     :04:21, 20s
-    Batch 15 (emails 281-300)     :04:41, 20s
-
-    section Consolidation
-    LLM category merge            :05:01, 05s
-    Cache snap                    :05:06, 00.5s
-
-    section ML Training
-    Feature extraction (300)      :05:07, 06s
-    LightGBM training             :05:13, 05s
-    Validation (100 emails)       :05:18, 02s
-    Save model to disk            :05:20, 00.5s
-
-
- -

12. Key Insights

- -
-

1. Labels are NOT created sequentially

-

The LLM creates labels as a byproduct of batch category discovery. There is NO separate "label each email one by one" phase.

-
- -
-

2. Batching is the optimization

-

Processing 20 emails in a single LLM call (20 sec) is 3× faster than 20 individual calls (60 sec total).

-
- -
-

3. LLM time dominates everything

-

98% of labeling phase time is LLM API calls. Everything else (parsing, merging, caching) is negligible.

-
- -
-

4. Consolidation is cheap

-

Merging 30-40 raw categories into 10-12 final ones takes only ~5 seconds with a single LLM call.

-
- -

13. Optimization Opportunities

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptimizationCurrentPotentialTradeoff
Increase batch size20 emails/batch30-40 emails/batchMay hit token limits, slower per call
Reduce sample size300 samples (3%)200 samples (2%)Less training data, potentially worse model
Parallel batchingSequential 15 batches3-5 concurrent batchesRequires async LLM client, more complex
Skip consolidationAlways consolidate if >10 catsSkip if <15 catsMay leave duplicate categories
Cache-first approachDiscover then snap to cacheSnap to cache, only discover newLess adaptive to new mailbox types
- - - - diff --git a/docs/MODEL_INFO.md b/docs/MODEL_INFO.md deleted file mode 100644 index a0a5558..0000000 --- a/docs/MODEL_INFO.md +++ /dev/null @@ -1,129 +0,0 @@ -# Model Information - -## Current Status - -- **Model Type**: LightGBM Classifier (Production) -- **Location**: `src/models/pretrained/classifier.pkl` -- **Categories**: 12 (junk, transactional, auth, newsletters, social, automated, conversational, work, personal, finance, travel, unknown) -- **Feature Extraction**: Hybrid (embeddings + patterns + structural features) - -## Usage - -The ML classifier will automatically use the real model if it exists at: -``` -src/models/pretrained/classifier.pkl -``` - -### Programmatic Usage - -```python -from src.classification.ml_classifier import MLClassifier - -# Will automatically load real model if available -classifier = MLClassifier() - -# Check if using mock or real model -info = classifier.get_info() -print(f"Is mock: {info['is_mock']}") -print(f"Model type: {info['model_type']}") - -# Make predictions -result = classifier.predict(feature_vector) -print(f"Category: {result['category']}") -print(f"Confidence: {result['confidence']}") -``` - -### Command Line Usage - -```bash -# Test with mock pipeline -python -m src.cli run --source mock --output test_results/ - -# Test with real model (when available) -python -m src.cli run --source gmail --limit 100 --output results/ -``` - -## How to Get a Real Model - -### Option 1: Train Your Own (Recommended) -```python -from src.calibration.trainer import ModelTrainer -from src.calibration.enron_parser import EnronParser -from src.classification.feature_extractor import FeatureExtractor - -# Parse Enron dataset -parser = EnronParser("enron_mail_20150507") -emails = parser.parse_emails(limit=5000) - -# Extract features -extractor = FeatureExtractor() -labeled_data = [(email, category) for email, category in zip(emails, categories)] - -# Train model -trainer = ModelTrainer(extractor, categories) -results = trainer.train(labeled_data) - -# Save model -trainer.save_model("src/models/pretrained/classifier.pkl") -``` - -### Option 2: Download Pre-trained Model - -Use the provided script: -```bash -cd tools -python download_pretrained_model.py \ - --url https://example.com/model.pkl \ - --hash abc123def456 -``` - -### Option 3: Use Community Model - -Check available pre-trained models at: -- Email Sorter releases on GitHub -- Hugging Face model hub (when available) -- Community-trained models - -## Model Performance - -Expected accuracy on real data: -- **Hard Rules**: 94-96% (instant, ~10% of emails) -- **ML Model**: 85-90% (fast, ~85% of emails) -- **LLM Review**: 92-95% (slower, ~5% uncertain cases) -- **Overall**: 90-94% (weighted average) - -## Retraining - -To retrain the model: - -```bash -python -m src.cli train \ - --source enron \ - --output models/new_model.pkl \ - --limit 10000 -``` - -## Troubleshooting - -### Model Not Loading -1. Check file exists: `src/models/pretrained/classifier.pkl` -2. Try to load directly: - ```python - import pickle - with open('src/models/pretrained/classifier.pkl', 'rb') as f: - data = pickle.load(f) - print(data.keys()) - ``` -3. Ensure pickle format is correct - -### Low Accuracy -1. Model may be underfitted - train on more data -2. Feature extraction may need tuning -3. Categories may need adjustment -4. Consider LLM review for uncertain cases - -### Slow Predictions -1. Use embedding cache for batch processing -2. Implement parallel processing -3. Consider quantization for LightGBM model -4. Profile feature extraction step diff --git a/docs/NEXT_STEPS.md b/docs/NEXT_STEPS.md deleted file mode 100644 index a165a0b..0000000 --- a/docs/NEXT_STEPS.md +++ /dev/null @@ -1,437 +0,0 @@ -# Email Sorter - Next Steps & Action Plan - -**Date**: 2025-10-21 -**Status**: Framework Complete - Ready for Real Model Integration -**Test Status**: 27/30 passing (90%) - ---- - -## Quick Summary - -✅ **Framework**: 100% complete, all 16 phases implemented -✅ **Testing**: 90% pass rate (27/30 tests) -✅ **Documentation**: Comprehensive and up-to-date -✅ **Tools**: Model integration scripts provided -❌ **Real Model**: Currently using mock (placeholder) -❌ **Gmail Credentials**: Not yet configured -❌ **Real Data Processing**: Ready when model + credentials available - ---- - -## Three Paths Forward - -Choose your path based on your needs: - -### Path A: Quick Framework Validation (5 minutes) -**Goal**: Verify everything works with mock model -**Commands**: -```bash -cd "c:/Build Folder/email-sorter" -source venv/Scripts/activate - -# Run quick validation -pytest tests/ -v --tb=short -python -m src.cli test-config -python -m src.cli run --source mock --output test_results/ -``` -**Result**: Confirms framework works correctly - -### Path B: Real Model Integration (30-60 minutes) -**Goal**: Replace mock model with real LightGBM model -**Two Sub-Options**: - -#### B1: Train Your Own Model on Enron Dataset -```bash -# Parse Enron emails (already downloaded) -python -c " -from src.calibration.enron_parser import EnronParser -from src.classification.feature_extractor import FeatureExtractor -from src.calibration.trainer import ModelTrainer - -parser = EnronParser('enron_mail_20150507') -emails = parser.parse_emails(limit=5000) - -extractor = FeatureExtractor() -trainer = ModelTrainer(extractor, ['junk', 'transactional', 'auth', 'newsletters', - 'social', 'automated', 'conversational', 'work', - 'personal', 'finance', 'travel', 'unknown']) - -# Train (takes 5-10 minutes on this laptop) -results = trainer.train([(e, 'unknown') for e in emails]) -trainer.save_model('src/models/pretrained/classifier.pkl') -" - -# Verify -python tools/setup_real_model.py --check -``` - -#### B2: Download Pre-trained Model -```bash -# If you have a pre-trained model URL -python tools/download_pretrained_model.py \ - --url https://example.com/lightgbm_model.pkl \ - --hash abc123def456 - -# Or if you have local file -python tools/setup_real_model.py --model-path /path/to/model.pkl - -# Verify -python tools/setup_real_model.py --check -``` - -**Result**: Real model installed, framework uses it automatically - -### Path C: Full Production Deployment (2-3 hours) -**Goal**: Process all 80k+ emails with Gmail integration -**Prerequisites**: Path B (real model) + Gmail OAuth -**Steps**: - -1. **Setup Gmail OAuth** - ```bash - # Get credentials from Google Cloud Console - # https://console.cloud.google.com/ - # - Create OAuth 2.0 credentials - # - Download as JSON - # - Place as credentials.json in project root - - # Test Gmail connection - python -m src.cli test-gmail - ``` - -2. **Test with 100 Emails** - ```bash - python -m src.cli run \ - --source gmail \ - --limit 100 \ - --output test_results/ - ``` - -3. **Process Full Dataset** - ```bash - python -m src.cli run \ - --source gmail \ - --output marion_results/ - ``` - -4. **Review Results** - - Check `marion_results/results.json` - - Check `marion_results/report.txt` - - Review accuracy metrics - - Adjust thresholds if needed - ---- - -## What's Ready Right Now - -### ✅ Framework Components (All Complete) -- [x] Feature extraction (embeddings + patterns + structural) -- [x] Three-tier adaptive classifier (hard rules → ML → LLM) -- [x] Embedding cache and batch processing -- [x] Processing pipeline with checkpointing -- [x] LLM integration (Ollama ready, OpenAI compatible) -- [x] Calibration workflow -- [x] Export system (JSON/CSV) -- [x] Provider sync (Gmail/IMAP framework) -- [x] Learning systems (threshold + pattern learning) -- [x] Complete CLI interface -- [x] Comprehensive test suite - -### ❌ What Needs Your Input -1. **Real Model** (50 MB file) - - Option: Train on Enron (~5-10 min, laptop-friendly) - - Option: Download pre-trained (~1 min) - -2. **Gmail Credentials** (OAuth JSON) - - Get from Google Cloud Console - - Place in project root as `credentials.json` - -3. **Real Data** (Already have: Enron dataset) - - Optional: Your own emails for better tuning - ---- - -## File Locations & Important Paths - -``` -Project Root: c:/Build Folder/email-sorter - -Key Files: -├── src/ -│ ├── cli.py # Command-line interface -│ ├── orchestration.py # Main pipeline -│ ├── classification/ -│ │ ├── feature_extractor.py # Feature extraction -│ │ ├── ml_classifier.py # ML predictions -│ │ ├── adaptive_classifier.py # Three-tier orchestration -│ │ └── embedding_cache.py # Caching & batching -│ ├── calibration/ -│ │ ├── trainer.py # LightGBM trainer -│ │ ├── enron_parser.py # Parse Enron dataset -│ │ └── workflow.py # Calibration pipeline -│ ├── processing/ -│ │ ├── bulk_processor.py # Batch processing -│ │ ├── queue_manager.py # LLM queue -│ │ └── attachment_handler.py # PDF/DOCX extraction -│ ├── llm/ -│ │ ├── ollama.py # Ollama integration -│ │ └── openai_compat.py # OpenAI API -│ └── email_providers/ -│ ├── gmail.py # Gmail provider -│ └── imap.py # IMAP provider -│ -├── models/ # (Will be created) -│ └── pretrained/ -│ └── classifier.pkl # Real model goes here -│ -├── tools/ -│ ├── download_pretrained_model.py # Download models -│ └── setup_real_model.py # Setup models -│ -├── enron_mail_20150507/ # Enron dataset (already extracted) -│ -├── tests/ # 23 test cases -├── config/ # Configuration -├── src/models/pretrained/ # (Will be created for real model) -│ -└── Documentation: - ├── PROJECT_STATUS.md # High-level overview - ├── COMPLETION_ASSESSMENT.md # Detailed component review - ├── MODEL_INFO.md # Model usage guide - └── NEXT_STEPS.md # This file -``` - ---- - -## Testing Your Setup - -### Framework Validation -```bash -# Test configuration loading -python -m src.cli test-config - -# Test Ollama (if running locally) -python -m src.cli test-ollama - -# Run full test suite -pytest tests/ -v -``` - -### Mock Pipeline (No Real Data Needed) -```bash -python -m src.cli run --source mock --output test_results/ -``` - -### Real Model Verification -```bash -python tools/setup_real_model.py --check -``` - -### Gmail Connection Test -```bash -python -m src.cli test-gmail -``` - ---- - -## Performance Expectations - -### With Mock Model (Testing) -- Feature extraction: ~50-100ms per email -- ML prediction: ~10-20ms per email -- Total time for 100 emails: ~30-40 seconds - -### With Real Model (Production) -- Feature extraction: ~50-100ms per email -- ML prediction: ~5-10ms per email (LightGBM is faster) -- LLM review (5% of emails): ~2-5 seconds per email -- Total time for 80k emails: 15-25 minutes - -### Calibration Phase -- Sampling: 1-2 minutes -- LLM category discovery: 2-3 minutes -- Model training: 5-10 minutes -- Total: 10-15 minutes - ---- - -## Troubleshooting - -### Problem: "Model not found" but framework running -**Solution**: This is normal - system uses mock model automatically -```bash -python tools/setup_real_model.py --check # Shows current status -``` - -### Problem: Ollama tests failing -**Solution**: Ollama is optional, LLM review will skip gracefully -```bash -# Not critical - framework has graceful fallback -python -m src.cli run --source mock -``` - -### Problem: Gmail connection fails -**Solution**: Gmail is optional, test with mock first -```bash -python -m src.cli run --source mock --output results/ -``` - -### Problem: Low accuracy with mock model -**Expected behavior**: Mock model is for framework testing only -```python -# Check model info -from src.classification.ml_classifier import MLClassifier -c = MLClassifier() -print(c.get_info()) # Shows is_mock: True -``` - ---- - -## Decision Tree: What to Do Next - -``` -START -│ -├─ Do you want to test the framework first? -│ └─ YES → Run Path A (5 minutes) -│ pytest tests/ -v -│ python -m src.cli run --source mock -│ -├─ Do you want to set up a real model? -│ ├─ YES (TRAIN) → Run Path B1 (30-60 min) -│ │ Train on Enron dataset -│ │ python tools/setup_real_model.py --check -│ │ -│ └─ YES (DOWNLOAD) → Run Path B2 (5 min) -│ python tools/setup_real_model.py --model-path /path/to/model.pkl -│ -├─ Do you want Gmail integration? -│ └─ YES → Setup OAuth credentials -│ Place credentials.json in project root -│ python -m src.cli test-gmail -│ -└─ Do you want to process all 80k emails? - └─ YES → Run Path C (2-3 hours) - python -m src.cli run --source gmail --output results/ -``` - ---- - -## Success Criteria - -### ✅ Framework is Ready When: -- [ ] `pytest tests/` shows 27/30 passing -- [ ] `python -m src.cli test-config` succeeds -- [ ] `python -m src.cli run --source mock` completes - -### ✅ Real Model is Ready When: -- [ ] `python tools/setup_real_model.py --check` shows model found -- [ ] `python -m src.cli run --source mock` shows `is_mock: False` -- [ ] Test predictions work without errors - -### ✅ Gmail is Ready When: -- [ ] `credentials.json` exists in project root -- [ ] `python -m src.cli test-gmail` succeeds -- [ ] Can fetch 10 emails from Gmail - -### ✅ Production is Ready When: -- [ ] Real model integrated -- [ ] Gmail credentials configured -- [ ] Test run on 100 emails succeeds -- [ ] Accuracy metrics are acceptable -- [ ] Ready to process full dataset - ---- - -## Common Commands Reference - -```bash -# Navigate to project -cd "c:/Build Folder/email-sorter" -source venv/Scripts/activate - -# Testing -pytest tests/ -v # Run all tests -pytest tests/test_feature_extraction.py -v # Run specific test file - -# Configuration -python -m src.cli test-config # Validate config -python -m src.cli test-ollama # Test LLM provider -python -m src.cli test-gmail # Test Gmail connection - -# Framework testing (mock) -python -m src.cli run --source mock --output test_results/ - -# Model setup -python tools/setup_real_model.py --check # Check status -python tools/setup_real_model.py --model-path /path/to/model # Install model -python tools/setup_real_model.py --info # Show info - -# Real processing (after setup) -python -m src.cli run --source gmail --limit 100 --output test/ -python -m src.cli run --source gmail --output results/ - -# Development -python -m pytest tests/ --cov=src # Coverage report -python -m src.cli --help # Show all commands -``` - ---- - -## What NOT to Do - -❌ **Do NOT**: -- Try to use mock model in production (it's not accurate) -- Process all emails before testing with 100 -- Skip Gmail credential setup (use mock for testing instead) -- Modify core classifier code (framework is complete) -- Skip the test suite validation -- Use Ollama if laptop is low on resources (graceful fallback available) - -✅ **DO**: -- Test with mock first -- Integrate real model before processing -- Start with 100 emails then scale -- Review results and adjust thresholds -- Keep this file for reference -- Use the tools provided for model integration - ---- - -## Support & Questions - -If something doesn't work: - -1. **Check logs**: All operations log to `logs/email_sorter.log` -2. **Run tests**: `pytest tests/ -v` shows what's working -3. **Check framework**: `python -m src.cli test-config` validates setup -4. **Review docs**: See COMPLETION_ASSESSMENT.md for details - ---- - -## Timeline Estimate - -**What You Can Do Now:** -- Framework validation: 5 minutes -- Mock pipeline test: 10 minutes -- Documentation review: 15 minutes - -**What You Can Do When Home:** -- Real model training: 30-60 minutes -- Gmail OAuth setup: 15-30 minutes -- Full processing: 20-30 minutes - -**Total Time to Production**: 1.5-2 hours when you're home with better hardware - ---- - -## Summary - -Your Email Sorter framework is **100% complete and tested**. The next step is simply choosing: - -1. **Now**: Validate framework with mock model (5 min) -2. **When home**: Integrate real model (30-60 min) -3. **When ready**: Process all 80k emails (20-30 min) - -All tools are provided. All documentation is complete. Framework is ready to use. - -**Choose your path above and get started!** diff --git a/docs/PROJECT_BLUEPRINT.md b/docs/PROJECT_BLUEPRINT.md deleted file mode 100644 index 527190a..0000000 --- a/docs/PROJECT_BLUEPRINT.md +++ /dev/null @@ -1,1063 +0,0 @@ -# EMAIL SORTER - PROJECT BLUEPRINT -**Hybrid ML/LLM Email Classification System** - -Version: 2.0 -Date: 2024-10-21 -Status: Research Complete - Ready to Build - ---- - -## EXECUTIVE SUMMARY - -**What it does:** -Processes 80,000+ emails in ~17 minutes using a pre-trained ML model for bulk classification (90%+) and LLM (Ollama/OpenAI-compatible) for edge cases and startup calibration (~5-10%). - -**How it works:** -1. Fresh repo clone per job (complete isolation) -2. LLM analyzes sample to discover natural categories (calibration phase) -3. Train LightGBM on embeddings + patterns + structural features -4. ML sprints through high-confidence classifications -5. Hard rules catch obvious patterns (OTP, invoices, etc.) -6. LLM reviews only uncertain cases (batched efficiently) -7. System self-tunes thresholds based on LLM feedback -8. Export results and sync back to email provider -9. Delete repo (cleanup) - -**Target use case:** -Self-employed and business owners with 10k-100k+ neglected emails who need privacy-focused, one-time cleanup without cloud uploads or subscriptions. - -**Key innovation:** -Hybrid approach with structured embeddings, hard pattern rules, and dynamic threshold adjustment. LLM is OPTIONAL - system degrades gracefully if unavailable. - ---- - -## COMPETITIVE ANALYSIS (2024 Research) - -### Existing Solutions (ALL Cloud-Based) - -| Tool | Price | Accuracy | Privacy | Notes | -|------|-------|----------|---------|-------| -| SaneBox | $7-15/mo | ~85% | ❌ Cloud | AI filtering, requires upload | -| Clean Email | $10-30/mo | ~80% | ❌ Cloud | Smart folders, subscription | -| Spark | Free/Paid | ~75% | ❌ Cloud | Smart inbox, cloud sync | -| EmailTree.ai | Enterprise | ~90% | ❌ Cloud | NLP, for businesses | -| Mailstrom | $30-50/yr | ~70% | ❌ Cloud | Bulk analysis | - -### Our Competitive Advantages - -✅ **100% LOCAL** - No data leaves the machine -✅ **Privacy-first** - Perfect for business owners with sensitive data -✅ **One-time use** - No subscription, pay per job or DIY -✅ **Customizable** - Adapts to each inbox during calibration -✅ **Open source potential** - Distributable as Python wheel -✅ **Attachment analysis** - Competitors ignore this entirely -✅ **Offline capable** - Works without internet (after initial setup) - -### Benchmark Performance (2024 Research) - -**Enron Dataset (industry standard):** -- Traditional ML (SVM, Random Forest): 95-98% -- Deep Learning (DNN-BiLSTM): 98.69% -- Transformers (BERT, RoBERTa): ~99% -- LLMs (GPT-4): 99.7% (phishing detection) -- Ensemble methods: 98.8% - -**Our Target:** 94-96% accuracy (competitive, privacy-focused, local) - ---- - -## ARCHITECTURE - -### Three-Phase Pipeline - -``` -┌─────────────────────────────────────────────────────────────┐ -│ PHASE 1: CALIBRATION (3-5 minutes) │ -├─────────────────────────────────────────────────────────────┤ -│ 1. Sample 1500 emails (stratified sampling) │ -│ 2. LLM analyzes patterns and discovers categories │ -│ Model: qwen3:4b (bigger, more accurate) │ -│ Alternative: Compress to 500 emails + smarter batching │ -│ 3. Map discovered → universal categories │ -│ 4. Generate training labels for embedding classifier │ -│ 5. Validate on 300 emails │ -│ 6. Set initial confidence thresholds │ -│ 7. Train LightGBM on embeddings + patterns │ -└─────────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────┐ -│ PHASE 2: BULK PROCESSING (10-12 minutes) │ -├─────────────────────────────────────────────────────────────┤ -│ For each email: │ -│ → Pattern detection (regex, <1ms) │ -│ → Hard rule match? → INSTANT (10% of emails) │ -│ → Generate structured embedding (batched, 8 min total) │ -│ → LightGBM classify with confidence score │ -│ → IF confidence >= threshold: ACCEPT (85%) │ -│ → IF confidence < threshold: QUEUE for LLM (5%) │ -│ │ -│ Every 1000 emails or queue full: │ -│ → Process LLM batch (qwen3:1.7b, fast) │ -│ → Analyze agreement rate │ -│ → Adjust thresholds dynamically │ -│ → Learn sender rules │ -│ → Save checkpoint │ -└─────────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────┐ -│ PHASE 3: FINALIZATION (2-3 minutes) │ -├─────────────────────────────────────────────────────────────┤ -│ 1. Process remaining LLM queue │ -│ 2. Export results (JSON/CSV) │ -│ 3. Sync to email provider (Gmail labels, IMAP folders) │ -│ 4. Generate classification report │ -│ 5. Cleanup (delete repo, temp files) │ -└─────────────────────────────────────────────────────────────┘ -``` - ---- - -## CORE COMPONENTS - -### 1. Hybrid Feature Extraction (THE SECRET SAUCE) - -Combines three feature types for maximum accuracy: - -#### A. Sentence Embeddings (Semantic Understanding) -```python -from sentence_transformers import SentenceTransformer - -embedder = SentenceTransformer('all-MiniLM-L6-v2') # 384 dimensions - -# Structured embedding with parameterized headers -def build_embedding_text(email, patterns): - return f"""[EMAIL_METADATA] -sender_type: {email.sender_domain_type} -time_category: {email.time_of_day} -has_attachments: {email.has_attachments} -attachment_types: {email.attachment_types} - -[DETECTED_PATTERNS] -has_otp: {patterns['has_otp']} -has_invoice: {patterns['has_invoice']} -has_unsubscribe: {patterns['has_unsubscribe']} -is_automated: {patterns['is_noreply']} -has_meeting: {patterns['has_meeting']} - -[CONTENT] -subject: {email.subject} -body: {email.body_snippet[:300]} -""" - -text = build_embedding_text(email, patterns) -embedding = embedder.encode(text) # → 384-dim vector -``` - -**Why this works:** -- Model sees STRUCTURE, not just raw text -- Pattern hints guide semantic understanding -- Research shows 5-10% accuracy boost vs naive embedding -- Handles semantic variants: "meeting" = "call" = "zoom" - -#### B. Hard Pattern Rules (Fast Deterministic) -```python -# ~20 boolean/numerical features extracted via regex -patterns = { - # Authentication patterns - 'has_otp': bool(re.search(r'\b\d{4,6}\b', text)), - 'has_verification': 'verification' in text.lower(), - 'has_reset_password': 'reset password' in text.lower(), - - # Transactional patterns - 'has_invoice': bool(re.search(r'invoice\s*#?\d+', text, re.I)), - 'has_receipt': 'receipt' in text.lower(), - 'has_price': bool(re.search(r'\$\d+', text)), - 'has_order_number': bool(re.search(r'order\s*#?\d+', text, re.I)), - - # Newsletter/marketing patterns - 'has_unsubscribe': 'unsubscribe' in text.lower(), - 'has_view_in_browser': 'view in browser' in text.lower(), - - # Meeting/calendar patterns - 'has_meeting': bool(re.search(r'(meeting|call|zoom|teams)', text, re.I)), - 'has_calendar': 'calendar' in text.lower(), - - # Other patterns - 'has_tracking': bool(re.search(r'tracking\s*(number|#)', text, re.I)), - 'is_automated': email.sender_domain_type == 'noreply', - 'has_signature': bool(re.search(r'(regards|sincerely|best)', text, re.I)), -} -``` - -#### C. Structural Features (Metadata) -```python -# ~20 numerical/categorical features -structural = { - # Sender analysis - 'sender_domain': extract_domain(email.sender), - 'sender_domain_type': categorize_domain(email.sender), # freemail/corporate/noreply - 'is_noreply': 'noreply' in email.sender.lower(), - - # Timing - 'time_of_day': categorize_hour(email.date.hour), # night/morning/afternoon/evening - 'day_of_week': email.date.strftime('%A').lower(), - - # Content structure - 'subject_length': len(email.subject), - 'body_length': len(email.body), - 'link_count': len(re.findall(r'https?://', email.body)), - 'image_count': len(re.findall(r' 0, - 'attachment_count': len(attachments), - 'total_size': sum(a['size'] for a in attachments), - 'attachment_types': [] - } - - for attachment in attachments: - mime_type = attachment.get('mime_type', '') - filename = attachment.get('filename', '') - - # Type categorization - if 'pdf' in mime_type or filename.endswith('.pdf'): - features['attachment_types'].append('pdf') - - # Extract text from PDF if small enough (<5MB) - if attachment['size'] < 5_000_000: - text = extract_pdf_text(attachment) - features['pdf_has_invoice'] = bool(re.search(r'invoice|bill', text, re.I)) - features['pdf_has_account'] = bool(re.search(r'account\s*#?\d+', text, re.I)) - - elif 'word' in mime_type or filename.endswith(('.doc', '.docx')): - features['attachment_types'].append('docx') - - elif 'excel' in mime_type or filename.endswith(('.xls', '.xlsx')): - features['attachment_types'].append('xlsx') - - elif 'image' in mime_type or filename.endswith(('.png', '.jpg', '.jpeg')): - features['attachment_types'].append('image') - - return features -``` - -**Why this matters:** -- Business emails often have invoice PDFs, contract DOCXs -- Detecting "PDF with INVOICE text" → instant "transactional" classification -- Competitors ignore attachments entirely = our differentiator - -#### Combined Feature Vector -```python -# Total: ~434 dimensions (vs 10,000 with TF-IDF!) -final_features = np.concatenate([ - embedding, # 384 dims (semantic understanding) - pattern_values, # 20 dims (hard rules) - structural_values, # 20 dims (metadata) - attachment_values # 10 dims (NEW!) -]) -``` - ---- - -### 2. LightGBM Classifier (Research-Backed Choice) - -**Why LightGBM over XGBoost:** -- ✅ **Native categorical handling** (no encoding needed) -- ✅ **2-5x faster** on mixed feature types -- ✅ **4x speedup** with categorical + numerical features -- ✅ **Better memory efficiency** -- ✅ **Equivalent accuracy** to XGBoost -- ✅ **Perfect for embeddings** (dense numerical) + categoricals - -```python -import lightgbm as lgb -import numpy as np - -class HybridClassifier: - def __init__(self, categories): - self.categories = categories - self.embedder = SentenceTransformer('all-MiniLM-L6-v2') - self.model = None - - def extract_features(self, email): - """Extract all feature types""" - patterns = extract_patterns(email) - structural = extract_structural(email) - - # Structured embedding with rich context - text = build_embedding_text(email, patterns) - embedding = self.embedder.encode(text) - - # Combine features - features = { - 'embedding': embedding, # 384 numerical - 'patterns': patterns, # 20 numerical/boolean - 'structural': structural # 20 numerical/categorical - } - - return features - - def train(self, emails, labels): - """Train on LLM-labeled data from calibration""" - # Extract features - all_features = [self.extract_features(e) for e in emails] - - # Build feature matrix - X = np.array([ - np.concatenate([ - f['embedding'], - list(f['patterns'].values()), - [f['structural'][k] for k in numerical_keys] - ]) - for f in all_features - ]) - - # Categorical feature indices - categorical_features = ['sender_domain_type', 'time_of_day', 'day_of_week'] - - # Train LightGBM - self.model = lgb.LGBMClassifier( - categorical_feature=categorical_features, - n_estimators=200, - learning_rate=0.1, - max_depth=8, - num_leaves=31, - objective='multiclass', - num_class=len(self.categories) - ) - - self.model.fit(X, labels) - - def predict(self, email): - """Predict with confidence""" - features = self.extract_features(email) - X = build_feature_vector(features) - - # Get probabilities - probs = self.model.predict_proba([X])[0] - pred_class = np.argmax(probs) - - return { - 'category': self.categories[pred_class], - 'confidence': float(probs[pred_class]), - 'probabilities': { - self.categories[i]: float(probs[i]) - for i in range(len(self.categories)) - } - } -``` - ---- - -### 3. LLM Integration (Flexible & Optional) - -**Model Strategy:** - -| Phase | Model | Speed | Purpose | -|-------|-------|-------|---------| -| Calibration | **qwen3:4b** | Slower | Better category discovery, 1500 emails | -| Classification | **qwen3:1.7b** | Fast | Quick review, only ~5% of emails | -| Optional | **qwen3:30b** | Slowest | Maximum accuracy if needed | - -**Configuration (Single Source of Truth):** -```yaml -# config/llm_models.yaml -llm: - # Provider type: ollama, openai, anthropic - provider: "ollama" - - # Ollama settings - ollama: - base_url: "http://localhost:11434" - calibration_model: "qwen3:4b" # Bigger for better discovery - classification_model: "qwen3:1.7b" # Smaller for speed - temperature: 0.1 - max_tokens: 500 - timeout: 30 - retry_attempts: 3 - - # OpenAI-compatible API (future-proof) - openai: - base_url: "https://api.openai.com/v1" # Or custom endpoint - api_key: "${OPENAI_API_KEY}" - calibration_model: "gpt-4o-mini" - classification_model: "gpt-4o-mini" - temperature: 0.1 - max_tokens: 500 - - # Graceful degradation - fallback: - enabled: true - # If LLM unavailable, emails go to "needs_review" folder - # ML still works, just more conservative thresholds -``` - -**LLM Provider Abstraction:** -```python -from abc import ABC, abstractmethod - -class BaseLLMProvider(ABC): - @abstractmethod - def complete(self, prompt: str, **kwargs) -> str: - pass - - @abstractmethod - def test_connection(self) -> bool: - pass - -class OllamaProvider(BaseLLMProvider): - def __init__(self, base_url: str, model: str): - import ollama - self.client = ollama.Client(host=base_url) - self.model = model - - def complete(self, prompt: str, **kwargs) -> str: - response = self.client.generate( - model=self.model, - prompt=prompt, - options={ - 'temperature': kwargs.get('temperature', 0.1), - 'num_predict': kwargs.get('max_tokens', 500) - } - ) - return response['response'] - - def test_connection(self) -> bool: - try: - self.client.list() - return True - except: - return False - -class OpenAIProvider(BaseLLMProvider): - def __init__(self, base_url: str, api_key: str, model: str): - from openai import OpenAI - self.client = OpenAI(base_url=base_url, api_key=api_key) - self.model = model - - def complete(self, prompt: str, **kwargs) -> str: - response = self.client.chat.completions.create( - model=self.model, - messages=[{"role": "user", "content": prompt}], - temperature=kwargs.get('temperature', 0.1), - max_tokens=kwargs.get('max_tokens', 500) - ) - return response.choices[0].message.content - - def test_connection(self) -> bool: - try: - self.client.models.list() - return True - except: - return False - -def get_llm_provider(config) -> BaseLLMProvider: - """Factory to create LLM provider based on config""" - provider_type = config['llm']['provider'] - - if provider_type == 'ollama': - return OllamaProvider( - base_url=config['llm']['ollama']['base_url'], - model=config['llm']['ollama']['classification_model'] - ) - elif provider_type == 'openai': - return OpenAIProvider( - base_url=config['llm']['openai']['base_url'], - api_key=os.getenv('OPENAI_API_KEY'), - model=config['llm']['openai']['classification_model'] - ) - else: - raise ValueError(f"Unknown provider: {provider_type}") -``` - -**Graceful Degradation (LLM Optional):** -```python -class AdaptiveClassifier: - def __init__(self, ml_model, llm_classifier, config): - self.ml_model = ml_model - self.llm_classifier = llm_classifier - self.llm_available = self._test_llm_connection() - self.config = config - - if not self.llm_available: - logger.warning("LLM unavailable - using conservative thresholds") - self.default_threshold = 0.85 # Higher threshold without LLM - else: - self.default_threshold = 0.75 - - def _test_llm_connection(self): - """Check if LLM is available""" - if not self.llm_classifier: - return False - try: - return self.llm_classifier.test_connection() - except: - return False - - def classify(self, email, features): - """Classify with or without LLM""" - # ML classification - ml_result = self.ml_model.predict(features) - - # Check hard rules first - if self._has_hard_rule_match(email): - return ClassificationResult( - category=self._get_rule_category(email), - confidence=0.99, - method='rule' - ) - - # High confidence ML result - if ml_result['confidence'] >= self.default_threshold: - return ClassificationResult( - category=ml_result['category'], - confidence=ml_result['confidence'], - method='ml' - ) - - # Low confidence - try LLM if available - if self.llm_available: - return ClassificationResult( - category=ml_result['category'], - confidence=ml_result['confidence'], - method='ml', - needs_review=True # Queue for LLM - ) - else: - # No LLM - mark for manual review - return ClassificationResult( - category='needs_review', - confidence=ml_result['confidence'], - method='ml', - needs_review=True, - metadata={'ml_prediction': ml_result} - ) -``` - ---- - -### 4. Universal Categories (12 Total) - -```python -categories = { - 'junk': { - 'description': 'Spam, unwanted marketing, phishing', - 'patterns': ['unsubscribe', 'click here', 'limited time'], - 'threshold': 0.85 # High confidence needed - }, - 'transactional': { - 'description': 'Receipts, invoices, confirmations, order tracking', - 'patterns': ['receipt', 'invoice', 'order', 'shipped', 'tracking'], - 'threshold': 0.80 - }, - 'auth': { - 'description': 'OTPs, password resets, 2FA codes, security alerts', - 'patterns': ['verification code', 'otp', 'reset password', r'\d{4,6}'], - 'threshold': 0.90 # Very high - important emails - }, - 'newsletters': { - 'description': 'Subscribed newsletters, marketing emails', - 'patterns': ['newsletter', 'weekly digest', 'monthly update'], - 'threshold': 0.75 - }, - 'social': { - 'description': 'Social media notifications, mentions, friend requests', - 'patterns': ['mentioned you', 'friend request', 'liked your'], - 'threshold': 0.75 - }, - 'automated': { - 'description': 'System notifications, alerts, no-reply messages', - 'patterns': ['automated', 'system notification', 'do not reply'], - 'threshold': 0.80 - }, - 'conversational': { - 'description': 'Human-to-human correspondence, replies, discussions', - 'patterns': ['hi', 'hello', 'thanks', 'regards'], - 'threshold': 0.65 # Lower - varied language - }, - 'work': { - 'description': 'Business correspondence, meetings, projects', - 'patterns': ['meeting', 'project', 'deadline', 'team'], - 'threshold': 0.70 - }, - 'personal': { - 'description': 'Friends and family, personal matters', - 'patterns': ['love', 'family', 'dinner', 'weekend'], - 'threshold': 0.70 - }, - 'finance': { - 'description': 'Bank statements, credit cards, investments, bills', - 'patterns': ['statement', 'balance', 'account', 'payment due'], - 'threshold': 0.85 # High - sensitive - }, - 'travel': { - 'description': 'Flight bookings, hotels, reservations, itineraries', - 'patterns': ['flight', 'booking', 'reservation', 'check-in'], - 'threshold': 0.80 - }, - 'unknown': { - 'description': "Doesn't fit any category (requires review)", - 'patterns': [], - 'threshold': 0.50 # Catch-all - } -} -``` - ---- - -## MODULAR ARCHITECTURE - -### Tiered Dependencies - -```python -# setup.py -setup( - name="email-sorter", - version="1.0.0", - install_requires=[ - # CORE (always required) - "numpy>=1.24.0", - "pandas>=2.0.0", - "scikit-learn>=1.3.0", - "lightgbm>=4.0.0", - "sentence-transformers>=2.2.0", - "pydantic>=2.0.0", - "pyyaml>=6.0", - "click>=8.1.0", - "rich>=13.0.0", - "tqdm>=4.66.0", - "tenacity>=8.2.0", - ], - extras_require={ - # Email providers (optional) - "gmail": [ - "google-api-python-client>=2.100.0", - "google-auth-httplib2>=0.1.1", - "google-auth-oauthlib>=1.1.0", - ], - "microsoft": [ - "msal>=1.24.0", - ], - "imap": [ - "imapclient>=2.3.1", - ], - - # LLM providers (optional) - "ollama": [ - "ollama>=0.1.0", - ], - "openai": [ - "openai>=1.0.0", - ], - - # Attachment processing (optional) - "attachments": [ - "PyPDF2>=3.0.0", - "python-docx>=0.8.11", - "openpyxl>=3.0.10", - ], - - # Development (optional) - "dev": [ - "pytest>=7.4.0", - "pytest-cov>=4.1.0", - "pytest-mock>=3.11.0", - "black>=23.0.0", - "isort>=5.12.0", - ], - - # All extras - "all": [ - # Combines all above - ] - } -) -``` - -**Installation options:** -```bash -# Minimal (ML only, no LLM, no email providers) -pip install email-sorter - -# With Gmail support -pip install email-sorter[gmail] - -# With Ollama LLM -pip install email-sorter[ollama,gmail] - -# Everything -pip install email-sorter[all] -``` - ---- - -## TESTING STRATEGY - -### Test Harness Structure - -``` -tests/ -├── unit/ -│ ├── test_feature_extraction.py -│ ├── test_pattern_matching.py -│ ├── test_embeddings.py -│ ├── test_lightgbm.py -│ └── test_attachment_analysis.py -├── integration/ -│ ├── test_calibration.py -│ ├── test_ml_llm_pipeline.py -│ ├── test_gmail_provider.py -│ └── test_checkpoint_resume.py -├── e2e/ -│ ├── test_full_pipeline_100.py -│ ├── test_full_pipeline_1000.py -│ └── test_full_pipeline_80k.py -├── fixtures/ -│ ├── mock_emails.json -│ ├── mock_llm_responses.json -│ └── sample_inboxes/ -└── conftest.py -``` - -### Unit Tests -```python -# tests/unit/test_feature_extraction.py -import pytest -from src.classification.feature_extractor import FeatureExtractor -from src.email_providers.base import Email - -def test_pattern_extraction(): - email = Email( - id='1', - subject='Your verification code is 123456', - sender='noreply@service.com', - body='Your one-time password is 123456' - ) - - extractor = FeatureExtractor() - patterns = extractor._extract_patterns(email) - - assert patterns['has_otp'] == True - assert patterns['has_verification'] == True - assert patterns['is_automated'] == True - -def test_structured_embedding(): - email = Email( - id='2', - subject='Invoice #12345', - sender='billing@company.com', - body='Please find attached your invoice' - ) - - extractor = FeatureExtractor() - text = extractor.build_embedding_text(email) - - assert '[EMAIL_METADATA]' in text - assert '[DETECTED_PATTERNS]' in text - assert 'has_invoice: True' in text -``` - -### Integration Tests -```python -# tests/integration/test_ml_llm_pipeline.py -def test_calibration_then_classification(): - # 1. Load sample emails - emails = load_sample_emails(count=100) - - # 2. Run calibration (with mock LLM) - calibrator = CalibrationPhase(mock_llm_provider) - config = calibrator.run(emails) - - # 3. Train classifier - classifier = HybridClassifier() - classifier.train(emails, config['labels']) - - # 4. Classify new emails - new_emails = load_sample_emails(count=20, exclude=emails) - results = [classifier.predict(e) for e in new_emails] - - # 5. Assert accuracy - accuracy = calculate_accuracy(results, ground_truth) - assert accuracy > 0.85 -``` - -### E2E Tests -```python -# tests/e2e/test_full_pipeline_100.py -def test_full_pipeline_100_emails(tmp_path): - """End-to-end test on 100 emails""" - # Setup - output_dir = tmp_path / "results" - emails = load_test_inbox(count=100) - - # Run full pipeline - result = run_email_sorter( - emails=emails, - output=output_dir, - config="tests/fixtures/test_config.yaml" - ) - - # Assertions - assert result['total_processed'] == 100 - assert result['accuracy_estimate'] > 0.90 - assert (output_dir / "results.json").exists() - assert (output_dir / "report.txt").exists() -``` - ---- - -## PERFORMANCE EXPECTATIONS (Updated with Research) - -### For 80,000 emails: - -| Phase | Time | Details | -|-------|------|---------| -| **Calibration** | 3-5 min | 1500 emails, qwen3:4b, train LightGBM | -| Pattern detection | ~10 sec | Regex on all 80k emails | -| Embedding generation | ~8 min | Batched, CPU, all 80k emails | -| LightGBM classification | ~3 sec | Fast inference | -| Hard rules auto-classify | instant | 10% = 8,000 emails | -| LLM review (qwen3:1.7b) | ~4 min | 5% = 4,000 emails, batched | -| Export & sync | ~2 min | JSON/CSV + Gmail API | -| **TOTAL** | **~17 min** | | - -### Accuracy Breakdown: - -| Component | Coverage | Accuracy | -|-----------|----------|----------| -| Hard rules | 10% | 99% | -| LightGBM (high conf) | 85% | 92% | -| LLM review | 5% | 95% | -| **Overall** | **100%** | **94-96%** | - -### Memory Usage (80k emails): -- Email data: ~400MB -- Embeddings (cached): ~500MB -- LightGBM model: ~5MB -- MiniLM model: ~90MB -- Peak: ~1.2GB - ---- - -## DISTRIBUTABLE WHEEL PACKAGING - -### Package Structure -``` -email-sorter/ -├── setup.py -├── setup.cfg -├── pyproject.toml -├── MANIFEST.in -├── README.md -├── LICENSE -├── src/ -│ └── email_sorter/ -│ ├── __init__.py -│ ├── __main__.py -│ ├── cli.py -│ └── ... (all modules) -├── config/ -│ ├── default_config.yaml -│ ├── categories.yaml -│ └── llm_models.yaml -└── models/ - └── pretrained/ - ├── minilm-l6-v2/ (bundled embedder) - └── lightgbm.pkl (optional pre-trained) -``` - -### Distribution Commands -```bash -# Build wheel -python setup.py sdist bdist_wheel - -# Install locally -pip install dist/email_sorter-1.0.0-py3-none-any.whl - -# Use as command -email-sorter --source gmail --credentials creds.json --output results/ - -# Or as module -python -m email_sorter --source gmail ... -``` - -### CLI Interface -```bash -email-sorter --help - -# Basic usage -email-sorter \ - --source gmail \ - --credentials credentials.json \ - --output results/ - -# Advanced options -email-sorter \ - --source gmail \ - --credentials creds.json \ - --output results/ \ - --config custom_config.yaml \ - --llm-provider ollama \ - --llm-model qwen3:1.7b \ - --limit 1000 \ - --no-calibrate \ - --dry-run -``` - ---- - -## PROJECT STRUCTURE - -``` -email-sorter/ -├── README.md -├── PROJECT_BLUEPRINT.md # This file -├── BUILD_INSTRUCTIONS.md -├── RESEARCH_FINDINGS.md -├── setup.py -├── setup.cfg -├── pyproject.toml -├── requirements.txt -├── .gitignore -├── .env.example -├── config/ -│ ├── default_config.yaml -│ ├── categories.yaml -│ ├── llm_models.yaml # LLM config (single source) -│ └── features.yaml -├── src/ -│ ├── __init__.py -│ ├── __main__.py -│ ├── cli.py # Click CLI -│ ├── calibration/ -│ │ ├── __init__.py -│ │ ├── sampler.py # Stratified sampling -│ │ ├── llm_analyzer.py # LLM calibration -│ │ └── trainer.py # Train LightGBM -│ ├── classification/ -│ │ ├── __init__.py -│ │ ├── feature_extractor.py # Hybrid features -│ │ ├── pattern_matcher.py # Hard rules -│ │ ├── embedder.py # Sentence embeddings -│ │ ├── lightgbm_classifier.py -│ │ ├── adaptive_classifier.py -│ │ └── llm_classifier.py -│ ├── models/ -│ │ ├── __init__.py -│ │ ├── pretrained/ -│ │ │ └── .gitkeep -│ │ └── model_loader.py -│ ├── email_providers/ -│ │ ├── __init__.py -│ │ ├── base.py -│ │ ├── gmail.py -│ │ ├── microsoft.py -│ │ └── imap.py -│ ├── llm/ -│ │ ├── __init__.py -│ │ ├── base.py # Abstract provider -│ │ ├── ollama.py -│ │ └── openai.py -│ ├── processing/ -│ │ ├── __init__.py -│ │ ├── bulk_processor.py -│ │ ├── attachment_handler.py -│ │ └── queue_manager.py -│ ├── adjustment/ -│ │ ├── __init__.py -│ │ ├── threshold_adjuster.py -│ │ └── pattern_learner.py -│ ├── export/ -│ │ ├── __init__.py -│ │ ├── results_exporter.py -│ │ ├── provider_sync.py -│ │ └── report_generator.py -│ └── utils/ -│ ├── __init__.py -│ ├── config.py -│ ├── logging.py -│ └── cleanup.py -├── tests/ -│ ├── unit/ -│ ├── integration/ -│ ├── e2e/ -│ ├── fixtures/ -│ └── conftest.py -├── prompts/ -│ ├── calibration.txt -│ └── classification.txt -├── scripts/ -│ ├── train_model.py -│ ├── verify_install.py -│ └── benchmark.py -├── data/ -│ └── samples/ -└── logs/ - └── .gitkeep -``` - ---- - -## SECURITY & PRIVACY - -✅ **All processing is local** - No cloud uploads -✅ **LLM runs locally** - Via Ollama (or optional OpenAI API) -✅ **Fresh clone per job** - Complete isolation -✅ **No persistent storage** - Email bodies never written to disk -✅ **Attachment content** - Processed in memory, discarded immediately -✅ **Auto cleanup** - Temp files deleted after processing -✅ **Credentials** - Used directly, never cached -✅ **GDPR-friendly** - No data retention or sharing - ---- - -## SUCCESS CRITERIA - -✅ Processes 80k emails in <20 minutes -✅ 94-96% classification accuracy (competitive with cloud tools) -✅ <5% emails need LLM review -✅ Successfully syncs back to Gmail/IMAP -✅ No data leakage between jobs -✅ Works on Windows, Linux, macOS -✅ LLM is optional (graceful degradation) -✅ Distributable as Python wheel -✅ Attachment analysis working -✅ OpenAI-compatible API support - ---- - -## WHAT'S NEXT - -1. ✅ Research complete (benchmarks, competition, LightGBM vs XGBoost) -2. ⏭ Update BUILD_INSTRUCTIONS.md with new architecture -3. ⏭ Create RESEARCH_FINDINGS.md with search results -4. ⏭ Build core infrastructure (config, logging, data models) -5. ⏭ Implement feature extraction (embeddings + patterns + attachments) -6. ⏭ Create LightGBM classifier -7. ⏭ Implement LLM providers (Ollama + OpenAI-compatible) -8. ⏭ Build calibration system -9. ⏭ Create test harness -10. ⏭ Package as wheel -11. ⏭ Test on Marion's 80k emails - ---- - -**END OF BLUEPRINT v2.0** - -This is the complete, research-backed architecture ready to build. diff --git a/docs/PROJECT_COMPLETE.md b/docs/PROJECT_COMPLETE.md deleted file mode 100644 index 4a9657f..0000000 --- a/docs/PROJECT_COMPLETE.md +++ /dev/null @@ -1,566 +0,0 @@ -# EMAIL SORTER - PROJECT COMPLETE - -**Date**: October 21, 2025 -**Status**: FEATURE COMPLETE - Ready to Use -**Framework Maturity**: All Features Implemented -**Test Coverage**: 90% (27/30 passing) -**Code Quality**: Full Type Hints and Comprehensive Error Handling - ---- - -## The Bottom Line - -✅ **Email Sorter framework is 100% complete and ready to use** - -All 16 planned development phases are implemented. The system is ready to process Marion's 80k+ emails with high accuracy. All you need to do is: - -1. Optionally integrate a real LightGBM model (tools provided) -2. Set up Gmail OAuth credentials (when ready) -3. Run the pipeline - -That's it. No more building. No more architecture decisions. Framework is done. - ---- - -## What You Have - -### Core System (Ready to Use) -- ✅ 38 Python modules (~6,000 lines of code) -- ✅ 12-category email classifier -- ✅ Hybrid ML/LLM classification system -- ✅ Smart feature extraction (embeddings + patterns + structure) -- ✅ Processing pipeline with checkpointing -- ✅ Gmail and IMAP sync capabilities -- ✅ Model training framework -- ✅ Learning systems (threshold + pattern adjustment) - -### Tools (Ready to Use) -- ✅ CLI interface (`python -m src.cli --help`) -- ✅ Model download tool (`tools/download_pretrained_model.py`) -- ✅ Model setup tool (`tools/setup_real_model.py`) -- ✅ Test suite (23 tests, 90% pass rate) - -### Documentation (Complete) -- ✅ PROJECT_STATUS.md - Feature inventory -- ✅ COMPLETION_ASSESSMENT.md - Detailed evaluation -- ✅ MODEL_INFO.md - Model usage guide -- ✅ NEXT_STEPS.md - Action plan -- ✅ README.md - Getting started -- ✅ Full API documentation via docstrings - -### Data (Ready) -- ✅ Enron dataset extracted (569MB, real emails) -- ✅ Mock provider for testing -- ✅ Test data sets - ---- - -## What's Different From Before - -When we started, there were **16 planned phases** with many unknowns. Now: - -| Phase | Status | Details | -|-------|--------|---------| -| 1-3 | ✅ DONE | Infrastructure, config, logging | -| 4 | ✅ DONE | Email providers (Gmail, IMAP, Mock) | -| 5 | ✅ DONE | Feature extraction (embeddings + patterns) | -| 6 | ✅ DONE | ML classifier (mock + LightGBM framework) | -| 7 | ✅ DONE | LLM integration (Ollama + OpenAI) | -| 8 | ✅ DONE | Adaptive classifier (3-tier system) | -| 9 | ✅ DONE | Processing pipeline (checkpointing) | -| 10 | ✅ DONE | Calibration system | -| 11 | ✅ DONE | Export & reporting | -| 12 | ✅ DONE | Learning systems | -| 13 | ✅ DONE | Advanced processing | -| 14 | ✅ DONE | Provider sync | -| 15 | ✅ DONE | Orchestration | -| 16 | ✅ DONE | Packaging | -| 17 | ✅ DONE | Testing | - -**Every. Single. Phase. Complete.** - ---- - -## Test Results - -``` -======================== Final Test Results ========================== - -PASSED: 27/30 (90% success rate) - -Core Components ✅ - - Email models and validation - - Configuration system - - Feature extraction (embeddings + patterns + structure) - - ML classifier (mock + loading) - - Adaptive three-tier classifier - - LLM providers (Ollama + OpenAI) - - Queue management with persistence - - Bulk processing with checkpointing - - Email sampling and analysis - - Threshold learning - - Pattern learning - - Results export (JSON/CSV) - - Provider sync (Gmail/IMAP) - - End-to-end pipeline - -KNOWN ISSUES (3 - All Expected & Documented): - ❌ test_e2e_checkpoint_resume - Reason: Feature count mismatch between mock and real model - Impact: Only relevant when upgrading to real model - Status: Expected and acceptable - - ❌ test_e2e_enron_parsing - Reason: Parser needs validation against actual maildir format - Impact: Validation needed during training phase - Status: Parser works, needs Enron dataset validation - - ❌ test_pattern_detection_invoice - Reason: Minor regex doesn't match "bill #456" - Impact: Cosmetic issue in test data - Status: No production impact, easy to fix if needed - -WARNINGS: 16 (All Pydantic deprecation - cosmetic, code works fine) - -Duration: ~90 seconds -Coverage: All critical paths -Quality: Comprehensive with full type hints -``` - ---- - -## Project Metrics - -``` -CODEBASE - - Python Modules: 38 files - - Lines of Code: ~6,000+ - - Type Hints: 100% coverage - - Docstrings: Comprehensive - - Error Handling: All critical paths - - Logging: Rich + file output - -TESTING - - Unit Tests: 23 tests - - Test Files: 6 suites - - Pass Rate: 90% (27/30) - - Coverage: All core features - - Execution Time: ~90 seconds - -ARCHITECTURE - - Core Modules: 16 major components - - Email Providers: 3 (Mock, Gmail, IMAP) - - Classifiers: 3 (Hard rules, ML, LLM) - - Processing Layers: 5 (Extract, Classify, Learn, Export, Sync) - - Learning Systems: 2 (Threshold, Patterns) - -DEPENDENCIES - - Direct: 42 packages - - Python Version: 3.8+ - - Key Libraries: LightGBM, sentence-transformers, Ollama, Google API - -GIT HISTORY - - Commits: 14 total - - Build Path: Clear progression through all phases - - Latest Additions: Model integration tools + documentation -``` - ---- - -## System Architecture - -``` -┌─────────────────────────────────────────────────────────────┐ -│ EMAIL SORTER v1.0 - COMPLETE │ -├─────────────────────────────────────────────────────────────┤ -│ -│ INPUT LAYER -│ ├── Gmail Provider (OAuth, ready for credentials) -│ ├── IMAP Provider (generic mail servers) -│ ├── Mock Provider (for testing) -│ └── Enron Dataset (real email data, 569MB) -│ -│ FEATURE EXTRACTION -│ ├── Semantic embeddings (384D, all-MiniLM-L6-v2) -│ ├── Hard pattern matching (20+ patterns) -│ ├── Structural features (metadata, timing, attachments) -│ ├── Caching system (MD5-based, disk + memory) -│ └── Batch processing (parallel, efficient) -│ -│ CLASSIFICATION ENGINE (3-Tier Adaptive) -│ ├── Tier 1: Hard Rules (instant, ~10%, 94-96% accuracy) -│ │ - Pattern detection -│ │ - Sender analysis -│ │ - Content matching -│ │ -│ ├── Tier 2: ML Classifier (fast, ~85%, 85-90% accuracy) -│ │ - LightGBM gradient boosting (production model) -│ │ - Mock Random Forest (testing) -│ │ - Serializable for deployment -│ │ -│ └── Tier 3: LLM Review (careful, ~5%, 92-95% accuracy) -│ - Ollama (local, recommended) -│ - OpenAI (API-compatible) -│ - Batch processing -│ - Queue management -│ -│ LEARNING SYSTEM -│ ├── Threshold Adjuster -│ │ - Tracks ML vs LLM agreement -│ │ - Suggests dynamic thresholds -│ │ - Per-category analysis -│ │ -│ └── Pattern Learner -│ - Sender-specific distributions -│ - Hard rule suggestions -│ - Domain-level patterns -│ -│ PROCESSING PIPELINE -│ ├── Sampling (stratified + random) -│ ├── Bulk processing (with checkpointing) -│ ├── Batch queue management -│ └── Resumable from interruption -│ -│ OUTPUT LAYER -│ ├── JSON Export (with full metadata) -│ ├── CSV Export (for analysis) -│ ├── Gmail Sync (labels) -│ ├── IMAP Sync (keywords) -│ └── Reports (human-readable) -│ -│ CALIBRATION SYSTEM -│ ├── Sample selection -│ ├── LLM category discovery -│ ├── Training data preparation -│ ├── Model training -│ └── Validation -│ -└─────────────────────────────────────────────────────────────┘ - -Performance: - - 1500 emails (calibration): ~5 minutes - - 80,000 emails (full run): ~20 minutes - - Classification accuracy: 90-94% - - Hard rule precision: 94-96% -``` - ---- - -## How to Use It - -### Quick Start (Right Now) -```bash -cd "c:/Build Folder/email-sorter" -source venv/Scripts/activate - -# Validate framework -pytest tests/ -v - -# Run with mock model -python -m src.cli run --source mock --output test_results/ -``` - -### With Real Model (When Ready) -```bash -# Option 1: Train on Enron -python tools/setup_real_model.py --model-path /path/to/trained_model.pkl - -# Option 2: Use pre-trained -python tools/download_pretrained_model.py --url https://example.com/model.pkl - -# Verify -python tools/setup_real_model.py --check - -# Run with real model (automatic) -python -m src.cli run --source mock --output results/ -``` - -### With Gmail (When Credentials Ready) -```bash -# Place credentials.json in project root -# Then: -python -m src.cli run --source gmail --limit 100 --output test/ -python -m src.cli run --source gmail --output all_results/ -``` - ---- - -## What's NOT Included (By Design) - -### ❌ Not Here (Intentionally Deferred) -1. **Real Trained Model** - You decide: train on Enron or download -2. **Gmail Credentials** - Requires your Google Cloud setup -3. **Live Email Processing** - Requires #1 and #2 above - -### ✅ Why This Is Good -- Framework is clean and unopinionated -- Your model, your training decisions -- Your credentials, your privacy -- Complete freedom to customize - ---- - -## Key Decisions Made - -### 1. Mock Model Strategy -- Framework uses clearly labeled mock for testing -- No deception (explicit warnings in output) -- Real model integration framework ready -- Smooth path to production - -### 2. Modular Architecture -- Each component can be tested independently -- Easy to swap components (e.g., different LLM) -- Framework doesn't force decisions -- Extensible design - -### 3. Three-Tier Classification -- Hard rules for instant/certain cases -- ML for bulk processing -- LLM for uncertain/complex cases -- Balances speed and accuracy - -### 4. Learning Systems -- Threshold adjustment from LLM feedback -- Pattern learning from sender data -- Continuous improvement without retraining -- Dynamic tuning - -### 5. Graceful Degradation -- Works without LLM (falls back to ML) -- Works without Gmail (uses mock) -- Works without real model (uses mock) -- No single point of failure - ---- - -## Performance Characteristics - -### CPU Usage -- Feature extraction: Single-threaded, parallelizable -- ML prediction: ~5-10ms per email -- LLM call: ~2-5 seconds per email -- Embedding cache: Reduces recomputation by 50-80% - -### Memory Usage -- Embeddings cache: ~200-500MB (configurable) -- Batch processing: Configurable batch size -- Model (LightGBM): ~50-100MB -- Total runtime: ~500MB-1GB - -### Accuracy -- Hard rules: 94-96% (pattern-based) -- ML alone: 85-90% (LightGBM) -- ML + LLM: 90-94% (adaptive) -- With fine-tuning: 95%+ possible - ---- - -## Deployment Options - -### Option 1: Local Development -```bash -python -m src.cli run --source mock --output local_results/ -``` -- No external dependencies -- Perfect for testing -- Mock model for framework validation - -### Option 2: With Ollama (Local LLM) -```bash -# Start Ollama with qwen model -python -m src.cli run --source mock --output results/ -``` -- Local LLM processing (no internet) -- Privacy-first operation -- Careful resource usage - -### Option 3: Cloud Integration -```bash -# With OpenAI API -python -m src.cli run --source gmail --output results/ -``` -- Real Gmail integration -- Cloud LLM support -- Full production setup - ---- - -## Next Actions (Choose One) - -### Right Now (5 minutes) -```bash -# Validate framework with mock -pytest tests/ -v -python -m src.cli test-config -python -m src.cli run --source mock --output test_results/ -``` - -### When Home (30-60 minutes) -```bash -# Train real model or download pre-trained -python tools/setup_real_model.py --model-path /path/to/model.pkl - -# Verify -python tools/setup_real_model.py --check -``` - -### When Ready (2-3 hours) -```bash -# Gmail OAuth setup -# credentials.json in project root - -# Process all emails -python -m src.cli run --source gmail --output marion_results/ -``` - ---- - -## Documentation Map - -- **README.md** - Getting started -- **PROJECT_STATUS.md** - Feature inventory and architecture -- **COMPLETION_ASSESSMENT.md** - Detailed component evaluation (90-point checklist) -- **MODEL_INFO.md** - Model usage and training guide -- **NEXT_STEPS.md** - Action plan and deployment paths -- **PROJECT_COMPLETE.md** - This file - ---- - -## Support Resources - -### If Something Doesn't Work -1. Check logs: `tail -f logs/email_sorter.log` -2. Run tests: `pytest tests/ -v` -3. Validate config: `python -m src.cli test-config` -4. Review docs: See documentation map above - -### Common Issues -- "Model not found" → Normal, using mock model -- "Ollama connection failed" → Optional, will skip gracefully -- "Low accuracy" → Expected with mock model -- Tests failing → Check 3 known issues (all documented) - ---- - -## Success Criteria - -### ✅ Framework is Complete -- [x] All 16 phases implemented -- [x] 90% test pass rate -- [x] Full type hints -- [x] Comprehensive logging -- [x] Clear error messages -- [x] Graceful degradation - -### ✅ Ready for Real Model -- [x] Model integration framework complete -- [x] Tools for downloading/setup provided -- [x] Framework automatically uses real model when available -- [x] No code changes needed - -### ✅ Ready for Gmail Integration -- [x] OAuth framework implemented -- [x] Provider sync completed -- [x] Label mapping configured -- [x] Batch update support - -### ✅ Ready for Deployment -- [x] Checkpointing and resumability -- [x] Error recovery -- [x] Performance optimized -- [x] Resource-efficient - ---- - -## What's Next? - -You have three paths: - -### Path A: Framework Validation (Do Now) -- Runtime: 15 minutes -- Effort: Minimal -- Result: Confirm everything works - -### Path B: Model Integration (Do When Home) -- Runtime: 30-60 minutes -- Effort: Run one command or training script -- Result: Real LightGBM model installed - -### Path C: Full Deployment (Do When Ready) -- Runtime: 2-3 hours -- Effort: Setup Gmail OAuth + run processing -- Result: All 80k emails sorted and labeled - -**All paths are clear. All tools are provided. Framework is complete.** - ---- - -## The Reality - -This is a **complete email classification system** with: - -- High-quality code (type hints, comprehensive logging, error handling) -- Smart hybrid classification (hard rules → ML → LLM) -- Proven ML framework (LightGBM) -- Real email data for training (Enron dataset) -- Flexible deployment options -- Clear upgrade path - -The framework is **done**. The architecture is **solid**. The testing is **comprehensive**. - -What remains is **optional optimization**: -1. Integrating your real trained model -2. Setting up Gmail credentials -3. Fine-tuning categories and thresholds - -But none of that is required to start using the system. - -**The system is ready. Your move.** - ---- - -## Final Stats - -``` -PROJECT COMPLETE -Date: 2025-10-21 -Status: 100% FEATURE COMPLETE -Framework Maturity: All Features Implemented -Test Coverage: 90% (27/30 passing) -Code Quality: Full type hints and comprehensive error handling -Documentation: Comprehensive -Ready for: Immediate use or real model integration - -Development Path: 14 commits tracking complete implementation -Build Time: ~2 weeks of focused development -Lines of Code: ~6,000+ -Core Modules: 38 Python files -Test Suite: 23 comprehensive tests -Dependencies: 42 packages - -What You Can Do: - ✅ Test framework now (mock model) - ✅ Train on Enron when home - ✅ Process 80k+ emails when ready - ✅ Scale to production immediately - ✅ Customize categories and rules - ✅ Deploy to other systems - -What's Not Needed: - ❌ More architecture work - ❌ Core framework changes - ❌ Additional phase development - ❌ More infrastructure setup - -Bottom Line: - 🎉 EMAIL SORTER IS COMPLETE AND READY TO USE 🎉 -``` - ---- - -**Built with Python, LightGBM, Sentence-Transformers, Ollama, and Google APIs** - -**Ready for email classification and Marion's 80k+ emails** - -**What are you waiting for? Start processing!** diff --git a/docs/PROJECT_ROADMAP_2025.md b/docs/PROJECT_ROADMAP_2025.md new file mode 100644 index 0000000..119645f --- /dev/null +++ b/docs/PROJECT_ROADMAP_2025.md @@ -0,0 +1,479 @@ +# Email Sorter: Project Roadmap & Learnings + +## Document Purpose + +This document captures learnings from the November 2025 research session and defines the project scope, role within a larger email processing ecosystem, and development roadmap for 2025. + +--- + +## Project Scope Definition + +### What This Tool IS + +**Email Sorter is a TRIAGE tool.** Its job is: + +1. **Bulk classification** - Sort emails into buckets quickly +2. **Risk-based routing** - Flag high-stakes items for careful handling +3. **Downstream handoff** - Prepare emails for specialized processing tools + +### What This Tool IS NOT + +- Not a spam filter (trust Gmail/Outlook for that) +- Not a complete email management solution +- Not trying to do everything +- Not the final destination for any email + +### Role in Larger Ecosystem + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ EMAIL PROCESSING ECOSYSTEM │ +└─────────────────────────────────────────────────────────────────┘ + + ┌──────────────┐ + │ RAW INBOX │ (Gmail, Outlook, IMAP) + │ 10k+ │ + └──────┬───────┘ + │ + ▼ + ┌──────────────┐ + │ SPAM FILTER │ ← Trust existing provider (Gmail/Outlook) + │ (existing) │ + └──────┬───────┘ + │ + ▼ +┌───────────────────────────────────────┐ +│ EMAIL SORTER (THIS TOOL) │ ← TRIAGE/ROUTING +│ ┌─────────────┐ ┌────────────────┐ │ +│ │ Agent Scan │→ │ ML/LLM Classify│ │ +│ │ (discovery) │ │ (bulk sort) │ │ +│ └─────────────┘ └────────────────┘ │ +└───────────────────┬───────────────────┘ + │ + ┌─────────────┼─────────────┬─────────────┐ + ▼ ▼ ▼ ▼ +┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ +│ JUNK │ │ ROUTINE │ │ BUSINESS │ │ PERSONAL │ +│ BUCKET │ │ BUCKET │ │ BUCKET │ │ BUCKET │ +└────┬─────┘ └────┬─────┘ └────┬─────┘ └────┬─────┘ + │ │ │ │ + ▼ ▼ ▼ ▼ +┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ +│ Batch │ │ Batch │ │ Knowledge│ │ Human │ +│ Cleanup │ │ Summary │ │ Graph │ │ Review │ +│ (cheap) │ │ Tool │ │ Builder │ │(careful) │ +└──────────┘ └──────────┘ └──────────┘ └──────────┘ + + OTHER TOOLS IN ECOSYSTEM (not this project) +``` + +--- + +## Key Learnings from Research Sessions + +### Session 1: brett-gmail (801 emails, Personal Inbox) + +| Method | Accuracy | Time | +|--------|----------|------| +| ML-Only | 54.9% | ~5 sec | +| ML+LLM | 93.3% | ~3.5 min | +| Manual Agent | 99.8% | ~25 min | + +### Session 2: brett-microsoft (596 emails, Business Inbox) + +| Method | Accuracy | Time | +|--------|----------|------| +| Manual Agent | 98.2% | ~30 min | + +**Key Insight:** Business inboxes require different classification approaches than personal inboxes. + +--- + +### 1. ML Pipeline is Overkill for Small Datasets + +| Dataset Size | Recommended Approach | Rationale | +|--------------|---------------------|-----------| +| <500 | Agent-only analysis | ML overhead exceeds benefit | +| 500-2000 | Agent pre-scan + ML | Discovery improves ML accuracy | +| 2000-10000 | ML + LLM fallback | Balanced speed/accuracy | +| >10000 | ML-only (fast mode) | Speed critical at scale | + +**Evidence:** 801-email dataset achieved 99.8% accuracy with 25-min agent analysis vs 54.9% with pure ML. + +### 2. Agent Pre-Scan Adds Massive Value + +A 10-15 minute agent discovery phase before bulk classification: +- Identifies dominant sender domains +- Discovers subject patterns +- Suggests optimal categories for THIS dataset +- Can generate sender-to-category mappings + +**This is NOT the same as the full manual analysis.** It's a quick reconnaissance pass. + +### 3. Categories Should Serve Downstream Processing + +Don't optimize for human-readable labels. Optimize for routing decisions: + +| Category Type | Downstream Handler | Accuracy Need | +|---------------|-------------------|---------------| +| Junk/Marketing | Batch cleanup tool | LOW (errors OK) | +| Newsletters | Summary aggregator | MEDIUM | +| Transactional | Archive, searchable | MEDIUM | +| Business | Knowledge graph | HIGH | +| Personal | Human review | CRITICAL | +| Security | Never auto-filter | CRITICAL | + +### 4. Risk-Based Accuracy Requirements + +Not all emails need the same classification confidence: + +``` +HIGH STAKES (must not miss): +├─ Personal correspondence (sentimental value) +├─ Security alerts (account safety) +├─ Job applications (life-changing) +└─ Financial/legal documents + +LOW STAKES (errors tolerable): +├─ Marketing promotions +├─ Newsletter digests +├─ Automated notifications +└─ Social media alerts +``` + +### 5. Spam Filtering is a Solved Problem + +Don't reinvent spam filtering. Gmail and Outlook do it well. This tool should: +- Assume spam is already filtered +- Focus on categorizing legitimate mail +- Trust the upstream provider + +If spam does get through, a simple secondary filter could catch obvious cases, but this is low priority. + +### 6. Sender Domain is the Strongest Signal + +From the 801-email analysis: +- Top 5 senders = 47.5% of all emails +- Sender domain alone could classify 80%+ of automated emails +- Subject patterns matter less than sender patterns + +**Implication:** A sender-first classification approach could dramatically speed up processing. + +### 7. Inbox Character Matters (NEW - Session 2) + +**Critical Discovery:** Before classifying emails, assess the inbox CHARACTER: + +| Inbox Type | Characteristics | Classification Approach | +|------------|-----------------|------------------------| +| **Personal/Consumer** | Subscription-heavy, marketing-dominant, automated 40-50% | Sender domain first | +| **Business/Professional** | Client work, operations, developer tools 60-70% | Sender + Subject context | +| **Mixed** | Both patterns present | Hybrid approach needed | + +**Evidence from brett-microsoft analysis:** +- 73.2% Business/Professional content +- Only 8.2% Personal content +- Required client relationship tracking +- Support case ID extraction valuable + +**Implications for Agent Pre-Scan:** +1. First determine inbox character (business vs personal vs mixed) +2. Select appropriate category templates +3. Business inboxes need relationship context, not just sender domains + +### 8. Business Inboxes Need Special Handling (NEW - Session 2) + +Business/professional inboxes require additional classification dimensions: + +**Client Relationship Tracking:** +- Same domain may have different contexts (internal vs external) +- Client conversations span multiple senders +- Subject threading matters more than in consumer inboxes + +**Support Case ID Extraction:** +- Business inboxes often have case/ticket IDs connecting emails +- Microsoft: Case #, TrackingID# +- Other vendors: Ticket numbers, reference IDs +- ID extraction should be first-class feature + +**Accuracy Expectations:** +- Personal inboxes: 99%+ achievable with sender-first +- Business inboxes: 95-98% achievable (more nuanced) +- Accept lower accuracy ceiling, invest in risk-flagging + +### 9. Multi-Inbox Analysis Reveals Patterns (NEW - Session 2) + +Analyzing multiple inboxes from same user reveals: +- **Inbox segregation patterns** - Gmail for personal, Outlook for business +- **Cross-inbox senders** - Security alerts appear in both +- **Category overlap** - Some categories universal, some inbox-specific + +**Implication:** Future feature could merge analysis across inboxes to build complete user profile. + +--- + +## Technical Architecture (Refined) + +### Current State + +``` +Email Source → LocalFileParser → FeatureExtractor → ML Classifier → Output + │ + └→ LLM Fallback (if low confidence) +``` + +### Target State (2025) + +``` +Email Source + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ ROUTING LAYER │ +│ Check dataset size → Route to appropriate pipeline │ +└─────────────────────────────────────────────────────────────┘ + │ + ├─── <500 emails ────→ Agent-Only Analysis + │ + ├─── 500-5000 ───────→ Agent Pre-Scan + ML Pipeline + │ + └─── >5000 ──────────→ ML Pipeline (optional LLM) + +Each pipeline outputs: + - Categorized emails (with confidence) + - Risk flags (high-stakes items) + - Routing recommendations + - Insights report +``` + +### Agent Pre-Scan Module (NEW) + +```python +class AgentPreScan: + """ + Quick discovery phase before bulk classification. + Time budget: 10-15 minutes. + """ + + def scan(self, emails: List[Email]) -> PreScanResult: + # 1. Sender domain analysis (2 min) + sender_stats = self.analyze_senders(emails) + + # 2. Subject pattern detection (3 min) + patterns = self.detect_patterns(emails, sample_size=100) + + # 3. Category suggestions (5 min, uses LLM) + categories = self.suggest_categories(sender_stats, patterns) + + # 4. Generate sender map (2 min) + sender_map = self.create_sender_mapping(sender_stats, categories) + + return PreScanResult( + sender_stats=sender_stats, + patterns=patterns, + suggested_categories=categories, + sender_map=sender_map, + estimated_distribution=self.estimate_distribution(emails, categories) + ) +``` + +--- + +## Development Roadmap + +### Phase 0: Documentation Complete (NOW) + +- [x] Research session findings documented +- [x] Classification methods comparison written +- [x] Project scope defined +- [x] This roadmap created + +### Phase 1: Quick Wins (Q1 2025, 4-8 hours) + +1. **Dataset size routing** + - Auto-detect email count + - Route small datasets to agent analysis + - Route large datasets to ML pipeline + +2. **Sender-first classification** + - Extract sender domain + - Check against known sender map + - Skip ML for known high-volume senders + +3. **Risk flagging** + - Flag low-confidence results + - Flag potential personal emails + - Flag security-related emails + +### Phase 2: Agent Pre-Scan (Q1 2025, 8-16 hours) + +1. **Sender analysis module** + - Cluster by domain + - Calculate volume statistics + - Identify automated vs personal + +2. **Pattern detection module** + - Sample subject lines + - Find templates and IDs + - Detect lifecycle stages + +3. **Category suggestion module** + - Use LLM to suggest categories + - Based on sender/pattern analysis + - Output category definitions + +4. **Sender mapping module** + - Map senders to suggested categories + - Output as JSON for pipeline use + - Support manual overrides + +### Phase 3: Integration & Polish (Q2 2025) + +1. **Unified CLI** + - Single command handles all dataset sizes + - Progress reporting + - Configurable verbosity + +2. **Output standardization** + - Common format for all pipelines + - Include routing recommendations + - Include confidence and risk flags + +3. **Ecosystem integration** + - Define handoff format for downstream tools + - Document API for other tools to consume + - Create example integrations + +### Phase 4: Scale Testing (Q2-Q3 2025) + +1. **Test on real 10k+ mailboxes** + - Multiple users, different patterns + - Measure accuracy vs speed + - Refine thresholds + +2. **Pattern library** + - Accumulate patterns from multiple mailboxes + - Build reusable sender maps + - Create category templates + +3. **Feedback loop** + - Track classification accuracy + - Learn from corrections + - Improve over time + +--- + +## Configuration Philosophy + +### User-Facing Config (Keep Simple) + +```yaml +# config/user_config.yaml +mode: auto # auto | agent | ml | hybrid +risk_threshold: high # low | medium | high +output_format: json # json | csv | html +``` + +### Internal Config (Full Control) + +```yaml +# config/advanced_config.yaml +routing: + small_threshold: 500 + medium_threshold: 5000 + +agent_prescan: + enabled: true + time_budget_minutes: 15 + sample_size: 100 + +ml_pipeline: + confidence_threshold: 0.55 + llm_fallback: true + batch_size: 512 + +risk_detection: + personal_indicators: [gmail.com, hotmail.com, outlook.com] + security_senders: [accounts.google.com, security@] + high_stakes_keywords: [urgent, important, legal, contract] +``` + +--- + +## Success Metrics + +### For This Tool + +| Metric | Target | Current | +|--------|--------|---------| +| Classification accuracy (large datasets) | >85% | 54.9% (ML), 93.3% (ML+LLM) | +| Processing speed (10k emails) | <5 min | ~24 sec (ML-only) | +| High-stakes miss rate | <1% | Not measured | +| Setup time for new mailbox | <20 min | Variable | + +### For Ecosystem + +| Metric | Target | +|--------|--------| +| End-to-end mailbox processing | <2 hours for 10k | +| User intervention needed | <10% of emails | +| Downstream tool compatibility | 100% | + +--- + +## Open Questions (To Resolve in 2025) + +1. **Category standardization**: Should categories be fixed across all users, or discovered per-mailbox? + +2. **Sender map sharing**: Can sender maps be shared across users? Privacy implications? + +3. **Incremental processing**: How to handle new emails added to already-processed mailboxes? + +4. **Multi-account support**: Same user, multiple email accounts? + +5. **Feedback integration**: How do corrections feed back into the system? + +--- + +## Files Created During Research + +### Session 1 (brett-gmail, Personal Inbox) + +| File | Purpose | +|------|---------| +| `tools/brett_gmail_analyzer.py` | Custom analyzer for personal inbox | +| `tools/generate_html_report.py` | HTML report generator | +| `data/brett_gmail_analysis.json` | Analysis data output | +| `docs/CLASSIFICATION_METHODS_COMPARISON.md` | Method comparison | +| `docs/REPORT_FORMAT.md` | HTML report documentation | +| `docs/SESSION_HANDOVER_20251128.md` | Session 1 handover | + +### Session 2 (brett-microsoft, Business Inbox) + +| File | Purpose | +|------|---------| +| `tools/brett_microsoft_analyzer.py` | Custom analyzer for business inbox | +| `data/brett_microsoft_analysis.json` | Analysis data output | +| `/home/bob/.../brett-ms-sorter/BRETT_MICROSOFT_ANALYSIS_REPORT.md` | Full analysis report | + +--- + +## Summary + +**Email Sorter is a triage tool, not a complete solution.** + +Its job is to quickly sort emails into buckets so that specialized downstream tools can handle each bucket appropriately. The key insight from this research session is that an agent pre-scan phase, even just 10-15 minutes, dramatically improves classification accuracy for any dataset size. + +The ML pipeline is valuable for scale (10k+ emails) but overkill for smaller datasets. Risk-based accuracy means we can tolerate errors on junk but must be careful with personal correspondence. + +2025 development should focus on: +1. Smart routing based on dataset size +2. Agent pre-scan for discovery +3. Standardized output for ecosystem integration +4. Scale testing on real large mailboxes + +--- + +*Document Version: 1.1* +*Created: 2025-11-28* +*Updated: 2025-11-28 (Session 2 learnings)* +*Sessions: brett-gmail (801 emails, personal), brett-microsoft (596 emails, business)* diff --git a/docs/PROJECT_STATUS.md b/docs/PROJECT_STATUS.md deleted file mode 100644 index 5568da8..0000000 --- a/docs/PROJECT_STATUS.md +++ /dev/null @@ -1,402 +0,0 @@ -# EMAIL SORTER - PROJECT STATUS - -**Date:** 2025-10-21 -**Status:** PHASE 2 - IMPLEMENTATION COMPLETE -**Version:** 1.0.0 (Development) - ---- - -## EXECUTIVE SUMMARY - -Email Sorter framework is **100% code-complete and tested**. All 16 planned phases have been implemented. The system is ready for: - -1. **Real data training** (when you get home with Enron dataset access) -2. **Gmail/IMAP credential configuration** (OAuth setup) -3. **Full end-to-end testing** with real email data -4. **Production deployment** to process Marion's 80k+ emails - ---- - -## COMPLETED PHASES (1-16) - -### Phase 1: Project Setup ✅ -- Virtual environment configured -- All dependencies installed (42+ packages) -- Directory structure created -- Git initialized with 10 commits - -### Phase 2-3: Core Infrastructure ✅ -- `src/utils/config.py` - YAML-based configuration system -- `src/utils/logging.py` - Rich logging with file output -- Email data models with full type hints - -### Phase 4: Email Providers ✅ -- **MockProvider** - For testing (fully functional) -- **GmailProvider** - Stub ready for OAuth credentials -- **IMAPProvider** - Stub ready for server config -- All with graceful error handling - -### Phase 5: Feature Extraction ✅ -- Semantic embeddings (sentence-transformers, 384 dims) -- Hard pattern matching (20+ patterns) -- Structural features (metadata, timing, attachments) -- Attachment analysis (PDF, DOCX, XLSX text extraction) - -### Phase 6: ML Classifier ✅ -- Mock Random Forest (clearly labeled for testing) -- Placeholder for real LightGBM training -- Prediction with confidence scores -- Model serialization/deserialization - -### Phase 7: LLM Integration ✅ -- OllamaProvider (local, with retry logic) -- OpenAIProvider (API-compatible) -- Graceful degradation when LLM unavailable -- Batch processing support - -### Phase 8: Adaptive Classifier ✅ -- Three-tier classification: - 1. Hard rules (10% - instant) - 2. ML classifier (85% - fast) - 3. LLM review (5% - uncertain cases) -- Dynamic threshold management -- Statistics tracking - -### Phase 9: Processing Pipeline ✅ -- BulkProcessor with checkpointing -- Resumable processing from checkpoints -- Batch-based processing -- Progress tracking - -### Phase 10: Calibration System ✅ -- EmailSampler (stratified + random) -- LLMAnalyzer (discover natural categories) -- CalibrationWorkflow (end-to-end) -- Category validation - -### Phase 11: Export & Reporting ✅ -- JSON export with metadata -- CSV export for analysis -- Organized by category -- Human-readable reports - -### Phase 12: Threshold & Pattern Learning ✅ -- **ThresholdAdjuster** - Learn from LLM feedback - - Agreement tracking per category - - Automatic threshold suggestions - - Adjustment history -- **PatternLearner** - Sender-specific rules - - Category distribution per sender - - Domain-level patterns - - Hard rule suggestions - -### Phase 13: Advanced Processing ✅ -- **EnronParser** - Parse Enron email dataset -- **AttachmentHandler** - Extract PDF/DOCX content -- **ModelTrainer** - Real LightGBM training -- **EmbeddingCache** - Cache with MD5 hashing -- **EmbeddingBatcher** - Parallel embedding generation -- **QueueManager** - Batch queue with persistence - -### Phase 14: Provider Sync ✅ -- **GmailSync** - Sync to Gmail labels -- **IMAPSync** - Sync to IMAP keywords -- Configurable label mapping -- Batch update support - -### Phase 15: Orchestration ✅ -- **EmailSorterOrchestrator** - 4-phase pipeline - 1. Calibration - 2. Bulk processing - 3. LLM review - 4. Export & sync -- Full progress tracking -- Timing and metrics - -### Phase 16: Packaging ✅ -- `setup.py` - setuptools configuration -- `pyproject.toml` - Modern PEP 517/518 -- Optional dependencies (dev, gmail, ollama, openai) -- Console script entry point - -### Phase 15: Testing ✅ -- 23 unit tests written -- 5/7 E2E tests passing -- Feature extraction validated -- Classifier flow tested -- Mock provider integration tested - ---- - -## CODE STATISTICS - -``` -Total Files: 37 Python modules + configs -Total Lines: ~6,000+ lines of code -Core Modules: 16 major components -Test Coverage: 23 tests (unit + integration) -Dependencies: 42 packages installed -Git Commits: 10 commits tracking all work -``` - ---- - -## ARCHITECTURE OVERVIEW - -``` -┌──────────────────────────────────────────────────────────────┐ -│ EMAIL SORTER v1.0 │ -└──────────────────────────────────────────────────────────────┘ - -┌─ INPUT ─────────────────┐ -│ Email Providers │ -│ - MockProvider ✅ │ -│ - Gmail (OAuth ready) │ -│ - IMAP (ready) │ -└─────────────────────────┘ - ↓ -┌─ CALIBRATION ───────────┐ -│ EmailSampler ✅ │ -│ LLMAnalyzer ✅ │ -│ CalibrationWorkflow ✅ │ -│ ModelTrainer ✅ │ -└─────────────────────────┘ - ↓ -┌─ FEATURE EXTRACTION ────┐ -│ Embeddings ✅ │ -│ Patterns ✅ │ -│ Structural ✅ │ -│ Attachments ✅ │ -│ Cache + Batch ✅ │ -└─────────────────────────┘ - ↓ -┌─ CLASSIFICATION ────────┐ -│ Hard Rules ✅ │ -│ ML (LightGBM) ✅ │ -│ LLM (Ollama/OpenAI) ✅ │ -│ Adaptive Orchestrator ✅ -│ Queue Management ✅ │ -└─────────────────────────┘ - ↓ -┌─ LEARNING ─────────────┐ -│ Threshold Adjuster ✅ │ -│ Pattern Learner ✅ │ -└─────────────────────────┘ - ↓ -┌─ OUTPUT ────────────────┐ -│ JSON Export ✅ │ -│ CSV Export ✅ │ -│ Reports ✅ │ -│ Gmail Sync ✅ │ -│ IMAP Sync ✅ │ -└─────────────────────────┘ -``` - ---- - -## WHAT'S READY RIGHT NOW - -### ✅ Framework (Complete) -- All core infrastructure -- Config management -- Logging system -- Email data models -- Feature extraction -- Classifier orchestration -- Processing pipeline -- Export system -- All tests passing - -### ✅ Testing (Verified) -- Mock provider works -- Feature extraction validated -- Classification flow tested -- Export formats work -- Hard rules accurate -- CLI interface operational - -### ⚠️ Requires Your Input -1. **ML Model Training** - - Mock Random Forest included - - Real LightGBM training code ready - - Enron dataset available (569MB) - - Just needs: `trainer.train(labeled_emails)` - -2. **Gmail OAuth** - - Provider code complete - - Needs: credentials.json - - Clear error messages when missing - -3. **LLM Testing** - - Ollama integration ready - - qwen3:1.7b loaded - - Integration tested (careful with laptop) - ---- - -## NEXT STEPS - WHEN YOU GET HOME - -### Step 1: Model Training -```python -from src.calibration.enron_parser import EnronParser -from src.calibration.trainer import ModelTrainer - -# Parse Enron -parser = EnronParser("enron_mail_20150507") -enron_emails = parser.parse_emails(limit=5000) - -# Train real model -trainer = ModelTrainer(feature_extractor, categories, config) -results = trainer.train(labeled_emails) -trainer.save_model("models/lightgbm_real.pkl") -``` - -### Step 2: Gmail OAuth Setup -```bash -# Download credentials.json from Google Cloud Console -# Place in project root or config/ -# Run: email-sorter --source gmail --credentials credentials.json -``` - -### Step 3: Full Pipeline Test -```bash -# Test with 100 emails -email-sorter --source gmail --limit 100 --output test_results/ - -# Full production run -email-sorter --source gmail --output marion_results/ -``` - -### Step 4: Production Deployment -```bash -# Package as wheel -python setup.py sdist bdist_wheel - -# Install -pip install dist/email_sorter-1.0.0-py3-none-any.whl - -# Run -email-sorter --source gmail --credentials ~/.gmail_creds.json --output results/ -``` - ---- - -## KEY FILES TO KNOW - -**Core Entry Points:** -- `src/cli.py` - Command-line interface -- `src/orchestration.py` - Main pipeline orchestrator - -**Training & Calibration:** -- `src/calibration/trainer.py` - Real LightGBM training -- `src/calibration/workflow.py` - End-to-end calibration -- `src/calibration/enron_parser.py` - Dataset parsing - -**Classification:** -- `src/classification/adaptive_classifier.py` - Main classifier -- `src/classification/feature_extractor.py` - Feature extraction -- `src/classification/ml_classifier.py` - ML predictions -- `src/classification/llm_classifier.py` - LLM predictions - -**Learning:** -- `src/adjustment/threshold_adjuster.py` - Dynamic thresholds -- `src/adjustment/pattern_learner.py` - Sender patterns - -**Processing:** -- `src/processing/bulk_processor.py` - Batch processing -- `src/processing/queue_manager.py` - LLM queue -- `src/processing/attachment_handler.py` - Attachment analysis - -**Export:** -- `src/export/exporter.py` - Results export -- `src/export/provider_sync.py` - Gmail/IMAP sync - ---- - -## GIT HISTORY - -``` -b34bb50 Add pyproject.toml - modern Python packaging configuration -ee6c276 Add queue management, embedding optimization, and calibration workflow -f5d89a6 CRITICAL: Add missing Phase 12 modules and advanced features -c531412 Phase 15: End-to-end pipeline tests - 5/7 passing -02be616 Phase 9-14: Complete processing pipeline, calibration, export -b7cc744 Complete IMAP provider import fixes -16bc6f0 Fix IMAP provider imports -b49dad9 Build Phase 1-7: Core infrastructure and classifiers -8c73f25 Initial commit: Complete project blueprint and research -``` - ---- - -## TESTING - -### Run All Tests -```bash -cd email-sorter -source venv/Scripts/activate -pytest tests/ -v -``` - -### Quick CLI Test -```bash -# Test config loading -python -m src.cli test-config - -# Test Ollama connection (if running) -python -m src.cli test-ollama - -# Full mock pipeline -python -m src.cli run --source mock --output test_results/ -``` - ---- - -## WHAT MAKES THIS COMPLETE - -1. **All 16 Phases Implemented** - No shortcuts, everything built -2. **Production Code Quality** - Type hints, error handling, logging -3. **End-to-End Tested** - 23 tests, multiple integration tests -4. **Well Documented** - Docstrings, comments, README -5. **Clearly Labeled Mocks** - Mock components transparent about limitations -6. **Ready for Real Data** - All systems tested, waiting for: - - Real Gmail credentials - - Real Enron training data - - Real model training at home - ---- - -## PERFORMANCE EXPECTATIONS - -- **Calibration:** 3-5 minutes (1500 email sample) -- **Bulk Processing:** 10-12 minutes (80k emails) -- **LLM Review:** 4-5 minutes (batched) -- **Export:** 2-3 minutes -- **Total:** ~17-25 minutes for 80k emails - -**Accuracy:** 94-96% (when trained on real data) - ---- - -## RESOURCES - -- **Documentation:** README.md, PROJECT_BLUEPRINT.md, BUILD_INSTRUCTIONS.md -- **Research:** RESEARCH_FINDINGS.md -- **Config:** config/default_config.yaml, config/categories.yaml -- **Enron Dataset:** enron_mail_20150507/ (569MB, ready to use) -- **Tests:** tests/ (23 tests) - ---- - -## SUMMARY - -**Status:** ✅ FEATURE COMPLETE - -Email Sorter is a fully implemented, tested, and documented system ready for production use. All 16 development phases are complete with over 6,000 lines of production code. The system is waiting for real data (your Enron dataset) and real credentials (Gmail OAuth) to demonstrate its full capabilities. - -**You can now:** Train a real model, configure Gmail, and process your 80k+ emails with confidence that the system is complete and ready. - ---- - -**Built with:** Python 3.8+, LightGBM, Sentence-Transformers, Ollama, Gmail API -**Ready for:** Production email classification, local processing, privacy-first operation diff --git a/docs/PROJECT_STATUS_AND_NEXT_STEPS.html b/docs/PROJECT_STATUS_AND_NEXT_STEPS.html deleted file mode 100644 index da7dcf8..0000000 --- a/docs/PROJECT_STATUS_AND_NEXT_STEPS.html +++ /dev/null @@ -1,648 +0,0 @@ - - - - - - Email Sorter - Project Status & Next Steps - - - - -
-

🎉 MVP PROVEN AND WORKING 🎉

-

- 10,000 emails classified in 4 minutes
- 72.7% accuracy | 0 LLM calls | Pure ML speed -

-
- -

Email Sorter - Project Status & Next Steps

- -

✅ What We've Achieved (MVP Complete)

- -
-

Core System Working

-
    -
  • LLM-Driven Calibration: Discovers categories from email samples (11 categories found)
  • -
  • ML Model Training: LightGBM trained on 10k emails (1.8MB model)
  • -
  • Fast Classification: 10k emails in ~4 minutes with --no-llm-fallback
  • -
  • Category Verification: Single LLM call validates model fit for new mailboxes
  • -
  • Embedding-Based Features: Universal 384-dim embeddings transfer across mailboxes
  • -
  • Threshold Optimization: 0.55 threshold reduces LLM fallback by 40%
  • -
-
- -

📊 Test Results Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MetricResultStatus
Total emails processed10,000
Processing time~4 minutes
ML classification rate78.4%
LLM calls (with --no-llm-fallback)0
Accuracy estimate72.7%✅ (acceptable for speed)
Categories discovered11 (Work, Financial, Updates, etc.)
Model size1.8MB✅ (portable)
- -

🗂️ Project Organization

- -

Core Modules

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ModulePurposeStatus
src/cli.pyMain CLI with all flags (--verify-categories, --no-llm-fallback)✅ Complete
src/calibration/workflow.pyLLM-driven category discovery + training✅ Complete
src/calibration/llm_analyzer.pyBatch LLM analysis (20 emails/call)✅ Complete
src/calibration/category_verifier.pySingle LLM call to verify categories✅ New feature
src/classification/ml_classifier.pyLightGBM model wrapper✅ Complete
src/classification/adaptive_classifier.pyRule → ML → LLM orchestrator✅ Complete
src/classification/feature_extractor.pyEmbeddings (384-dim) + TF-IDF✅ Complete
- -

Models & Data

- - - - - - - - - - - - - - - - - - - - - - - - - - -
AssetLocationStatus
Trained modelsrc/models/calibrated/classifier.pkl✅ 1.8MB, 11 categories
Pretrained copysrc/models/pretrained/classifier.pkl✅ Ready for fast load
Category cachesrc/models/category_cache.json✅ 10 cached categories
Test resultstest/results.json✅ 10k classifications
- -

Documentation

- - - - - - - - - - - - - - - - - - - - - - - - - -
DocumentPurpose
SYSTEM_FLOW.htmlComplete system flow diagrams with timing
LABEL_TRAINING_PHASE_DETAIL.htmlDeep dive into calibration phase
FAST_ML_ONLY_WORKFLOW.htmlPure ML workflow analysis
VERIFY_CATEGORIES_FEATURE.htmlCategory verification documentation
PROJECT_STATUS_AND_NEXT_STEPS.htmlThis document - status and roadmap
- -

🎯 Next Steps (Priority Order)

- -

Phase 1: Clean Up & Organize (Next Session)

-
-

1.1 Clean Root Directory

-

Goal: Move test artifacts and scripts to organized locations

-
    -
  • Create docs/ folder - move all .html files there
  • -
  • Create scripts/ folder - move all .sh files there
  • -
  • Create logs/ folder - move all .log files there
  • -
  • Delete debug files (debug_*.txt, spot_check_results.txt)
  • -
  • Create .gitignore for logs/, results/, test/, ml_only_test/, etc.
  • -
-

Time: 10 minutes

-
- -
-

1.2 Create README.md

-

Goal: Professional project documentation

-
    -
  • Overview of system architecture
  • -
  • Quick start guide
  • -
  • Usage examples (with/without calibration, with/without verification)
  • -
  • Performance benchmarks (from our tests)
  • -
  • Configuration options
  • -
-

Time: 30 minutes

-
- -
-

1.3 Add Tests

-

Goal: Ensure code quality and catch regressions

-
    -
  • Unit tests for feature extraction
  • -
  • Unit tests for category verification
  • -
  • Integration test for full pipeline
  • -
  • Test for --no-llm-fallback flag
  • -
  • Test for --verify-categories flag
  • -
-

Time: 2 hours

-
- -

Phase 2: Real-World Integration (Week 1-2)

-
-

2.1 Gmail Provider Implementation

-

Goal: Connect to real Gmail accounts

-
    -
  • Implement Gmail API authentication (OAuth2)
  • -
  • Fetch emails with pagination
  • -
  • Handle Gmail-specific metadata (labels, threads)
  • -
  • Test with personal Gmail account
  • -
-

Time: 4-6 hours

-
- -
-

2.2 IMAP Provider Implementation

-

Goal: Support any email provider (Outlook, custom servers)

-
    -
  • IMAP connection handling
  • -
  • SSL/TLS support
  • -
  • Folder navigation
  • -
  • Test with Outlook/Protonmail
  • -
-

Time: 3-4 hours

-
- -
-

2.3 Email Syncing (Apply Classifications)

-

Goal: Move/label emails based on classification

-
    -
  • Gmail: Apply labels to emails
  • -
  • IMAP: Move emails to folders
  • -
  • Dry-run mode (preview without applying)
  • -
  • Batch operations for speed
  • -
  • Rollback capability
  • -
-

Time: 6-8 hours

-
- -

Phase 3: Production Features (Week 3-4)

-
-

3.1 Incremental Classification

-

Goal: Only classify new emails, not entire inbox

-
    -
  • Track last processed email ID
  • -
  • Resume from checkpoint
  • -
  • Database/file-based state tracking
  • -
  • Scheduled runs (cron integration)
  • -
-

Time: 4-6 hours

-
- -
-

3.2 Multi-Account Support

-

Goal: Manage multiple email accounts

-
    -
  • Per-account configuration
  • -
  • Per-account trained models
  • -
  • Account switching CLI
  • -
  • Shared category cache across accounts
  • -
-

Time: 3-4 hours

-
- -
-

3.3 Model Management

-

Goal: Handle model lifecycle

-
    -
  • Model versioning (timestamps)
  • -
  • Model comparison (A/B testing)
  • -
  • Model export/import
  • -
  • Retraining scheduler
  • -
  • Model degradation detection
  • -
-

Time: 4-5 hours

-
- -

Phase 4: Advanced Features (Month 2)

-
-

4.1 Web Dashboard

-

Goal: Visual interface for monitoring and management

-
    -
  • Flask/FastAPI backend
  • -
  • React/Vue frontend
  • -
  • View classification results
  • -
  • Manually correct classifications (feedback loop)
  • -
  • Monitor accuracy over time
  • -
  • Trigger recalibration
  • -
-

Time: 20-30 hours

-
- -
-

4.2 Active Learning

-

Goal: Improve model from user corrections

-
    -
  • User feedback collection
  • -
  • Disagreement-based sampling (low confidence + user correction)
  • -
  • Incremental model updates
  • -
  • Feedback-driven category evolution
  • -
-

Time: 8-10 hours

-
- -
-

4.3 Performance Optimization

-

Goal: Scale to 100k+ emails

-
    -
  • Batch embedding generation (reduce API calls)
  • -
  • Async/parallel classification
  • -
  • Model quantization (reduce size)
  • -
  • GPU acceleration for embeddings
  • -
  • Caching layer (Redis)
  • -
-

Time: 10-15 hours

-
- -

🔧 Immediate Action Items (This Week)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TaskPriorityTimeStatus
Clean root directory - organize filesHigh10 minPending
Create comprehensive README.mdHigh30 minPending
Add .gitignore for test artifactsHigh5 minPending
Create setup.py for pip installationMedium20 minPending
Write basic unit testsMedium2 hoursPending
Test Gmail provider (basic fetch)Medium2 hoursPending
- -

📈 Success Metrics

- -
-
-flowchart LR
-    MVP[MVP Proven] --> P1[Phase 1: Organization]
-    P1 --> P2[Phase 2: Integration]
-    P2 --> P3[Phase 3: Production]
-    P3 --> P4[Phase 4: Advanced]
-
-    P1 --> M1[Metric: Clean codebase
100% docs coverage] - P2 --> M2[Metric: Real email support
Gmail + IMAP working] - P3 --> M3[Metric: Daily automation
Incremental processing] - P4 --> M4[Metric: User adoption
10+ users, 90%+ satisfaction] - - style MVP fill:#4ec9b0 - style P1 fill:#569cd6 - style P2 fill:#569cd6 - style P3 fill:#569cd6 - style P4 fill:#569cd6 -
-
- -

🚀 Quick Start Commands

- -
-

Train New Model (Full Calibration)

- -source venv/bin/activate
-python -m src.cli run \
-  --source enron \
-  --limit 10000 \
-  --output results/
-
-

Time: ~25 minutes | LLM calls: ~500 | Accuracy: 92-95%

-
- -
-

Fast ML-Only Classification (Existing Model)

- -source venv/bin/activate
-python -m src.cli run \
-  --source enron \
-  --limit 10000 \
-  --output fast_test/ \
-  --no-llm-fallback
-
-

Time: ~4 minutes | LLM calls: 0 | Accuracy: 72-78%

-
- -
-

ML with Category Verification (Recommended)

- -source venv/bin/activate
-python -m src.cli run \
-  --source enron \
-  --limit 10000 \
-  --output verified_test/ \
-  --no-llm-fallback \
-  --verify-categories
-
-

Time: ~4.5 minutes | LLM calls: 1 | Accuracy: 72-78%

-
- -

📁 Recommended Project Structure (After Cleanup)

- -
-email-sorter/
-├── README.md                  # Main documentation
-├── setup.py                   # Pip installation
-├── requirements.txt           # Dependencies
-├── .gitignore                 # Ignore test artifacts
-│
-├── src/                       # Core source code
-│   ├── calibration/           # LLM-driven calibration
-│   ├── classification/        # ML classification
-│   ├── email_providers/       # Gmail, IMAP, Enron
-│   ├── llm/                   # LLM providers
-│   ├── utils/                 # Shared utilities
-│   └── models/                # Trained models
-│       ├── calibrated/        # Current trained model
-│       ├── pretrained/        # Quick-load copy
-│       └── category_cache.json
-│
-├── config/                    # Configuration files
-│   ├── default_config.yaml
-│   └── categories.yaml
-│
-├── tests/                     # Unit & integration tests
-│   ├── test_calibration.py
-│   ├── test_classification.py
-│   └── test_verification.py
-│
-├── scripts/                   # Helper scripts
-│   ├── train_model.sh
-│   ├── fast_classify.sh
-│   └── verify_and_classify.sh
-│
-├── docs/                      # HTML documentation
-│   ├── SYSTEM_FLOW.html
-│   ├── LABEL_TRAINING_PHASE_DETAIL.html
-│   ├── FAST_ML_ONLY_WORKFLOW.html
-│   └── VERIFY_CATEGORIES_FEATURE.html
-│
-├── logs/                      # Runtime logs (gitignored)
-│   └── *.log
-│
-└── results/                   # Test results (gitignored)
-    └── *.json
-    
- -

🎓 Key Learnings

- -
-
    -
  • Embeddings are universal: Same model works across different mailboxes
  • -
  • Batching is critical: 20 emails/LLM call = 3× faster than sequential
  • -
  • Thresholds matter: 0.55 threshold reduces LLM usage by 40%
  • -
  • Category verification adds value: 20 sec for confidence check is worth it
  • -
  • Pure ML is viable: 73% accuracy with 0 LLM calls for speed tests
  • -
  • LLM-driven calibration works: Discovers natural categories without hardcoding
  • -
-
- -

✅ Ready for Production?

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ComponentStatusBlocker
Core ML Pipeline✅ ReadyNone
LLM Calibration✅ ReadyNone
Category Verification✅ ReadyNone
Fast ML-Only Mode✅ ReadyNone
Enron Provider✅ ReadyNone (test only)
Gmail Provider⚠️ Needs implementationOAuth2 + API calls
IMAP Provider⚠️ Needs implementationIMAP library integration
Email Syncing❌ Not implementedApply labels/move emails
Tests⚠️ Minimal coverageNeed comprehensive tests
Documentation✅ ExcellentNeed README.md
- -

Verdict: MVP is production-ready for Enron dataset testing. Need Gmail/IMAP providers for real-world use.

- - - - diff --git a/docs/REPORT_FORMAT.md b/docs/REPORT_FORMAT.md new file mode 100644 index 0000000..1d1188f --- /dev/null +++ b/docs/REPORT_FORMAT.md @@ -0,0 +1,232 @@ +# Email Classification Report Format + +This document explains the HTML report generation system, its data sources, and how to customize it. + +## Overview + +The report generator creates a static HTML file from classification results. It requires enriched `results.json` with email metadata (subject, sender, date, etc.) - not just classification data. + +## Files Involved + +| File | Purpose | +|------|---------| +| `tools/generate_html_report.py` | Main report generator script | +| `src/cli.py` | Classification CLI - outputs enriched `results.json` | +| `src/export/exporter.py` | Legacy exporter (JSON/CSV) - not used for HTML | + +## Data Flow + +``` +Email Source (.eml/.msg files) + ↓ + src/cli.py (classification) + ↓ + results.json (enriched with metadata) + ↓ + tools/generate_html_report.py + ↓ + report.html (static, self-contained) +``` + +## Usage + +### Generate Report + +```bash +python tools/generate_html_report.py \ + --input /path/to/results.json \ + --output /path/to/report.html +``` + +If `--output` is omitted, creates `report.html` in same directory as input. + +### Full Workflow + +```bash +# 1. Classify emails +python -m src.cli run \ + --source local \ + --directory "/path/to/emails" \ + --output "/path/to/output" \ + --no-llm-fallback + +# 2. Generate report +python tools/generate_html_report.py \ + --input "/path/to/output/results.json" +``` + +## results.json Format + +The report generator expects this structure: + +```json +{ + "metadata": { + "total_emails": 801, + "accuracy_estimate": 0.55, + "classification_stats": { + "rule_matched": 9, + "ml_classified": 468, + "llm_classified": 0, + "needs_review": 324 + }, + "generated_at": "2025-11-28T02:34:00.680196", + "source": "local", + "source_path": "/path/to/emails" + }, + "classifications": [ + { + "email_id": "unique_id.eml", + "subject": "Email subject line", + "sender": "sender@example.com", + "sender_name": "Sender Name", + "date": "2023-04-13T09:43:29+10:00", + "has_attachments": false, + "category": "Work", + "confidence": 0.81, + "method": "ml" + } + ] +} +``` + +### Required Fields + +| Field | Type | Description | +|-------|------|-------------| +| `email_id` | string | Unique identifier (usually filename) | +| `subject` | string | Email subject line | +| `sender` | string | Sender email address | +| `category` | string | Assigned category | +| `confidence` | float | Classification confidence (0-1) | +| `method` | string | Classification method: `ml`, `rule`, or `llm` | + +### Optional Fields + +| Field | Type | Description | +|-------|------|-------------| +| `sender_name` | string | Display name of sender | +| `date` | string | ISO 8601 date string | +| `has_attachments` | boolean | Whether email has attachments | + +## Report Sections + +### 1. Header +- Report title +- Generation timestamp +- Source info +- Total email count + +### 2. Stats Grid +- Total emails +- Number of categories +- High confidence count (>=70%) +- Unique sender domains + +### 3. Category Distribution +- Horizontal bar chart +- Count and percentage per category +- Sorted by count (descending) + +### 4. Classification Methods +- Breakdown of ML vs Rule vs LLM +- Shows which method handled what percentage + +### 5. Confidence Distribution +- High (>=70%): Green +- Medium (50-70%): Yellow +- Low (<50%): Red + +### 6. Top Senders +- Top 20 senders by email count +- Grid layout + +### 7. Email Tables (Tabbed) +- "All" tab shows all emails +- Category tabs filter by category +- Search box filters by subject/sender +- Columns: Date, Subject, Sender, Category, Confidence, Method +- Sorted by date (newest first) +- Attachment indicator (📎) + +## Customization + +### Changing Colors + +Edit the CSS variables in `generate_html_report.py`: + +```css +:root { + --bg-primary: #1a1a2e; /* Main background */ + --bg-secondary: #16213e; /* Card backgrounds */ + --bg-card: #0f3460; /* Nested elements */ + --text-primary: #eee; /* Main text */ + --text-secondary: #aaa; /* Muted text */ + --accent: #e94560; /* Accent color (red) */ + --accent-hover: #ff6b6b; /* Accent hover */ + --success: #00d9a5; /* Green (high confidence) */ + --warning: #ffc107; /* Yellow (medium confidence) */ + --border: #2a2a4a; /* Border color */ +} +``` + +### Light Theme Example + +```css +:root { + --bg-primary: #f5f5f5; + --bg-secondary: #ffffff; + --bg-card: #e8e8e8; + --text-primary: #333; + --text-secondary: #666; + --accent: #2563eb; + --accent-hover: #3b82f6; + --success: #10b981; + --warning: #f59e0b; + --border: #d1d5db; +} +``` + +### Adding New Sections + +1. Add data extraction in `generate_html_report()` function +2. Add HTML section in the main template string +3. Style with existing CSS classes or add new ones + +### Adding New Table Columns + +1. Modify `generate_email_row()` function +2. Add `` in table header +3. Add `` in row template + +## Performance Notes + +- Report is fully static (no server required) +- JavaScript is minimal (tab switching, search filtering) +- Handles 1000+ emails without performance issues +- For 10k+ emails, consider pagination (not yet implemented) + +## Future Enhancements (TODO) + +- [ ] Pagination for large datasets +- [ ] Export to PDF option +- [ ] Configurable color themes via CLI +- [ ] Column sorting (click headers) +- [ ] Date range filter +- [ ] Sender domain grouping +- [ ] Category confidence heatmap +- [ ] Email body preview on hover + +## Troubleshooting + +### "KeyError: 'subject'" +Results.json lacks email metadata. Re-run classification with latest cli.py. + +### Empty tables +Check that results.json has `classifications` array with data. + +### Dates showing "N/A" +Date parsing failed. Check date format in results.json is ISO 8601. + +### Search not working +JavaScript error. Check browser console. Ensure no HTML entities in data. diff --git a/docs/RESEARCH_FINDINGS.md b/docs/RESEARCH_FINDINGS.md deleted file mode 100644 index ce86883..0000000 --- a/docs/RESEARCH_FINDINGS.md +++ /dev/null @@ -1,419 +0,0 @@ -# EMAIL SORTER - RESEARCH FINDINGS - -Date: 2024-10-21 -Research Phase: Complete - ---- - -## SEARCH SUMMARY - -We conducted web research on: -1. Email classification benchmarks (2024) -2. XGBoost vs LightGBM for embeddings and mixed features -3. Competition analysis (existing email organizers) -4. Gradient boosting with embeddings + categorical features - ---- - -## 1. EMAIL CLASSIFICATION BENCHMARKS (2024) - -### Key Findings - -**Enron Dataset Performance:** -- Traditional ML (SVM, Random Forest): **95-98% accuracy** -- Deep Learning (DNN-BiLSTM): **98.69% accuracy** -- Transformer models (BERT, RoBERTa, DistilBERT): **~99% accuracy** -- LLMs (GPT-4): **99.7% accuracy** (phishing detection) -- Ensemble stacking methods: **98.8% accuracy**, F1: 98.9% - -**Zero-Shot LLM Performance:** -- Flan-T5: **94% accuracy**, F1: 90% -- GPT-4: **97% accuracy**, F1: 95% - -**Key insight:** Modern ML methods can achieve 95-98% accuracy on email classification. Our hybrid target of 94-96% is realistic and competitive. - -### Dataset Details - -- **Enron Email Dataset**: 500,000+ emails from 150 employees -- **EnronQA benchmark**: 103,638 emails with 528,304 Q&A pairs -- **AESLC**: Annotated Enron Subject Line Corpus (for summarization) - -### Implications for Our System - -- Our 94-96% target is achievable and competitive -- LightGBM + embeddings should hit 92-95% easily -- LLM review for 5-10% uncertain cases will push us to upper range -- Attachment analysis is a differentiator (not tested in benchmarks) - ---- - -## 2. LIGHTGBM VS XGBOOST FOR HYBRID FEATURES - -### Decision: LightGBM WINS 🏆 - -| Feature | LightGBM | XGBoost | Winner | -|---------|----------|---------|--------| -| **Categorical handling** | Native support | Needs encoding | ✅ LightGBM | -| **Speed** | 2-5x faster | Baseline | ✅ LightGBM | -| **Memory** | Very efficient | Standard | ✅ LightGBM | -| **Accuracy** | Equivalent | Equivalent | Tie | -| **Mixed features** | 4x speedup | Slower | ✅ LightGBM | - -### Key Advantages of LightGBM - -1. **Native Categorical Support** - - LightGBM splits categorical features by equality - - No need for one-hot encoding - - Avoids dimensionality explosion - - XGBoost requires manual encoding (label, mean, or one-hot) - -2. **Speed Performance** - - 2-5x faster than XGBoost in general - - **4x speedup** on datasets with categorical features - - Same AUC performance, drastically better speed - -3. **Memory Efficiency** - - Preferable for large, sparse datasets - - Better for memory-constrained environments - -4. **Embedding Compatibility** - - Handles dense numerical features (embeddings) excellently - - Native categorical handling for mixed feature types - - Perfect for our hybrid approach - -### Research Quote - -> "LightGBM is significantly faster than XGBoost but delivers almost equivalent performance. In tests, both algorithms achieve pretty much the same AUC, but LightGBM runs from 2 to 5 times faster." - -### Implications for Our System - -**Perfect for our hybrid features:** -```python -features = { - 'embeddings': [384 dense numerical], # ✅ LightGBM handles - 'patterns': [20 boolean/numerical], # ✅ LightGBM handles - 'sender_type': 'corporate', # ✅ LightGBM native categorical - 'time_of_day': 'morning', # ✅ LightGBM native categorical -} -# No encoding needed! 4x faster than XGBoost with encoding -``` - ---- - -## 3. COMPETITION ANALYSIS - -### Cloud-Based Email Organizers (2024) - -| Tool | Price | Features | Privacy | Accuracy Estimate | -|------|-------|----------|---------|-------------------| -| **SaneBox** | $7-15/mo | AI filtering, smart folders | ❌ Cloud | ~85% | -| **Clean Email** | $10-30/mo | 30+ smart filters, bulk ops | ❌ Cloud | ~80% | -| **Spark** | Free/Paid | Smart inbox, categorization | ❌ Cloud | ~75% | -| **EmailTree.ai** | Enterprise | NLP classification, routing | ❌ Cloud | ~90% | -| **Mailstrom** | $30-50/yr | Bulk analysis, categorization | ❌ Cloud | ~70% | - -### Key Features They Offer - -**Common capabilities:** -- Automatic categorization (newsletters, social, etc.) -- Smart folders based on sender/topic -- Bulk operations (archive, delete) -- Unsubscribe management -- Search and filter - -**What they DON'T offer:** -- ❌ Local processing (all require cloud upload) -- ❌ Attachment content analysis -- ❌ One-time cleanup (all are subscriptions) -- ❌ Offline capability -- ❌ Custom LLM integration -- ❌ Open source / distributable - -### Our Competitive Advantages - -✅ **100% LOCAL** - No data leaves the machine -✅ **Privacy-first** - Perfect for business owners with sensitive data -✅ **One-time use** - No subscription, pay per job or DIY -✅ **Attachment analysis** - Extract and classify PDF/DOCX content -✅ **Customizable** - Adapts to each inbox via calibration -✅ **Open source potential** - Distributable as Python wheel -✅ **Offline capable** - Works without internet after setup - -### Market Gap Identified - -**Target customers:** -- Self-employed / business owners with 10k-100k+ emails -- Can't/won't upload to cloud (privacy, GDPR, security concerns) -- Want one-time cleanup, not ongoing subscription -- Tech-savvy enough to run Python tool or hire someone to run it -- Have sensitive business correspondence, invoices, contracts - -**Pain point:** -> "I've thought about just deleting it all, but there's some stuff I need to keep..." - -**Our solution:** -- Local processing (100% private) -- Smart classification (94-96% accurate) -- Attachment analysis (find those invoices!) -- One-time fee or DIY - -**Pricing comparison:** -- SaneBox: $120-180/year subscription -- Clean Email: $120-360/year subscription -- **Us**: $50-200 one-time job OR free (DIY wheel) - ---- - -## 4. GRADIENT BOOSTING WITH EMBEDDINGS - -### Key Finding: CatBoost Has Embedding Support - -**GB-CENT Model** (Gradient Boosted Categorical Embedding and Numerical Trees): -- Combines latent factor embeddings with tree components -- Handles categorical features via low-dimensional representation -- Captures nonlinear interactions of numerical features -- Best of both worlds approach - -**CatBoost's "killer feature":** -> "CatBoost has a killer feature that knows how to work with embeddings, though this is not well-documented." - -**Performance insights:** -- Embeddings both as a feature AND as separate numerical features → best quality -- Native categorical handling has slight edge over encoded approaches -- One-hot encoding generally performs poorly (especially with limited tree depth) - -### Implications for Our System - -**LightGBM strategy (validated by research):** -```python -import lightgbm as lgb - -# Combine embeddings + categorical features -X = np.concatenate([ - embeddings, # 384 dense numerical - pattern_booleans, # 20 numerical (0/1) - structural_numerical # 10 numerical (counts, lengths) -], axis=1) - -# Specify categorical features by name -categorical_features = ['sender_domain_type', 'time_of_day', 'day_of_week'] - -model = lgb.LGBMClassifier( - categorical_feature=categorical_features, # Native handling - n_estimators=200, - learning_rate=0.1, - max_depth=8 -) - -model.fit(X, y) -``` - -**Why this works:** -- LightGBM handles embeddings (dense numerical) excellently -- Native categorical handling for domain_type, time_of_day, etc. -- No encoding overhead (faster, less memory) -- Research shows slight accuracy edge over encoded approaches - ---- - -## 5. SENTENCE EMBEDDINGS FOR EMAIL - -### all-MiniLM-L6-v2 - The Sweet Spot - -**Model specs:** -- Size: 23MB (tiny!) -- Dimensions: 384 (vs 768 for larger models) -- Speed: ~100 emails/sec on CPU -- Accuracy: 85-95% on email/text classification tasks -- Pretrained on 1B+ sentence pairs - -**Why it's perfect for us:** -- Small enough to bundle with wheel distribution -- Fast on CPU (no GPU required) -- Semantic understanding (handles synonyms, paraphrasing) -- Works with short text (emails are perfect) -- No fine-tuning needed (pretrained is excellent) - -### Structured Embeddings (Our Innovation) - -Instead of naive embedding: -```python -# BAD -text = f"{subject} {body}" -embedding = model.encode(text) -``` - -**Our approach (parameterized headers):** -```python -# GOOD - gives model rich context -text = f"""[EMAIL_METADATA] -sender_type: corporate -has_attachments: true -[DETECTED_PATTERNS] -has_otp: false -has_invoice: true -[CONTENT] -subject: {subject} -body: {body[:300]} -""" -embedding = model.encode(text) -``` - -**Research-backed benefit:** 5-10% accuracy boost from structured context - ---- - -## 6. ATTACHMENT ANALYSIS (COMPETITIVE ADVANTAGE) - -### What Competitors Do - -**Most tools:** -- Note "has attachment: true/false" -- Maybe detect attachment type (PDF, DOCX, etc.) -- **DO NOT** extract or analyze attachment content - -### What We Can Do - -**Simple extraction (fast, high value):** -```python -if attachment_type == 'pdf': - text = extract_pdf_text(attachment) # PyPDF2 library - - # Pattern matching in PDF - has_invoice = 'invoice' in text.lower() - has_account_number = bool(re.search(r'account\s*#?\d+', text)) - has_total_amount = bool(re.search(r'total.*\$\d+', text, re.I)) - - # Boost classification confidence - if has_invoice and has_account_number: - category = 'transactional' # 99% confidence - -if attachment_type == 'docx': - text = extract_docx_text(attachment) # python-docx library - word_count = len(text.split()) - - # Long documents might be contracts, reports - if word_count > 1000: - category_hint = 'work' -``` - -**Business owner value:** -- "Find all invoices" → includes PDFs with invoice content -- "Financial documents" → PDFs with account numbers -- "Contracts" → DOCX files with legal terms -- "Reports" → Long DOCX or PDF files - -**Implementation:** -- Use PyPDF2 for PDFs (<5MB size limit) -- Use python-docx for Word docs -- Use openpyxl for simple Excel files -- Flag complex/large attachments for review - ---- - -## 7. PERFORMANCE OPTIMIZATION - -### Batching Strategy (Critical) - -**Embedding generation bottleneck:** -- Sequential: 80,000 emails × 10ms = 13 minutes -- Batched (128 emails): 80,000 ÷ 128 × 100ms = ~1 minute - -**LLM processing optimization:** -- Don't send 1500 individual requests during calibration -- Batch 10-20 emails per prompt → 75-150 requests instead -- Compress sample if needed (1500 → 500 smarter selection) - -### Expected Performance (Revised) - -``` -80,000 emails breakdown: -├─ Calibration (500 compressed samples): 2-3 min -├─ Pattern detection (all 80k): 10 sec -├─ Embedding generation (batched): 1-2 min -├─ LightGBM classification: 3 sec -├─ Hard rules (10%): instant -├─ LLM review (5%, batched): 4 min -└─ Export: 2 min - -Total: ~10-12 minutes (optimistic) -Total: ~15-20 minutes (realistic with overhead) -``` - ---- - -## 8. SECURITY & PRIVACY ADVANTAGES - -### Why Local Processing Matters - -**GDPR considerations:** -- Cloud upload = data processing agreement needed -- Local processing = no third-party involvement -- Business emails often contain sensitive data - -**Privacy concerns:** -- Client lists, pricing, contracts -- Financial information, invoices -- Personal health information (if medical business) -- Legal correspondence - -**Our advantage:** -- 100% local processing -- No data retention -- No cloud storage -- Fresh repo per job (isolation) - ---- - -## CONCLUSIONS & RECOMMENDATIONS - -### 1. Use LightGBM (Not XGBoost) -- 2-5x faster -- Native categorical handling -- Perfect for our hybrid features -- Research-validated choice - -### 2. Structured Embeddings Work -- Parameterized headers boost accuracy 5-10% -- Guide model with detected patterns -- Research-backed technique - -### 3. Attachment Analysis is Differentiator -- Competitors don't do this -- High value for business owners -- Simple to implement (PyPDF2, python-docx) - -### 4. Qwen 3 Model Strategy -- **qwen3:4b** for calibration (better discovery) -- **qwen3:1.7b** for bulk review (faster) -- Single config file for easy swapping - -### 5. Market Gap Validated -- No local, privacy-first alternatives -- Business owners have this pain point -- One-time cleanup vs subscription -- 94-96% accuracy is competitive - -### 6. Performance Target Achievable -- 15-20 min for 80k emails (realistic) -- 94-96% accuracy (research-backed) -- <5% need LLM review -- Competitive with cloud tools - ---- - -## NEXT STEPS - -1. ✅ Research complete -2. ✅ Architecture validated -3. ⏭ Build core infrastructure -4. ⏭ Implement hybrid features -5. ⏭ Create LightGBM classifier -6. ⏭ Add LLM providers -7. ⏭ Build test harness -8. ⏭ Package as wheel -9. ⏭ Test on real inbox - ---- - -**Research phase complete. Architecture validated. Ready to build.** diff --git a/docs/ROOT_CAUSE_ANALYSIS.md b/docs/ROOT_CAUSE_ANALYSIS.md deleted file mode 100644 index 752c25d..0000000 --- a/docs/ROOT_CAUSE_ANALYSIS.md +++ /dev/null @@ -1,319 +0,0 @@ -# Root Cause Analysis: Category Explosion & Over-Confidence - -**Date:** 2025-10-24 -**Run:** 100k emails, qwen3:4b model -**Issue:** Model trained on 29 categories instead of expected 11, with extreme over-confidence - ---- - -## Executive Summary - -The 100k classification run technically succeeded (92.1% accuracy estimate) but revealed critical architectural issues: - -1. **Category Explosion:** 29 training categories vs expected 11 -2. **Duplicate Categories:** Work/work, Administrative/auth, finance/Financial -3. **Extreme Over-Confidence:** 99%+ classifications at 1.0 confidence -4. **Category Leakage:** Hardcoded categories leaked into LLM-discovered categories - ---- - -## The Bug - -### Location -[src/calibration/workflow.py:110](src/calibration/workflow.py#L110) - -```python -all_categories = list(set(self.categories) | set(discovered_categories.keys()) | label_categories) -``` - -### What Happened - -The workflow merges THREE category sources: - -1. **`self.categories`** - 12 hardcoded categories from `config/categories.yaml`: - - junk, transactional, auth, newsletters, social, automated - - conversational, work, personal, finance, travel, unknown - -2. **`discovered_categories.keys()`** - 11 LLM-discovered categories: - - Work, Financial, Administrative, Operational, Meeting - - Technical, External, Announcements, Urgent, Miscellaneous, Forwarded - -3. **`label_categories`** - Additional categories from LLM labels: - - Bowl Pool 2000, California Market, Prehearing, Change, Monitoring - - Information - -### Result: 29 Total Categories - -``` -1. Administrative (LLM discovered) -2. Announcements (LLM discovered) -3. Bowl Pool 2000 (LLM label - weird) -4. California Market (LLM label - too specific) -5. Change (LLM label - vague) -6. External (LLM discovered) -7. Financial (LLM discovered) -8. Forwarded (LLM discovered) -9. Information (LLM label - vague) -10. Meeting (LLM discovered) -11. Miscellaneous (LLM discovered) -12. Monitoring (LLM label - too specific) -13. Operational (LLM discovered) -14. Prehearing (LLM label - too specific) -15. Technical (LLM discovered) -16. Urgent (LLM discovered) -17. Work (LLM discovered) -18. auth (hardcoded) -19. automated (hardcoded) -20. conversational (hardcoded) -21. finance (hardcoded) -22. junk (hardcoded) -23. newsletters (hardcoded) -24. personal (hardcoded) -25. social (hardcoded) -26. transactional (hardcoded) -27. travel (hardcoded) -28. unknown (hardcoded) -29. work (hardcoded) -``` - -### Duplicates Identified - -- **Work (LLM) vs work (hardcoded)** - 14,223 vs 368 emails -- **Financial (LLM) vs finance (hardcoded)** - 5,943 vs 0 emails -- **Administrative (LLM) vs auth (hardcoded)** - 67,195 vs 37 emails - ---- - -## Impact Analysis - -### 1. Category Distribution (100k Results) - -| Category | Count | Confidence | Source | -|----------|-------|------------|--------| -| Administrative | 67,195 | 1.000 | LLM discovered | -| Work | 14,223 | 1.000 | LLM discovered | -| Meeting | 7,785 | 1.000 | LLM discovered | -| Financial | 5,943 | 1.000 | LLM discovered | -| Operational | 3,274 | 1.000 | LLM discovered | -| junk | 394 | 0.960 | Hardcoded | -| work | 368 | 0.950 | Hardcoded | -| Miscellaneous | 238 | 1.000 | LLM discovered | -| Technical | 193 | 1.000 | LLM discovered | -| External | 137 | 1.000 | LLM discovered | -| transactional | 44 | 0.970 | Hardcoded | -| auth | 37 | 0.990 | Hardcoded | -| unknown | 23 | 0.500 | Hardcoded | -| Others | <20 each | Various | Mixed | - -### 2. Extreme Over-Confidence - -- **67,195 emails** classified as "Administrative" with **1.0 confidence** -- **99.9%** of all classifications have confidence >= 0.95 -- This is unrealistic - suggests overfitting or poor calibration - -### 3. Why It Still "Worked" - -- LLM-discovered categories (uppercase) handled 99%+ of emails -- Hardcoded categories (lowercase) mostly unused except for rules -- Model learned both sets but strongly preferred LLM categories -- Enron dataset doesn't match hardcoded categories well - ---- - -## Why This Happened - -### Design Intent vs Reality - -**Original Design:** -- Hardcoded categories in `categories.yaml` for rule-based matching -- LLM discovers NEW categories during calibration -- Merge both for flexible classification - -**Reality:** -- Hardcoded categories leak into ML training -- Creates duplicate concepts (Work vs work) -- LLM labels include one-off categories (Bowl Pool 2000) -- No deduplication or conflict resolution - -### The Workflow Path - -``` -1. CLI loads hardcoded categories from categories.yaml - → ['junk', 'transactional', 'auth', ... 'work', 'finance', 'unknown'] - -2. Passes to CalibrationWorkflow.__init__(categories=...) - → self.categories = list(categories.keys()) - -3. LLM discovers categories from emails - → {'Work': 'business emails', 'Financial': 'budgets', ...} - -4. Consolidation reduces duplicates (within LLM categories only) - → But doesn't see hardcoded categories - -5. Merge ALL sources at workflow.py:110 - → Hardcoded + Discovered + Label anomalies = 29 categories - -6. Trainer learns all 29 categories - → Model becomes confused but weights LLM categories heavily -``` - ---- - -## Spot-Check Findings - -### High Confidence Samples (Correct) - -✅ **Sample 1:** "i'll get the movie and wine. my suggestion is something from central market" - - Classified: Administrative (1.0) - - **Assessment:** Questionable - looks more personal - -✅ **Sample 2:** "Can you spell S-N-O-O-T-Y?" - - Classified: Administrative (1.0) - - **Assessment:** Wrong - clearly conversational/personal - -✅ **Sample 3:** "MEETING TONIGHT - 6:00 pm Central Time at The Houstonian" - - Classified: Meeting (1.0) - - **Assessment:** Correct - -### Low Confidence Samples (Unknown) - -⚠️ **All low confidence samples classified as "unknown" (0.500)** -- These fell back to LLM -- LLM failed to classify (returned unknown) -- Actual content: Legitimate business emails about deferrals, power units - -### Category Anomalies - -❌ **"California Market" (6 emails, 1.0 confidence)** -- Too specific - shouldn't be a standalone category -- Should be "Work" or "External" - -❌ **"Bowl Pool 2000" (exists in training set)** -- One-off event category -- Should never have been kept - ---- - -## Performance Impact - -### What Went Right - -- **ML handled 99.1%** of emails (99,134 / 100,000) -- **Only 31 fell to LLM** (0.03%) -- Fast classification (~3 minutes for 100k) -- Discovered categories are semantically good - -### What Went Wrong - -- **Unrealistic confidence** - Almost everything is 1.0 -- **Category pollution** - 29 instead of 11 -- **Duplicates** - Work/work, finance/Financial -- **No calibration** - Model confidence not properly calibrated -- **Hardcoded categories unused** - 368 "work" vs 14,223 "Work" - ---- - -## Root Causes - -### 1. Architectural Confusion - -**Two competing philosophies:** -- **Rule-based system:** Use hardcoded categories with pattern matching -- **LLM-driven system:** Discover categories from data - -**Result:** They interfere with each other instead of complementing - -### 2. Missing Deduplication - -The workflow.py:110 line does a simple set union without: -- Case normalization -- Semantic similarity checking -- Conflict resolution -- Priority rules - -### 3. No Consolidation Across Sources - -The LLM consolidation step (line 91-100) only consolidates within discovered categories. It doesn't: -- Check against hardcoded categories -- Merge similar concepts -- Remove one-off labels - -### 4. Poor Category Cache Design - -The category cache (src/models/category_cache.json) saves LLM categories but: -- Doesn't deduplicate against hardcoded categories -- Allows case-sensitive duplicates -- No validation of category quality - ---- - -## Recommendations - -### Immediate Fixes - -1. **Remove hardcoded categories from ML training** - - Use them ONLY for rule-based matching - - Don't merge into `all_categories` for training - - Let LLM discover all ML categories - -2. **Add case-insensitive deduplication** - - Normalize to title case - - Check semantic similarity - - Merge duplicates before training - -3. **Filter label anomalies** - - Reject categories with <10 training samples - - Reject overly specific categories (Bowl Pool 2000) - - LLM review step for quality - -4. **Calibrate model confidence** - - Use temperature scaling or Platt scaling - - Ensure confidence reflects actual accuracy - -### Architecture Decision - -**Option A: Rule-Based + ML (Current)** -- Keep hardcoded categories for RULES ONLY -- LLM discovers categories for ML ONLY -- Never merge the two - -**Option B: Pure LLM Discovery (Recommended)** -- Remove categories.yaml entirely -- LLM discovers ALL categories -- Rules can still match on keywords but don't define categories - -**Option C: Hybrid with Priority** -- Define 3-5 HIGH-PRIORITY hardcoded categories (junk, auth, transactional) -- Let LLM discover everything else -- Clear hierarchy: Rules → Hardcoded ML → Discovered ML - ---- - -## Next Steps - -1. **Decision:** Choose architecture (A, B, or C above) -2. **Fix workflow.py:110** - Implement chosen strategy -3. **Add deduplication logic** - Case-insensitive, semantic matching -4. **Rerun calibration** - Clean 250-sample run -5. **Validate results** - Ensure clean categories -6. **Fix confidence** - Add calibration layer - ---- - -## Files to Modify - -1. [src/calibration/workflow.py:110](src/calibration/workflow.py#L110) - Category merging logic -2. [src/calibration/llm_analyzer.py](src/calibration/llm_analyzer.py) - Add cross-source consolidation -3. [src/cli.py:70](src/cli.py#L70) - Decide whether to load hardcoded categories -4. [config/categories.yaml](config/categories.yaml) - Clarify purpose (rules only?) -5. [src/calibration/trainer.py](src/calibration/trainer.py) - Add confidence calibration - ---- - -## Conclusion - -The system technically worked - it classified 100k emails with high ML efficiency. However, the category explosion and over-confidence issues reveal fundamental architectural problems that need resolution before production use. - -The core question: **Should hardcoded categories participate in ML training at all?** - -My recommendation: **No.** Use them for rules only, let LLM discover ML categories cleanly. diff --git a/docs/SESSION_HANDOVER_20251128.md b/docs/SESSION_HANDOVER_20251128.md new file mode 100644 index 0000000..b466622 --- /dev/null +++ b/docs/SESSION_HANDOVER_20251128.md @@ -0,0 +1,128 @@ +# Session Handover Report - Email Sorter +**Date:** 2025-11-28 +**Session ID:** eb549838-a153-48d1-ae5d-891e0e83108f + +--- + +## What Was Done This Session + +### 1. Classified 801 emails from brett-gmail using three methods: + +| Method | Accuracy | Time | Output Location | +|--------|----------|------|-----------------| +| ML-Only | 54.9% | ~5 sec | `/home/bob/Documents/Email Manager/emails/brett-gm-md/` | +| ML+LLM | 93.3% | ~3.5 min | `/home/bob/Documents/Email Manager/emails/brett-gm-llm/` | +| Manual Agent | 99.8% | ~25 min | Same as ML-only + analysis files | + +### 2. Created/Modified Files + +**New Files:** +- `tools/generate_html_report.py` - HTML report generator +- `tools/brett_gmail_analyzer.py` - Custom dataset analyzer +- `data/brett_gmail_analysis.json` - Analysis output +- `docs/REPORT_FORMAT.md` - Report system documentation +- `docs/CLASSIFICATION_METHODS_COMPARISON.md` - Method comparison +- `docs/PROJECT_ROADMAP_2025.md` - Full roadmap and learnings +- `/home/bob/Documents/Email Manager/emails/brett-gm-md/BRETT_GMAIL_ANALYSIS_REPORT.md` - Analysis report +- `/home/bob/Documents/Email Manager/emails/brett-gm-md/report.html` - HTML report (ML-only) +- `/home/bob/Documents/Email Manager/emails/brett-gm-llm/report.html` - HTML report (ML+LLM) + +**Modified Files:** +- `src/cli.py` - Added `--force-ml` flag, enriched results.json with email metadata +- `src/llm/openai_compat.py` - Removed API key requirement for local vLLM +- `config/default_config.yaml` - Changed LLM to openai provider on localhost:11433 + +### 3. Key Configuration Changes + +```yaml +# config/default_config.yaml - LLM now uses vLLM endpoint +llm: + provider: "openai" + openai: + base_url: "http://localhost:11433/v1" + api_key: "not-needed" + classification_model: "qwen3-coder-30b" +``` + +--- + +## Key Findings + +1. **ML pipeline overkill for <5000 emails** - Agent analysis gives better accuracy in similar time +2. **Sender domain is strongest signal** - Top 5 senders = 47.5% of emails +3. **Categories should serve downstream routing** - Not human labels, but processing decisions +4. **Risk-based accuracy** - Personal emails need high accuracy, junk can tolerate errors +5. **This tool = triage** - Sorts into buckets for other specialized tools + +--- + +## Project Scope (Agreed with User) + +**Email Sorter IS:** +- Bulk classification/triage tool +- Router to downstream specialized tools +- Part of larger email processing ecosystem + +**Email Sorter IS NOT:** +- Complete email management solution +- Spam filter (trust Gmail/Outlook) +- Final destination for emails + +--- + +## Recommended Dataset Size Routing + +| Size | Method | +|------|--------| +| <500 | Agent-only | +| 500-5000 | Agent pre-scan + ML | +| >5000 | ML pipeline | + +--- + +## Background Processes + +There are stale background bash processes (f8678e, 0a3549, 0d150e) from classification runs. These completed successfully and can be ignored. + +--- + +## What Needs Doing Next + +1. **Review docs/** - All learnings are in PROJECT_ROADMAP_2025.md +2. **Phase 1 development** - Dataset size routing, sender-first classification +3. **Agent pre-scan module** - 10-15 min discovery phase before ML + +--- + +## User Preferences (from CLAUDE.md) + +- NO emojis in commits +- NO "Generated with Claude" attribution +- Use tools (Read/Edit/Grep) not bash commands for file ops +- Virtual environment required for Python +- TTS available via `fss-speak` (single line messages only, no newlines) + +--- + +## Quick Start for Next Agent + +```bash +cd /MASTERFOLDER/Tools/email-sorter +source venv/bin/activate + +# Read the roadmap +cat docs/PROJECT_ROADMAP_2025.md + +# Run classification +python -m src.cli run --source local \ + --directory "/path/to/emails" \ + --output "/path/to/output" \ + --force-ml --llm-provider openai + +# Generate HTML report +python tools/generate_html_report.py --input /path/to/results.json +``` + +--- + +*Session ended: 2025-11-28 ~03:30 AEDT* diff --git a/docs/START_HERE.md b/docs/START_HERE.md deleted file mode 100644 index 825bd6f..0000000 --- a/docs/START_HERE.md +++ /dev/null @@ -1,324 +0,0 @@ -# EMAIL SORTER - START HERE - -**Welcome to Email Sorter v1.0 - Your Email Classification System** - ---- - -## What Is This? - -A **complete email classification system** that: -- Uses hybrid ML/LLM classification for 90-94% accuracy -- Processes emails with smart rules, machine learning, and AI -- Works with Gmail, IMAP, or any email dataset -- Is ready to use **right now** - ---- - -## What You Need to Know - -### ✅ The Good News -- **Framework is 100% complete** - all 16 planned phases are done -- **Ready to use immediately** - with mock model or real model -- **Complete codebase** - 6000+ lines, full type hints, comprehensive logging -- **90% test pass rate** - 27/30 tests passing -- **Comprehensive documentation** - 10 guides covering everything - -### ❌ The Not-So-News -- **Mock model included** - for testing the framework (not for production accuracy) -- **Real model optional** - you choose to train on Enron or download pre-trained -- **Gmail setup optional** - framework works without it -- **LLM integration optional** - graceful fallback if unavailable - ---- - -## Three Ways to Get Started - -### 🟢 Path A: Validate Framework (5 minutes) -Perfect if you want to quickly verify everything works - -```bash -cd "c:/Build Folder/email-sorter" -source venv/Scripts/activate - -# Run tests -pytest tests/ -v - -# Test with mock pipeline -python -m src.cli run --source mock --output test_results/ -``` - -**What you'll learn**: Framework works perfectly with mock model - ---- - -### 🟡 Path B: Integrate Real Model (30-60 minutes) -Perfect if you want actual classification results - -```bash -# Option 1: Train on Enron dataset (recommended) -python -c " -from src.calibration.enron_parser import EnronParser -from src.calibration.trainer import ModelTrainer -from src.classification.feature_extractor import FeatureExtractor - -parser = EnronParser('enron_mail_20150507') -emails = parser.parse_emails(limit=5000) -extractor = FeatureExtractor() -trainer = ModelTrainer(extractor, ['junk', 'transactional', 'auth', 'newsletters', - 'social', 'automated', 'conversational', 'work', - 'personal', 'finance', 'travel', 'unknown']) -results = trainer.train([(e, 'unknown') for e in emails]) -trainer.save_model('src/models/pretrained/classifier.pkl') -" - -# Option 2: Use pre-trained model -python tools/setup_real_model.py --model-path /path/to/model.pkl - -# Verify -python tools/setup_real_model.py --check -``` - -**What you'll get**: Real LightGBM model, automatic classification with 85-90% accuracy - ---- - -### 🔴 Path C: Full Production Deployment (2-3 hours) -Perfect if you want to process Marion's 80k+ emails - -```bash -# 1. Setup Gmail OAuth (download credentials.json, place in project root) - -# 2. Test with 100 emails -python -m src.cli run --source gmail --limit 100 --output test_results/ - -# 3. Process all emails -python -m src.cli run --source gmail --output marion_results/ - -# 4. Check results -cat marion_results/report.txt -``` - -**What you'll get**: All 80k+ emails sorted, labeled, and synced to Gmail - ---- - -## Documentation Map - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **START_HERE.md** | This file - quick orientation | First (right now!) | -| **NEXT_STEPS.md** | Decision tree and action plan | Decide your path | -| **PROJECT_COMPLETE.md** | Final summary and status | Understand scope | -| **COMPLETION_ASSESSMENT.md** | Detailed component review | Deep dive needed | -| **MODEL_INFO.md** | Model usage and training | For model setup | -| **README.md** | Getting started guide | General reference | -| **PROJECT_STATUS.md** | Feature inventory | Full feature list | -| **PROJECT_BLUEPRINT.md** | Original architecture plan | Background context | - ---- - -## Quick Reference Commands - -```bash -# Navigate and activate -cd "c:/Build Folder/email-sorter" -source venv/Scripts/activate - -# Validation -pytest tests/ -v # Run all tests -python -m src.cli test-config # Validate configuration -python -m src.cli test-ollama # Test LLM (if running) -python -m src.cli test-gmail # Test Gmail connection - -# Framework testing -python -m src.cli run --source mock # Test with mock provider - -# Real processing -python -m src.cli run --source gmail --limit 100 # Test with Gmail -python -m src.cli run --source gmail --output results/ # Full processing - -# Model management -python tools/setup_real_model.py --check # Check model status -python tools/setup_real_model.py --model-path FILE # Install model -python tools/download_pretrained_model.py --url URL # Download model -``` - ---- - -## Common Questions - -### Q: Do I need to do anything right now? -**A:** No! But you can run `pytest tests/ -v` to verify everything works. - -### Q: Is the framework ready to use? -**A:** YES! All 16 phases are complete. 90% test pass rate. Ready to use. - -### Q: How do I get better accuracy than the mock model? -**A:** Train a real model or download pre-trained. See Path B above. - -### Q: Does this work without Gmail? -**A:** YES! Use mock provider or IMAP provider instead. - -### Q: Can I use it right now? -**A:** YES! With mock model. For real accuracy, integrate real model (Path B). - -### Q: How long to process all 80k emails? -**A:** About 20-30 minutes after setup. Path C shows how. - -### Q: Where do I start? -**A:** Choose your path above. Path A (5 min) is the quickest. - ---- - -## What Each Path Gets You - -### Path A Results (5 minutes) -- ✅ Confirm framework works -- ✅ See mock classification in action -- ✅ Verify all tests pass -- ❌ Not real-world accuracy yet - -### Path B Results (30-60 minutes) -- ✅ Real LightGBM model trained -- ✅ 85-90% classification accuracy -- ✅ Ready for real data -- ❌ Haven't processed real emails yet - -### Path C Results (2-3 hours) -- ✅ All emails classified -- ✅ 90-94% overall accuracy -- ✅ Synced to Gmail labels -- ✅ Full deployment complete -- ✅ Marion's 80k+ emails processed - ---- - -## Key Files & Locations - -``` -c:/Build Folder/email-sorter/ - -Core Framework: - src/ Main framework code - classification/ Email classifiers - calibration/ Model training - processing/ Batch processing - llm/ LLM providers - email_providers/ Email sources - export/ Results export - -Data & Models: - enron_mail_20150507/ Real email dataset (already extracted) - src/models/pretrained/ Where real model goes - models/ Alternative model directory - -Tools: - tools/setup_real_model.py Install pre-trained models - tools/download_pretrained_model.py Download models - -Configuration: - config/ YAML configuration - credentials.json (optional) Gmail OAuth - -Testing: - tests/ 23 test cases - logs/ Execution logs -``` - ---- - -## Success Looks Like - -### After Path A (5 min) -``` -✅ 27/30 tests passing -✅ Framework validation complete -✅ Mock pipeline ran successfully -Status: Ready to explore -``` - -### After Path B (30-60 min) -``` -✅ Real model installed -✅ Model check shows: is_mock: False -✅ Ready for real classification -Status: Ready for real data -``` - -### After Path C (2-3 hours) -``` -✅ All 80k emails processed -✅ Gmail labels synced -✅ Results exported and reviewed -✅ Accuracy metrics acceptable -Status: Complete and deployed -``` - ---- - -## One More Thing... - -**This framework is complete and ready to use NOW.** You don't need to: -- Fix anything ✅ -- Add components ✅ -- Change architecture ✅ -- Debug systems ✅ -- Train models (optional) ✅ - -What you CAN do: -- Use it immediately with mock model -- Integrate real model when ready -- Scale to production anytime -- Customize categories and rules -- Deploy to other systems - ---- - -## Your Next Step - -Pick one: - -**🟢 I want to test the framework right now** → Go to Path A (5 min) - -**🟡 I want better accuracy tomorrow** → Go to Path B (30-60 min) - -**🔴 I want all emails processed this week** → Go to Path C (2-3 hours total) - -Or read one of the detailed docs: -- **NEXT_STEPS.md** - Decision tree -- **PROJECT_COMPLETE.md** - Full summary -- **README.md** - Detailed guide - ---- - -## Contact & Support - -If something doesn't work: - -1. Check logs: `tail -f logs/email_sorter.log` -2. Run tests: `pytest tests/ -v` -3. Validate setup: `python -m src.cli test-config` -4. Review docs: See Documentation Map above - -Most issues are covered in the docs! - ---- - -## Quick Stats - -- **Framework Status**: 100% complete -- **Test Pass Rate**: 90% (27/30) -- **Lines of Code**: ~6,000+ production -- **Python Modules**: 38 files -- **Documentation**: 10 guides -- **Ready for**: Immediate use - ---- - -**Ready to get started? Choose your path above and begin! 🚀** - -The framework is done. The tools are ready. The documentation is complete. - -All you need to do is pick a path and start. - -Let's go! diff --git a/docs/SYSTEM_FLOW.html b/docs/SYSTEM_FLOW.html deleted file mode 100644 index f05e877..0000000 --- a/docs/SYSTEM_FLOW.html +++ /dev/null @@ -1,493 +0,0 @@ - - - - - - Email Sorter System Flow - - - - -

Email Sorter System Flow Documentation

- -

1. Main Execution Flow

-
-
-flowchart TD
-    Start([python -m src.cli run]) --> LoadConfig[Load config/default_config.yaml]
-    LoadConfig --> InitProviders[Initialize Email Provider
Enron/Gmail/IMAP] - InitProviders --> FetchEmails[Fetch Emails
--limit N] - - FetchEmails --> CheckSize{Email Count?} - CheckSize -->|"< 1000"| SetMockMode[Set ml_classifier.is_mock = True
LLM-only mode] - CheckSize -->|">= 1000"| CheckModel{Model Exists?} - - CheckModel -->|No model at
src/models/pretrained/classifier.pkl| RunCalibration[CALIBRATION PHASE
LLM category discovery
Train ML model] - CheckModel -->|Model exists| SkipCalibration[Skip Calibration
Load existing model] - SetMockMode --> SkipCalibration - - RunCalibration --> ClassifyPhase[CLASSIFICATION PHASE] - SkipCalibration --> ClassifyPhase - - ClassifyPhase --> Loop{For each email} - Loop --> RuleCheck{Hard rule match?} - RuleCheck -->|Yes| RuleClassify[Category by rule
confidence=1.0
method='rule'] - RuleCheck -->|No| MLClassify[ML Classification
Get category + confidence] - - MLClassify --> ConfCheck{Confidence >= threshold?} - ConfCheck -->|Yes| AcceptML[Accept ML result
method='ml'
needs_review=False] - ConfCheck -->|No| LowConf[Low confidence detected
needs_review=True] - - LowConf --> FlagCheck{--no-llm-fallback?} - FlagCheck -->|Yes| AcceptMLAnyway[Accept ML anyway
needs_review=False] - FlagCheck -->|No| LLMCheck{LLM available?} - - LLMCheck -->|Yes| LLMReview[LLM Classification
~4 seconds
method='llm'] - LLMCheck -->|No| AcceptMLAnyway - - RuleClassify --> NextEmail{More emails?} - AcceptML --> NextEmail - AcceptMLAnyway --> NextEmail - LLMReview --> NextEmail - - NextEmail -->|Yes| Loop - NextEmail -->|No| SaveResults[Save results.json] - SaveResults --> End([Complete]) - - style RunCalibration fill:#ff6b6b - style LLMReview fill:#ff6b6b - style SetMockMode fill:#ffd93d - style FlagCheck fill:#4ec9b0 - style AcceptMLAnyway fill:#4ec9b0 -
-
- -

2. Calibration Phase Detail (When Triggered)

-
-
-flowchart TD
-    Start([Calibration Triggered]) --> Sample[Stratified Sampling
3% of emails
min 250, max 1500] - Sample --> LLMBatch[LLM Category Discovery
50 emails per batch] - - LLMBatch --> Batch1[Batch 1: 50 emails
~20 seconds] - Batch1 --> Batch2[Batch 2: 50 emails
~20 seconds] - Batch2 --> BatchN[... N batches
For 300 samples: 6 batches] - - BatchN --> Consolidate[LLM Consolidation
Merge similar categories
~5 seconds] - Consolidate --> Categories[Final Categories
~10-12 unique categories] - - Categories --> Label[Label Training Emails
LLM labels each sample
~3 seconds per email] - Label --> Extract[Feature Extraction
Embeddings + TF-IDF
~0.02 seconds per email] - Extract --> Train[Train LightGBM Model
~5 seconds total] - - Train --> Validate[Validate on 100 samples
~2 seconds] - Validate --> Save[Save Model
src/models/calibrated/classifier.pkl] - Save --> End([Calibration Complete
Total time: 15-25 minutes for 10k emails]) - - style LLMBatch fill:#ff6b6b - style Label fill:#ff6b6b - style Consolidate fill:#ff6b6b - style Train fill:#4ec9b0 -
-
- -

3. Classification Phase Detail

-
-
-flowchart TD
-    Start([Classification Phase]) --> Email[Get Email]
-    Email --> Rules{Check Hard Rules
Pattern matching} - - Rules -->|Match| RuleDone[Rule Match
~0.001 seconds
59 of 10000 emails] - Rules -->|No match| Embed[Generate Embedding
all-minilm:l6-v2
~0.02 seconds] - - Embed --> TFIDF[TF-IDF Features
~0.001 seconds] - TFIDF --> MLPredict[ML Prediction
LightGBM
~0.003 seconds] - - MLPredict --> Threshold{Confidence >= 0.55?} - Threshold -->|Yes| MLDone[ML Classification
7842 of 10000 emails
78.4%] - Threshold -->|No| Flag{--no-llm-fallback?} - - Flag -->|Yes| MLForced[Force ML result
No LLM call] - Flag -->|No| LLM[LLM Classification
~4 seconds
2099 of 10000 emails
21%] - - RuleDone --> Next([Next Email]) - MLDone --> Next - MLForced --> Next - LLM --> Next - - style LLM fill:#ff6b6b - style MLDone fill:#4ec9b0 - style MLForced fill:#ffd93d -
-
- -

4. Model Loading Logic

-
-
-flowchart TD
-    Start([MLClassifier.__init__]) --> CheckPath{model_path provided?}
-    CheckPath -->|Yes| UsePath[Use provided path]
-    CheckPath -->|No| Default[Default:
src/models/pretrained/classifier.pkl] - - UsePath --> FileCheck{File exists?} - Default --> FileCheck - - FileCheck -->|Yes| Load[Load pickle file] - FileCheck -->|No| CreateMock[Create MOCK model
Random Forest
12 hardcoded categories] - - Load --> ValidCheck{Valid model data?} - ValidCheck -->|Yes| CheckMock{is_mock flag?} - ValidCheck -->|No| CreateMock - - CheckMock -->|True| WarnMock[Warn: MOCK model active] - CheckMock -->|False| RealModel[Real trained model loaded] - - CreateMock --> MockWarnings[Multiple warnings printed
NOT for production] - WarnMock --> Ready[Model Ready] - RealModel --> Ready - MockWarnings --> Ready - - Ready --> End([Classification can start]) - - style CreateMock fill:#ff6b6b - style RealModel fill:#4ec9b0 - style WarnMock fill:#ffd93d -
-
- -

5. Flag Conditions & Effects

- -
-

--no-llm-fallback

-

Location: src/cli.py:46, src/classification/adaptive_classifier.py:152-161

-

Effect: When ML confidence < threshold, accept ML result anyway instead of calling LLM

-

Use case: Test pure ML performance, avoid LLM costs

-

Code path:

- -if self.disable_llm_fallback:
-  # Just return ML result without LLM fallback
-  return ClassificationResult(needs_review=False) -
-
- -
-

--limit N

-

Location: src/cli.py:38

-

Effect: Limits number of emails fetched from source

-

Calibration trigger: If N < 1000, forces LLM-only mode (no ML training)

-

Code path:

- -if total_emails < 1000:
-  ml_classifier.is_mock = True # Skip ML, use LLM only -
-
- -
-

Model Path Override

-

Location: src/classification/ml_classifier.py:43

-

Default: src/models/pretrained/classifier.pkl

-

Calibration saves to: src/models/calibrated/classifier.pkl

-

Problem: Calibration saves to different location than default load location

-

Solution: Copy calibrated model to pretrained location OR pass model_path parameter

-
- -

6. Timing Breakdown (10,000 emails)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PhaseOperationTime per EmailTotal Time (10k)LLM Required?
Calibration
(if model doesn't exist)
Stratified sampling (300 emails)-~1 secondNo
LLM category discovery (6 batches)~0.4 sec/email~2 minutesYES
LLM consolidation-~5 secondsYES
LLM labeling (300 samples)~3 sec/email~15 minutesYES
Feature extraction (300 samples)~0.02 sec/email~6 secondsNo (embeddings)
Model training (LightGBM)-~5 secondsNo
CALIBRATION TOTAL~17-20 minutesYES
Classification
(with model)
Hard rule matching~0.001 sec~10 seconds (all 10k)No
Embedding generation~0.02 sec~200 seconds (all 10k)No (Ollama embed)
ML prediction~0.003 sec~30 seconds (all 10k)No
LLM fallback (21% of emails)~4 sec/email~140 minutes (2100 emails)YES
Saving results-~1 secondNo
CLASSIFICATION TOTAL (with LLM fallback)~2.5 hoursYES (21%)
CLASSIFICATION TOTAL (--no-llm-fallback)~4 minutesNo
- -

7. Why LLM Still Loads

- -
-
-flowchart TD
-    Start([CLI startup]) --> Always1[ALWAYS: Load LLM provider
src/cli.py:98-117] - Always1 --> Reason1[Reason: Needed for calibration
if model doesn't exist] - - Reason1 --> Check{Model exists?} - Check -->|No| NeedLLM1[LLM required for calibration
Category discovery
Sample labeling] - Check -->|Yes| SkipCal[Skip calibration] - - SkipCal --> ClassStart[Start classification] - NeedLLM1 --> DoCalibration[Run calibration
Uses LLM] - DoCalibration --> ClassStart - - ClassStart --> Always2[ALWAYS: LLM provider is available
llm.is_available = True] - Always2 --> EmailLoop[For each email...] - - EmailLoop --> LowConf{Low confidence?} - LowConf -->|No| NoLLM[No LLM call] - LowConf -->|Yes| FlagCheck{--no-llm-fallback?} - - FlagCheck -->|Yes| NoLLMCall[No LLM call
Accept ML result] - FlagCheck -->|No| LLMAvail{llm.is_available?} - - LLMAvail -->|Yes| CallLLM[LLM called
src/cli.py:227-228] - LLMAvail -->|No| NoLLMCall - - NoLLM --> End([Next email]) - NoLLMCall --> End - CallLLM --> End - - style Always1 fill:#ffd93d - style Always2 fill:#ffd93d - style CallLLM fill:#ff6b6b - style NoLLMCall fill:#4ec9b0 -
-
- -

Why LLM Provider is Always Initialized:

-
    -
  • Line 98-117 (src/cli.py): LLM provider is created before checking if model exists
  • -
  • Reason: Need LLM ready in case calibration is required
  • -
  • Result: Even with --no-llm-fallback, LLM provider loads (but won't be called for classification)
  • -
- -

8. Command Scenarios

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
CommandModel Exists?Calibration Runs?LLM Used for Classification?Total Time (10k)
python -m src.cli run --source enron --limit 10000NoYES (~20 min)YES (~2.5 hours)~2 hours 50 min
python -m src.cli run --source enron --limit 10000YesNoYES (~2.5 hours)~2.5 hours
python -m src.cli run --source enron --limit 10000 --no-llm-fallbackNoYES (~20 min)NO~24 minutes
python -m src.cli run --source enron --limit 10000 --no-llm-fallbackYesNoNO~4 minutes
python -m src.cli run --source enron --limit 500AnyNo (too few emails)YES (100% LLM-only)~35 minutes
- -

9. Current System State

- -
-

Model Status

-
    -
  • src/models/calibrated/classifier.pkl - 1.8MB, trained at 02:54, 10 categories
  • -
  • src/models/pretrained/classifier.pkl - Copy of calibrated model (created manually)
  • -
-
- -
-

Threshold Configuration

-
    -
  • config/default_config.yaml: default_threshold = 0.55
  • -
  • config/categories.yaml: All category thresholds = 0.55
  • -
  • Effect: ML must be ≥55% confident to skip LLM
  • -
-
- -
-

Last Run Results (10k emails)

-
    -
  • Rules: 59 emails (0.6%)
  • -
  • ML: 7,842 emails (78.4%)
  • -
  • LLM fallback: 2,099 emails (21%)
  • -
  • Accuracy estimate: 92.7%
  • -
-
- -

10. To Run ML-Only Test (No LLM Calls During Classification)

- -
-

Requirements:

-
    -
  1. Model must exist at src/models/pretrained/classifier.pkl ✓ (done)
  2. -
  3. Use --no-llm-fallback flag
  4. -
  5. Ensure sufficient emails (≥1000) to avoid LLM-only mode
  6. -
- -

Command:

- -python -m src.cli run --source enron --limit 10000 --output ml_only_10k/ --no-llm-fallback - - -

Expected Results:

-
    -
  • Calibration: Skipped (model exists)
  • -
  • LLM calls during classification: 0
  • -
  • Total time: ~4 minutes
  • -
  • ML acceptance rate: 100% (all emails classified by ML, even low confidence)
  • -
-
- - - - diff --git a/docs/VERIFY_CATEGORIES_FEATURE.html b/docs/VERIFY_CATEGORIES_FEATURE.html deleted file mode 100644 index ad46d0e..0000000 --- a/docs/VERIFY_CATEGORIES_FEATURE.html +++ /dev/null @@ -1,357 +0,0 @@ - - - - - - Category Verification Feature - - - - -

--verify-categories Feature

- -
-

✅ IMPLEMENTED AND READY TO USE

-

Feature: Single LLM call to verify model categories fit new mailbox

-

Cost: +20 seconds, 1 LLM call

-

Value: Confidence check before bulk ML classification

-
- -

Usage

- -
-Basic usage (with verification): -python -m src.cli run \ - --source enron \ - --limit 10000 \ - --output verified_test/ \ - --no-llm-fallback \ - --verify-categories - -Custom verification sample size: -python -m src.cli run \ - --source enron \ - --limit 10000 \ - --output verified_test/ \ - --no-llm-fallback \ - --verify-categories \ - --verify-sample 30 - -Without verification (fastest): -python -m src.cli run \ - --source enron \ - --limit 10000 \ - --output fast_test/ \ - --no-llm-fallback -
- -

How It Works

- -
-
-flowchart TD
-    Start([Run with --verify-categories]) --> LoadModel[Load trained model
Categories: Updates, Work,
Meetings, etc.] - - LoadModel --> FetchEmails[Fetch all emails
10,000 total] - - FetchEmails --> CheckFlag{--verify-categories?} - CheckFlag -->|No| SkipVerify[Skip verification
Proceed to classification] - CheckFlag -->|Yes| Sample[Sample random emails
Default: 20 emails] - - Sample --> BuildPrompt[Build verification prompt
Show model categories
Show sample emails] - - BuildPrompt --> LLMCall[Single LLM call
~20 seconds
Task: Rate category fit] - - LLMCall --> ParseResponse[Parse JSON response
Extract verdict + confidence] - - ParseResponse --> Verdict{Verdict?} - - Verdict -->|GOOD_MATCH
80%+ fit| LogGood[Log: Categories appropriate
Confidence: 0.8-1.0] - Verdict -->|FAIR_MATCH
60-80% fit| LogFair[Log: Categories acceptable
Confidence: 0.6-0.8] - Verdict -->|POOR_MATCH
<60% fit| LogPoor[Log WARNING
Show suggested categories
Recommend calibration
Confidence: 0.0-0.6] - - LogGood --> Proceed[Proceed with ML classification] - LogFair --> Proceed - LogPoor --> Proceed - - SkipVerify --> Proceed - - Proceed --> ClassifyAll[Classify all 10,000 emails
Pure ML, no LLM fallback
~4 minutes] - - ClassifyAll --> Done[Results saved] - - style LLMCall fill:#ffd93d - style LogGood fill:#4ec9b0 - style LogPoor fill:#ff6b6b - style ClassifyAll fill:#4ec9b0 -
-
- -

Example Outputs

- -

Scenario 1: GOOD_MATCH (Enron → Enron)

-
-================================================================================ -VERIFYING MODEL CATEGORIES -================================================================================ -Verifying model categories against 10000 emails -Model categories (11): Updates, Work, Meetings, External, Financial, Test, Administrative, Operational, Technical, Urgent, Requests -Sampled 20 emails for verification -Calling LLM for category verification... -Verification complete: GOOD_MATCH (0.85) -Reasoning: The sample emails fit well into the trained categories. Most are work-related correspondence, meetings, and operational updates which align with the model. - -Verification: GOOD_MATCH -Confidence: 85% -Model categories look appropriate for this mailbox -================================================================================ - -Starting classification... -
- -

Scenario 2: POOR_MATCH (Enron → Personal Gmail)

-
-================================================================================ -VERIFYING MODEL CATEGORIES -================================================================================ -Verifying model categories against 10000 emails -Model categories (11): Updates, Work, Meetings, External, Financial, Test, Administrative, Operational, Technical, Urgent, Requests -Sampled 20 emails for verification -Calling LLM for category verification... -Verification complete: POOR_MATCH (0.45) -Reasoning: Many sample emails are shopping confirmations, social media notifications, and personal correspondence which don't fit the business-focused categories well. - -Verification: POOR_MATCH -Confidence: 45% -================================================================================ -WARNING: Model categories may not fit this mailbox well -Suggested categories: ['Shopping', 'Social', 'Travel', 'Newsletters', 'Personal'] -Consider running full calibration for better accuracy -Proceeding with existing model anyway... -================================================================================ - -Starting classification... -
- -

LLM Prompt Structure

- -
-You are evaluating whether pre-trained email categories fit a new mailbox. - -TRAINED MODEL CATEGORIES (11 categories): - - Updates - - Work - - Meetings - - External - - Financial - - Test - - Administrative - - Operational - - Technical - - Urgent - - Requests - -SAMPLE EMAILS FROM NEW MAILBOX (20 total, showing first 20): -1. From: phillip.allen@enron.com - Subject: Re: AEC Volumes at OPAL - Preview: Here are the volumes for today... - -2. From: notifications@amazon.com - Subject: Your order has shipped - Preview: Your Amazon.com order #123-4567890... - -[... 18 more emails ...] - -TASK: -Evaluate if the trained categories are appropriate for this mailbox. - -Consider: -1. Do the sample emails naturally fit into the trained categories? -2. Are there obvious email types that don't match any category? -3. Are the category names semantically appropriate? -4. Would a user find these categories helpful for THIS mailbox? - -Respond with JSON: -{ - "verdict": "GOOD_MATCH" | "FAIR_MATCH" | "POOR_MATCH", - "confidence": 0.0-1.0, - "reasoning": "brief explanation", - "fit_percentage": 0-100, - "suggested_categories": ["cat1", "cat2", ...], - "category_mapping": {"old_name": "better_name", ...} -} -
- -

Configuration

- - - - - - - - - - - - - - - - - - - - - - - - - - -
FlagTypeDefaultDescription
--verify-categoriesFlagFalseEnable category verification
--verify-sampleInteger20Number of emails to sample
--no-llm-fallbackFlagFalseDisable LLM fallback during classification
- -

When Verification Runs

- -
    -
  • ✅ Only if --verify-categories flag is set
  • -
  • ✅ Only if trained model exists (not mock)
  • -
  • ✅ After emails are fetched, before calibration/classification
  • -
  • ❌ Skipped if using mock model
  • -
  • ❌ Skipped if model doesn't exist (calibration will run anyway)
  • -
- -

Timing Impact

- - - - - - - - - - - - - - - - - - - - - - - - - - - -
ConfigurationTime (10k emails)LLM Calls
ML-only (no flags)~4 minutes0
ML-only + --verify-categories~4.3 minutes1 (verification)
Full calibration (no model)~25 minutes~500
ML + LLM fallback (21%)~2.5 hours~2100
- -

Decision Tree

- -
-
-flowchart TD
-    Start([Need to classify emails]) --> HaveModel{Trained model
exists?} - - HaveModel -->|No| MustCalibrate[Must run calibration
~20 minutes
~500 LLM calls] - - HaveModel -->|Yes| SameDomain{Same domain as
training data?} - - SameDomain -->|Yes, confident| FastML[Pure ML
4 minutes
0 LLM calls] - - SameDomain -->|Unsure| VerifyML[ML + Verification
4.3 minutes
1 LLM call] - - SameDomain -->|No, different| Options{Accuracy needs?} - - Options -->|High accuracy required| MustCalibrate - Options -->|Speed more important| VerifyML - Options -->|Experimental| FastML - - MustCalibrate --> Done[Classification complete] - FastML --> Done - VerifyML --> Done - - style FastML fill:#4ec9b0 - style VerifyML fill:#ffd93d - style MustCalibrate fill:#ff6b6b -
-
- -

Quick Start

- -
-Test with verification on same domain (Enron → Enron): -python -m src.cli run \ - --source enron \ - --limit 1000 \ - --output verify_test_same/ \ - --no-llm-fallback \ - --verify-categories - -Expected: GOOD_MATCH (0.80-0.95) -Time: ~30 seconds - -Test without verification for speed comparison: -python -m src.cli run \ - --source enron \ - --limit 1000 \ - --output no_verify_test/ \ - --no-llm-fallback - -Expected: Same accuracy, 20 seconds faster -Time: ~10 seconds -
- - - - diff --git a/docs/WORKFLOW_DIAGRAM.md b/docs/WORKFLOW_DIAGRAM.md deleted file mode 100644 index cf073be..0000000 --- a/docs/WORKFLOW_DIAGRAM.md +++ /dev/null @@ -1,255 +0,0 @@ -# Email Sorter - Complete Workflow Diagram - -## Full End-to-End Pipeline with LLM Calls - -```mermaid -graph TB - Start([📧 Start: Enron Maildir
100,000 emails]) --> Parse[EnronParser
Stratified Sampling] - - Parse --> CalibCheck{Need
Calibration?} - - CalibCheck -->|Yes: No Model| CalibStart[🎯 CALIBRATION PHASE] - CalibCheck -->|No: Model Exists| ClassifyStart[📊 CLASSIFICATION PHASE] - - %% CALIBRATION PHASE - CalibStart --> Sample[Sample 100 Emails
Stratified by user/folder] - Sample --> Split[Split: 50 train / 50 validation] - - Split --> LLMBatch[📤 LLM CALL 1-5
Batch Discovery
5 batches × 20 emails] - - LLMBatch -->|qwen3:8b-q4_K_M| Discover[Category Discovery
~15 raw categories] - - Discover --> Consolidate[📤 LLM CALL 6
Consolidation
Merge similar categories] - - Consolidate -->|qwen3:8b-q4_K_M| CacheSnap[Category Cache Snap
Semantic matching
10 final categories] - - CacheSnap --> ExtractTrain[Extract Features
50 training emails
Batch embeddings] - - ExtractTrain --> Embed1[📤 EMBEDDING CALLS
Ollama all-minilm:l6-v2
384-dim vectors] - - Embed1 --> TrainModel[Train LightGBM
200 boosting rounds
22 total categories] - - TrainModel --> SaveModel[💾 Save Model
classifier.pkl 1.1MB] - - SaveModel --> ClassifyStart - - %% CLASSIFICATION PHASE - ClassifyStart --> LoadModel[Load Model
classifier.pkl] - LoadModel --> FetchAll[Fetch All Emails
100,000 emails] - - FetchAll --> BatchProcess[Process in Batches
5,000 emails per batch
20 batches total] - - BatchProcess --> ExtractFeatures[Extract Features
Batch size: 512
Batched embeddings] - - ExtractFeatures --> Embed2[📤 EMBEDDING CALLS
Ollama all-minilm:l6-v2
~200 batched calls] - - Embed2 --> MLInference[LightGBM Inference
Predict categories
~2ms per email] - - MLInference --> Results[💾 Save Results
results.json 19MB
summary.json 1.5KB
classifications.csv 8.6MB] - - Results --> ValidationStart[🔍 VALIDATION PHASE] - - %% VALIDATION PHASE - ValidationStart --> SelectSamples[Select Samples
50 low-conf + 25 random] - - SelectSamples --> LoadEmails[Load Full Email Content
Subject + Body + Metadata] - - LoadEmails --> LLMEval[📤 LLM CALLS 7-81
Individual Evaluation
75 total assessments] - - LLMEval -->|qwen3:8b-q4_K_M
<no_think>| EvalResults[Collect Verdicts
YES/PARTIAL/NO
+ Reasoning] - - EvalResults --> LLMSummary[📤 LLM CALL 82
Final Summary
Aggregate findings] - - LLMSummary -->|qwen3:8b-q4_K_M| FinalReport[📊 Final Report
Accuracy metrics
Category quality
Recommendations] - - FinalReport --> End([✅ Complete
100k classified
+ validated]) - - %% OPTIONAL FINE-TUNING LOOP - FinalReport -.->|If corrections needed| FineTune[🔄 FINE-TUNING
Collect LLM corrections
Continue training] - FineTune -.-> ClassifyStart - - style Start fill:#e1f5e1 - style End fill:#e1f5e1 - style LLMBatch fill:#fff4e6 - style Consolidate fill:#fff4e6 - style Embed1 fill:#e6f3ff - style Embed2 fill:#e6f3ff - style LLMEval fill:#fff4e6 - style LLMSummary fill:#fff4e6 - style SaveModel fill:#ffe6f0 - style Results fill:#ffe6f0 - style FinalReport fill:#ffe6f0 -``` - ---- - -## Pipeline Stages Breakdown - -### STAGE 1: CALIBRATION (1 minute) -**Input:** 100 emails -**LLM Calls:** 6 calls -- 5 batch discovery calls (20 emails each) -- 1 consolidation call -**Embedding Calls:** ~50 calls (one per training email) -**Output:** -- 10 discovered categories -- Trained LightGBM model (1.1MB) -- Category cache - -### STAGE 2: CLASSIFICATION (3.4 minutes) -**Input:** 100,000 emails -**LLM Calls:** 0 (pure ML inference) -**Embedding Calls:** ~200 batched calls (512 emails per batch) -**Output:** -- 100,000 classifications -- Confidence scores -- Results in JSON/CSV - -### STAGE 3: VALIDATION (variable, ~5-10 minutes) -**Input:** 75 sample emails (50 low-conf + 25 random) -**LLM Calls:** 76 calls -- 75 individual evaluation calls -- 1 final summary call -**Output:** -- Quality assessment (YES/PARTIAL/NO) -- Accuracy metrics -- Recommendations - ---- - -## LLM Call Summary - -| Call # | Purpose | Model | Input | Output | Time | -|--------|---------|-------|-------|--------|------| -| 1-5 | Batch Discovery | qwen3:8b | 20 emails each | Categories | ~5-6s each | -| 6 | Consolidation | qwen3:8b | 15 categories | 10 merged | ~3s | -| 7-81 | Evaluation | qwen3:8b | 1 email + category | Verdict | ~2s each | -| 82 | Summary | qwen3:8b | 75 evaluations | Final report | ~5s | - -**Total LLM Calls:** 82 -**Total LLM Time:** ~3-4 minutes -**Embedding Calls:** ~250 (batched) -**Embedding Time:** ~30 seconds (batched) - ---- - -## Performance Metrics - -### Calibration Phase -- **Time:** 60 seconds -- **Samples:** 100 emails (50 for training) -- **Categories Discovered:** 10 -- **Model Size:** 1.1MB -- **Accuracy on training:** 95%+ - -### Classification Phase -- **Time:** 202 seconds (3.4 minutes) -- **Emails:** 100,000 -- **Speed:** 495 emails/second -- **Per Email:** 2ms total processing -- **Batch Size:** 512 (optimal) -- **GPU Utilization:** High (batched embeddings) - -### Validation Phase -- **Time:** ~10 minutes (75 LLM calls) -- **Samples:** 75 emails -- **Per Sample:** ~8 seconds -- **Accuracy Found:** Model already accurate (0 corrections) - ---- - -## Data Flow Details - -### Email Processing Pipeline -``` -Email File → Parse → Features → Embedding → Model → Category - (text) (dict) (struct) (384-dim) (22-cat) (label) -``` - -### Feature Extraction -``` -Email Content -├─ Subject (text) -├─ Body (text) -├─ Sender (email address) -├─ Date (timestamp) -├─ Attachments (boolean + count) -└─ Patterns (regex matches) - ↓ -Structured Text - ↓ -Ollama Embedding (all-minilm:l6-v2) - ↓ -384-dimensional vector -``` - -### LightGBM Training -``` -Features (384-dim) + Labels (10 categories) - ↓ -Training: 200 boosting rounds - ↓ -Model: 22 categories total (10 discovered + 12 hardcoded) - ↓ -Output: classifier.pkl (1.1MB) -``` - ---- - -## Category Distribution (100k Results) - -```mermaid -pie title Category Distribution - "Work Communication" : 89807 - "Financial" : 6534 - "Forwarded" : 2457 - "Technical Analysis" : 1129 - "Other" : 73 -``` - ---- - -## Confidence Distribution (100k Results) - -```mermaid -pie title Confidence Levels - "High (≥0.7)" : 74777 - "Medium (0.5-0.7)" : 17381 - "Low (<0.5)" : 7842 -``` - ---- - -## System Architecture - -```mermaid -graph LR - A[Email Source
Gmail/IMAP/Enron] --> B[Email Provider] - B --> C[Feature Extractor] - C --> D[Ollama
Embeddings] - C --> E[Pattern Detector] - D --> F[LightGBM
Classifier] - E --> F - F --> G[Results
JSON/CSV] - F --> H[Sync Engine
Labels/Keywords] - - I[LLM
qwen3:8b] -.->|Calibration| J[Category Discovery] - J -.-> F - I -.->|Validation| K[Quality Check] - K -.-> G - - style D fill:#e6f3ff - style I fill:#fff4e6 - style F fill:#f0e6ff - style G fill:#ffe6f0 -``` - ---- - -## Next: Integrated End-to-End Script - -Building comprehensive validation script with: -1. 50 low-confidence samples -2. 25 random samples -3. Final LLM summary call -4. Complete pipeline orchestration diff --git a/docs/chat-gippity-research.md b/docs/chat-gippity-research.md deleted file mode 100644 index 7422725..0000000 --- a/docs/chat-gippity-research.md +++ /dev/null @@ -1,126 +0,0 @@ -Yeah — your instinct is solid here, Brett. -For the kind of **multi-category, fast, local, high-volume** classification system you’re building, training your own model on your own labelled data will almost certainly give **better and more controllable results** than trying to stretch an off-the-shelf model. - -Let’s break this down clearly — especially what **sentence-transformers** brings to the table and why it’s a good fit for your hybrid architecture. - ---- - -## 🧠 What Sentence Transformers Actually Do - -A **sentence transformer** is a model that converts a piece of text (e.g. subject line, short body snippet, sender info) into a **dense vector embedding** — basically a fixed-length numerical representation that captures its semantic meaning. - -Once you have embeddings, classification becomes a **simple, fast** downstream problem: - -* Instead of training a giant deep network, -* You train a **small, lightweight model** (like Logistic Regression, XGBoost, or a simple neural head) on top of those embeddings. - ---- - -## 🚀 Why This Is a Great Fit for Your Email Sorter - -| Sentence Transformers | Why it matters for you | -| ----------------------------------------------------------- | ----------------------------------------------------------------------------- | -| **Pretrained models** already “understand” general language | So you don’t need massive datasets to get good results. | -| **Fast inference** | Embedding generation can run on CPU or GPU and is easy to parallelise. | -| **Stable + deterministic** | Embeddings are consistent across runs (unlike LLM zero-shot answers). | -| **Lightweight training** | You can train a classifier on top with a few thousand labelled samples. | -| **Supports multi-class** easily | Perfect for your 12 category types. | -| **Re-usable** | One universal model for all inboxes; just retrain the top layer occasionally. | - ---- - -## 🏗️ How It Would Work in Your Pipeline - -### 1. **Use the LLM once** to label your initial batch (e.g. 1.5k–5k emails). - -This gives you your **bootstrapped labelled dataset**. - -### 2. **Generate embeddings** - -* Take **subject**, **sender domain**, and optionally a short **body snippet**. -* Pass through sentence transformer → get 768-dimensional vector (typical). -* Save those embeddings alongside labels. - -### 3. **Train a classifier** on top - -A lightweight model like: - -* **Logistic Regression** (fastest), -* **XGBoost / LightGBM** (slightly heavier, more accuracy), -* or even a shallow **MLP** if you want. - -This becomes your **universal email classifier**. - -### 4. **Run in production** - -* New email comes in → embed text → run classifier → get category + confidence. -* If below threshold → send to LLM for re-classification. - -This gives you **LLM semantic power** at training time, and **ML speed** at runtime. - ---- - -## 🧪 Practical Setup (Minimal) - -```python -from sentence_transformers import SentenceTransformer -from sklearn.linear_model import LogisticRegression - -# 1. Load a pre-trained embedding model -model = SentenceTransformer("all-MiniLM-L6-v2") # fast, lightweight - -# 2. Convert texts to embeddings -texts = [f"{subject} {sender}" for subject, sender in your_email_data] -embeddings = model.encode(texts, batch_size=64, show_progress_bar=True) - -# 3. Train a lightweight classifier -clf = LogisticRegression(max_iter=1000) -clf.fit(embeddings, labels) # labels come from your LLM bootstrap - -# 4. Predict -new_embedding = model.encode(["Subject from new email"]) -pred = clf.predict(new_embedding) -conf = clf.predict_proba(new_embedding) -``` - -* `all-MiniLM-L6-v2` is a fantastic starting model — small, fast, and surprisingly accurate. -* You can fine-tune the sentence transformer later if you want **extra precision**. - ---- - -## 🧠 Why It’s Easier Than Full Fine-Tuning - -You *don’t* need to train the transformer itself (at least not at the start). -You’re just training the **top layer** (classifier). That means: - -* Training takes minutes, not hours. -* You don’t need huge GPUs. -* You can refresh or retrain easily with new data. - -Later on, if you want to **fine-tune the transformer itself** (so it “understands emails” more deeply), that’s an optional next step. - ---- - -## ⚡ Typical Results People See - -* With 2–5k labelled samples, sentence transformer embeddings + logistic regression can hit **85–95 % accuracy** on email category tasks. -* Inference time is **<5 ms per email** on CPU. -* Works well for both generic and user-specific inboxes. - ---- - -## 🪜 Suggested Path for You - -1. Use your **LLM pass** to generate labels on your first big inbox. -2. Generate embeddings with a pretrained MiniLM. -3. Train a logistic regression or XGBoost model. -4. Run it on the next inbox → see how it performs. -5. (Optional) Fine-tune the transformer if you want to push performance higher. - ---- - -👉 In short: -Yes — sentence transformers are **perfect** for this. -They give you **semantic power without LLM overhead**, are **easy to train**, and will make your hybrid classifier **extremely fast and accurate** after that first run. - -If you want, I can give you a **tiny starter training script** (30–40 lines) that does the embedding + classifier training from your first LLM-labelled dataset. Would you like that? diff --git a/src/cli.py b/src/cli.py index 1ce6980..536324c 100644 --- a/src/cli.py +++ b/src/cli.py @@ -53,6 +53,8 @@ def cli(): help='Verify model categories fit new mailbox (single LLM call)') @click.option('--verify-sample', type=int, default=20, help='Number of emails to sample for category verification') +@click.option('--force-ml', is_flag=True, + help='Force use of existing ML model regardless of dataset size') def run( source: str, credentials: Optional[str], @@ -65,7 +67,8 @@ def run( verbose: bool, no_llm_fallback: bool, verify_categories: bool, - verify_sample: int + verify_sample: int, + force_ml: bool ): """Run email sorter pipeline.""" @@ -198,10 +201,14 @@ def run( total_emails = len(emails) # Skip ML for small datasets (<1000 emails) - use LLM only - if total_emails < 1000: + # Unless --force-ml is set and we have an existing model + if total_emails < 1000 and not force_ml: logger.warning(f"Only {total_emails} emails - too few for ML training") logger.warning("Using LLM-only classification (no ML model)") + logger.warning("Use --force-ml to use existing model anyway") ml_classifier.is_mock = True + elif force_ml and ml_classifier.model: + logger.info(f"--force-ml: Using existing ML model for {total_emails} emails") # Check if we need calibration (no good ML model) if ml_classifier.is_mock or not ml_classifier.model: @@ -294,7 +301,20 @@ def run( logger.info("Exporting results") Path(output).mkdir(parents=True, exist_ok=True) + # Build email lookup for metadata enrichment + email_lookup = {email.id: email for email in emails} + import json + from datetime import datetime as dt + + def serialize_date(date_obj): + """Serialize date to ISO format string.""" + if date_obj is None: + return None + if isinstance(date_obj, dt): + return date_obj.isoformat() + return str(date_obj) + results_data = { 'metadata': { 'total_emails': len(emails), @@ -304,16 +324,24 @@ def run( 'ml_classified': adaptive_classifier.get_stats().ml_classified, 'llm_classified': adaptive_classifier.get_stats().llm_classified, 'needs_review': adaptive_classifier.get_stats().needs_review, - } + }, + 'generated_at': dt.now().isoformat(), + 'source': source, + 'source_path': directory if source == 'local' else None, }, 'classifications': [ { 'email_id': r.email_id, + 'subject': email_lookup.get(r.email_id, emails[i]).subject if r.email_id in email_lookup or i < len(emails) else '', + 'sender': email_lookup.get(r.email_id, emails[i]).sender if r.email_id in email_lookup or i < len(emails) else '', + 'sender_name': email_lookup.get(r.email_id, emails[i]).sender_name if r.email_id in email_lookup or i < len(emails) else None, + 'date': serialize_date(email_lookup.get(r.email_id, emails[i]).date if r.email_id in email_lookup or i < len(emails) else None), + 'has_attachments': email_lookup.get(r.email_id, emails[i]).has_attachments if r.email_id in email_lookup or i < len(emails) else False, 'category': r.category, 'confidence': r.confidence, 'method': r.method } - for r in results + for i, r in enumerate(results) ] } diff --git a/src/llm/openai_compat.py b/src/llm/openai_compat.py index 69faa74..2275d3c 100644 --- a/src/llm/openai_compat.py +++ b/src/llm/openai_compat.py @@ -47,14 +47,12 @@ class OpenAIProvider(BaseLLMProvider): try: from openai import OpenAI - if not self.api_key: - self.logger.error("OpenAI API key not configured") - self.logger.error("Set OPENAI_API_KEY environment variable or pass api_key parameter") - self._available = False - return + # For local vLLM/OpenAI-compatible servers, API key may not be required + # Use a placeholder if not set + api_key = self.api_key or "not-needed" self.client = OpenAI( - api_key=self.api_key, + api_key=api_key, base_url=self.base_url if self.base_url != "https://api.openai.com/v1" else None, timeout=self.timeout ) @@ -121,7 +119,7 @@ class OpenAIProvider(BaseLLMProvider): def test_connection(self) -> bool: """Test if OpenAI API is accessible.""" - if not self.client or not self.api_key: + if not self.client: self.logger.warning("OpenAI client not initialized") return False diff --git a/tools/batch_llm_classifier.py b/tools/batch_llm_classifier.py new file mode 100755 index 0000000..a9cceb3 --- /dev/null +++ b/tools/batch_llm_classifier.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python3 +""" +Standalone vLLM Batch Email Classifier + +PREREQUISITE: vLLM server must be running at configured endpoint + +This is a SEPARATE tool from the main ML classification pipeline. +Use this for: +- One-off batch questions ("find all emails about project X") +- Custom classification criteria not in trained model +- Exploratory analysis with flexible prompts + +Use RAG instead for: +- Searching across large email corpus +- Finding specific topics/keywords +- Building knowledge from email content +""" + +import time +import asyncio +import logging +import sys +from pathlib import Path +from typing import List, Dict, Any, Optional + +import httpx +import click + + +# Server configuration +VLLM_CONFIG = { + 'base_url': 'https://rtx3090.bobai.com.au/v1', + 'api_key': 'rtx3090_foxadmin_10_8034ecb47841f45ba1d5f3f5d875c092', + 'model': 'qwen3-coder-30b', + 'batch_size': 4, # Tested optimal - 100% success, proper batch pooling + 'temperature': 0.1, + 'max_tokens': 500 +} + + +async def check_vllm_server(base_url: str, api_key: str, model: str) -> bool: + """Check if vLLM server is running and model is loaded.""" + try: + async with httpx.AsyncClient() as client: + response = await client.post( + f"{base_url}/chat/completions", + json={ + "model": model, + "messages": [{"role": "user", "content": "test"}], + "max_tokens": 5 + }, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + timeout=10.0 + ) + return response.status_code == 200 + except Exception as e: + print(f"ERROR: vLLM server check failed: {e}") + return False + + +async def classify_email_async( + client: httpx.AsyncClient, + email: Any, + prompt_template: str, + base_url: str, + api_key: str, + model: str, + temperature: float, + max_tokens: int +) -> Dict[str, Any]: + """Classify single email using async HTTP request.""" + + # No semaphore - proper batch pooling instead + try: + # Build prompt with email data + prompt = prompt_template.format( + subject=email.get('subject', 'N/A')[:100], + sender=email.get('sender', 'N/A')[:50], + body_snippet=email.get('body_snippet', '')[:500] + ) + response = await client.post( + f"{base_url}/chat/completions", + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "temperature": temperature, + "max_tokens": max_tokens + }, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + }, + timeout=30.0 + ) + + if response.status_code == 200: + data = response.json() + content = data['choices'][0]['message']['content'] + + return { + 'email_id': email.get('id', 'unknown'), + 'subject': email.get('subject', 'N/A')[:60], + 'result': content.strip(), + 'success': True + } + + return { + 'email_id': email.get('id', 'unknown'), + 'subject': email.get('subject', 'N/A')[:60], + 'result': f'HTTP {response.status_code}', + 'success': False + } + + except Exception as e: + return { + 'email_id': email.get('id', 'unknown'), + 'subject': email.get('subject', 'N/A')[:60], + 'result': f'Error: {str(e)[:100]}', + 'success': False + } + + +async def classify_single_batch( + client: httpx.AsyncClient, + emails: List[Dict[str, Any]], + prompt_template: str, + config: Dict[str, Any] +) -> List[Dict[str, Any]]: + """Classify one batch of emails - send all at once, wait for completion.""" + + tasks = [ + classify_email_async( + client, email, prompt_template, + config['base_url'], config['api_key'], config['model'], + config['temperature'], config['max_tokens'] + ) + for email in emails + ] + + results = await asyncio.gather(*tasks) + return results + + +async def batch_classify_async( + emails: List[Dict[str, Any]], + prompt_template: str, + config: Dict[str, Any] +) -> List[Dict[str, Any]]: + """Classify emails using proper batch pooling.""" + + batch_size = config['batch_size'] + all_results = [] + + async with httpx.AsyncClient() as client: + # Process in batches - send batch, wait for all to complete, repeat + for batch_start in range(0, len(emails), batch_size): + batch_end = min(batch_start + batch_size, len(emails)) + batch_emails = emails[batch_start:batch_end] + + batch_results = await classify_single_batch( + client, batch_emails, prompt_template, config + ) + + all_results.extend(batch_results) + + return all_results + + +def load_emails_from_provider(provider_type: str, credentials: Optional[str], limit: int) -> List[Dict[str, Any]]: + """Load emails from configured provider.""" + + # Lazy import to avoid dependency issues + if provider_type == 'enron': + from src.email_providers.enron import EnronProvider + provider = EnronProvider(maildir_path=".") + provider.connect({}) + emails = provider.fetch_emails(limit=limit) + provider.disconnect() + + # Convert to dict format + return [ + { + 'id': e.id, + 'subject': e.subject, + 'sender': e.sender, + 'body_snippet': e.body_snippet + } + for e in emails + ] + + elif provider_type == 'gmail': + from src.email_providers.gmail import GmailProvider + if not credentials: + print("ERROR: Gmail requires --credentials path") + sys.exit(1) + provider = GmailProvider() + provider.connect({'credentials_path': credentials}) + emails = provider.fetch_emails(limit=limit) + provider.disconnect() + + return [ + { + 'id': e.id, + 'subject': e.subject, + 'sender': e.sender, + 'body_snippet': e.body_snippet + } + for e in emails + ] + + else: + print(f"ERROR: Unsupported provider: {provider_type}") + sys.exit(1) + + +@click.group() +def cli(): + """vLLM Batch Email Classifier - Ask custom questions across email batches.""" + pass + + +@cli.command() +@click.option('--source', type=click.Choice(['gmail', 'enron']), default='enron', + help='Email provider') +@click.option('--credentials', type=click.Path(exists=False), + help='Path to credentials file (for Gmail)') +@click.option('--limit', type=int, default=50, + help='Number of emails to process') +@click.option('--question', type=str, required=True, + help='Question to ask about each email') +@click.option('--output', type=click.Path(), default='batch_results.txt', + help='Output file for results') +def ask(source: str, credentials: Optional[str], limit: int, question: str, output: str): + """Ask a custom question about a batch of emails.""" + + print("=" * 80) + print("vLLM BATCH EMAIL CLASSIFIER") + print("=" * 80) + print(f"Question: {question}") + print(f"Source: {source}") + print(f"Batch size: {limit}") + print("=" * 80) + print() + + # Check vLLM server + print("Checking vLLM server...") + if not asyncio.run(check_vllm_server( + VLLM_CONFIG['base_url'], + VLLM_CONFIG['api_key'], + VLLM_CONFIG['model'] + )): + print() + print("ERROR: vLLM server not available or not responding") + print(f"Expected endpoint: {VLLM_CONFIG['base_url']}") + print(f"Expected model: {VLLM_CONFIG['model']}") + print() + print("PREREQUISITE: Start vLLM server before running this tool") + sys.exit(1) + + print(f"✓ vLLM server running ({VLLM_CONFIG['model']})") + print() + + # Load emails + print(f"Loading {limit} emails from {source}...") + emails = load_emails_from_provider(source, credentials, limit) + print(f"✓ Loaded {len(emails)} emails") + print() + + # Build prompt template (optimized for caching) + prompt_template = f"""You are analyzing emails to answer specific questions. + +INSTRUCTIONS: +- Read the email carefully +- Answer the question directly and concisely +- Provide reasoning if helpful +- If the email is not relevant, say "Not relevant" + +QUESTION: +{question} + +EMAIL TO ANALYZE: +Subject: {{subject}} +From: {{sender}} +Body: {{body_snippet}} + +ANSWER: +""" + + # Process batch + print(f"Processing {len(emails)} emails with {VLLM_CONFIG['max_concurrent']} concurrent requests...") + start_time = time.time() + + results = asyncio.run(batch_classify_async(emails, prompt_template, VLLM_CONFIG)) + + end_time = time.time() + total_time = end_time - start_time + + # Stats + successful = sum(1 for r in results if r['success']) + throughput = len(emails) / total_time + + print() + print("=" * 80) + print("RESULTS") + print("=" * 80) + print(f"Total emails: {len(emails)}") + print(f"Successful: {successful}") + print(f"Failed: {len(emails) - successful}") + print(f"Time: {total_time:.1f}s") + print(f"Throughput: {throughput:.2f} emails/sec") + print("=" * 80) + print() + + # Save results + with open(output, 'w') as f: + f.write(f"Question: {question}\n") + f.write(f"Processed: {len(emails)} emails in {total_time:.1f}s\n") + f.write("=" * 80 + "\n\n") + + for i, result in enumerate(results, 1): + f.write(f"{i}. {result['subject']}\n") + f.write(f" Email ID: {result['email_id']}\n") + f.write(f" Answer: {result['result']}\n") + f.write("\n") + + print(f"Results saved to: {output}") + print() + + # Show sample + print("SAMPLE RESULTS (first 5):") + for i, result in enumerate(results[:5], 1): + print(f"\n{i}. {result['subject']}") + print(f" {result['result'][:100]}...") + + +@cli.command() +def check(): + """Check if vLLM server is running and ready.""" + + print("Checking vLLM server...") + print(f"Endpoint: {VLLM_CONFIG['base_url']}") + print(f"Model: {VLLM_CONFIG['model']}") + print() + + if asyncio.run(check_vllm_server( + VLLM_CONFIG['base_url'], + VLLM_CONFIG['api_key'], + VLLM_CONFIG['model'] + )): + print("✓ vLLM server is running and ready") + print(f"✓ Max concurrent requests: {VLLM_CONFIG['max_concurrent']}") + print(f"✓ Estimated throughput: ~4.4 emails/sec") + else: + print("✗ vLLM server not available") + print() + print("Start vLLM server before using this tool") + sys.exit(1) + + +if __name__ == '__main__': + cli() diff --git a/tools/brett_gmail_analyzer.py b/tools/brett_gmail_analyzer.py new file mode 100644 index 0000000..652fcbd --- /dev/null +++ b/tools/brett_gmail_analyzer.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +""" +Brett Gmail Dataset Analyzer +============================ +CUSTOM script for analyzing the brett-gmail email dataset. +NOT portable to other datasets without modification. + +Usage: + python tools/brett_gmail_analyzer.py + +Output: + - Console report with comprehensive statistics + - data/brett_gmail_analysis.json with full analysis data +""" + +import json +import re +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path + +# Add parent to path for imports +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.calibration.local_file_parser import LocalFileParser + + +# ============================================================================= +# CLASSIFICATION RULES - CUSTOM FOR BRETT'S GMAIL +# ============================================================================= + +def classify_email(email): + """ + Classify email into categories based on sender domain and subject patterns. + + Priority: Sender domain > Subject keywords + """ + sender = email.sender or "" + subject = email.subject or "" + domain = sender.split('@')[-1] if '@' in sender else sender + + # === HIGH-LEVEL CATEGORIES === + + # --- Art & Collectibles --- + if 'mutualart.com' in domain: + return ('Art & Collectibles', 'MutualArt Alerts') + + # --- Travel & Tourism --- + if 'tripadvisor.com' in domain: + return ('Travel & Tourism', 'Tripadvisor') + if 'booking.com' in domain: + return ('Travel & Tourism', 'Booking.com') + + # --- Entertainment & Streaming --- + if 'spotify.com' in domain: + if 'concert' in subject.lower() or 'live' in subject.lower(): + return ('Entertainment', 'Spotify Concerts') + return ('Entertainment', 'Spotify Promotions') + if 'youtube.com' in domain: + return ('Entertainment', 'YouTube') + if 'onlyfans.com' in domain: + return ('Entertainment', 'OnlyFans') + if 'ign.com' in domain: + return ('Entertainment', 'IGN Gaming') + + # --- Shopping & eCommerce --- + if 'ebay.com' in domain or 'reply.ebay' in domain: + return ('Shopping', 'eBay') + if 'aliexpress.com' in domain: + return ('Shopping', 'AliExpress') + if 'alibabacloud.com' in domain or 'alibaba-inc.com' in domain: + return ('Tech Services', 'Alibaba Cloud') + if '4wdsupacentre' in domain: + return ('Shopping', '4WD Supacentre') + if 'mikeblewitt' in domain or 'mbcoffscoast' in domain: + return ('Shopping', 'Mike Blewitt/MBC') + if 'auspost.com.au' in domain: + return ('Shopping', 'Australia Post') + if 'printfresh' in domain: + return ('Business', 'Timesheets') + + # --- AI & Tech Services --- + if 'anthropic.com' in domain or 'claude.com' in domain: + return ('AI Services', 'Anthropic/Claude') + if 'openai.com' in domain: + return ('AI Services', 'OpenAI') + if 'openrouter.ai' in domain: + return ('AI Services', 'OpenRouter') + if 'lambda' in domain: + return ('AI Services', 'Lambda Labs') + if 'x.ai' in domain: + return ('AI Services', 'xAI') + if 'perplexity.ai' in domain: + return ('AI Services', 'Perplexity') + if 'cursor.com' in domain: + return ('Developer Tools', 'Cursor') + + # --- Developer Tools --- + if 'ngrok.com' in domain: + return ('Developer Tools', 'ngrok') + if 'docker.com' in domain: + return ('Developer Tools', 'Docker') + + # --- Productivity Apps --- + if 'screencastify.com' in domain: + return ('Productivity', 'Screencastify') + if 'tango.us' in domain: + return ('Productivity', 'Tango') + if 'xplor.com' in domain or 'myxplor' in domain: + return ('Services', 'Xplor Childcare') + + # --- Google Services --- + if 'google.com' in domain or 'accounts.google.com' in domain: + if 'performance report' in subject.lower() or 'business profile' in subject.lower(): + return ('Google', 'Business Profile') + if 'security' in subject.lower() or 'sign-in' in subject.lower(): + return ('Security', 'Google Security') + if 'firebase' in subject.lower() or 'firestore' in subject.lower(): + return ('Developer Tools', 'Firebase') + if 'ads' in subject.lower(): + return ('Google', 'Google Ads') + if 'analytics' in subject.lower(): + return ('Google', 'Analytics') + if re.search(r'verification code|verify', subject, re.I): + return ('Security', 'Google Verification') + return ('Google', 'Other Google') + + # --- Microsoft --- + if 'microsoft.com' in domain or 'outlook.com' in domain or 'hotmail.com' in domain: + if 'security' in subject.lower() or 'protection' in domain: + return ('Security', 'Microsoft Security') + return ('Personal', 'Microsoft/Outlook') + + # --- Social Media --- + if 'reddit' in domain: + return ('Social', 'Reddit') + + # --- Business/Work --- + if 'frontiertechstrategies' in domain: + return ('Business', 'Appointments') + if 'crsaustralia.gov.au' in domain: + return ('Business', 'Job Applications') + if 'v6send.net' in domain: + return ('Shopping', 'Automotive Dealers') + + # === SUBJECT-BASED FALLBACK === + + if re.search(r'security alert|verification code|sign.?in|password|2fa', subject, re.I): + return ('Security', 'General Security') + + if re.search(r'order.*ship|receipt|payment|invoice|purchase', subject, re.I): + return ('Transactions', 'Orders/Receipts') + + if re.search(r'trial|subscription|billing|renew', subject, re.I): + return ('Billing', 'Subscriptions') + + if re.search(r'terms of service|privacy policy|legal', subject, re.I): + return ('Legal', 'Policy Updates') + + if re.search(r'welcome to|getting started', subject, re.I): + return ('Onboarding', 'Welcome Emails') + + # --- Personal contacts --- + if 'gmail.com' in domain: + return ('Personal', 'Gmail Contacts') + + return ('Uncategorized', 'Unknown') + + +def extract_order_ids(emails): + """Extract order/transaction IDs from emails.""" + order_patterns = [ + (r'Order\s+(\d{10,})', 'AliExpress Order'), + (r'receipt.*(\d{4}-\d{4}-\d{4})', 'Receipt ID'), + (r'#(\d{4,})', 'Generic Order ID'), + ] + + orders = [] + for email in emails: + subject = email.subject or "" + for pattern, order_type in order_patterns: + match = re.search(pattern, subject, re.I) + if match: + orders.append({ + 'id': match.group(1), + 'type': order_type, + 'subject': subject, + 'date': str(email.date) if email.date else None, + 'sender': email.sender + }) + break + return orders + + +def analyze_time_distribution(emails): + """Analyze email distribution over time.""" + by_year = Counter() + by_month = Counter() + by_day_of_week = Counter() + + day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + for email in emails: + if email.date: + try: + by_year[email.date.year] += 1 + by_month[f"{email.date.year}-{email.date.month:02d}"] += 1 + by_day_of_week[day_names[email.date.weekday()]] += 1 + except: + pass + + return { + 'by_year': dict(by_year.most_common()), + 'by_month': dict(sorted(by_month.items())), + 'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names} + } + + +def main(): + email_dir = "/home/bob/Documents/Email Manager/emails/brett-gmail" + output_dir = Path(__file__).parent.parent / "data" + output_dir.mkdir(exist_ok=True) + + print("="*70) + print("BRETT GMAIL DATASET ANALYSIS") + print("="*70) + print(f"\nSource: {email_dir}") + print(f"Output: {output_dir}") + + # Parse emails + print("\nParsing emails...") + parser = LocalFileParser(email_dir) + emails = parser.parse_emails() + print(f"Total emails: {len(emails)}") + + # Date range + dates = [e.date for e in emails if e.date] + if dates: + dates.sort() + print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}") + + # Classify all emails + print("\nClassifying emails...") + + category_counts = Counter() + subcategory_counts = Counter() + by_category = defaultdict(list) + by_subcategory = defaultdict(list) + + for email in emails: + category, subcategory = classify_email(email) + category_counts[category] += 1 + subcategory_counts[subcategory] += 1 + by_category[category].append(email) + by_subcategory[subcategory].append(email) + + # Print category summary + print("\n" + "="*70) + print("CATEGORY SUMMARY") + print("="*70) + + for category, count in category_counts.most_common(): + pct = count / len(emails) * 100 + bar = "█" * int(pct / 2) + print(f"\n{category} ({count} emails, {pct:.1f}%)") + print(f" {bar}") + + # Show subcategories + subcats = Counter() + for email in by_category[category]: + _, subcat = classify_email(email) + subcats[subcat] += 1 + + for subcat, subcount in subcats.most_common(): + print(f" - {subcat}: {subcount}") + + # Analyze senders + print("\n" + "="*70) + print("TOP SENDERS BY VOLUME") + print("="*70) + + sender_counts = Counter(e.sender for e in emails) + for sender, count in sender_counts.most_common(15): + pct = count / len(emails) * 100 + print(f" {count:4d} ({pct:4.1f}%) {sender}") + + # Time analysis + print("\n" + "="*70) + print("TIME DISTRIBUTION") + print("="*70) + + time_dist = analyze_time_distribution(emails) + + print("\nBy Year:") + for year, count in sorted(time_dist['by_year'].items()): + bar = "█" * (count // 10) + print(f" {year}: {count:4d} {bar}") + + print("\nBy Day of Week:") + for day, count in time_dist['by_day_of_week'].items(): + bar = "█" * (count // 5) + print(f" {day}: {count:3d} {bar}") + + # Extract orders + print("\n" + "="*70) + print("ORDER/TRANSACTION IDs FOUND") + print("="*70) + + orders = extract_order_ids(emails) + if orders: + for order in orders[:10]: + print(f" [{order['type']}] {order['id']}") + print(f" Subject: {order['subject'][:60]}...") + else: + print(" No order IDs detected in subjects") + + # Actionable insights + print("\n" + "="*70) + print("ACTIONABLE INSIGHTS") + print("="*70) + + # High-volume automated senders + automated_domains = ['mutualart.com', 'tripadvisor.com', 'ebay.com', 'spotify.com'] + auto_count = sum(1 for e in emails if any(d in (e.sender or '') for d in automated_domains)) + print(f"\n1. AUTOMATED EMAILS: {auto_count} ({auto_count/len(emails)*100:.1f}%)") + print(" - MutualArt alerts: Consider aggregating to weekly digest") + print(" - Tripadvisor: Can be filtered to trash or separate folder") + print(" - eBay/Spotify: Promotional, low priority") + + # Security alerts + security_count = category_counts.get('Security', 0) + print(f"\n2. SECURITY ALERTS: {security_count} ({security_count/len(emails)*100:.1f}%)") + print(" - Google security: Review for legitimate sign-in attempts") + print(" - Should NOT be auto-filtered") + + # Business/Work + business_count = category_counts.get('Business', 0) + category_counts.get('Google', 0) + print(f"\n3. BUSINESS-RELATED: {business_count} ({business_count/len(emails)*100:.1f}%)") + print(" - Google Business Profile reports: Monthly review") + print(" - Job applications: High priority") + print(" - Appointments: Calendar integration") + + # AI Services (professional interest) + ai_count = category_counts.get('AI Services', 0) + category_counts.get('Developer Tools', 0) + print(f"\n4. AI/DEVELOPER TOOLS: {ai_count} ({ai_count/len(emails)*100:.1f}%)") + print(" - Anthropic, OpenAI, Lambda: Keep for reference") + print(" - ngrok, Docker, Cursor: Developer updates") + + # Personal + personal_count = category_counts.get('Personal', 0) + print(f"\n5. PERSONAL: {personal_count} ({personal_count/len(emails)*100:.1f}%)") + print(" - Gmail contacts: May need human review") + print(" - Microsoft/Outlook: Check for spam") + + # Save analysis data + analysis_data = { + 'metadata': { + 'total_emails': len(emails), + 'date_range': { + 'start': str(dates[0]) if dates else None, + 'end': str(dates[-1]) if dates else None + }, + 'analyzed_at': datetime.now().isoformat() + }, + 'categories': dict(category_counts), + 'subcategories': dict(subcategory_counts), + 'top_senders': dict(sender_counts.most_common(50)), + 'time_distribution': time_dist, + 'orders_found': orders, + 'classification_accuracy': { + 'categorized': len(emails) - category_counts.get('Uncategorized', 0), + 'uncategorized': category_counts.get('Uncategorized', 0), + 'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100 + } + } + + output_file = output_dir / "brett_gmail_analysis.json" + with open(output_file, 'w') as f: + json.dump(analysis_data, f, indent=2) + + print(f"\n\nAnalysis saved to: {output_file}") + print("\n" + "="*70) + print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%") + print(f"({analysis_data['classification_accuracy']['categorized']} categorized, " + f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)") + print("="*70) + + +if __name__ == '__main__': + main() diff --git a/tools/brett_microsoft_analyzer.py b/tools/brett_microsoft_analyzer.py new file mode 100644 index 0000000..ee384b4 --- /dev/null +++ b/tools/brett_microsoft_analyzer.py @@ -0,0 +1,500 @@ +#!/usr/bin/env python3 +""" +Brett Microsoft (Outlook) Dataset Analyzer +========================================== +CUSTOM script for analyzing the brett-microsoft email dataset. +NOT portable to other datasets without modification. + +Usage: + python tools/brett_microsoft_analyzer.py + +Output: + - Console report with comprehensive statistics + - data/brett_microsoft_analysis.json with full analysis data +""" + +import json +import re +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path + +# Add parent to path for imports +import sys +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.calibration.local_file_parser import LocalFileParser + + +# ============================================================================= +# CLASSIFICATION RULES - CUSTOM FOR BRETT'S MICROSOFT/OUTLOOK INBOX +# ============================================================================= + +def classify_email(email): + """ + Classify email into categories based on sender domain and subject patterns. + + This is a BUSINESS inbox - different approach than personal Gmail. + Priority: Sender domain > Subject keywords > Business context + """ + sender = email.sender or "" + subject = email.subject or "" + domain = sender.split('@')[-1] if '@' in sender else sender + + # === BUSINESS OPERATIONS === + + # MYOB/Accounting + if 'apps.myob.com' in domain or 'myob' in subject.lower(): + return ('Business Operations', 'MYOB Invoices') + + # TPG/Telecom/Internet + if 'tpgtelecom.com.au' in domain or 'aapt.com.au' in domain: + if 'suspension' in subject.lower() or 'overdue' in subject.lower(): + return ('Business Operations', 'Telecom - Urgent/Overdue') + if 'novation' in subject.lower(): + return ('Business Operations', 'Telecom - Contract Changes') + if 'NBN' in subject or 'nbn' in subject.lower(): + return ('Business Operations', 'Telecom - NBN') + return ('Business Operations', 'Telecom - General') + + # DocuSign (Contracts) + if 'docusign' in domain or 'docusign' in subject.lower(): + return ('Business Operations', 'DocuSign Contracts') + + # === CLIENT WORK === + + # Green Output / Energy Avengers (App Development Client) + if 'greenoutput.com.au' in domain or 'energyavengers' in domain: + return ('Client Work', 'Energy Avengers Project') + + # Brighter Access (Client) + if 'brighteraccess' in domain or 'Brighter Access' in subject: + return ('Client Work', 'Brighter Access') + + # Waterfall Way Designs (Business Partner) + if 'waterfallwaydesigns' in domain: + return ('Client Work', 'Waterfall Way Designs') + + # Target Impact + if 'targetimpact.com.au' in domain: + return ('Client Work', 'Target Impact') + + # MerlinFX + if 'merlinfx.com.au' in domain: + return ('Client Work', 'MerlinFX') + + # Solar/Energy related (Energy Avengers ecosystem) + if 'solarairenergy.com.au' in domain or 'solarconnected.com.au' in domain: + return ('Client Work', 'Energy Avengers Ecosystem') + + if 'eonadvisory.com.au' in domain or 'australianpowerbrokers.com.au' in domain: + return ('Client Work', 'Energy Avengers Ecosystem') + + if 'fyconsulting.com.au' in domain: + return ('Client Work', 'Energy Avengers Ecosystem') + + if 'convergedesign.com.au' in domain: + return ('Client Work', 'Energy Avengers Ecosystem') + + # MYP Corp (Disability Services Software) + if '1myp.com' in domain or 'mypcorp' in domain or 'MYP' in subject: + return ('Business Operations', 'MYP Software') + + # === MICROSOFT SERVICES === + + # Microsoft Support Cases + if re.search(r'\[Case.*#|Case #|TrackingID', subject, re.I) or 'support.microsoft.com' in domain: + return ('Microsoft', 'Support Cases') + + # Microsoft Billing/Invoices + if 'Microsoft invoice' in subject or 'credit card was declined' in subject: + return ('Microsoft', 'Billing') + + # Microsoft Subscriptions + if 'subscription' in subject.lower() and 'microsoft' in sender.lower(): + return ('Microsoft', 'Subscriptions') + + # SharePoint/Teams + if 'sharepointonline.com' in domain or 'Teams' in subject: + return ('Microsoft', 'SharePoint/Teams') + + # O365 Service Updates + if 'o365su' in sender or ('digest' in subject.lower() and 'microsoft' in sender.lower()): + return ('Microsoft', 'Service Updates') + + # General Microsoft + if 'microsoft.com' in domain: + return ('Microsoft', 'General') + + # === DEVELOPER TOOLS === + + # GitHub CI/CD + if re.search(r'\[FSSCoding', subject): + return ('Developer', 'GitHub CI/CD Failures') + + # GitHub Issues/PRs + if 'github.com' in domain: + if 'linuxmint' in subject or 'cinnamon' in subject: + return ('Developer', 'Open Source Contributions') + if 'Pheromind' in subject or 'ChrisRoyse' in subject: + return ('Developer', 'GitHub Collaborations') + return ('Developer', 'GitHub Notifications') + + # Neo4j + if 'neo4j.com' in domain: + if 'webinar' in subject.lower() or 'Webinar' in subject: + return ('Developer', 'Neo4j Webinars') + if 'NODES' in subject or 'GraphTalk' in subject: + return ('Developer', 'Neo4j Conference') + return ('Developer', 'Neo4j') + + # Cursor (AI IDE) + if 'cursor.com' in domain or 'cursor.so' in domain or 'Cursor' in subject: + return ('Developer', 'Cursor IDE') + + # Tailscale + if 'tailscale.com' in domain: + return ('Developer', 'Tailscale') + + # Hugging Face + if 'huggingface' in domain or 'Hugging Face' in subject: + return ('Developer', 'Hugging Face') + + # Stripe (Payment Failures) + if 'stripe.com' in domain: + return ('Billing', 'Stripe Payments') + + # Contabo (Hosting) + if 'contabo.com' in domain: + return ('Developer', 'Contabo Hosting') + + # SendGrid + if 'sendgrid' in subject.lower(): + return ('Developer', 'SendGrid') + + # Twilio + if 'twilio.com' in domain: + return ('Developer', 'Twilio') + + # Brave Search API + if 'brave.com' in domain: + return ('Developer', 'Brave Search API') + + # PyPI + if 'pypi' in subject.lower() or 'pypi.org' in domain: + return ('Developer', 'PyPI') + + # NVIDIA/CUDA + if 'CUDA' in subject or 'nvidia' in domain: + return ('Developer', 'NVIDIA/CUDA') + + # Inception Labs / AI Tools + if 'inceptionlabs.ai' in domain: + return ('Developer', 'AI Tools') + + # === LEARNING === + + # Computer Enhance (Casey Muratori) / Substack + if 'computerenhance' in sender or 'substack.com' in domain: + return ('Learning', 'Substack/Newsletters') + + # Odoo + if 'odoo.com' in domain: + return ('Learning', 'Odoo ERP') + + # Mozilla Firefox + if 'mozilla.org' in domain: + return ('Developer', 'Mozilla Firefox') + + # === PERSONAL / COMMUNITY === + + # Grandfather Gatherings (Personal Community) + if 'Grandfather Gather' in subject: + return ('Personal', 'Grandfather Gatherings') + + # Mailchimp newsletters (often personal) + if 'mailchimpapp.com' in domain: + return ('Personal', 'Personal Newsletters') + + # Community Events + if 'Community Working Bee' in subject: + return ('Personal', 'Community Events') + + # Personal emails (Gmail/Hotmail) + if 'gmail.com' in domain or 'hotmail.com' in domain or 'bigpond.com' in domain: + return ('Personal', 'Personal Contacts') + + # FSS Internal + if 'foxsoftwaresolutions.com.au' in domain: + return ('Business Operations', 'FSS Internal') + + # === FINANCIAL === + + # eToro + if 'etoro.com' in domain: + return ('Financial', 'eToro Trading') + + # Dell + if 'dell.com' in domain or 'Dell' in subject: + return ('Business Operations', 'Dell Hardware') + + # Insurance + if 'KT Insurance' in subject or 'insurance' in subject.lower(): + return ('Business Operations', 'Insurance') + + # SBSCH Payments + if 'SBSCH' in subject: + return ('Business Operations', 'SBSCH Payments') + + # iCare NSW + if 'icare.nsw.gov.au' in domain: + return ('Business Operations', 'iCare NSW') + + # Vodafone + if 'vodafone.com.au' in domain: + return ('Business Operations', 'Telecom - Vodafone') + + # === MISC === + + # Undeliverable/Bounces + if 'Undeliverable' in subject: + return ('System', 'Email Bounces') + + # Security + if re.search(r'Security Alert|Login detected|security code|Verify', subject, re.I): + return ('Security', 'Security Alerts') + + # Password Reset + if 'password' in subject.lower(): + return ('Security', 'Password') + + # Calendly + if 'calendly.com' in domain: + return ('Business Operations', 'Calendly') + + # Trello + if 'trello.com' in domain: + return ('Business Operations', 'Trello') + + # Scorptec + if 'scorptec' in domain: + return ('Business Operations', 'Hardware Vendor') + + # Webcentral + if 'webcentral.com.au' in domain: + return ('Business Operations', 'Web Hosting') + + # Bluetti (Hardware) + if 'bluettipower.com' in domain: + return ('Business Operations', 'Hardware - Power') + + # ABS Surveys + if 'abs.gov.au' in domain: + return ('Business Operations', 'Government - ABS') + + # Qualtrics/Surveys + if 'qualtrics' in domain: + return ('Business Operations', 'Surveys') + + return ('Uncategorized', 'Unknown') + + +def extract_case_ids(emails): + """Extract Microsoft support case IDs and tracking IDs from emails.""" + case_patterns = [ + (r'Case\s*#?\s*:?\s*(\d{8})', 'Microsoft Case'), + (r'\[Case\s*#?\s*:?\s*(\d{8})\]', 'Microsoft Case'), + (r'TrackingID#(\d{16})', 'Tracking ID'), + ] + + cases = defaultdict(list) + for email in emails: + subject = email.subject or "" + for pattern, case_type in case_patterns: + match = re.search(pattern, subject, re.I) + if match: + case_id = match.group(1) + cases[case_id].append({ + 'type': case_type, + 'subject': subject, + 'date': str(email.date) if email.date else None, + 'sender': email.sender + }) + return dict(cases) + + +def analyze_time_distribution(emails): + """Analyze email distribution over time.""" + by_year = Counter() + by_month = Counter() + by_day_of_week = Counter() + + day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + for email in emails: + if email.date: + try: + by_year[email.date.year] += 1 + by_month[f"{email.date.year}-{email.date.month:02d}"] += 1 + by_day_of_week[day_names[email.date.weekday()]] += 1 + except: + pass + + return { + 'by_year': dict(by_year.most_common()), + 'by_month': dict(sorted(by_month.items())), + 'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names} + } + + +def main(): + email_dir = "/home/bob/Documents/Email Manager/emails/brett-microsoft" + output_dir = Path(__file__).parent.parent / "data" + output_dir.mkdir(exist_ok=True) + + print("="*70) + print("BRETT MICROSOFT (OUTLOOK) DATASET ANALYSIS") + print("="*70) + print(f"\nSource: {email_dir}") + print(f"Output: {output_dir}") + + # Parse emails + print("\nParsing emails...") + parser = LocalFileParser(email_dir) + emails = parser.parse_emails() + print(f"Total emails: {len(emails)}") + + # Date range + dates = [e.date for e in emails if e.date] + if dates: + dates.sort() + print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}") + + # Classify all emails + print("\nClassifying emails...") + + category_counts = Counter() + subcategory_counts = Counter() + by_category = defaultdict(list) + by_subcategory = defaultdict(list) + + for email in emails: + category, subcategory = classify_email(email) + category_counts[category] += 1 + subcategory_counts[f"{category}: {subcategory}"] += 1 + by_category[category].append(email) + by_subcategory[subcategory].append(email) + + # Print category summary + print("\n" + "="*70) + print("TOP-LEVEL CATEGORY SUMMARY") + print("="*70) + + for category, count in category_counts.most_common(): + pct = count / len(emails) * 100 + bar = "█" * int(pct / 2) + print(f"\n{category} ({count} emails, {pct:.1f}%)") + print(f" {bar}") + + # Show subcategories + subcats = Counter() + for email in by_category[category]: + _, subcat = classify_email(email) + subcats[subcat] += 1 + + for subcat, subcount in subcats.most_common(): + print(f" - {subcat}: {subcount}") + + # Analyze senders + print("\n" + "="*70) + print("TOP SENDERS BY VOLUME") + print("="*70) + + sender_counts = Counter(e.sender for e in emails) + for sender, count in sender_counts.most_common(15): + pct = count / len(emails) * 100 + print(f" {count:4d} ({pct:4.1f}%) {sender}") + + # Time analysis + print("\n" + "="*70) + print("TIME DISTRIBUTION") + print("="*70) + + time_dist = analyze_time_distribution(emails) + + print("\nBy Year:") + for year, count in sorted(time_dist['by_year'].items()): + bar = "█" * (count // 10) + print(f" {year}: {count:4d} {bar}") + + print("\nBy Day of Week:") + for day, count in time_dist['by_day_of_week'].items(): + bar = "█" * (count // 5) + print(f" {day}: {count:3d} {bar}") + + # Extract case IDs + print("\n" + "="*70) + print("MICROSOFT SUPPORT CASES TRACKED") + print("="*70) + + cases = extract_case_ids(emails) + if cases: + for case_id, occurrences in sorted(cases.items()): + print(f"\n Case/Tracking: {case_id} ({len(occurrences)} emails)") + for occ in occurrences[:3]: + print(f" - {occ['date']}: {occ['subject'][:50]}...") + else: + print(" No case IDs detected") + + # Actionable insights + print("\n" + "="*70) + print("INBOX CHARACTER ASSESSMENT") + print("="*70) + + business_pct = (category_counts.get('Business Operations', 0) + + category_counts.get('Client Work', 0) + + category_counts.get('Developer', 0)) / len(emails) * 100 + personal_pct = category_counts.get('Personal', 0) / len(emails) * 100 + + print(f"\n Business/Professional: {business_pct:.1f}%") + print(f" Personal: {personal_pct:.1f}%") + print(f"\n ASSESSMENT: This is a {'BUSINESS' if business_pct > 50 else 'MIXED'} inbox") + + # Save analysis data + analysis_data = { + 'metadata': { + 'total_emails': len(emails), + 'inbox_type': 'microsoft', + 'inbox_character': 'business' if business_pct > 50 else 'mixed', + 'date_range': { + 'start': str(dates[0]) if dates else None, + 'end': str(dates[-1]) if dates else None + }, + 'analyzed_at': datetime.now().isoformat() + }, + 'categories': dict(category_counts), + 'subcategories': dict(subcategory_counts), + 'top_senders': dict(sender_counts.most_common(50)), + 'time_distribution': time_dist, + 'support_cases': cases, + 'classification_accuracy': { + 'categorized': len(emails) - category_counts.get('Uncategorized', 0), + 'uncategorized': category_counts.get('Uncategorized', 0), + 'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100 + } + } + + output_file = output_dir / "brett_microsoft_analysis.json" + with open(output_file, 'w') as f: + json.dump(analysis_data, f, indent=2) + + print(f"\n\nAnalysis saved to: {output_file}") + print("\n" + "="*70) + print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%") + print(f"({analysis_data['classification_accuracy']['categorized']} categorized, " + f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)") + print("="*70) + + +if __name__ == '__main__': + main() diff --git a/tools/generate_html_report.py b/tools/generate_html_report.py new file mode 100644 index 0000000..b19220e --- /dev/null +++ b/tools/generate_html_report.py @@ -0,0 +1,642 @@ +#!/usr/bin/env python3 +""" +Generate interactive HTML report from email classification results. + +Usage: + python tools/generate_html_report.py --input results.json --output report.html +""" + +import argparse +import json +from pathlib import Path +from datetime import datetime +from collections import Counter, defaultdict +from html import escape + + +def load_results(input_path: str) -> dict: + """Load classification results from JSON.""" + with open(input_path) as f: + return json.load(f) + + +def extract_domain(sender: str) -> str: + """Extract domain from email address.""" + if not sender: + return "unknown" + if "@" in sender: + return sender.split("@")[-1].lower() + return sender.lower() + + +def format_date(date_str: str) -> str: + """Format ISO date string for display.""" + if not date_str: + return "N/A" + try: + dt = datetime.fromisoformat(date_str.replace("Z", "+00:00")) + return dt.strftime("%Y-%m-%d %H:%M") + except: + return date_str[:16] if len(date_str) > 16 else date_str + + +def truncate(text: str, max_len: int = 60) -> str: + """Truncate text with ellipsis.""" + if not text: + return "" + if len(text) <= max_len: + return text + return text[:max_len-3] + "..." + + +def generate_html_report(results: dict, output_path: str): + """Generate interactive HTML report.""" + + metadata = results.get("metadata", {}) + classifications = results.get("classifications", []) + + # Calculate statistics + total = len(classifications) + categories = Counter(c["category"] for c in classifications) + methods = Counter(c["method"] for c in classifications) + + # Group by category + by_category = defaultdict(list) + for c in classifications: + by_category[c["category"]].append(c) + + # Sort categories by count + sorted_categories = sorted(categories.keys(), key=lambda x: categories[x], reverse=True) + + # Sender statistics + sender_domains = Counter(extract_domain(c.get("sender", "")) for c in classifications) + top_senders = Counter(c.get("sender", "unknown") for c in classifications).most_common(20) + + # Confidence distribution + high_conf = sum(1 for c in classifications if c.get("confidence", 0) >= 0.7) + med_conf = sum(1 for c in classifications if 0.5 <= c.get("confidence", 0) < 0.7) + low_conf = sum(1 for c in classifications if c.get("confidence", 0) < 0.5) + + # Generate HTML + html = f''' + + + + + Email Classification Report + + + +
+
+

Email Classification Report

+

Automated analysis of email inbox

+
+ Generated: {datetime.now().strftime("%Y-%m-%d %H:%M")} + Source: {escape(metadata.get("source", "unknown"))} + Total Emails: {total:,} +
+
+ +
+
+
{total:,}
+
Total Emails
+
+
+
{len(categories)}
+
Categories
+
+
+
{high_conf}
+
High Confidence (≥70%)
+
+
+
{len(sender_domains)}
+
Unique Domains
+
+
+ +
+

Category Distribution

+ {"".join(f''' +
+
{escape(cat)}
+
+
+
+
{categories[cat]:,} ({categories[cat]/total*100:.1f}%)
+
+ ''' for cat in sorted_categories)} +
+ +
+

Classification Methods

+ {"".join(f''' +
+
{escape(method.upper())}
+
+
+
+
{methods[method]:,} ({methods[method]/total*100:.1f}%)
+
+ ''' for method in sorted(methods.keys()))} +
+ +
+

Confidence Distribution

+
+
High (≥70%)
+
+
+
+
{high_conf:,} ({high_conf/total*100:.1f}%)
+
+
+
Medium (50-70%)
+
+
+
+
{med_conf:,} ({med_conf/total*100:.1f}%)
+
+
+
Low (<50%)
+
+
+
+
{low_conf:,} ({low_conf/total*100:.1f}%)
+
+
+ +
+

Top Senders

+
+ {"".join(f''' +
+ + {count} +
+ ''' for sender, count in top_senders)} +
+
+ +
+

Emails by Category

+ +
+
All{total}
+ {"".join(f'''
{escape(cat)}{categories[cat]}
''' for cat in sorted_categories)} +
+ + + +
+
+ + + + + + + + + + + + + {"".join(generate_email_row(c) for c in sorted(classifications, key=lambda x: x.get("date") or "", reverse=True))} + + +
+
+ + {"".join(f''' +
+
+ + + + + + + + + + + + {"".join(generate_email_row(c, show_category=False) for c in sorted(by_category[cat], key=lambda x: x.get("date") or "", reverse=True))} + + +
+
+ ''' for cat in sorted_categories)} +
+ +
+ Generated by Email Sorter | {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} +
+
+ + + + +''' + + with open(output_path, "w", encoding="utf-8") as f: + f.write(html) + + print(f"Report generated: {output_path}") + print(f" Total emails: {total:,}") + print(f" Categories: {len(categories)}") + print(f" Top category: {sorted_categories[0]} ({categories[sorted_categories[0]]:,})") + + +def generate_email_row(c: dict, show_category: bool = True) -> str: + """Generate HTML table row for an email.""" + conf = c.get("confidence", 0) + conf_class = "high" if conf >= 0.7 else "medium" if conf >= 0.5 else "low" + method = c.get("method", "unknown") + method_class = f"method-{method}" + + attachment_icon = '📎 ' if c.get("has_attachments") else "" + + category_col = f'{escape(c.get("category", "unknown"))}' if show_category else "" + + return f''' + + {format_date(c.get("date"))} + {attachment_icon}{escape(truncate(c.get("subject", "No subject"), 70))} + {escape(truncate(c.get("sender_name") or c.get("sender", ""), 35))} + {category_col} + {conf*100:.0f}% + {method} + + ''' + + +def main(): + parser = argparse.ArgumentParser(description="Generate HTML report from classification results") + parser.add_argument("--input", "-i", required=True, help="Path to results.json") + parser.add_argument("--output", "-o", default=None, help="Output HTML file path") + + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: Input file not found: {input_path}") + return 1 + + output_path = args.output or str(input_path.parent / "report.html") + + results = load_results(args.input) + generate_html_report(results, output_path) + + return 0 + + +if __name__ == "__main__": + exit(main())