email-sorter/tools/brett_gmail_analyzer.py
FSSCoding 8f25e30f52 Rewrite CLAUDE.md and clean project structure
- Rewrote CLAUDE.md with comprehensive development guide
- Archived 20 old docs to docs/archive/
- Added PROJECT_ROADMAP_2025.md with research learnings
- Added CLASSIFICATION_METHODS_COMPARISON.md
- Added SESSION_HANDOVER_20251128.md
- Added tools for analysis (brett_gmail/microsoft analyzers)
- Updated .gitignore for archive folders
- Config changes for local vLLM endpoint
2025-11-28 13:07:27 +11:00

392 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Brett Gmail Dataset Analyzer
============================
CUSTOM script for analyzing the brett-gmail email dataset.
NOT portable to other datasets without modification.
Usage:
python tools/brett_gmail_analyzer.py
Output:
- Console report with comprehensive statistics
- data/brett_gmail_analysis.json with full analysis data
"""
import json
import re
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
# Add parent to path for imports
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.calibration.local_file_parser import LocalFileParser
# =============================================================================
# CLASSIFICATION RULES - CUSTOM FOR BRETT'S GMAIL
# =============================================================================
def classify_email(email):
"""
Classify email into categories based on sender domain and subject patterns.
Priority: Sender domain > Subject keywords
"""
sender = email.sender or ""
subject = email.subject or ""
domain = sender.split('@')[-1] if '@' in sender else sender
# === HIGH-LEVEL CATEGORIES ===
# --- Art & Collectibles ---
if 'mutualart.com' in domain:
return ('Art & Collectibles', 'MutualArt Alerts')
# --- Travel & Tourism ---
if 'tripadvisor.com' in domain:
return ('Travel & Tourism', 'Tripadvisor')
if 'booking.com' in domain:
return ('Travel & Tourism', 'Booking.com')
# --- Entertainment & Streaming ---
if 'spotify.com' in domain:
if 'concert' in subject.lower() or 'live' in subject.lower():
return ('Entertainment', 'Spotify Concerts')
return ('Entertainment', 'Spotify Promotions')
if 'youtube.com' in domain:
return ('Entertainment', 'YouTube')
if 'onlyfans.com' in domain:
return ('Entertainment', 'OnlyFans')
if 'ign.com' in domain:
return ('Entertainment', 'IGN Gaming')
# --- Shopping & eCommerce ---
if 'ebay.com' in domain or 'reply.ebay' in domain:
return ('Shopping', 'eBay')
if 'aliexpress.com' in domain:
return ('Shopping', 'AliExpress')
if 'alibabacloud.com' in domain or 'alibaba-inc.com' in domain:
return ('Tech Services', 'Alibaba Cloud')
if '4wdsupacentre' in domain:
return ('Shopping', '4WD Supacentre')
if 'mikeblewitt' in domain or 'mbcoffscoast' in domain:
return ('Shopping', 'Mike Blewitt/MBC')
if 'auspost.com.au' in domain:
return ('Shopping', 'Australia Post')
if 'printfresh' in domain:
return ('Business', 'Timesheets')
# --- AI & Tech Services ---
if 'anthropic.com' in domain or 'claude.com' in domain:
return ('AI Services', 'Anthropic/Claude')
if 'openai.com' in domain:
return ('AI Services', 'OpenAI')
if 'openrouter.ai' in domain:
return ('AI Services', 'OpenRouter')
if 'lambda' in domain:
return ('AI Services', 'Lambda Labs')
if 'x.ai' in domain:
return ('AI Services', 'xAI')
if 'perplexity.ai' in domain:
return ('AI Services', 'Perplexity')
if 'cursor.com' in domain:
return ('Developer Tools', 'Cursor')
# --- Developer Tools ---
if 'ngrok.com' in domain:
return ('Developer Tools', 'ngrok')
if 'docker.com' in domain:
return ('Developer Tools', 'Docker')
# --- Productivity Apps ---
if 'screencastify.com' in domain:
return ('Productivity', 'Screencastify')
if 'tango.us' in domain:
return ('Productivity', 'Tango')
if 'xplor.com' in domain or 'myxplor' in domain:
return ('Services', 'Xplor Childcare')
# --- Google Services ---
if 'google.com' in domain or 'accounts.google.com' in domain:
if 'performance report' in subject.lower() or 'business profile' in subject.lower():
return ('Google', 'Business Profile')
if 'security' in subject.lower() or 'sign-in' in subject.lower():
return ('Security', 'Google Security')
if 'firebase' in subject.lower() or 'firestore' in subject.lower():
return ('Developer Tools', 'Firebase')
if 'ads' in subject.lower():
return ('Google', 'Google Ads')
if 'analytics' in subject.lower():
return ('Google', 'Analytics')
if re.search(r'verification code|verify', subject, re.I):
return ('Security', 'Google Verification')
return ('Google', 'Other Google')
# --- Microsoft ---
if 'microsoft.com' in domain or 'outlook.com' in domain or 'hotmail.com' in domain:
if 'security' in subject.lower() or 'protection' in domain:
return ('Security', 'Microsoft Security')
return ('Personal', 'Microsoft/Outlook')
# --- Social Media ---
if 'reddit' in domain:
return ('Social', 'Reddit')
# --- Business/Work ---
if 'frontiertechstrategies' in domain:
return ('Business', 'Appointments')
if 'crsaustralia.gov.au' in domain:
return ('Business', 'Job Applications')
if 'v6send.net' in domain:
return ('Shopping', 'Automotive Dealers')
# === SUBJECT-BASED FALLBACK ===
if re.search(r'security alert|verification code|sign.?in|password|2fa', subject, re.I):
return ('Security', 'General Security')
if re.search(r'order.*ship|receipt|payment|invoice|purchase', subject, re.I):
return ('Transactions', 'Orders/Receipts')
if re.search(r'trial|subscription|billing|renew', subject, re.I):
return ('Billing', 'Subscriptions')
if re.search(r'terms of service|privacy policy|legal', subject, re.I):
return ('Legal', 'Policy Updates')
if re.search(r'welcome to|getting started', subject, re.I):
return ('Onboarding', 'Welcome Emails')
# --- Personal contacts ---
if 'gmail.com' in domain:
return ('Personal', 'Gmail Contacts')
return ('Uncategorized', 'Unknown')
def extract_order_ids(emails):
"""Extract order/transaction IDs from emails."""
order_patterns = [
(r'Order\s+(\d{10,})', 'AliExpress Order'),
(r'receipt.*(\d{4}-\d{4}-\d{4})', 'Receipt ID'),
(r'#(\d{4,})', 'Generic Order ID'),
]
orders = []
for email in emails:
subject = email.subject or ""
for pattern, order_type in order_patterns:
match = re.search(pattern, subject, re.I)
if match:
orders.append({
'id': match.group(1),
'type': order_type,
'subject': subject,
'date': str(email.date) if email.date else None,
'sender': email.sender
})
break
return orders
def analyze_time_distribution(emails):
"""Analyze email distribution over time."""
by_year = Counter()
by_month = Counter()
by_day_of_week = Counter()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for email in emails:
if email.date:
try:
by_year[email.date.year] += 1
by_month[f"{email.date.year}-{email.date.month:02d}"] += 1
by_day_of_week[day_names[email.date.weekday()]] += 1
except:
pass
return {
'by_year': dict(by_year.most_common()),
'by_month': dict(sorted(by_month.items())),
'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names}
}
def main():
email_dir = "/home/bob/Documents/Email Manager/emails/brett-gmail"
output_dir = Path(__file__).parent.parent / "data"
output_dir.mkdir(exist_ok=True)
print("="*70)
print("BRETT GMAIL DATASET ANALYSIS")
print("="*70)
print(f"\nSource: {email_dir}")
print(f"Output: {output_dir}")
# Parse emails
print("\nParsing emails...")
parser = LocalFileParser(email_dir)
emails = parser.parse_emails()
print(f"Total emails: {len(emails)}")
# Date range
dates = [e.date for e in emails if e.date]
if dates:
dates.sort()
print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")
# Classify all emails
print("\nClassifying emails...")
category_counts = Counter()
subcategory_counts = Counter()
by_category = defaultdict(list)
by_subcategory = defaultdict(list)
for email in emails:
category, subcategory = classify_email(email)
category_counts[category] += 1
subcategory_counts[subcategory] += 1
by_category[category].append(email)
by_subcategory[subcategory].append(email)
# Print category summary
print("\n" + "="*70)
print("CATEGORY SUMMARY")
print("="*70)
for category, count in category_counts.most_common():
pct = count / len(emails) * 100
bar = "" * int(pct / 2)
print(f"\n{category} ({count} emails, {pct:.1f}%)")
print(f" {bar}")
# Show subcategories
subcats = Counter()
for email in by_category[category]:
_, subcat = classify_email(email)
subcats[subcat] += 1
for subcat, subcount in subcats.most_common():
print(f" - {subcat}: {subcount}")
# Analyze senders
print("\n" + "="*70)
print("TOP SENDERS BY VOLUME")
print("="*70)
sender_counts = Counter(e.sender for e in emails)
for sender, count in sender_counts.most_common(15):
pct = count / len(emails) * 100
print(f" {count:4d} ({pct:4.1f}%) {sender}")
# Time analysis
print("\n" + "="*70)
print("TIME DISTRIBUTION")
print("="*70)
time_dist = analyze_time_distribution(emails)
print("\nBy Year:")
for year, count in sorted(time_dist['by_year'].items()):
bar = "" * (count // 10)
print(f" {year}: {count:4d} {bar}")
print("\nBy Day of Week:")
for day, count in time_dist['by_day_of_week'].items():
bar = "" * (count // 5)
print(f" {day}: {count:3d} {bar}")
# Extract orders
print("\n" + "="*70)
print("ORDER/TRANSACTION IDs FOUND")
print("="*70)
orders = extract_order_ids(emails)
if orders:
for order in orders[:10]:
print(f" [{order['type']}] {order['id']}")
print(f" Subject: {order['subject'][:60]}...")
else:
print(" No order IDs detected in subjects")
# Actionable insights
print("\n" + "="*70)
print("ACTIONABLE INSIGHTS")
print("="*70)
# High-volume automated senders
automated_domains = ['mutualart.com', 'tripadvisor.com', 'ebay.com', 'spotify.com']
auto_count = sum(1 for e in emails if any(d in (e.sender or '') for d in automated_domains))
print(f"\n1. AUTOMATED EMAILS: {auto_count} ({auto_count/len(emails)*100:.1f}%)")
print(" - MutualArt alerts: Consider aggregating to weekly digest")
print(" - Tripadvisor: Can be filtered to trash or separate folder")
print(" - eBay/Spotify: Promotional, low priority")
# Security alerts
security_count = category_counts.get('Security', 0)
print(f"\n2. SECURITY ALERTS: {security_count} ({security_count/len(emails)*100:.1f}%)")
print(" - Google security: Review for legitimate sign-in attempts")
print(" - Should NOT be auto-filtered")
# Business/Work
business_count = category_counts.get('Business', 0) + category_counts.get('Google', 0)
print(f"\n3. BUSINESS-RELATED: {business_count} ({business_count/len(emails)*100:.1f}%)")
print(" - Google Business Profile reports: Monthly review")
print(" - Job applications: High priority")
print(" - Appointments: Calendar integration")
# AI Services (professional interest)
ai_count = category_counts.get('AI Services', 0) + category_counts.get('Developer Tools', 0)
print(f"\n4. AI/DEVELOPER TOOLS: {ai_count} ({ai_count/len(emails)*100:.1f}%)")
print(" - Anthropic, OpenAI, Lambda: Keep for reference")
print(" - ngrok, Docker, Cursor: Developer updates")
# Personal
personal_count = category_counts.get('Personal', 0)
print(f"\n5. PERSONAL: {personal_count} ({personal_count/len(emails)*100:.1f}%)")
print(" - Gmail contacts: May need human review")
print(" - Microsoft/Outlook: Check for spam")
# Save analysis data
analysis_data = {
'metadata': {
'total_emails': len(emails),
'date_range': {
'start': str(dates[0]) if dates else None,
'end': str(dates[-1]) if dates else None
},
'analyzed_at': datetime.now().isoformat()
},
'categories': dict(category_counts),
'subcategories': dict(subcategory_counts),
'top_senders': dict(sender_counts.most_common(50)),
'time_distribution': time_dist,
'orders_found': orders,
'classification_accuracy': {
'categorized': len(emails) - category_counts.get('Uncategorized', 0),
'uncategorized': category_counts.get('Uncategorized', 0),
'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100
}
}
output_file = output_dir / "brett_gmail_analysis.json"
with open(output_file, 'w') as f:
json.dump(analysis_data, f, indent=2)
print(f"\n\nAnalysis saved to: {output_file}")
print("\n" + "="*70)
print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%")
print(f"({analysis_data['classification_accuracy']['categorized']} categorized, "
f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)")
print("="*70)
if __name__ == '__main__':
main()