- Rewrote CLAUDE.md with comprehensive development guide - Archived 20 old docs to docs/archive/ - Added PROJECT_ROADMAP_2025.md with research learnings - Added CLASSIFICATION_METHODS_COMPARISON.md - Added SESSION_HANDOVER_20251128.md - Added tools for analysis (brett_gmail/microsoft analyzers) - Updated .gitignore for archive folders - Config changes for local vLLM endpoint
392 lines
14 KiB
Python
392 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brett Gmail Dataset Analyzer
|
|
============================
|
|
CUSTOM script for analyzing the brett-gmail email dataset.
|
|
NOT portable to other datasets without modification.
|
|
|
|
Usage:
|
|
python tools/brett_gmail_analyzer.py
|
|
|
|
Output:
|
|
- Console report with comprehensive statistics
|
|
- data/brett_gmail_analysis.json with full analysis data
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Add parent to path for imports
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.calibration.local_file_parser import LocalFileParser
|
|
|
|
|
|
# =============================================================================
|
|
# CLASSIFICATION RULES - CUSTOM FOR BRETT'S GMAIL
|
|
# =============================================================================
|
|
|
|
def classify_email(email):
|
|
"""
|
|
Classify email into categories based on sender domain and subject patterns.
|
|
|
|
Priority: Sender domain > Subject keywords
|
|
"""
|
|
sender = email.sender or ""
|
|
subject = email.subject or ""
|
|
domain = sender.split('@')[-1] if '@' in sender else sender
|
|
|
|
# === HIGH-LEVEL CATEGORIES ===
|
|
|
|
# --- Art & Collectibles ---
|
|
if 'mutualart.com' in domain:
|
|
return ('Art & Collectibles', 'MutualArt Alerts')
|
|
|
|
# --- Travel & Tourism ---
|
|
if 'tripadvisor.com' in domain:
|
|
return ('Travel & Tourism', 'Tripadvisor')
|
|
if 'booking.com' in domain:
|
|
return ('Travel & Tourism', 'Booking.com')
|
|
|
|
# --- Entertainment & Streaming ---
|
|
if 'spotify.com' in domain:
|
|
if 'concert' in subject.lower() or 'live' in subject.lower():
|
|
return ('Entertainment', 'Spotify Concerts')
|
|
return ('Entertainment', 'Spotify Promotions')
|
|
if 'youtube.com' in domain:
|
|
return ('Entertainment', 'YouTube')
|
|
if 'onlyfans.com' in domain:
|
|
return ('Entertainment', 'OnlyFans')
|
|
if 'ign.com' in domain:
|
|
return ('Entertainment', 'IGN Gaming')
|
|
|
|
# --- Shopping & eCommerce ---
|
|
if 'ebay.com' in domain or 'reply.ebay' in domain:
|
|
return ('Shopping', 'eBay')
|
|
if 'aliexpress.com' in domain:
|
|
return ('Shopping', 'AliExpress')
|
|
if 'alibabacloud.com' in domain or 'alibaba-inc.com' in domain:
|
|
return ('Tech Services', 'Alibaba Cloud')
|
|
if '4wdsupacentre' in domain:
|
|
return ('Shopping', '4WD Supacentre')
|
|
if 'mikeblewitt' in domain or 'mbcoffscoast' in domain:
|
|
return ('Shopping', 'Mike Blewitt/MBC')
|
|
if 'auspost.com.au' in domain:
|
|
return ('Shopping', 'Australia Post')
|
|
if 'printfresh' in domain:
|
|
return ('Business', 'Timesheets')
|
|
|
|
# --- AI & Tech Services ---
|
|
if 'anthropic.com' in domain or 'claude.com' in domain:
|
|
return ('AI Services', 'Anthropic/Claude')
|
|
if 'openai.com' in domain:
|
|
return ('AI Services', 'OpenAI')
|
|
if 'openrouter.ai' in domain:
|
|
return ('AI Services', 'OpenRouter')
|
|
if 'lambda' in domain:
|
|
return ('AI Services', 'Lambda Labs')
|
|
if 'x.ai' in domain:
|
|
return ('AI Services', 'xAI')
|
|
if 'perplexity.ai' in domain:
|
|
return ('AI Services', 'Perplexity')
|
|
if 'cursor.com' in domain:
|
|
return ('Developer Tools', 'Cursor')
|
|
|
|
# --- Developer Tools ---
|
|
if 'ngrok.com' in domain:
|
|
return ('Developer Tools', 'ngrok')
|
|
if 'docker.com' in domain:
|
|
return ('Developer Tools', 'Docker')
|
|
|
|
# --- Productivity Apps ---
|
|
if 'screencastify.com' in domain:
|
|
return ('Productivity', 'Screencastify')
|
|
if 'tango.us' in domain:
|
|
return ('Productivity', 'Tango')
|
|
if 'xplor.com' in domain or 'myxplor' in domain:
|
|
return ('Services', 'Xplor Childcare')
|
|
|
|
# --- Google Services ---
|
|
if 'google.com' in domain or 'accounts.google.com' in domain:
|
|
if 'performance report' in subject.lower() or 'business profile' in subject.lower():
|
|
return ('Google', 'Business Profile')
|
|
if 'security' in subject.lower() or 'sign-in' in subject.lower():
|
|
return ('Security', 'Google Security')
|
|
if 'firebase' in subject.lower() or 'firestore' in subject.lower():
|
|
return ('Developer Tools', 'Firebase')
|
|
if 'ads' in subject.lower():
|
|
return ('Google', 'Google Ads')
|
|
if 'analytics' in subject.lower():
|
|
return ('Google', 'Analytics')
|
|
if re.search(r'verification code|verify', subject, re.I):
|
|
return ('Security', 'Google Verification')
|
|
return ('Google', 'Other Google')
|
|
|
|
# --- Microsoft ---
|
|
if 'microsoft.com' in domain or 'outlook.com' in domain or 'hotmail.com' in domain:
|
|
if 'security' in subject.lower() or 'protection' in domain:
|
|
return ('Security', 'Microsoft Security')
|
|
return ('Personal', 'Microsoft/Outlook')
|
|
|
|
# --- Social Media ---
|
|
if 'reddit' in domain:
|
|
return ('Social', 'Reddit')
|
|
|
|
# --- Business/Work ---
|
|
if 'frontiertechstrategies' in domain:
|
|
return ('Business', 'Appointments')
|
|
if 'crsaustralia.gov.au' in domain:
|
|
return ('Business', 'Job Applications')
|
|
if 'v6send.net' in domain:
|
|
return ('Shopping', 'Automotive Dealers')
|
|
|
|
# === SUBJECT-BASED FALLBACK ===
|
|
|
|
if re.search(r'security alert|verification code|sign.?in|password|2fa', subject, re.I):
|
|
return ('Security', 'General Security')
|
|
|
|
if re.search(r'order.*ship|receipt|payment|invoice|purchase', subject, re.I):
|
|
return ('Transactions', 'Orders/Receipts')
|
|
|
|
if re.search(r'trial|subscription|billing|renew', subject, re.I):
|
|
return ('Billing', 'Subscriptions')
|
|
|
|
if re.search(r'terms of service|privacy policy|legal', subject, re.I):
|
|
return ('Legal', 'Policy Updates')
|
|
|
|
if re.search(r'welcome to|getting started', subject, re.I):
|
|
return ('Onboarding', 'Welcome Emails')
|
|
|
|
# --- Personal contacts ---
|
|
if 'gmail.com' in domain:
|
|
return ('Personal', 'Gmail Contacts')
|
|
|
|
return ('Uncategorized', 'Unknown')
|
|
|
|
|
|
def extract_order_ids(emails):
|
|
"""Extract order/transaction IDs from emails."""
|
|
order_patterns = [
|
|
(r'Order\s+(\d{10,})', 'AliExpress Order'),
|
|
(r'receipt.*(\d{4}-\d{4}-\d{4})', 'Receipt ID'),
|
|
(r'#(\d{4,})', 'Generic Order ID'),
|
|
]
|
|
|
|
orders = []
|
|
for email in emails:
|
|
subject = email.subject or ""
|
|
for pattern, order_type in order_patterns:
|
|
match = re.search(pattern, subject, re.I)
|
|
if match:
|
|
orders.append({
|
|
'id': match.group(1),
|
|
'type': order_type,
|
|
'subject': subject,
|
|
'date': str(email.date) if email.date else None,
|
|
'sender': email.sender
|
|
})
|
|
break
|
|
return orders
|
|
|
|
|
|
def analyze_time_distribution(emails):
|
|
"""Analyze email distribution over time."""
|
|
by_year = Counter()
|
|
by_month = Counter()
|
|
by_day_of_week = Counter()
|
|
|
|
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
|
|
|
for email in emails:
|
|
if email.date:
|
|
try:
|
|
by_year[email.date.year] += 1
|
|
by_month[f"{email.date.year}-{email.date.month:02d}"] += 1
|
|
by_day_of_week[day_names[email.date.weekday()]] += 1
|
|
except:
|
|
pass
|
|
|
|
return {
|
|
'by_year': dict(by_year.most_common()),
|
|
'by_month': dict(sorted(by_month.items())),
|
|
'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names}
|
|
}
|
|
|
|
|
|
def main():
|
|
email_dir = "/home/bob/Documents/Email Manager/emails/brett-gmail"
|
|
output_dir = Path(__file__).parent.parent / "data"
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
print("="*70)
|
|
print("BRETT GMAIL DATASET ANALYSIS")
|
|
print("="*70)
|
|
print(f"\nSource: {email_dir}")
|
|
print(f"Output: {output_dir}")
|
|
|
|
# Parse emails
|
|
print("\nParsing emails...")
|
|
parser = LocalFileParser(email_dir)
|
|
emails = parser.parse_emails()
|
|
print(f"Total emails: {len(emails)}")
|
|
|
|
# Date range
|
|
dates = [e.date for e in emails if e.date]
|
|
if dates:
|
|
dates.sort()
|
|
print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")
|
|
|
|
# Classify all emails
|
|
print("\nClassifying emails...")
|
|
|
|
category_counts = Counter()
|
|
subcategory_counts = Counter()
|
|
by_category = defaultdict(list)
|
|
by_subcategory = defaultdict(list)
|
|
|
|
for email in emails:
|
|
category, subcategory = classify_email(email)
|
|
category_counts[category] += 1
|
|
subcategory_counts[subcategory] += 1
|
|
by_category[category].append(email)
|
|
by_subcategory[subcategory].append(email)
|
|
|
|
# Print category summary
|
|
print("\n" + "="*70)
|
|
print("CATEGORY SUMMARY")
|
|
print("="*70)
|
|
|
|
for category, count in category_counts.most_common():
|
|
pct = count / len(emails) * 100
|
|
bar = "█" * int(pct / 2)
|
|
print(f"\n{category} ({count} emails, {pct:.1f}%)")
|
|
print(f" {bar}")
|
|
|
|
# Show subcategories
|
|
subcats = Counter()
|
|
for email in by_category[category]:
|
|
_, subcat = classify_email(email)
|
|
subcats[subcat] += 1
|
|
|
|
for subcat, subcount in subcats.most_common():
|
|
print(f" - {subcat}: {subcount}")
|
|
|
|
# Analyze senders
|
|
print("\n" + "="*70)
|
|
print("TOP SENDERS BY VOLUME")
|
|
print("="*70)
|
|
|
|
sender_counts = Counter(e.sender for e in emails)
|
|
for sender, count in sender_counts.most_common(15):
|
|
pct = count / len(emails) * 100
|
|
print(f" {count:4d} ({pct:4.1f}%) {sender}")
|
|
|
|
# Time analysis
|
|
print("\n" + "="*70)
|
|
print("TIME DISTRIBUTION")
|
|
print("="*70)
|
|
|
|
time_dist = analyze_time_distribution(emails)
|
|
|
|
print("\nBy Year:")
|
|
for year, count in sorted(time_dist['by_year'].items()):
|
|
bar = "█" * (count // 10)
|
|
print(f" {year}: {count:4d} {bar}")
|
|
|
|
print("\nBy Day of Week:")
|
|
for day, count in time_dist['by_day_of_week'].items():
|
|
bar = "█" * (count // 5)
|
|
print(f" {day}: {count:3d} {bar}")
|
|
|
|
# Extract orders
|
|
print("\n" + "="*70)
|
|
print("ORDER/TRANSACTION IDs FOUND")
|
|
print("="*70)
|
|
|
|
orders = extract_order_ids(emails)
|
|
if orders:
|
|
for order in orders[:10]:
|
|
print(f" [{order['type']}] {order['id']}")
|
|
print(f" Subject: {order['subject'][:60]}...")
|
|
else:
|
|
print(" No order IDs detected in subjects")
|
|
|
|
# Actionable insights
|
|
print("\n" + "="*70)
|
|
print("ACTIONABLE INSIGHTS")
|
|
print("="*70)
|
|
|
|
# High-volume automated senders
|
|
automated_domains = ['mutualart.com', 'tripadvisor.com', 'ebay.com', 'spotify.com']
|
|
auto_count = sum(1 for e in emails if any(d in (e.sender or '') for d in automated_domains))
|
|
print(f"\n1. AUTOMATED EMAILS: {auto_count} ({auto_count/len(emails)*100:.1f}%)")
|
|
print(" - MutualArt alerts: Consider aggregating to weekly digest")
|
|
print(" - Tripadvisor: Can be filtered to trash or separate folder")
|
|
print(" - eBay/Spotify: Promotional, low priority")
|
|
|
|
# Security alerts
|
|
security_count = category_counts.get('Security', 0)
|
|
print(f"\n2. SECURITY ALERTS: {security_count} ({security_count/len(emails)*100:.1f}%)")
|
|
print(" - Google security: Review for legitimate sign-in attempts")
|
|
print(" - Should NOT be auto-filtered")
|
|
|
|
# Business/Work
|
|
business_count = category_counts.get('Business', 0) + category_counts.get('Google', 0)
|
|
print(f"\n3. BUSINESS-RELATED: {business_count} ({business_count/len(emails)*100:.1f}%)")
|
|
print(" - Google Business Profile reports: Monthly review")
|
|
print(" - Job applications: High priority")
|
|
print(" - Appointments: Calendar integration")
|
|
|
|
# AI Services (professional interest)
|
|
ai_count = category_counts.get('AI Services', 0) + category_counts.get('Developer Tools', 0)
|
|
print(f"\n4. AI/DEVELOPER TOOLS: {ai_count} ({ai_count/len(emails)*100:.1f}%)")
|
|
print(" - Anthropic, OpenAI, Lambda: Keep for reference")
|
|
print(" - ngrok, Docker, Cursor: Developer updates")
|
|
|
|
# Personal
|
|
personal_count = category_counts.get('Personal', 0)
|
|
print(f"\n5. PERSONAL: {personal_count} ({personal_count/len(emails)*100:.1f}%)")
|
|
print(" - Gmail contacts: May need human review")
|
|
print(" - Microsoft/Outlook: Check for spam")
|
|
|
|
# Save analysis data
|
|
analysis_data = {
|
|
'metadata': {
|
|
'total_emails': len(emails),
|
|
'date_range': {
|
|
'start': str(dates[0]) if dates else None,
|
|
'end': str(dates[-1]) if dates else None
|
|
},
|
|
'analyzed_at': datetime.now().isoformat()
|
|
},
|
|
'categories': dict(category_counts),
|
|
'subcategories': dict(subcategory_counts),
|
|
'top_senders': dict(sender_counts.most_common(50)),
|
|
'time_distribution': time_dist,
|
|
'orders_found': orders,
|
|
'classification_accuracy': {
|
|
'categorized': len(emails) - category_counts.get('Uncategorized', 0),
|
|
'uncategorized': category_counts.get('Uncategorized', 0),
|
|
'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100
|
|
}
|
|
}
|
|
|
|
output_file = output_dir / "brett_gmail_analysis.json"
|
|
with open(output_file, 'w') as f:
|
|
json.dump(analysis_data, f, indent=2)
|
|
|
|
print(f"\n\nAnalysis saved to: {output_file}")
|
|
print("\n" + "="*70)
|
|
print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%")
|
|
print(f"({analysis_data['classification_accuracy']['categorized']} categorized, "
|
|
f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)")
|
|
print("="*70)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|