#!/usr/bin/env python3 """ Brett Gmail Dataset Analyzer ============================ CUSTOM script for analyzing the brett-gmail email dataset. NOT portable to other datasets without modification. Usage: python tools/brett_gmail_analyzer.py Output: - Console report with comprehensive statistics - data/brett_gmail_analysis.json with full analysis data """ import json import re from collections import Counter, defaultdict from datetime import datetime from pathlib import Path # Add parent to path for imports import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from src.calibration.local_file_parser import LocalFileParser # ============================================================================= # CLASSIFICATION RULES - CUSTOM FOR BRETT'S GMAIL # ============================================================================= def classify_email(email): """ Classify email into categories based on sender domain and subject patterns. Priority: Sender domain > Subject keywords """ sender = email.sender or "" subject = email.subject or "" domain = sender.split('@')[-1] if '@' in sender else sender # === HIGH-LEVEL CATEGORIES === # --- Art & Collectibles --- if 'mutualart.com' in domain: return ('Art & Collectibles', 'MutualArt Alerts') # --- Travel & Tourism --- if 'tripadvisor.com' in domain: return ('Travel & Tourism', 'Tripadvisor') if 'booking.com' in domain: return ('Travel & Tourism', 'Booking.com') # --- Entertainment & Streaming --- if 'spotify.com' in domain: if 'concert' in subject.lower() or 'live' in subject.lower(): return ('Entertainment', 'Spotify Concerts') return ('Entertainment', 'Spotify Promotions') if 'youtube.com' in domain: return ('Entertainment', 'YouTube') if 'onlyfans.com' in domain: return ('Entertainment', 'OnlyFans') if 'ign.com' in domain: return ('Entertainment', 'IGN Gaming') # --- Shopping & eCommerce --- if 'ebay.com' in domain or 'reply.ebay' in domain: return ('Shopping', 'eBay') if 'aliexpress.com' in domain: return ('Shopping', 'AliExpress') if 'alibabacloud.com' in domain or 'alibaba-inc.com' in domain: return ('Tech Services', 'Alibaba Cloud') if '4wdsupacentre' in domain: return ('Shopping', '4WD Supacentre') if 'mikeblewitt' in domain or 'mbcoffscoast' in domain: return ('Shopping', 'Mike Blewitt/MBC') if 'auspost.com.au' in domain: return ('Shopping', 'Australia Post') if 'printfresh' in domain: return ('Business', 'Timesheets') # --- AI & Tech Services --- if 'anthropic.com' in domain or 'claude.com' in domain: return ('AI Services', 'Anthropic/Claude') if 'openai.com' in domain: return ('AI Services', 'OpenAI') if 'openrouter.ai' in domain: return ('AI Services', 'OpenRouter') if 'lambda' in domain: return ('AI Services', 'Lambda Labs') if 'x.ai' in domain: return ('AI Services', 'xAI') if 'perplexity.ai' in domain: return ('AI Services', 'Perplexity') if 'cursor.com' in domain: return ('Developer Tools', 'Cursor') # --- Developer Tools --- if 'ngrok.com' in domain: return ('Developer Tools', 'ngrok') if 'docker.com' in domain: return ('Developer Tools', 'Docker') # --- Productivity Apps --- if 'screencastify.com' in domain: return ('Productivity', 'Screencastify') if 'tango.us' in domain: return ('Productivity', 'Tango') if 'xplor.com' in domain or 'myxplor' in domain: return ('Services', 'Xplor Childcare') # --- Google Services --- if 'google.com' in domain or 'accounts.google.com' in domain: if 'performance report' in subject.lower() or 'business profile' in subject.lower(): return ('Google', 'Business Profile') if 'security' in subject.lower() or 'sign-in' in subject.lower(): return ('Security', 'Google Security') if 'firebase' in subject.lower() or 'firestore' in subject.lower(): return ('Developer Tools', 'Firebase') if 'ads' in subject.lower(): return ('Google', 'Google Ads') if 'analytics' in subject.lower(): return ('Google', 'Analytics') if re.search(r'verification code|verify', subject, re.I): return ('Security', 'Google Verification') return ('Google', 'Other Google') # --- Microsoft --- if 'microsoft.com' in domain or 'outlook.com' in domain or 'hotmail.com' in domain: if 'security' in subject.lower() or 'protection' in domain: return ('Security', 'Microsoft Security') return ('Personal', 'Microsoft/Outlook') # --- Social Media --- if 'reddit' in domain: return ('Social', 'Reddit') # --- Business/Work --- if 'frontiertechstrategies' in domain: return ('Business', 'Appointments') if 'crsaustralia.gov.au' in domain: return ('Business', 'Job Applications') if 'v6send.net' in domain: return ('Shopping', 'Automotive Dealers') # === SUBJECT-BASED FALLBACK === if re.search(r'security alert|verification code|sign.?in|password|2fa', subject, re.I): return ('Security', 'General Security') if re.search(r'order.*ship|receipt|payment|invoice|purchase', subject, re.I): return ('Transactions', 'Orders/Receipts') if re.search(r'trial|subscription|billing|renew', subject, re.I): return ('Billing', 'Subscriptions') if re.search(r'terms of service|privacy policy|legal', subject, re.I): return ('Legal', 'Policy Updates') if re.search(r'welcome to|getting started', subject, re.I): return ('Onboarding', 'Welcome Emails') # --- Personal contacts --- if 'gmail.com' in domain: return ('Personal', 'Gmail Contacts') return ('Uncategorized', 'Unknown') def extract_order_ids(emails): """Extract order/transaction IDs from emails.""" order_patterns = [ (r'Order\s+(\d{10,})', 'AliExpress Order'), (r'receipt.*(\d{4}-\d{4}-\d{4})', 'Receipt ID'), (r'#(\d{4,})', 'Generic Order ID'), ] orders = [] for email in emails: subject = email.subject or "" for pattern, order_type in order_patterns: match = re.search(pattern, subject, re.I) if match: orders.append({ 'id': match.group(1), 'type': order_type, 'subject': subject, 'date': str(email.date) if email.date else None, 'sender': email.sender }) break return orders def analyze_time_distribution(emails): """Analyze email distribution over time.""" by_year = Counter() by_month = Counter() by_day_of_week = Counter() day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] for email in emails: if email.date: try: by_year[email.date.year] += 1 by_month[f"{email.date.year}-{email.date.month:02d}"] += 1 by_day_of_week[day_names[email.date.weekday()]] += 1 except: pass return { 'by_year': dict(by_year.most_common()), 'by_month': dict(sorted(by_month.items())), 'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names} } def main(): email_dir = "/home/bob/Documents/Email Manager/emails/brett-gmail" output_dir = Path(__file__).parent.parent / "data" output_dir.mkdir(exist_ok=True) print("="*70) print("BRETT GMAIL DATASET ANALYSIS") print("="*70) print(f"\nSource: {email_dir}") print(f"Output: {output_dir}") # Parse emails print("\nParsing emails...") parser = LocalFileParser(email_dir) emails = parser.parse_emails() print(f"Total emails: {len(emails)}") # Date range dates = [e.date for e in emails if e.date] if dates: dates.sort() print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}") # Classify all emails print("\nClassifying emails...") category_counts = Counter() subcategory_counts = Counter() by_category = defaultdict(list) by_subcategory = defaultdict(list) for email in emails: category, subcategory = classify_email(email) category_counts[category] += 1 subcategory_counts[subcategory] += 1 by_category[category].append(email) by_subcategory[subcategory].append(email) # Print category summary print("\n" + "="*70) print("CATEGORY SUMMARY") print("="*70) for category, count in category_counts.most_common(): pct = count / len(emails) * 100 bar = "█" * int(pct / 2) print(f"\n{category} ({count} emails, {pct:.1f}%)") print(f" {bar}") # Show subcategories subcats = Counter() for email in by_category[category]: _, subcat = classify_email(email) subcats[subcat] += 1 for subcat, subcount in subcats.most_common(): print(f" - {subcat}: {subcount}") # Analyze senders print("\n" + "="*70) print("TOP SENDERS BY VOLUME") print("="*70) sender_counts = Counter(e.sender for e in emails) for sender, count in sender_counts.most_common(15): pct = count / len(emails) * 100 print(f" {count:4d} ({pct:4.1f}%) {sender}") # Time analysis print("\n" + "="*70) print("TIME DISTRIBUTION") print("="*70) time_dist = analyze_time_distribution(emails) print("\nBy Year:") for year, count in sorted(time_dist['by_year'].items()): bar = "█" * (count // 10) print(f" {year}: {count:4d} {bar}") print("\nBy Day of Week:") for day, count in time_dist['by_day_of_week'].items(): bar = "█" * (count // 5) print(f" {day}: {count:3d} {bar}") # Extract orders print("\n" + "="*70) print("ORDER/TRANSACTION IDs FOUND") print("="*70) orders = extract_order_ids(emails) if orders: for order in orders[:10]: print(f" [{order['type']}] {order['id']}") print(f" Subject: {order['subject'][:60]}...") else: print(" No order IDs detected in subjects") # Actionable insights print("\n" + "="*70) print("ACTIONABLE INSIGHTS") print("="*70) # High-volume automated senders automated_domains = ['mutualart.com', 'tripadvisor.com', 'ebay.com', 'spotify.com'] auto_count = sum(1 for e in emails if any(d in (e.sender or '') for d in automated_domains)) print(f"\n1. AUTOMATED EMAILS: {auto_count} ({auto_count/len(emails)*100:.1f}%)") print(" - MutualArt alerts: Consider aggregating to weekly digest") print(" - Tripadvisor: Can be filtered to trash or separate folder") print(" - eBay/Spotify: Promotional, low priority") # Security alerts security_count = category_counts.get('Security', 0) print(f"\n2. SECURITY ALERTS: {security_count} ({security_count/len(emails)*100:.1f}%)") print(" - Google security: Review for legitimate sign-in attempts") print(" - Should NOT be auto-filtered") # Business/Work business_count = category_counts.get('Business', 0) + category_counts.get('Google', 0) print(f"\n3. BUSINESS-RELATED: {business_count} ({business_count/len(emails)*100:.1f}%)") print(" - Google Business Profile reports: Monthly review") print(" - Job applications: High priority") print(" - Appointments: Calendar integration") # AI Services (professional interest) ai_count = category_counts.get('AI Services', 0) + category_counts.get('Developer Tools', 0) print(f"\n4. AI/DEVELOPER TOOLS: {ai_count} ({ai_count/len(emails)*100:.1f}%)") print(" - Anthropic, OpenAI, Lambda: Keep for reference") print(" - ngrok, Docker, Cursor: Developer updates") # Personal personal_count = category_counts.get('Personal', 0) print(f"\n5. PERSONAL: {personal_count} ({personal_count/len(emails)*100:.1f}%)") print(" - Gmail contacts: May need human review") print(" - Microsoft/Outlook: Check for spam") # Save analysis data analysis_data = { 'metadata': { 'total_emails': len(emails), 'date_range': { 'start': str(dates[0]) if dates else None, 'end': str(dates[-1]) if dates else None }, 'analyzed_at': datetime.now().isoformat() }, 'categories': dict(category_counts), 'subcategories': dict(subcategory_counts), 'top_senders': dict(sender_counts.most_common(50)), 'time_distribution': time_dist, 'orders_found': orders, 'classification_accuracy': { 'categorized': len(emails) - category_counts.get('Uncategorized', 0), 'uncategorized': category_counts.get('Uncategorized', 0), 'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100 } } output_file = output_dir / "brett_gmail_analysis.json" with open(output_file, 'w') as f: json.dump(analysis_data, f, indent=2) print(f"\n\nAnalysis saved to: {output_file}") print("\n" + "="*70) print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%") print(f"({analysis_data['classification_accuracy']['categorized']} categorized, " f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)") print("="*70) if __name__ == '__main__': main()