email-sorter/tools/brett_gmail_analyzer.py

#!/usr/bin/env python3
"""
Brett Gmail Dataset Analyzer
============================
CUSTOM script for analyzing the brett-gmail email dataset.
NOT portable to other datasets without modification.

Usage:
    python tools/brett_gmail_analyzer.py

Output:
    - Console report with comprehensive statistics
    - data/brett_gmail_analysis.json with full analysis data
"""

import json
import re
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path

# Add parent to path for imports
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.calibration.local_file_parser import LocalFileParser


# =============================================================================
# CLASSIFICATION RULES - CUSTOM FOR BRETT'S GMAIL
# =============================================================================

def classify_email(email):
    """
    Classify email into categories based on sender domain and subject patterns.

    Priority: Sender domain > Subject keywords
    """
    sender = email.sender or ""
    subject = email.subject or ""
    domain = sender.split('@')[-1] if '@' in sender else sender

    # === HIGH-LEVEL CATEGORIES ===

    # --- Art & Collectibles ---
    if 'mutualart.com' in domain:
        return ('Art & Collectibles', 'MutualArt Alerts')

    # --- Travel & Tourism ---
    if 'tripadvisor.com' in domain:
        return ('Travel & Tourism', 'Tripadvisor')
    if 'booking.com' in domain:
        return ('Travel & Tourism', 'Booking.com')

    # --- Entertainment & Streaming ---
    if 'spotify.com' in domain:
        if 'concert' in subject.lower() or 'live' in subject.lower():
            return ('Entertainment', 'Spotify Concerts')
        return ('Entertainment', 'Spotify Promotions')
    if 'youtube.com' in domain:
        return ('Entertainment', 'YouTube')
    if 'onlyfans.com' in domain:
        return ('Entertainment', 'OnlyFans')
    if 'ign.com' in domain:
        return ('Entertainment', 'IGN Gaming')

    # --- Shopping & eCommerce ---
    if 'ebay.com' in domain or 'reply.ebay' in domain:
        return ('Shopping', 'eBay')
    if 'aliexpress.com' in domain:
        return ('Shopping', 'AliExpress')
    if 'alibabacloud.com' in domain or 'alibaba-inc.com' in domain:
        return ('Tech Services', 'Alibaba Cloud')
    if '4wdsupacentre' in domain:
        return ('Shopping', '4WD Supacentre')
    if 'mikeblewitt' in domain or 'mbcoffscoast' in domain:
        return ('Shopping', 'Mike Blewitt/MBC')
    if 'auspost.com.au' in domain:
        return ('Shopping', 'Australia Post')
    if 'printfresh' in domain:
        return ('Business', 'Timesheets')

    # --- AI & Tech Services ---
    if 'anthropic.com' in domain or 'claude.com' in domain:
        return ('AI Services', 'Anthropic/Claude')
    if 'openai.com' in domain:
        return ('AI Services', 'OpenAI')
    if 'openrouter.ai' in domain:
        return ('AI Services', 'OpenRouter')
    if 'lambda' in domain:
        return ('AI Services', 'Lambda Labs')
    if 'x.ai' in domain:
        return ('AI Services', 'xAI')
    if 'perplexity.ai' in domain:
        return ('AI Services', 'Perplexity')
    if 'cursor.com' in domain:
        return ('Developer Tools', 'Cursor')

    # --- Developer Tools ---
    if 'ngrok.com' in domain:
        return ('Developer Tools', 'ngrok')
    if 'docker.com' in domain:
        return ('Developer Tools', 'Docker')

    # --- Productivity Apps ---
    if 'screencastify.com' in domain:
        return ('Productivity', 'Screencastify')
    if 'tango.us' in domain:
        return ('Productivity', 'Tango')
    if 'xplor.com' in domain or 'myxplor' in domain:
        return ('Services', 'Xplor Childcare')

    # --- Google Services ---
    if 'google.com' in domain or 'accounts.google.com' in domain:
        if 'performance report' in subject.lower() or 'business profile' in subject.lower():
            return ('Google', 'Business Profile')
        if 'security' in subject.lower() or 'sign-in' in subject.lower():
            return ('Security', 'Google Security')
        if 'firebase' in subject.lower() or 'firestore' in subject.lower():
            return ('Developer Tools', 'Firebase')
        if 'ads' in subject.lower():
            return ('Google', 'Google Ads')
        if 'analytics' in subject.lower():
            return ('Google', 'Analytics')
        if re.search(r'verification code|verify', subject, re.I):
            return ('Security', 'Google Verification')
        return ('Google', 'Other Google')

    # --- Microsoft ---
    if 'microsoft.com' in domain or 'outlook.com' in domain or 'hotmail.com' in domain:
        if 'security' in subject.lower() or 'protection' in domain:
            return ('Security', 'Microsoft Security')
        return ('Personal', 'Microsoft/Outlook')

    # --- Social Media ---
    if 'reddit' in domain:
        return ('Social', 'Reddit')

    # --- Business/Work ---
    if 'frontiertechstrategies' in domain:
        return ('Business', 'Appointments')
    if 'crsaustralia.gov.au' in domain:
        return ('Business', 'Job Applications')
    if 'v6send.net' in domain:
        return ('Shopping', 'Automotive Dealers')

    # === SUBJECT-BASED FALLBACK ===

    if re.search(r'security alert|verification code|sign.?in|password|2fa', subject, re.I):
        return ('Security', 'General Security')

    if re.search(r'order.*ship|receipt|payment|invoice|purchase', subject, re.I):
        return ('Transactions', 'Orders/Receipts')

    if re.search(r'trial|subscription|billing|renew', subject, re.I):
        return ('Billing', 'Subscriptions')

    if re.search(r'terms of service|privacy policy|legal', subject, re.I):
        return ('Legal', 'Policy Updates')

    if re.search(r'welcome to|getting started', subject, re.I):
        return ('Onboarding', 'Welcome Emails')

    # --- Personal contacts ---
    if 'gmail.com' in domain:
        return ('Personal', 'Gmail Contacts')

    return ('Uncategorized', 'Unknown')


def extract_order_ids(emails):
    """Extract order/transaction IDs from emails."""
    order_patterns = [
        (r'Order\s+(\d{10,})', 'AliExpress Order'),
        (r'receipt.*(\d{4}-\d{4}-\d{4})', 'Receipt ID'),
        (r'#(\d{4,})', 'Generic Order ID'),
    ]

    orders = []
    for email in emails:
        subject = email.subject or ""
        for pattern, order_type in order_patterns:
            match = re.search(pattern, subject, re.I)
            if match:
                orders.append({
                    'id': match.group(1),
                    'type': order_type,
                    'subject': subject,
                    'date': str(email.date) if email.date else None,
                    'sender': email.sender
                })
                break
    return orders


def analyze_time_distribution(emails):
    """Analyze email distribution over time."""
    by_year = Counter()
    by_month = Counter()
    by_day_of_week = Counter()

    day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    for email in emails:
        if email.date:
            try:
                by_year[email.date.year] += 1
                by_month[f"{email.date.year}-{email.date.month:02d}"] += 1
                by_day_of_week[day_names[email.date.weekday()]] += 1
            except:
                pass

    return {
        'by_year': dict(by_year.most_common()),
        'by_month': dict(sorted(by_month.items())),
        'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names}
    }


def main():
    email_dir = "/home/bob/Documents/Email Manager/emails/brett-gmail"
    output_dir = Path(__file__).parent.parent / "data"
    output_dir.mkdir(exist_ok=True)

    print("="*70)
    print("BRETT GMAIL DATASET ANALYSIS")
    print("="*70)
    print(f"\nSource: {email_dir}")
    print(f"Output: {output_dir}")

    # Parse emails
    print("\nParsing emails...")
    parser = LocalFileParser(email_dir)
    emails = parser.parse_emails()
    print(f"Total emails: {len(emails)}")

    # Date range
    dates = [e.date for e in emails if e.date]
    if dates:
        dates.sort()
        print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")

    # Classify all emails
    print("\nClassifying emails...")

    category_counts = Counter()
    subcategory_counts = Counter()
    by_category = defaultdict(list)
    by_subcategory = defaultdict(list)

    for email in emails:
        category, subcategory = classify_email(email)
        category_counts[category] += 1
        subcategory_counts[subcategory] += 1
        by_category[category].append(email)
        by_subcategory[subcategory].append(email)

    # Print category summary
    print("\n" + "="*70)
    print("CATEGORY SUMMARY")
    print("="*70)

    for category, count in category_counts.most_common():
        pct = count / len(emails) * 100
        bar = "█" * int(pct / 2)
        print(f"\n{category} ({count} emails, {pct:.1f}%)")
        print(f"  {bar}")

        # Show subcategories
        subcats = Counter()
        for email in by_category[category]:
            _, subcat = classify_email(email)
            subcats[subcat] += 1

        for subcat, subcount in subcats.most_common():
            print(f"    - {subcat}: {subcount}")

    # Analyze senders
    print("\n" + "="*70)
    print("TOP SENDERS BY VOLUME")
    print("="*70)

    sender_counts = Counter(e.sender for e in emails)
    for sender, count in sender_counts.most_common(15):
        pct = count / len(emails) * 100
        print(f"  {count:4d} ({pct:4.1f}%)  {sender}")

    # Time analysis
    print("\n" + "="*70)
    print("TIME DISTRIBUTION")
    print("="*70)

    time_dist = analyze_time_distribution(emails)

    print("\nBy Year:")
    for year, count in sorted(time_dist['by_year'].items()):
        bar = "█" * (count // 10)
        print(f"  {year}: {count:4d} {bar}")

    print("\nBy Day of Week:")
    for day, count in time_dist['by_day_of_week'].items():
        bar = "█" * (count // 5)
        print(f"  {day}: {count:3d} {bar}")

    # Extract orders
    print("\n" + "="*70)
    print("ORDER/TRANSACTION IDs FOUND")
    print("="*70)

    orders = extract_order_ids(emails)
    if orders:
        for order in orders[:10]:
            print(f"  [{order['type']}] {order['id']}")
            print(f"    Subject: {order['subject'][:60]}...")
    else:
        print("  No order IDs detected in subjects")

    # Actionable insights
    print("\n" + "="*70)
    print("ACTIONABLE INSIGHTS")
    print("="*70)

    # High-volume automated senders
    automated_domains = ['mutualart.com', 'tripadvisor.com', 'ebay.com', 'spotify.com']
    auto_count = sum(1 for e in emails if any(d in (e.sender or '') for d in automated_domains))
    print(f"\n1. AUTOMATED EMAILS: {auto_count} ({auto_count/len(emails)*100:.1f}%)")
    print("   - MutualArt alerts: Consider aggregating to weekly digest")
    print("   - Tripadvisor: Can be filtered to trash or separate folder")
    print("   - eBay/Spotify: Promotional, low priority")

    # Security alerts
    security_count = category_counts.get('Security', 0)
    print(f"\n2. SECURITY ALERTS: {security_count} ({security_count/len(emails)*100:.1f}%)")
    print("   - Google security: Review for legitimate sign-in attempts")
    print("   - Should NOT be auto-filtered")

    # Business/Work
    business_count = category_counts.get('Business', 0) + category_counts.get('Google', 0)
    print(f"\n3. BUSINESS-RELATED: {business_count} ({business_count/len(emails)*100:.1f}%)")
    print("   - Google Business Profile reports: Monthly review")
    print("   - Job applications: High priority")
    print("   - Appointments: Calendar integration")

    # AI Services (professional interest)
    ai_count = category_counts.get('AI Services', 0) + category_counts.get('Developer Tools', 0)
    print(f"\n4. AI/DEVELOPER TOOLS: {ai_count} ({ai_count/len(emails)*100:.1f}%)")
    print("   - Anthropic, OpenAI, Lambda: Keep for reference")
    print("   - ngrok, Docker, Cursor: Developer updates")

    # Personal
    personal_count = category_counts.get('Personal', 0)
    print(f"\n5. PERSONAL: {personal_count} ({personal_count/len(emails)*100:.1f}%)")
    print("   - Gmail contacts: May need human review")
    print("   - Microsoft/Outlook: Check for spam")

    # Save analysis data
    analysis_data = {
        'metadata': {
            'total_emails': len(emails),
            'date_range': {
                'start': str(dates[0]) if dates else None,
                'end': str(dates[-1]) if dates else None
            },
            'analyzed_at': datetime.now().isoformat()
        },
        'categories': dict(category_counts),
        'subcategories': dict(subcategory_counts),
        'top_senders': dict(sender_counts.most_common(50)),
        'time_distribution': time_dist,
        'orders_found': orders,
        'classification_accuracy': {
            'categorized': len(emails) - category_counts.get('Uncategorized', 0),
            'uncategorized': category_counts.get('Uncategorized', 0),
            'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100
        }
    }

    output_file = output_dir / "brett_gmail_analysis.json"
    with open(output_file, 'w') as f:
        json.dump(analysis_data, f, indent=2)

    print(f"\n\nAnalysis saved to: {output_file}")
    print("\n" + "="*70)
    print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%")
    print(f"({analysis_data['classification_accuracy']['categorized']} categorized, "
          f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)")
    print("="*70)


if __name__ == '__main__':
    main()