#!/usr/bin/env python3 """ Brett Microsoft (Outlook) Dataset Analyzer ========================================== CUSTOM script for analyzing the brett-microsoft email dataset. NOT portable to other datasets without modification. Usage: python tools/brett_microsoft_analyzer.py Output: - Console report with comprehensive statistics - data/brett_microsoft_analysis.json with full analysis data """ import json import re from collections import Counter, defaultdict from datetime import datetime from pathlib import Path # Add parent to path for imports import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from src.calibration.local_file_parser import LocalFileParser # ============================================================================= # CLASSIFICATION RULES - CUSTOM FOR BRETT'S MICROSOFT/OUTLOOK INBOX # ============================================================================= def classify_email(email): """ Classify email into categories based on sender domain and subject patterns. This is a BUSINESS inbox - different approach than personal Gmail. Priority: Sender domain > Subject keywords > Business context """ sender = email.sender or "" subject = email.subject or "" domain = sender.split('@')[-1] if '@' in sender else sender # === BUSINESS OPERATIONS === # MYOB/Accounting if 'apps.myob.com' in domain or 'myob' in subject.lower(): return ('Business Operations', 'MYOB Invoices') # TPG/Telecom/Internet if 'tpgtelecom.com.au' in domain or 'aapt.com.au' in domain: if 'suspension' in subject.lower() or 'overdue' in subject.lower(): return ('Business Operations', 'Telecom - Urgent/Overdue') if 'novation' in subject.lower(): return ('Business Operations', 'Telecom - Contract Changes') if 'NBN' in subject or 'nbn' in subject.lower(): return ('Business Operations', 'Telecom - NBN') return ('Business Operations', 'Telecom - General') # DocuSign (Contracts) if 'docusign' in domain or 'docusign' in subject.lower(): return ('Business Operations', 'DocuSign Contracts') # === CLIENT WORK === # Green Output / Energy Avengers (App Development Client) if 'greenoutput.com.au' in domain or 'energyavengers' in domain: return ('Client Work', 'Energy Avengers Project') # Brighter Access (Client) if 'brighteraccess' in domain or 'Brighter Access' in subject: return ('Client Work', 'Brighter Access') # Waterfall Way Designs (Business Partner) if 'waterfallwaydesigns' in domain: return ('Client Work', 'Waterfall Way Designs') # Target Impact if 'targetimpact.com.au' in domain: return ('Client Work', 'Target Impact') # MerlinFX if 'merlinfx.com.au' in domain: return ('Client Work', 'MerlinFX') # Solar/Energy related (Energy Avengers ecosystem) if 'solarairenergy.com.au' in domain or 'solarconnected.com.au' in domain: return ('Client Work', 'Energy Avengers Ecosystem') if 'eonadvisory.com.au' in domain or 'australianpowerbrokers.com.au' in domain: return ('Client Work', 'Energy Avengers Ecosystem') if 'fyconsulting.com.au' in domain: return ('Client Work', 'Energy Avengers Ecosystem') if 'convergedesign.com.au' in domain: return ('Client Work', 'Energy Avengers Ecosystem') # MYP Corp (Disability Services Software) if '1myp.com' in domain or 'mypcorp' in domain or 'MYP' in subject: return ('Business Operations', 'MYP Software') # === MICROSOFT SERVICES === # Microsoft Support Cases if re.search(r'\[Case.*#|Case #|TrackingID', subject, re.I) or 'support.microsoft.com' in domain: return ('Microsoft', 'Support Cases') # Microsoft Billing/Invoices if 'Microsoft invoice' in subject or 'credit card was declined' in subject: return ('Microsoft', 'Billing') # Microsoft Subscriptions if 'subscription' in subject.lower() and 'microsoft' in sender.lower(): return ('Microsoft', 'Subscriptions') # SharePoint/Teams if 'sharepointonline.com' in domain or 'Teams' in subject: return ('Microsoft', 'SharePoint/Teams') # O365 Service Updates if 'o365su' in sender or ('digest' in subject.lower() and 'microsoft' in sender.lower()): return ('Microsoft', 'Service Updates') # General Microsoft if 'microsoft.com' in domain: return ('Microsoft', 'General') # === DEVELOPER TOOLS === # GitHub CI/CD if re.search(r'\[FSSCoding', subject): return ('Developer', 'GitHub CI/CD Failures') # GitHub Issues/PRs if 'github.com' in domain: if 'linuxmint' in subject or 'cinnamon' in subject: return ('Developer', 'Open Source Contributions') if 'Pheromind' in subject or 'ChrisRoyse' in subject: return ('Developer', 'GitHub Collaborations') return ('Developer', 'GitHub Notifications') # Neo4j if 'neo4j.com' in domain: if 'webinar' in subject.lower() or 'Webinar' in subject: return ('Developer', 'Neo4j Webinars') if 'NODES' in subject or 'GraphTalk' in subject: return ('Developer', 'Neo4j Conference') return ('Developer', 'Neo4j') # Cursor (AI IDE) if 'cursor.com' in domain or 'cursor.so' in domain or 'Cursor' in subject: return ('Developer', 'Cursor IDE') # Tailscale if 'tailscale.com' in domain: return ('Developer', 'Tailscale') # Hugging Face if 'huggingface' in domain or 'Hugging Face' in subject: return ('Developer', 'Hugging Face') # Stripe (Payment Failures) if 'stripe.com' in domain: return ('Billing', 'Stripe Payments') # Contabo (Hosting) if 'contabo.com' in domain: return ('Developer', 'Contabo Hosting') # SendGrid if 'sendgrid' in subject.lower(): return ('Developer', 'SendGrid') # Twilio if 'twilio.com' in domain: return ('Developer', 'Twilio') # Brave Search API if 'brave.com' in domain: return ('Developer', 'Brave Search API') # PyPI if 'pypi' in subject.lower() or 'pypi.org' in domain: return ('Developer', 'PyPI') # NVIDIA/CUDA if 'CUDA' in subject or 'nvidia' in domain: return ('Developer', 'NVIDIA/CUDA') # Inception Labs / AI Tools if 'inceptionlabs.ai' in domain: return ('Developer', 'AI Tools') # === LEARNING === # Computer Enhance (Casey Muratori) / Substack if 'computerenhance' in sender or 'substack.com' in domain: return ('Learning', 'Substack/Newsletters') # Odoo if 'odoo.com' in domain: return ('Learning', 'Odoo ERP') # Mozilla Firefox if 'mozilla.org' in domain: return ('Developer', 'Mozilla Firefox') # === PERSONAL / COMMUNITY === # Grandfather Gatherings (Personal Community) if 'Grandfather Gather' in subject: return ('Personal', 'Grandfather Gatherings') # Mailchimp newsletters (often personal) if 'mailchimpapp.com' in domain: return ('Personal', 'Personal Newsletters') # Community Events if 'Community Working Bee' in subject: return ('Personal', 'Community Events') # Personal emails (Gmail/Hotmail) if 'gmail.com' in domain or 'hotmail.com' in domain or 'bigpond.com' in domain: return ('Personal', 'Personal Contacts') # FSS Internal if 'foxsoftwaresolutions.com.au' in domain: return ('Business Operations', 'FSS Internal') # === FINANCIAL === # eToro if 'etoro.com' in domain: return ('Financial', 'eToro Trading') # Dell if 'dell.com' in domain or 'Dell' in subject: return ('Business Operations', 'Dell Hardware') # Insurance if 'KT Insurance' in subject or 'insurance' in subject.lower(): return ('Business Operations', 'Insurance') # SBSCH Payments if 'SBSCH' in subject: return ('Business Operations', 'SBSCH Payments') # iCare NSW if 'icare.nsw.gov.au' in domain: return ('Business Operations', 'iCare NSW') # Vodafone if 'vodafone.com.au' in domain: return ('Business Operations', 'Telecom - Vodafone') # === MISC === # Undeliverable/Bounces if 'Undeliverable' in subject: return ('System', 'Email Bounces') # Security if re.search(r'Security Alert|Login detected|security code|Verify', subject, re.I): return ('Security', 'Security Alerts') # Password Reset if 'password' in subject.lower(): return ('Security', 'Password') # Calendly if 'calendly.com' in domain: return ('Business Operations', 'Calendly') # Trello if 'trello.com' in domain: return ('Business Operations', 'Trello') # Scorptec if 'scorptec' in domain: return ('Business Operations', 'Hardware Vendor') # Webcentral if 'webcentral.com.au' in domain: return ('Business Operations', 'Web Hosting') # Bluetti (Hardware) if 'bluettipower.com' in domain: return ('Business Operations', 'Hardware - Power') # ABS Surveys if 'abs.gov.au' in domain: return ('Business Operations', 'Government - ABS') # Qualtrics/Surveys if 'qualtrics' in domain: return ('Business Operations', 'Surveys') return ('Uncategorized', 'Unknown') def extract_case_ids(emails): """Extract Microsoft support case IDs and tracking IDs from emails.""" case_patterns = [ (r'Case\s*#?\s*:?\s*(\d{8})', 'Microsoft Case'), (r'\[Case\s*#?\s*:?\s*(\d{8})\]', 'Microsoft Case'), (r'TrackingID#(\d{16})', 'Tracking ID'), ] cases = defaultdict(list) for email in emails: subject = email.subject or "" for pattern, case_type in case_patterns: match = re.search(pattern, subject, re.I) if match: case_id = match.group(1) cases[case_id].append({ 'type': case_type, 'subject': subject, 'date': str(email.date) if email.date else None, 'sender': email.sender }) return dict(cases) def analyze_time_distribution(emails): """Analyze email distribution over time.""" by_year = Counter() by_month = Counter() by_day_of_week = Counter() day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] for email in emails: if email.date: try: by_year[email.date.year] += 1 by_month[f"{email.date.year}-{email.date.month:02d}"] += 1 by_day_of_week[day_names[email.date.weekday()]] += 1 except: pass return { 'by_year': dict(by_year.most_common()), 'by_month': dict(sorted(by_month.items())), 'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names} } def main(): email_dir = "/home/bob/Documents/Email Manager/emails/brett-microsoft" output_dir = Path(__file__).parent.parent / "data" output_dir.mkdir(exist_ok=True) print("="*70) print("BRETT MICROSOFT (OUTLOOK) DATASET ANALYSIS") print("="*70) print(f"\nSource: {email_dir}") print(f"Output: {output_dir}") # Parse emails print("\nParsing emails...") parser = LocalFileParser(email_dir) emails = parser.parse_emails() print(f"Total emails: {len(emails)}") # Date range dates = [e.date for e in emails if e.date] if dates: dates.sort() print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}") # Classify all emails print("\nClassifying emails...") category_counts = Counter() subcategory_counts = Counter() by_category = defaultdict(list) by_subcategory = defaultdict(list) for email in emails: category, subcategory = classify_email(email) category_counts[category] += 1 subcategory_counts[f"{category}: {subcategory}"] += 1 by_category[category].append(email) by_subcategory[subcategory].append(email) # Print category summary print("\n" + "="*70) print("TOP-LEVEL CATEGORY SUMMARY") print("="*70) for category, count in category_counts.most_common(): pct = count / len(emails) * 100 bar = "█" * int(pct / 2) print(f"\n{category} ({count} emails, {pct:.1f}%)") print(f" {bar}") # Show subcategories subcats = Counter() for email in by_category[category]: _, subcat = classify_email(email) subcats[subcat] += 1 for subcat, subcount in subcats.most_common(): print(f" - {subcat}: {subcount}") # Analyze senders print("\n" + "="*70) print("TOP SENDERS BY VOLUME") print("="*70) sender_counts = Counter(e.sender for e in emails) for sender, count in sender_counts.most_common(15): pct = count / len(emails) * 100 print(f" {count:4d} ({pct:4.1f}%) {sender}") # Time analysis print("\n" + "="*70) print("TIME DISTRIBUTION") print("="*70) time_dist = analyze_time_distribution(emails) print("\nBy Year:") for year, count in sorted(time_dist['by_year'].items()): bar = "█" * (count // 10) print(f" {year}: {count:4d} {bar}") print("\nBy Day of Week:") for day, count in time_dist['by_day_of_week'].items(): bar = "█" * (count // 5) print(f" {day}: {count:3d} {bar}") # Extract case IDs print("\n" + "="*70) print("MICROSOFT SUPPORT CASES TRACKED") print("="*70) cases = extract_case_ids(emails) if cases: for case_id, occurrences in sorted(cases.items()): print(f"\n Case/Tracking: {case_id} ({len(occurrences)} emails)") for occ in occurrences[:3]: print(f" - {occ['date']}: {occ['subject'][:50]}...") else: print(" No case IDs detected") # Actionable insights print("\n" + "="*70) print("INBOX CHARACTER ASSESSMENT") print("="*70) business_pct = (category_counts.get('Business Operations', 0) + category_counts.get('Client Work', 0) + category_counts.get('Developer', 0)) / len(emails) * 100 personal_pct = category_counts.get('Personal', 0) / len(emails) * 100 print(f"\n Business/Professional: {business_pct:.1f}%") print(f" Personal: {personal_pct:.1f}%") print(f"\n ASSESSMENT: This is a {'BUSINESS' if business_pct > 50 else 'MIXED'} inbox") # Save analysis data analysis_data = { 'metadata': { 'total_emails': len(emails), 'inbox_type': 'microsoft', 'inbox_character': 'business' if business_pct > 50 else 'mixed', 'date_range': { 'start': str(dates[0]) if dates else None, 'end': str(dates[-1]) if dates else None }, 'analyzed_at': datetime.now().isoformat() }, 'categories': dict(category_counts), 'subcategories': dict(subcategory_counts), 'top_senders': dict(sender_counts.most_common(50)), 'time_distribution': time_dist, 'support_cases': cases, 'classification_accuracy': { 'categorized': len(emails) - category_counts.get('Uncategorized', 0), 'uncategorized': category_counts.get('Uncategorized', 0), 'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100 } } output_file = output_dir / "brett_microsoft_analysis.json" with open(output_file, 'w') as f: json.dump(analysis_data, f, indent=2) print(f"\n\nAnalysis saved to: {output_file}") print("\n" + "="*70) print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%") print(f"({analysis_data['classification_accuracy']['categorized']} categorized, " f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)") print("="*70) if __name__ == '__main__': main()