email-sorter/tools/brett_microsoft_analyzer.py
FSSCoding 8f25e30f52 Rewrite CLAUDE.md and clean project structure
- Rewrote CLAUDE.md with comprehensive development guide
- Archived 20 old docs to docs/archive/
- Added PROJECT_ROADMAP_2025.md with research learnings
- Added CLASSIFICATION_METHODS_COMPARISON.md
- Added SESSION_HANDOVER_20251128.md
- Added tools for analysis (brett_gmail/microsoft analyzers)
- Updated .gitignore for archive folders
- Config changes for local vLLM endpoint
2025-11-28 13:07:27 +11:00

501 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Brett Microsoft (Outlook) Dataset Analyzer
==========================================
CUSTOM script for analyzing the brett-microsoft email dataset.
NOT portable to other datasets without modification.
Usage:
python tools/brett_microsoft_analyzer.py
Output:
- Console report with comprehensive statistics
- data/brett_microsoft_analysis.json with full analysis data
"""
import json
import re
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
# Add parent to path for imports
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.calibration.local_file_parser import LocalFileParser
# =============================================================================
# CLASSIFICATION RULES - CUSTOM FOR BRETT'S MICROSOFT/OUTLOOK INBOX
# =============================================================================
def classify_email(email):
"""
Classify email into categories based on sender domain and subject patterns.
This is a BUSINESS inbox - different approach than personal Gmail.
Priority: Sender domain > Subject keywords > Business context
"""
sender = email.sender or ""
subject = email.subject or ""
domain = sender.split('@')[-1] if '@' in sender else sender
# === BUSINESS OPERATIONS ===
# MYOB/Accounting
if 'apps.myob.com' in domain or 'myob' in subject.lower():
return ('Business Operations', 'MYOB Invoices')
# TPG/Telecom/Internet
if 'tpgtelecom.com.au' in domain or 'aapt.com.au' in domain:
if 'suspension' in subject.lower() or 'overdue' in subject.lower():
return ('Business Operations', 'Telecom - Urgent/Overdue')
if 'novation' in subject.lower():
return ('Business Operations', 'Telecom - Contract Changes')
if 'NBN' in subject or 'nbn' in subject.lower():
return ('Business Operations', 'Telecom - NBN')
return ('Business Operations', 'Telecom - General')
# DocuSign (Contracts)
if 'docusign' in domain or 'docusign' in subject.lower():
return ('Business Operations', 'DocuSign Contracts')
# === CLIENT WORK ===
# Green Output / Energy Avengers (App Development Client)
if 'greenoutput.com.au' in domain or 'energyavengers' in domain:
return ('Client Work', 'Energy Avengers Project')
# Brighter Access (Client)
if 'brighteraccess' in domain or 'Brighter Access' in subject:
return ('Client Work', 'Brighter Access')
# Waterfall Way Designs (Business Partner)
if 'waterfallwaydesigns' in domain:
return ('Client Work', 'Waterfall Way Designs')
# Target Impact
if 'targetimpact.com.au' in domain:
return ('Client Work', 'Target Impact')
# MerlinFX
if 'merlinfx.com.au' in domain:
return ('Client Work', 'MerlinFX')
# Solar/Energy related (Energy Avengers ecosystem)
if 'solarairenergy.com.au' in domain or 'solarconnected.com.au' in domain:
return ('Client Work', 'Energy Avengers Ecosystem')
if 'eonadvisory.com.au' in domain or 'australianpowerbrokers.com.au' in domain:
return ('Client Work', 'Energy Avengers Ecosystem')
if 'fyconsulting.com.au' in domain:
return ('Client Work', 'Energy Avengers Ecosystem')
if 'convergedesign.com.au' in domain:
return ('Client Work', 'Energy Avengers Ecosystem')
# MYP Corp (Disability Services Software)
if '1myp.com' in domain or 'mypcorp' in domain or 'MYP' in subject:
return ('Business Operations', 'MYP Software')
# === MICROSOFT SERVICES ===
# Microsoft Support Cases
if re.search(r'\[Case.*#|Case #|TrackingID', subject, re.I) or 'support.microsoft.com' in domain:
return ('Microsoft', 'Support Cases')
# Microsoft Billing/Invoices
if 'Microsoft invoice' in subject or 'credit card was declined' in subject:
return ('Microsoft', 'Billing')
# Microsoft Subscriptions
if 'subscription' in subject.lower() and 'microsoft' in sender.lower():
return ('Microsoft', 'Subscriptions')
# SharePoint/Teams
if 'sharepointonline.com' in domain or 'Teams' in subject:
return ('Microsoft', 'SharePoint/Teams')
# O365 Service Updates
if 'o365su' in sender or ('digest' in subject.lower() and 'microsoft' in sender.lower()):
return ('Microsoft', 'Service Updates')
# General Microsoft
if 'microsoft.com' in domain:
return ('Microsoft', 'General')
# === DEVELOPER TOOLS ===
# GitHub CI/CD
if re.search(r'\[FSSCoding', subject):
return ('Developer', 'GitHub CI/CD Failures')
# GitHub Issues/PRs
if 'github.com' in domain:
if 'linuxmint' in subject or 'cinnamon' in subject:
return ('Developer', 'Open Source Contributions')
if 'Pheromind' in subject or 'ChrisRoyse' in subject:
return ('Developer', 'GitHub Collaborations')
return ('Developer', 'GitHub Notifications')
# Neo4j
if 'neo4j.com' in domain:
if 'webinar' in subject.lower() or 'Webinar' in subject:
return ('Developer', 'Neo4j Webinars')
if 'NODES' in subject or 'GraphTalk' in subject:
return ('Developer', 'Neo4j Conference')
return ('Developer', 'Neo4j')
# Cursor (AI IDE)
if 'cursor.com' in domain or 'cursor.so' in domain or 'Cursor' in subject:
return ('Developer', 'Cursor IDE')
# Tailscale
if 'tailscale.com' in domain:
return ('Developer', 'Tailscale')
# Hugging Face
if 'huggingface' in domain or 'Hugging Face' in subject:
return ('Developer', 'Hugging Face')
# Stripe (Payment Failures)
if 'stripe.com' in domain:
return ('Billing', 'Stripe Payments')
# Contabo (Hosting)
if 'contabo.com' in domain:
return ('Developer', 'Contabo Hosting')
# SendGrid
if 'sendgrid' in subject.lower():
return ('Developer', 'SendGrid')
# Twilio
if 'twilio.com' in domain:
return ('Developer', 'Twilio')
# Brave Search API
if 'brave.com' in domain:
return ('Developer', 'Brave Search API')
# PyPI
if 'pypi' in subject.lower() or 'pypi.org' in domain:
return ('Developer', 'PyPI')
# NVIDIA/CUDA
if 'CUDA' in subject or 'nvidia' in domain:
return ('Developer', 'NVIDIA/CUDA')
# Inception Labs / AI Tools
if 'inceptionlabs.ai' in domain:
return ('Developer', 'AI Tools')
# === LEARNING ===
# Computer Enhance (Casey Muratori) / Substack
if 'computerenhance' in sender or 'substack.com' in domain:
return ('Learning', 'Substack/Newsletters')
# Odoo
if 'odoo.com' in domain:
return ('Learning', 'Odoo ERP')
# Mozilla Firefox
if 'mozilla.org' in domain:
return ('Developer', 'Mozilla Firefox')
# === PERSONAL / COMMUNITY ===
# Grandfather Gatherings (Personal Community)
if 'Grandfather Gather' in subject:
return ('Personal', 'Grandfather Gatherings')
# Mailchimp newsletters (often personal)
if 'mailchimpapp.com' in domain:
return ('Personal', 'Personal Newsletters')
# Community Events
if 'Community Working Bee' in subject:
return ('Personal', 'Community Events')
# Personal emails (Gmail/Hotmail)
if 'gmail.com' in domain or 'hotmail.com' in domain or 'bigpond.com' in domain:
return ('Personal', 'Personal Contacts')
# FSS Internal
if 'foxsoftwaresolutions.com.au' in domain:
return ('Business Operations', 'FSS Internal')
# === FINANCIAL ===
# eToro
if 'etoro.com' in domain:
return ('Financial', 'eToro Trading')
# Dell
if 'dell.com' in domain or 'Dell' in subject:
return ('Business Operations', 'Dell Hardware')
# Insurance
if 'KT Insurance' in subject or 'insurance' in subject.lower():
return ('Business Operations', 'Insurance')
# SBSCH Payments
if 'SBSCH' in subject:
return ('Business Operations', 'SBSCH Payments')
# iCare NSW
if 'icare.nsw.gov.au' in domain:
return ('Business Operations', 'iCare NSW')
# Vodafone
if 'vodafone.com.au' in domain:
return ('Business Operations', 'Telecom - Vodafone')
# === MISC ===
# Undeliverable/Bounces
if 'Undeliverable' in subject:
return ('System', 'Email Bounces')
# Security
if re.search(r'Security Alert|Login detected|security code|Verify', subject, re.I):
return ('Security', 'Security Alerts')
# Password Reset
if 'password' in subject.lower():
return ('Security', 'Password')
# Calendly
if 'calendly.com' in domain:
return ('Business Operations', 'Calendly')
# Trello
if 'trello.com' in domain:
return ('Business Operations', 'Trello')
# Scorptec
if 'scorptec' in domain:
return ('Business Operations', 'Hardware Vendor')
# Webcentral
if 'webcentral.com.au' in domain:
return ('Business Operations', 'Web Hosting')
# Bluetti (Hardware)
if 'bluettipower.com' in domain:
return ('Business Operations', 'Hardware - Power')
# ABS Surveys
if 'abs.gov.au' in domain:
return ('Business Operations', 'Government - ABS')
# Qualtrics/Surveys
if 'qualtrics' in domain:
return ('Business Operations', 'Surveys')
return ('Uncategorized', 'Unknown')
def extract_case_ids(emails):
"""Extract Microsoft support case IDs and tracking IDs from emails."""
case_patterns = [
(r'Case\s*#?\s*:?\s*(\d{8})', 'Microsoft Case'),
(r'\[Case\s*#?\s*:?\s*(\d{8})\]', 'Microsoft Case'),
(r'TrackingID#(\d{16})', 'Tracking ID'),
]
cases = defaultdict(list)
for email in emails:
subject = email.subject or ""
for pattern, case_type in case_patterns:
match = re.search(pattern, subject, re.I)
if match:
case_id = match.group(1)
cases[case_id].append({
'type': case_type,
'subject': subject,
'date': str(email.date) if email.date else None,
'sender': email.sender
})
return dict(cases)
def analyze_time_distribution(emails):
"""Analyze email distribution over time."""
by_year = Counter()
by_month = Counter()
by_day_of_week = Counter()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for email in emails:
if email.date:
try:
by_year[email.date.year] += 1
by_month[f"{email.date.year}-{email.date.month:02d}"] += 1
by_day_of_week[day_names[email.date.weekday()]] += 1
except:
pass
return {
'by_year': dict(by_year.most_common()),
'by_month': dict(sorted(by_month.items())),
'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names}
}
def main():
email_dir = "/home/bob/Documents/Email Manager/emails/brett-microsoft"
output_dir = Path(__file__).parent.parent / "data"
output_dir.mkdir(exist_ok=True)
print("="*70)
print("BRETT MICROSOFT (OUTLOOK) DATASET ANALYSIS")
print("="*70)
print(f"\nSource: {email_dir}")
print(f"Output: {output_dir}")
# Parse emails
print("\nParsing emails...")
parser = LocalFileParser(email_dir)
emails = parser.parse_emails()
print(f"Total emails: {len(emails)}")
# Date range
dates = [e.date for e in emails if e.date]
if dates:
dates.sort()
print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")
# Classify all emails
print("\nClassifying emails...")
category_counts = Counter()
subcategory_counts = Counter()
by_category = defaultdict(list)
by_subcategory = defaultdict(list)
for email in emails:
category, subcategory = classify_email(email)
category_counts[category] += 1
subcategory_counts[f"{category}: {subcategory}"] += 1
by_category[category].append(email)
by_subcategory[subcategory].append(email)
# Print category summary
print("\n" + "="*70)
print("TOP-LEVEL CATEGORY SUMMARY")
print("="*70)
for category, count in category_counts.most_common():
pct = count / len(emails) * 100
bar = "" * int(pct / 2)
print(f"\n{category} ({count} emails, {pct:.1f}%)")
print(f" {bar}")
# Show subcategories
subcats = Counter()
for email in by_category[category]:
_, subcat = classify_email(email)
subcats[subcat] += 1
for subcat, subcount in subcats.most_common():
print(f" - {subcat}: {subcount}")
# Analyze senders
print("\n" + "="*70)
print("TOP SENDERS BY VOLUME")
print("="*70)
sender_counts = Counter(e.sender for e in emails)
for sender, count in sender_counts.most_common(15):
pct = count / len(emails) * 100
print(f" {count:4d} ({pct:4.1f}%) {sender}")
# Time analysis
print("\n" + "="*70)
print("TIME DISTRIBUTION")
print("="*70)
time_dist = analyze_time_distribution(emails)
print("\nBy Year:")
for year, count in sorted(time_dist['by_year'].items()):
bar = "" * (count // 10)
print(f" {year}: {count:4d} {bar}")
print("\nBy Day of Week:")
for day, count in time_dist['by_day_of_week'].items():
bar = "" * (count // 5)
print(f" {day}: {count:3d} {bar}")
# Extract case IDs
print("\n" + "="*70)
print("MICROSOFT SUPPORT CASES TRACKED")
print("="*70)
cases = extract_case_ids(emails)
if cases:
for case_id, occurrences in sorted(cases.items()):
print(f"\n Case/Tracking: {case_id} ({len(occurrences)} emails)")
for occ in occurrences[:3]:
print(f" - {occ['date']}: {occ['subject'][:50]}...")
else:
print(" No case IDs detected")
# Actionable insights
print("\n" + "="*70)
print("INBOX CHARACTER ASSESSMENT")
print("="*70)
business_pct = (category_counts.get('Business Operations', 0) +
category_counts.get('Client Work', 0) +
category_counts.get('Developer', 0)) / len(emails) * 100
personal_pct = category_counts.get('Personal', 0) / len(emails) * 100
print(f"\n Business/Professional: {business_pct:.1f}%")
print(f" Personal: {personal_pct:.1f}%")
print(f"\n ASSESSMENT: This is a {'BUSINESS' if business_pct > 50 else 'MIXED'} inbox")
# Save analysis data
analysis_data = {
'metadata': {
'total_emails': len(emails),
'inbox_type': 'microsoft',
'inbox_character': 'business' if business_pct > 50 else 'mixed',
'date_range': {
'start': str(dates[0]) if dates else None,
'end': str(dates[-1]) if dates else None
},
'analyzed_at': datetime.now().isoformat()
},
'categories': dict(category_counts),
'subcategories': dict(subcategory_counts),
'top_senders': dict(sender_counts.most_common(50)),
'time_distribution': time_dist,
'support_cases': cases,
'classification_accuracy': {
'categorized': len(emails) - category_counts.get('Uncategorized', 0),
'uncategorized': category_counts.get('Uncategorized', 0),
'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100
}
}
output_file = output_dir / "brett_microsoft_analysis.json"
with open(output_file, 'w') as f:
json.dump(analysis_data, f, indent=2)
print(f"\n\nAnalysis saved to: {output_file}")
print("\n" + "="*70)
print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%")
print(f"({analysis_data['classification_accuracy']['categorized']} categorized, "
f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)")
print("="*70)
if __name__ == '__main__':
main()