- Rewrote CLAUDE.md with comprehensive development guide - Archived 20 old docs to docs/archive/ - Added PROJECT_ROADMAP_2025.md with research learnings - Added CLASSIFICATION_METHODS_COMPARISON.md - Added SESSION_HANDOVER_20251128.md - Added tools for analysis (brett_gmail/microsoft analyzers) - Updated .gitignore for archive folders - Config changes for local vLLM endpoint
501 lines
16 KiB
Python
501 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brett Microsoft (Outlook) Dataset Analyzer
|
|
==========================================
|
|
CUSTOM script for analyzing the brett-microsoft email dataset.
|
|
NOT portable to other datasets without modification.
|
|
|
|
Usage:
|
|
python tools/brett_microsoft_analyzer.py
|
|
|
|
Output:
|
|
- Console report with comprehensive statistics
|
|
- data/brett_microsoft_analysis.json with full analysis data
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Add parent to path for imports
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from src.calibration.local_file_parser import LocalFileParser
|
|
|
|
|
|
# =============================================================================
|
|
# CLASSIFICATION RULES - CUSTOM FOR BRETT'S MICROSOFT/OUTLOOK INBOX
|
|
# =============================================================================
|
|
|
|
def classify_email(email):
|
|
"""
|
|
Classify email into categories based on sender domain and subject patterns.
|
|
|
|
This is a BUSINESS inbox - different approach than personal Gmail.
|
|
Priority: Sender domain > Subject keywords > Business context
|
|
"""
|
|
sender = email.sender or ""
|
|
subject = email.subject or ""
|
|
domain = sender.split('@')[-1] if '@' in sender else sender
|
|
|
|
# === BUSINESS OPERATIONS ===
|
|
|
|
# MYOB/Accounting
|
|
if 'apps.myob.com' in domain or 'myob' in subject.lower():
|
|
return ('Business Operations', 'MYOB Invoices')
|
|
|
|
# TPG/Telecom/Internet
|
|
if 'tpgtelecom.com.au' in domain or 'aapt.com.au' in domain:
|
|
if 'suspension' in subject.lower() or 'overdue' in subject.lower():
|
|
return ('Business Operations', 'Telecom - Urgent/Overdue')
|
|
if 'novation' in subject.lower():
|
|
return ('Business Operations', 'Telecom - Contract Changes')
|
|
if 'NBN' in subject or 'nbn' in subject.lower():
|
|
return ('Business Operations', 'Telecom - NBN')
|
|
return ('Business Operations', 'Telecom - General')
|
|
|
|
# DocuSign (Contracts)
|
|
if 'docusign' in domain or 'docusign' in subject.lower():
|
|
return ('Business Operations', 'DocuSign Contracts')
|
|
|
|
# === CLIENT WORK ===
|
|
|
|
# Green Output / Energy Avengers (App Development Client)
|
|
if 'greenoutput.com.au' in domain or 'energyavengers' in domain:
|
|
return ('Client Work', 'Energy Avengers Project')
|
|
|
|
# Brighter Access (Client)
|
|
if 'brighteraccess' in domain or 'Brighter Access' in subject:
|
|
return ('Client Work', 'Brighter Access')
|
|
|
|
# Waterfall Way Designs (Business Partner)
|
|
if 'waterfallwaydesigns' in domain:
|
|
return ('Client Work', 'Waterfall Way Designs')
|
|
|
|
# Target Impact
|
|
if 'targetimpact.com.au' in domain:
|
|
return ('Client Work', 'Target Impact')
|
|
|
|
# MerlinFX
|
|
if 'merlinfx.com.au' in domain:
|
|
return ('Client Work', 'MerlinFX')
|
|
|
|
# Solar/Energy related (Energy Avengers ecosystem)
|
|
if 'solarairenergy.com.au' in domain or 'solarconnected.com.au' in domain:
|
|
return ('Client Work', 'Energy Avengers Ecosystem')
|
|
|
|
if 'eonadvisory.com.au' in domain or 'australianpowerbrokers.com.au' in domain:
|
|
return ('Client Work', 'Energy Avengers Ecosystem')
|
|
|
|
if 'fyconsulting.com.au' in domain:
|
|
return ('Client Work', 'Energy Avengers Ecosystem')
|
|
|
|
if 'convergedesign.com.au' in domain:
|
|
return ('Client Work', 'Energy Avengers Ecosystem')
|
|
|
|
# MYP Corp (Disability Services Software)
|
|
if '1myp.com' in domain or 'mypcorp' in domain or 'MYP' in subject:
|
|
return ('Business Operations', 'MYP Software')
|
|
|
|
# === MICROSOFT SERVICES ===
|
|
|
|
# Microsoft Support Cases
|
|
if re.search(r'\[Case.*#|Case #|TrackingID', subject, re.I) or 'support.microsoft.com' in domain:
|
|
return ('Microsoft', 'Support Cases')
|
|
|
|
# Microsoft Billing/Invoices
|
|
if 'Microsoft invoice' in subject or 'credit card was declined' in subject:
|
|
return ('Microsoft', 'Billing')
|
|
|
|
# Microsoft Subscriptions
|
|
if 'subscription' in subject.lower() and 'microsoft' in sender.lower():
|
|
return ('Microsoft', 'Subscriptions')
|
|
|
|
# SharePoint/Teams
|
|
if 'sharepointonline.com' in domain or 'Teams' in subject:
|
|
return ('Microsoft', 'SharePoint/Teams')
|
|
|
|
# O365 Service Updates
|
|
if 'o365su' in sender or ('digest' in subject.lower() and 'microsoft' in sender.lower()):
|
|
return ('Microsoft', 'Service Updates')
|
|
|
|
# General Microsoft
|
|
if 'microsoft.com' in domain:
|
|
return ('Microsoft', 'General')
|
|
|
|
# === DEVELOPER TOOLS ===
|
|
|
|
# GitHub CI/CD
|
|
if re.search(r'\[FSSCoding', subject):
|
|
return ('Developer', 'GitHub CI/CD Failures')
|
|
|
|
# GitHub Issues/PRs
|
|
if 'github.com' in domain:
|
|
if 'linuxmint' in subject or 'cinnamon' in subject:
|
|
return ('Developer', 'Open Source Contributions')
|
|
if 'Pheromind' in subject or 'ChrisRoyse' in subject:
|
|
return ('Developer', 'GitHub Collaborations')
|
|
return ('Developer', 'GitHub Notifications')
|
|
|
|
# Neo4j
|
|
if 'neo4j.com' in domain:
|
|
if 'webinar' in subject.lower() or 'Webinar' in subject:
|
|
return ('Developer', 'Neo4j Webinars')
|
|
if 'NODES' in subject or 'GraphTalk' in subject:
|
|
return ('Developer', 'Neo4j Conference')
|
|
return ('Developer', 'Neo4j')
|
|
|
|
# Cursor (AI IDE)
|
|
if 'cursor.com' in domain or 'cursor.so' in domain or 'Cursor' in subject:
|
|
return ('Developer', 'Cursor IDE')
|
|
|
|
# Tailscale
|
|
if 'tailscale.com' in domain:
|
|
return ('Developer', 'Tailscale')
|
|
|
|
# Hugging Face
|
|
if 'huggingface' in domain or 'Hugging Face' in subject:
|
|
return ('Developer', 'Hugging Face')
|
|
|
|
# Stripe (Payment Failures)
|
|
if 'stripe.com' in domain:
|
|
return ('Billing', 'Stripe Payments')
|
|
|
|
# Contabo (Hosting)
|
|
if 'contabo.com' in domain:
|
|
return ('Developer', 'Contabo Hosting')
|
|
|
|
# SendGrid
|
|
if 'sendgrid' in subject.lower():
|
|
return ('Developer', 'SendGrid')
|
|
|
|
# Twilio
|
|
if 'twilio.com' in domain:
|
|
return ('Developer', 'Twilio')
|
|
|
|
# Brave Search API
|
|
if 'brave.com' in domain:
|
|
return ('Developer', 'Brave Search API')
|
|
|
|
# PyPI
|
|
if 'pypi' in subject.lower() or 'pypi.org' in domain:
|
|
return ('Developer', 'PyPI')
|
|
|
|
# NVIDIA/CUDA
|
|
if 'CUDA' in subject or 'nvidia' in domain:
|
|
return ('Developer', 'NVIDIA/CUDA')
|
|
|
|
# Inception Labs / AI Tools
|
|
if 'inceptionlabs.ai' in domain:
|
|
return ('Developer', 'AI Tools')
|
|
|
|
# === LEARNING ===
|
|
|
|
# Computer Enhance (Casey Muratori) / Substack
|
|
if 'computerenhance' in sender or 'substack.com' in domain:
|
|
return ('Learning', 'Substack/Newsletters')
|
|
|
|
# Odoo
|
|
if 'odoo.com' in domain:
|
|
return ('Learning', 'Odoo ERP')
|
|
|
|
# Mozilla Firefox
|
|
if 'mozilla.org' in domain:
|
|
return ('Developer', 'Mozilla Firefox')
|
|
|
|
# === PERSONAL / COMMUNITY ===
|
|
|
|
# Grandfather Gatherings (Personal Community)
|
|
if 'Grandfather Gather' in subject:
|
|
return ('Personal', 'Grandfather Gatherings')
|
|
|
|
# Mailchimp newsletters (often personal)
|
|
if 'mailchimpapp.com' in domain:
|
|
return ('Personal', 'Personal Newsletters')
|
|
|
|
# Community Events
|
|
if 'Community Working Bee' in subject:
|
|
return ('Personal', 'Community Events')
|
|
|
|
# Personal emails (Gmail/Hotmail)
|
|
if 'gmail.com' in domain or 'hotmail.com' in domain or 'bigpond.com' in domain:
|
|
return ('Personal', 'Personal Contacts')
|
|
|
|
# FSS Internal
|
|
if 'foxsoftwaresolutions.com.au' in domain:
|
|
return ('Business Operations', 'FSS Internal')
|
|
|
|
# === FINANCIAL ===
|
|
|
|
# eToro
|
|
if 'etoro.com' in domain:
|
|
return ('Financial', 'eToro Trading')
|
|
|
|
# Dell
|
|
if 'dell.com' in domain or 'Dell' in subject:
|
|
return ('Business Operations', 'Dell Hardware')
|
|
|
|
# Insurance
|
|
if 'KT Insurance' in subject or 'insurance' in subject.lower():
|
|
return ('Business Operations', 'Insurance')
|
|
|
|
# SBSCH Payments
|
|
if 'SBSCH' in subject:
|
|
return ('Business Operations', 'SBSCH Payments')
|
|
|
|
# iCare NSW
|
|
if 'icare.nsw.gov.au' in domain:
|
|
return ('Business Operations', 'iCare NSW')
|
|
|
|
# Vodafone
|
|
if 'vodafone.com.au' in domain:
|
|
return ('Business Operations', 'Telecom - Vodafone')
|
|
|
|
# === MISC ===
|
|
|
|
# Undeliverable/Bounces
|
|
if 'Undeliverable' in subject:
|
|
return ('System', 'Email Bounces')
|
|
|
|
# Security
|
|
if re.search(r'Security Alert|Login detected|security code|Verify', subject, re.I):
|
|
return ('Security', 'Security Alerts')
|
|
|
|
# Password Reset
|
|
if 'password' in subject.lower():
|
|
return ('Security', 'Password')
|
|
|
|
# Calendly
|
|
if 'calendly.com' in domain:
|
|
return ('Business Operations', 'Calendly')
|
|
|
|
# Trello
|
|
if 'trello.com' in domain:
|
|
return ('Business Operations', 'Trello')
|
|
|
|
# Scorptec
|
|
if 'scorptec' in domain:
|
|
return ('Business Operations', 'Hardware Vendor')
|
|
|
|
# Webcentral
|
|
if 'webcentral.com.au' in domain:
|
|
return ('Business Operations', 'Web Hosting')
|
|
|
|
# Bluetti (Hardware)
|
|
if 'bluettipower.com' in domain:
|
|
return ('Business Operations', 'Hardware - Power')
|
|
|
|
# ABS Surveys
|
|
if 'abs.gov.au' in domain:
|
|
return ('Business Operations', 'Government - ABS')
|
|
|
|
# Qualtrics/Surveys
|
|
if 'qualtrics' in domain:
|
|
return ('Business Operations', 'Surveys')
|
|
|
|
return ('Uncategorized', 'Unknown')
|
|
|
|
|
|
def extract_case_ids(emails):
|
|
"""Extract Microsoft support case IDs and tracking IDs from emails."""
|
|
case_patterns = [
|
|
(r'Case\s*#?\s*:?\s*(\d{8})', 'Microsoft Case'),
|
|
(r'\[Case\s*#?\s*:?\s*(\d{8})\]', 'Microsoft Case'),
|
|
(r'TrackingID#(\d{16})', 'Tracking ID'),
|
|
]
|
|
|
|
cases = defaultdict(list)
|
|
for email in emails:
|
|
subject = email.subject or ""
|
|
for pattern, case_type in case_patterns:
|
|
match = re.search(pattern, subject, re.I)
|
|
if match:
|
|
case_id = match.group(1)
|
|
cases[case_id].append({
|
|
'type': case_type,
|
|
'subject': subject,
|
|
'date': str(email.date) if email.date else None,
|
|
'sender': email.sender
|
|
})
|
|
return dict(cases)
|
|
|
|
|
|
def analyze_time_distribution(emails):
|
|
"""Analyze email distribution over time."""
|
|
by_year = Counter()
|
|
by_month = Counter()
|
|
by_day_of_week = Counter()
|
|
|
|
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
|
|
|
for email in emails:
|
|
if email.date:
|
|
try:
|
|
by_year[email.date.year] += 1
|
|
by_month[f"{email.date.year}-{email.date.month:02d}"] += 1
|
|
by_day_of_week[day_names[email.date.weekday()]] += 1
|
|
except:
|
|
pass
|
|
|
|
return {
|
|
'by_year': dict(by_year.most_common()),
|
|
'by_month': dict(sorted(by_month.items())),
|
|
'by_day_of_week': {d: by_day_of_week.get(d, 0) for d in day_names}
|
|
}
|
|
|
|
|
|
def main():
|
|
email_dir = "/home/bob/Documents/Email Manager/emails/brett-microsoft"
|
|
output_dir = Path(__file__).parent.parent / "data"
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
print("="*70)
|
|
print("BRETT MICROSOFT (OUTLOOK) DATASET ANALYSIS")
|
|
print("="*70)
|
|
print(f"\nSource: {email_dir}")
|
|
print(f"Output: {output_dir}")
|
|
|
|
# Parse emails
|
|
print("\nParsing emails...")
|
|
parser = LocalFileParser(email_dir)
|
|
emails = parser.parse_emails()
|
|
print(f"Total emails: {len(emails)}")
|
|
|
|
# Date range
|
|
dates = [e.date for e in emails if e.date]
|
|
if dates:
|
|
dates.sort()
|
|
print(f"Date range: {dates[0].strftime('%Y-%m-%d')} to {dates[-1].strftime('%Y-%m-%d')}")
|
|
|
|
# Classify all emails
|
|
print("\nClassifying emails...")
|
|
|
|
category_counts = Counter()
|
|
subcategory_counts = Counter()
|
|
by_category = defaultdict(list)
|
|
by_subcategory = defaultdict(list)
|
|
|
|
for email in emails:
|
|
category, subcategory = classify_email(email)
|
|
category_counts[category] += 1
|
|
subcategory_counts[f"{category}: {subcategory}"] += 1
|
|
by_category[category].append(email)
|
|
by_subcategory[subcategory].append(email)
|
|
|
|
# Print category summary
|
|
print("\n" + "="*70)
|
|
print("TOP-LEVEL CATEGORY SUMMARY")
|
|
print("="*70)
|
|
|
|
for category, count in category_counts.most_common():
|
|
pct = count / len(emails) * 100
|
|
bar = "█" * int(pct / 2)
|
|
print(f"\n{category} ({count} emails, {pct:.1f}%)")
|
|
print(f" {bar}")
|
|
|
|
# Show subcategories
|
|
subcats = Counter()
|
|
for email in by_category[category]:
|
|
_, subcat = classify_email(email)
|
|
subcats[subcat] += 1
|
|
|
|
for subcat, subcount in subcats.most_common():
|
|
print(f" - {subcat}: {subcount}")
|
|
|
|
# Analyze senders
|
|
print("\n" + "="*70)
|
|
print("TOP SENDERS BY VOLUME")
|
|
print("="*70)
|
|
|
|
sender_counts = Counter(e.sender for e in emails)
|
|
for sender, count in sender_counts.most_common(15):
|
|
pct = count / len(emails) * 100
|
|
print(f" {count:4d} ({pct:4.1f}%) {sender}")
|
|
|
|
# Time analysis
|
|
print("\n" + "="*70)
|
|
print("TIME DISTRIBUTION")
|
|
print("="*70)
|
|
|
|
time_dist = analyze_time_distribution(emails)
|
|
|
|
print("\nBy Year:")
|
|
for year, count in sorted(time_dist['by_year'].items()):
|
|
bar = "█" * (count // 10)
|
|
print(f" {year}: {count:4d} {bar}")
|
|
|
|
print("\nBy Day of Week:")
|
|
for day, count in time_dist['by_day_of_week'].items():
|
|
bar = "█" * (count // 5)
|
|
print(f" {day}: {count:3d} {bar}")
|
|
|
|
# Extract case IDs
|
|
print("\n" + "="*70)
|
|
print("MICROSOFT SUPPORT CASES TRACKED")
|
|
print("="*70)
|
|
|
|
cases = extract_case_ids(emails)
|
|
if cases:
|
|
for case_id, occurrences in sorted(cases.items()):
|
|
print(f"\n Case/Tracking: {case_id} ({len(occurrences)} emails)")
|
|
for occ in occurrences[:3]:
|
|
print(f" - {occ['date']}: {occ['subject'][:50]}...")
|
|
else:
|
|
print(" No case IDs detected")
|
|
|
|
# Actionable insights
|
|
print("\n" + "="*70)
|
|
print("INBOX CHARACTER ASSESSMENT")
|
|
print("="*70)
|
|
|
|
business_pct = (category_counts.get('Business Operations', 0) +
|
|
category_counts.get('Client Work', 0) +
|
|
category_counts.get('Developer', 0)) / len(emails) * 100
|
|
personal_pct = category_counts.get('Personal', 0) / len(emails) * 100
|
|
|
|
print(f"\n Business/Professional: {business_pct:.1f}%")
|
|
print(f" Personal: {personal_pct:.1f}%")
|
|
print(f"\n ASSESSMENT: This is a {'BUSINESS' if business_pct > 50 else 'MIXED'} inbox")
|
|
|
|
# Save analysis data
|
|
analysis_data = {
|
|
'metadata': {
|
|
'total_emails': len(emails),
|
|
'inbox_type': 'microsoft',
|
|
'inbox_character': 'business' if business_pct > 50 else 'mixed',
|
|
'date_range': {
|
|
'start': str(dates[0]) if dates else None,
|
|
'end': str(dates[-1]) if dates else None
|
|
},
|
|
'analyzed_at': datetime.now().isoformat()
|
|
},
|
|
'categories': dict(category_counts),
|
|
'subcategories': dict(subcategory_counts),
|
|
'top_senders': dict(sender_counts.most_common(50)),
|
|
'time_distribution': time_dist,
|
|
'support_cases': cases,
|
|
'classification_accuracy': {
|
|
'categorized': len(emails) - category_counts.get('Uncategorized', 0),
|
|
'uncategorized': category_counts.get('Uncategorized', 0),
|
|
'accuracy_pct': (len(emails) - category_counts.get('Uncategorized', 0)) / len(emails) * 100
|
|
}
|
|
}
|
|
|
|
output_file = output_dir / "brett_microsoft_analysis.json"
|
|
with open(output_file, 'w') as f:
|
|
json.dump(analysis_data, f, indent=2)
|
|
|
|
print(f"\n\nAnalysis saved to: {output_file}")
|
|
print("\n" + "="*70)
|
|
print(f"CLASSIFICATION ACCURACY: {analysis_data['classification_accuracy']['accuracy_pct']:.1f}%")
|
|
print(f"({analysis_data['classification_accuracy']['categorized']} categorized, "
|
|
f"{analysis_data['classification_accuracy']['uncategorized']} uncategorized)")
|
|
print("="*70)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|