email-sorter/tools/generate_html_report.py
FSSCoding 8f25e30f52 Rewrite CLAUDE.md and clean project structure
- Rewrote CLAUDE.md with comprehensive development guide
- Archived 20 old docs to docs/archive/
- Added PROJECT_ROADMAP_2025.md with research learnings
- Added CLASSIFICATION_METHODS_COMPARISON.md
- Added SESSION_HANDOVER_20251128.md
- Added tools for analysis (brett_gmail/microsoft analyzers)
- Updated .gitignore for archive folders
- Config changes for local vLLM endpoint
2025-11-28 13:07:27 +11:00

643 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Generate interactive HTML report from email classification results.
Usage:
python tools/generate_html_report.py --input results.json --output report.html
"""
import argparse
import json
from pathlib import Path
from datetime import datetime
from collections import Counter, defaultdict
from html import escape
def load_results(input_path: str) -> dict:
"""Load classification results from JSON."""
with open(input_path) as f:
return json.load(f)
def extract_domain(sender: str) -> str:
"""Extract domain from email address."""
if not sender:
return "unknown"
if "@" in sender:
return sender.split("@")[-1].lower()
return sender.lower()
def format_date(date_str: str) -> str:
"""Format ISO date string for display."""
if not date_str:
return "N/A"
try:
dt = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
return dt.strftime("%Y-%m-%d %H:%M")
except:
return date_str[:16] if len(date_str) > 16 else date_str
def truncate(text: str, max_len: int = 60) -> str:
"""Truncate text with ellipsis."""
if not text:
return ""
if len(text) <= max_len:
return text
return text[:max_len-3] + "..."
def generate_html_report(results: dict, output_path: str):
"""Generate interactive HTML report."""
metadata = results.get("metadata", {})
classifications = results.get("classifications", [])
# Calculate statistics
total = len(classifications)
categories = Counter(c["category"] for c in classifications)
methods = Counter(c["method"] for c in classifications)
# Group by category
by_category = defaultdict(list)
for c in classifications:
by_category[c["category"]].append(c)
# Sort categories by count
sorted_categories = sorted(categories.keys(), key=lambda x: categories[x], reverse=True)
# Sender statistics
sender_domains = Counter(extract_domain(c.get("sender", "")) for c in classifications)
top_senders = Counter(c.get("sender", "unknown") for c in classifications).most_common(20)
# Confidence distribution
high_conf = sum(1 for c in classifications if c.get("confidence", 0) >= 0.7)
med_conf = sum(1 for c in classifications if 0.5 <= c.get("confidence", 0) < 0.7)
low_conf = sum(1 for c in classifications if c.get("confidence", 0) < 0.5)
# Generate HTML
html = f'''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Email Classification Report</title>
<style>
:root {{
--bg-primary: #1a1a2e;
--bg-secondary: #16213e;
--bg-card: #0f3460;
--text-primary: #eee;
--text-secondary: #aaa;
--accent: #e94560;
--accent-hover: #ff6b6b;
--success: #00d9a5;
--warning: #ffc107;
--border: #2a2a4a;
}}
* {{
margin: 0;
padding: 0;
box-sizing: border-box;
}}
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
background: var(--bg-primary);
color: var(--text-primary);
line-height: 1.6;
}}
.container {{
max-width: 1400px;
margin: 0 auto;
padding: 20px;
}}
header {{
background: var(--bg-secondary);
padding: 30px;
border-radius: 12px;
margin-bottom: 30px;
border: 1px solid var(--border);
}}
header h1 {{
font-size: 2rem;
margin-bottom: 10px;
color: var(--accent);
}}
.meta-info {{
display: flex;
flex-wrap: wrap;
gap: 20px;
margin-top: 15px;
color: var(--text-secondary);
font-size: 0.9rem;
}}
.meta-info span {{
background: var(--bg-card);
padding: 5px 12px;
border-radius: 20px;
}}
.stats-grid {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 30px;
}}
.stat-card {{
background: var(--bg-secondary);
padding: 20px;
border-radius: 12px;
border: 1px solid var(--border);
text-align: center;
}}
.stat-card .value {{
font-size: 2.5rem;
font-weight: bold;
color: var(--accent);
}}
.stat-card .label {{
color: var(--text-secondary);
font-size: 0.9rem;
margin-top: 5px;
}}
.tabs {{
display: flex;
flex-wrap: wrap;
gap: 10px;
margin-bottom: 20px;
border-bottom: 2px solid var(--border);
padding-bottom: 10px;
}}
.tab {{
padding: 10px 20px;
background: var(--bg-secondary);
border: 1px solid var(--border);
border-radius: 8px 8px 0 0;
cursor: pointer;
transition: all 0.2s;
color: var(--text-secondary);
}}
.tab:hover {{
background: var(--bg-card);
color: var(--text-primary);
}}
.tab.active {{
background: var(--accent);
color: white;
border-color: var(--accent);
}}
.tab .count {{
background: rgba(255,255,255,0.2);
padding: 2px 8px;
border-radius: 10px;
font-size: 0.8rem;
margin-left: 8px;
}}
.tab-content {{
display: none;
}}
.tab-content.active {{
display: block;
}}
.email-table {{
width: 100%;
border-collapse: collapse;
background: var(--bg-secondary);
border-radius: 12px;
overflow: hidden;
}}
.email-table th {{
background: var(--bg-card);
padding: 15px;
text-align: left;
font-weight: 600;
color: var(--text-primary);
position: sticky;
top: 0;
}}
.email-table td {{
padding: 12px 15px;
border-bottom: 1px solid var(--border);
color: var(--text-secondary);
}}
.email-table tr:hover td {{
background: var(--bg-card);
color: var(--text-primary);
}}
.email-table .subject {{
max-width: 400px;
color: var(--text-primary);
}}
.email-table .sender {{
max-width: 250px;
}}
.confidence {{
display: inline-block;
padding: 3px 10px;
border-radius: 12px;
font-size: 0.85rem;
font-weight: 500;
}}
.confidence.high {{
background: rgba(0, 217, 165, 0.2);
color: var(--success);
}}
.confidence.medium {{
background: rgba(255, 193, 7, 0.2);
color: var(--warning);
}}
.confidence.low {{
background: rgba(233, 69, 96, 0.2);
color: var(--accent);
}}
.method-badge {{
display: inline-block;
padding: 3px 8px;
border-radius: 4px;
font-size: 0.75rem;
text-transform: uppercase;
}}
.method-ml {{
background: rgba(0, 217, 165, 0.2);
color: var(--success);
}}
.method-rule {{
background: rgba(100, 149, 237, 0.2);
color: cornflowerblue;
}}
.method-llm {{
background: rgba(255, 193, 7, 0.2);
color: var(--warning);
}}
.section {{
background: var(--bg-secondary);
padding: 25px;
border-radius: 12px;
margin-bottom: 30px;
border: 1px solid var(--border);
}}
.section h2 {{
margin-bottom: 20px;
color: var(--accent);
font-size: 1.3rem;
}}
.chart-bar {{
display: flex;
align-items: center;
margin-bottom: 10px;
}}
.chart-bar .label {{
width: 150px;
font-size: 0.9rem;
color: var(--text-secondary);
}}
.chart-bar .bar-container {{
flex: 1;
height: 24px;
background: var(--bg-card);
border-radius: 4px;
overflow: hidden;
margin: 0 15px;
}}
.chart-bar .bar {{
height: 100%;
background: linear-gradient(90deg, var(--accent), var(--accent-hover));
transition: width 0.5s ease;
}}
.chart-bar .value {{
width: 80px;
text-align: right;
font-size: 0.9rem;
}}
.sender-list {{
display: grid;
grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
gap: 10px;
}}
.sender-item {{
display: flex;
justify-content: space-between;
padding: 10px 15px;
background: var(--bg-card);
border-radius: 8px;
font-size: 0.9rem;
}}
.sender-item .email {{
color: var(--text-secondary);
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
max-width: 220px;
}}
.sender-item .count {{
color: var(--accent);
font-weight: bold;
}}
.search-box {{
width: 100%;
padding: 12px 20px;
background: var(--bg-card);
border: 1px solid var(--border);
border-radius: 8px;
color: var(--text-primary);
font-size: 1rem;
margin-bottom: 20px;
}}
.search-box:focus {{
outline: none;
border-color: var(--accent);
}}
.table-container {{
max-height: 600px;
overflow-y: auto;
border-radius: 12px;
}}
.attachment-icon {{
color: var(--warning);
}}
footer {{
text-align: center;
padding: 20px;
color: var(--text-secondary);
font-size: 0.85rem;
}}
</style>
</head>
<body>
<div class="container">
<header>
<h1>Email Classification Report</h1>
<p>Automated analysis of email inbox</p>
<div class="meta-info">
<span>Generated: {datetime.now().strftime("%Y-%m-%d %H:%M")}</span>
<span>Source: {escape(metadata.get("source", "unknown"))}</span>
<span>Total Emails: {total:,}</span>
</div>
</header>
<div class="stats-grid">
<div class="stat-card">
<div class="value">{total:,}</div>
<div class="label">Total Emails</div>
</div>
<div class="stat-card">
<div class="value">{len(categories)}</div>
<div class="label">Categories</div>
</div>
<div class="stat-card">
<div class="value">{high_conf}</div>
<div class="label">High Confidence (&ge;70%)</div>
</div>
<div class="stat-card">
<div class="value">{len(sender_domains)}</div>
<div class="label">Unique Domains</div>
</div>
</div>
<div class="section">
<h2>Category Distribution</h2>
{"".join(f'''
<div class="chart-bar">
<div class="label">{escape(cat)}</div>
<div class="bar-container">
<div class="bar" style="width: {categories[cat]/total*100:.1f}%"></div>
</div>
<div class="value">{categories[cat]:,} ({categories[cat]/total*100:.1f}%)</div>
</div>
''' for cat in sorted_categories)}
</div>
<div class="section">
<h2>Classification Methods</h2>
{"".join(f'''
<div class="chart-bar">
<div class="label">{escape(method.upper())}</div>
<div class="bar-container">
<div class="bar" style="width: {methods[method]/total*100:.1f}%"></div>
</div>
<div class="value">{methods[method]:,} ({methods[method]/total*100:.1f}%)</div>
</div>
''' for method in sorted(methods.keys()))}
</div>
<div class="section">
<h2>Confidence Distribution</h2>
<div class="chart-bar">
<div class="label">High (&ge;70%)</div>
<div class="bar-container">
<div class="bar" style="width: {high_conf/total*100:.1f}%; background: linear-gradient(90deg, #00d9a5, #00ffcc);"></div>
</div>
<div class="value">{high_conf:,} ({high_conf/total*100:.1f}%)</div>
</div>
<div class="chart-bar">
<div class="label">Medium (50-70%)</div>
<div class="bar-container">
<div class="bar" style="width: {med_conf/total*100:.1f}%; background: linear-gradient(90deg, #ffc107, #ffdb58);"></div>
</div>
<div class="value">{med_conf:,} ({med_conf/total*100:.1f}%)</div>
</div>
<div class="chart-bar">
<div class="label">Low (&lt;50%)</div>
<div class="bar-container">
<div class="bar" style="width: {low_conf/total*100:.1f}%; background: linear-gradient(90deg, #e94560, #ff6b6b);"></div>
</div>
<div class="value">{low_conf:,} ({low_conf/total*100:.1f}%)</div>
</div>
</div>
<div class="section">
<h2>Top Senders</h2>
<div class="sender-list">
{"".join(f'''
<div class="sender-item">
<span class="email" title="{escape(sender)}">{escape(truncate(sender, 35))}</span>
<span class="count">{count}</span>
</div>
''' for sender, count in top_senders)}
</div>
</div>
<div class="section">
<h2>Emails by Category</h2>
<div class="tabs">
<div class="tab active" onclick="showTab('all')">All<span class="count">{total}</span></div>
{"".join(f'''<div class="tab" onclick="showTab('{escape(cat)}')">{escape(cat)}<span class="count">{categories[cat]}</span></div>''' for cat in sorted_categories)}
</div>
<input type="text" class="search-box" placeholder="Search by subject, sender..." onkeyup="filterTable(this.value)">
<div id="tab-all" class="tab-content active">
<div class="table-container">
<table class="email-table" id="email-table-all">
<thead>
<tr>
<th>Date</th>
<th>Subject</th>
<th>Sender</th>
<th>Category</th>
<th>Confidence</th>
<th>Method</th>
</tr>
</thead>
<tbody>
{"".join(generate_email_row(c) for c in sorted(classifications, key=lambda x: x.get("date") or "", reverse=True))}
</tbody>
</table>
</div>
</div>
{"".join(f'''
<div id="tab-{escape(cat)}" class="tab-content">
<div class="table-container">
<table class="email-table">
<thead>
<tr>
<th>Date</th>
<th>Subject</th>
<th>Sender</th>
<th>Confidence</th>
<th>Method</th>
</tr>
</thead>
<tbody>
{"".join(generate_email_row(c, show_category=False) for c in sorted(by_category[cat], key=lambda x: x.get("date") or "", reverse=True))}
</tbody>
</table>
</div>
</div>
''' for cat in sorted_categories)}
</div>
<footer>
Generated by Email Sorter | {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
</footer>
</div>
<script>
function showTab(tabId) {{
// Hide all tabs
document.querySelectorAll('.tab-content').forEach(el => el.classList.remove('active'));
document.querySelectorAll('.tab').forEach(el => el.classList.remove('active'));
// Show selected tab
document.getElementById('tab-' + tabId).classList.add('active');
event.target.classList.add('active');
}}
function filterTable(query) {{
query = query.toLowerCase();
document.querySelectorAll('.tab-content.active tbody tr').forEach(row => {{
const text = row.textContent.toLowerCase();
row.style.display = text.includes(query) ? '' : 'none';
}});
}}
</script>
</body>
</html>
'''
with open(output_path, "w", encoding="utf-8") as f:
f.write(html)
print(f"Report generated: {output_path}")
print(f" Total emails: {total:,}")
print(f" Categories: {len(categories)}")
print(f" Top category: {sorted_categories[0]} ({categories[sorted_categories[0]]:,})")
def generate_email_row(c: dict, show_category: bool = True) -> str:
"""Generate HTML table row for an email."""
conf = c.get("confidence", 0)
conf_class = "high" if conf >= 0.7 else "medium" if conf >= 0.5 else "low"
method = c.get("method", "unknown")
method_class = f"method-{method}"
attachment_icon = '<span class="attachment-icon" title="Has attachments">📎</span> ' if c.get("has_attachments") else ""
category_col = f'<td>{escape(c.get("category", "unknown"))}</td>' if show_category else ""
return f'''
<tr data-search="{escape(c.get('subject', ''))} {escape(c.get('sender', ''))}">
<td>{format_date(c.get("date"))}</td>
<td class="subject">{attachment_icon}{escape(truncate(c.get("subject", "No subject"), 70))}</td>
<td class="sender" title="{escape(c.get('sender', ''))}">{escape(truncate(c.get("sender_name") or c.get("sender", ""), 35))}</td>
{category_col}
<td><span class="confidence {conf_class}">{conf*100:.0f}%</span></td>
<td><span class="method-badge {method_class}">{method}</span></td>
</tr>
'''
def main():
parser = argparse.ArgumentParser(description="Generate HTML report from classification results")
parser.add_argument("--input", "-i", required=True, help="Path to results.json")
parser.add_argument("--output", "-o", default=None, help="Output HTML file path")
args = parser.parse_args()
input_path = Path(args.input)
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
return 1
output_path = args.output or str(input_path.parent / "report.html")
results = load_results(args.input)
generate_html_report(results, output_path)
return 0
if __name__ == "__main__":
exit(main())