#!/usr/bin/env python3 """ Generate interactive HTML report from email classification results. Usage: python tools/generate_html_report.py --input results.json --output report.html """ import argparse import json from pathlib import Path from datetime import datetime from collections import Counter, defaultdict from html import escape def load_results(input_path: str) -> dict: """Load classification results from JSON.""" with open(input_path) as f: return json.load(f) def extract_domain(sender: str) -> str: """Extract domain from email address.""" if not sender: return "unknown" if "@" in sender: return sender.split("@")[-1].lower() return sender.lower() def format_date(date_str: str) -> str: """Format ISO date string for display.""" if not date_str: return "N/A" try: dt = datetime.fromisoformat(date_str.replace("Z", "+00:00")) return dt.strftime("%Y-%m-%d %H:%M") except: return date_str[:16] if len(date_str) > 16 else date_str def truncate(text: str, max_len: int = 60) -> str: """Truncate text with ellipsis.""" if not text: return "" if len(text) <= max_len: return text return text[:max_len-3] + "..." def generate_html_report(results: dict, output_path: str): """Generate interactive HTML report.""" metadata = results.get("metadata", {}) classifications = results.get("classifications", []) # Calculate statistics total = len(classifications) categories = Counter(c["category"] for c in classifications) methods = Counter(c["method"] for c in classifications) # Group by category by_category = defaultdict(list) for c in classifications: by_category[c["category"]].append(c) # Sort categories by count sorted_categories = sorted(categories.keys(), key=lambda x: categories[x], reverse=True) # Sender statistics sender_domains = Counter(extract_domain(c.get("sender", "")) for c in classifications) top_senders = Counter(c.get("sender", "unknown") for c in classifications).most_common(20) # Confidence distribution high_conf = sum(1 for c in classifications if c.get("confidence", 0) >= 0.7) med_conf = sum(1 for c in classifications if 0.5 <= c.get("confidence", 0) < 0.7) low_conf = sum(1 for c in classifications if c.get("confidence", 0) < 0.5) # Generate HTML html = f''' Email Classification Report

Email Classification Report

Automated analysis of email inbox

Generated: {datetime.now().strftime("%Y-%m-%d %H:%M")} Source: {escape(metadata.get("source", "unknown"))} Total Emails: {total:,}

{total:,}

Total Emails

{len(categories)}

Categories

{high_conf}

High Confidence (≥70%)

{len(sender_domains)}

Unique Domains

Category Distribution

{"".join(f'''

{escape(cat)}

{categories[cat]:,} ({categories[cat]/total*100:.1f}%)

''' for cat in sorted_categories)}

Classification Methods

{"".join(f'''

{escape(method.upper())}

{methods[method]:,} ({methods[method]/total*100:.1f}%)

''' for method in sorted(methods.keys()))}

Confidence Distribution

High (≥70%)

{high_conf:,} ({high_conf/total*100:.1f}%)

Medium (50-70%)

{med_conf:,} ({med_conf/total*100:.1f}%)

Low (<50%)

{low_conf:,} ({low_conf/total*100:.1f}%)

Top Senders

{"".join(f'''

{escape(truncate(sender, 35))} {count}

''' for sender, count in top_senders)}

Emails by Category

All{total}

{"".join(f'''

{escape(cat)}{categories[cat]}

''' for cat in sorted_categories)}

{"".join(generate_email_row(c) for c in sorted(classifications, key=lambda x: x.get("date") or "", reverse=True))}

Date	Subject	Sender	Category	Confidence	Method

{"".join(f'''

{"".join(generate_email_row(c, show_category=False) for c in sorted(by_category[cat], key=lambda x: x.get("date") or "", reverse=True))}

Date	Subject	Sender	Confidence	Method

''' for cat in sorted_categories)}

''' with open(output_path, "w", encoding="utf-8") as f: f.write(html) print(f"Report generated: {output_path}") print(f" Total emails: {total:,}") print(f" Categories: {len(categories)}") print(f" Top category: {sorted_categories[0]} ({categories[sorted_categories[0]]:,})") def generate_email_row(c: dict, show_category: bool = True) -> str: """Generate HTML table row for an email.""" conf = c.get("confidence", 0) conf_class = "high" if conf >= 0.7 else "medium" if conf >= 0.5 else "low" method = c.get("method", "unknown") method_class = f"method-{method}" attachment_icon = '📎 ' if c.get("has_attachments") else "" category_col = f'{escape(c.get("category", "unknown"))}' if show_category else "" return f''' {format_date(c.get("date"))} {attachment_icon}{escape(truncate(c.get("subject", "No subject"), 70))} {escape(truncate(c.get("sender_name") or c.get("sender", ""), 35))} {category_col} {conf*100:.0f}% {method} ''' def main(): parser = argparse.ArgumentParser(description="Generate HTML report from classification results") parser.add_argument("--input", "-i", required=True, help="Path to results.json") parser.add_argument("--output", "-o", default=None, help="Output HTML file path") args = parser.parse_args() input_path = Path(args.input) if not input_path.exists(): print(f"Error: Input file not found: {input_path}") return 1 output_path = args.output or str(input_path.parent / "report.html") results = load_results(args.input) generate_html_report(results, output_path) return 0 if __name__ == "__main__": exit(main())