Add stratified 100k Enron email sampler

Creates diverse, representative sample: - Samples from full maildir (150 users, 2768 folders) - Proportional stratified sampling - Minimum 100 emails per folder for representation - Reproducible with seed 42 - Generated: 102,152 stratified emails ready for calibration
2025-10-23 16:15:58 +11:00 · 2025-10-23 16:15:58 +11:00 · a29d7d1401
commit a29d7d1401
parent fa09d14e52
1 changed files with 189 additions and 0 deletions
--- a/create_stratified_sample.py
+++ b/create_stratified_sample.py
@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Create stratified 100k sample from Enron dataset for calibration.
+
+Ensures diverse, representative sample across:
+- Different mailboxes (users)
+- Different folders (sent, inbox, etc.)
+- Time periods
+- Email sizes
+"""
+
+import os
+import random
+import json
+from pathlib import Path
+from collections import defaultdict
+from typing import List, Dict
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+logger = logging.getLogger(__name__)
+
+
+def get_enron_structure(maildir_path: str = "maildir") -> Dict[str, List[Path]]:
+    """
+    Analyze Enron dataset structure.
+
+    Structure: maildir/user/folder/email_file
+    Returns dict of {user_folder: [email_paths]}
+    """
+    base_path = Path(maildir_path)
+
+    if not base_path.exists():
+        logger.error(f"Maildir not found: {maildir_path}")
+        return {}
+
+    structure = defaultdict(list)
+
+    # Iterate through users
+    for user_dir in base_path.iterdir():
+        if not user_dir.is_dir():
+            continue
+
+        user_name = user_dir.name
+
+        # Iterate through folders within user
+        for folder in user_dir.iterdir():
+            if not folder.is_dir():
+                continue
+
+            folder_name = f"{user_name}/{folder.name}"
+
+            # Collect emails in folder
+            for email_file in folder.iterdir():
+                if email_file.is_file():
+                    structure[folder_name].append(email_file)
+
+    return structure
+
+
+def create_stratified_sample(
+    maildir_path: str = "arnold-j",
+    target_size: int = 100000,
+    output_file: str = "enron_100k_sample.json"
+) -> Dict:
+    """
+    Create stratified sample ensuring diversity across folders.
+
+    Strategy:
+    1. Sample proportionally from each folder
+    2. Ensure minimum representation from small folders
+    3. Randomize within each stratum
+    4. Save sample metadata for reproducibility
+    """
+    logger.info(f"Creating stratified sample of {target_size:,} emails from {maildir_path}")
+
+    # Get dataset structure
+    structure = get_enron_structure(maildir_path)
+
+    if not structure:
+        logger.error("No emails found!")
+        return {}
+
+    # Calculate folder sizes
+    folder_stats = {}
+    total_emails = 0
+
+    for folder, emails in structure.items():
+        count = len(emails)
+        folder_stats[folder] = count
+        total_emails += count
+        logger.info(f"  {folder}: {count:,} emails")
+
+    logger.info(f"\nTotal emails available: {total_emails:,}")
+
+    if total_emails < target_size:
+        logger.warning(f"Only {total_emails:,} emails available, using all")
+        target_size = total_emails
+
+    # Calculate proportional sample sizes
+    min_per_folder = 100  # Ensure minimum representation
+    sample_plan = {}
+
+    for folder, count in folder_stats.items():
+        # Proportional allocation
+        proportion = count / total_emails
+        allocated = int(proportion * target_size)
+
+        # Ensure minimum
+        allocated = max(allocated, min(min_per_folder, count))
+
+        sample_plan[folder] = min(allocated, count)
+
+    # Adjust to hit exact target
+    current_total = sum(sample_plan.values())
+    if current_total != target_size:
+        # Distribute difference proportionally to largest folders
+        diff = target_size - current_total
+        sorted_folders = sorted(folder_stats.items(), key=lambda x: x[1], reverse=True)
+
+        for folder, _ in sorted_folders:
+            if diff == 0:
+                break
+            if diff > 0:  # Need more
+                available = folder_stats[folder] - sample_plan[folder]
+                add = min(abs(diff), available)
+                sample_plan[folder] += add
+                diff -= add
+            else:  # Need fewer
+                removable = sample_plan[folder] - min_per_folder
+                remove = min(abs(diff), removable)
+                sample_plan[folder] -= remove
+                diff += remove
+
+    logger.info(f"\nSample Plan (total: {sum(sample_plan.values()):,}):")
+    for folder, count in sorted(sample_plan.items(), key=lambda x: x[1], reverse=True):
+        pct = (count / sum(sample_plan.values())) * 100
+        logger.info(f"  {folder}: {count:,} ({pct:.1f}%)")
+
+    # Execute sampling
+    random.seed(42)  # Reproducibility
+    sample = {}
+
+    for folder, target_count in sample_plan.items():
+        emails = structure[folder]
+        sampled = random.sample(emails, min(target_count, len(emails)))
+        sample[folder] = [str(p) for p in sampled]
+
+    # Flatten and save
+    all_sampled = []
+    for folder, paths in sample.items():
+        for path in paths:
+            all_sampled.append({
+                'path': path,
+                'folder': folder
+            })
+
+    # Shuffle for randomness
+    random.shuffle(all_sampled)
+
+    # Save sample metadata
+    output_data = {
+        'version': '1.0',
+        'target_size': target_size,
+        'actual_size': len(all_sampled),
+        'maildir_path': maildir_path,
+        'sample_plan': sample_plan,
+        'folder_stats': folder_stats,
+        'emails': all_sampled
+    }
+
+    with open(output_file, 'w') as f:
+        json.dump(output_data, f, indent=2)
+
+    logger.info(f"\n✅ Sample created: {len(all_sampled):,} emails")
+    logger.info(f"📁 Saved to: {output_file}")
+    logger.info(f"🎲 Random seed: 42 (reproducible)")
+
+    return output_data
+
+
+if __name__ == "__main__":
+    import sys
+
+    maildir = sys.argv[1] if len(sys.argv) > 1 else "arnold-j"
+    target = int(sys.argv[2]) if len(sys.argv) > 2 else 100000
+    output = sys.argv[3] if len(sys.argv) > 3 else "enron_100k_sample.json"
+
+    create_stratified_sample(maildir, target, output)