#!/usr/bin/env python3 """ Create stratified 100k sample from Enron dataset for calibration. Ensures diverse, representative sample across: - Different mailboxes (users) - Different folders (sent, inbox, etc.) - Time periods - Email sizes """ import os import random import json from pathlib import Path from collections import defaultdict from typing import List, Dict import logging logging.basicConfig(level=logging.INFO, format='%(message)s') logger = logging.getLogger(__name__) def get_enron_structure(maildir_path: str = "maildir") -> Dict[str, List[Path]]: """ Analyze Enron dataset structure. Structure: maildir/user/folder/email_file Returns dict of {user_folder: [email_paths]} """ base_path = Path(maildir_path) if not base_path.exists(): logger.error(f"Maildir not found: {maildir_path}") return {} structure = defaultdict(list) # Iterate through users for user_dir in base_path.iterdir(): if not user_dir.is_dir(): continue user_name = user_dir.name # Iterate through folders within user for folder in user_dir.iterdir(): if not folder.is_dir(): continue folder_name = f"{user_name}/{folder.name}" # Collect emails in folder for email_file in folder.iterdir(): if email_file.is_file(): structure[folder_name].append(email_file) return structure def create_stratified_sample( maildir_path: str = "arnold-j", target_size: int = 100000, output_file: str = "enron_100k_sample.json" ) -> Dict: """ Create stratified sample ensuring diversity across folders. Strategy: 1. Sample proportionally from each folder 2. Ensure minimum representation from small folders 3. Randomize within each stratum 4. Save sample metadata for reproducibility """ logger.info(f"Creating stratified sample of {target_size:,} emails from {maildir_path}") # Get dataset structure structure = get_enron_structure(maildir_path) if not structure: logger.error("No emails found!") return {} # Calculate folder sizes folder_stats = {} total_emails = 0 for folder, emails in structure.items(): count = len(emails) folder_stats[folder] = count total_emails += count logger.info(f" {folder}: {count:,} emails") logger.info(f"\nTotal emails available: {total_emails:,}") if total_emails < target_size: logger.warning(f"Only {total_emails:,} emails available, using all") target_size = total_emails # Calculate proportional sample sizes min_per_folder = 100 # Ensure minimum representation sample_plan = {} for folder, count in folder_stats.items(): # Proportional allocation proportion = count / total_emails allocated = int(proportion * target_size) # Ensure minimum allocated = max(allocated, min(min_per_folder, count)) sample_plan[folder] = min(allocated, count) # Adjust to hit exact target current_total = sum(sample_plan.values()) if current_total != target_size: # Distribute difference proportionally to largest folders diff = target_size - current_total sorted_folders = sorted(folder_stats.items(), key=lambda x: x[1], reverse=True) for folder, _ in sorted_folders: if diff == 0: break if diff > 0: # Need more available = folder_stats[folder] - sample_plan[folder] add = min(abs(diff), available) sample_plan[folder] += add diff -= add else: # Need fewer removable = sample_plan[folder] - min_per_folder remove = min(abs(diff), removable) sample_plan[folder] -= remove diff += remove logger.info(f"\nSample Plan (total: {sum(sample_plan.values()):,}):") for folder, count in sorted(sample_plan.items(), key=lambda x: x[1], reverse=True): pct = (count / sum(sample_plan.values())) * 100 logger.info(f" {folder}: {count:,} ({pct:.1f}%)") # Execute sampling random.seed(42) # Reproducibility sample = {} for folder, target_count in sample_plan.items(): emails = structure[folder] sampled = random.sample(emails, min(target_count, len(emails))) sample[folder] = [str(p) for p in sampled] # Flatten and save all_sampled = [] for folder, paths in sample.items(): for path in paths: all_sampled.append({ 'path': path, 'folder': folder }) # Shuffle for randomness random.shuffle(all_sampled) # Save sample metadata output_data = { 'version': '1.0', 'target_size': target_size, 'actual_size': len(all_sampled), 'maildir_path': maildir_path, 'sample_plan': sample_plan, 'folder_stats': folder_stats, 'emails': all_sampled } with open(output_file, 'w') as f: json.dump(output_data, f, indent=2) logger.info(f"\nāœ… Sample created: {len(all_sampled):,} emails") logger.info(f"šŸ“ Saved to: {output_file}") logger.info(f"šŸŽ² Random seed: 42 (reproducible)") return output_data if __name__ == "__main__": import sys maildir = sys.argv[1] if len(sys.argv) > 1 else "arnold-j" target = int(sys.argv[2]) if len(sys.argv) > 2 else 100000 output = sys.argv[3] if len(sys.argv) > 3 else "enron_100k_sample.json" create_stratified_sample(maildir, target, output)