email-sorter/create_stratified_sample.py
FSSCoding a29d7d1401 Add stratified 100k Enron email sampler
Creates diverse, representative sample:
- Samples from full maildir (150 users, 2768 folders)
- Proportional stratified sampling
- Minimum 100 emails per folder for representation
- Reproducible with seed 42
- Generated: 102,152 stratified emails ready for calibration
2025-10-23 16:15:58 +11:00

190 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Create stratified 100k sample from Enron dataset for calibration.
Ensures diverse, representative sample across:
- Different mailboxes (users)
- Different folders (sent, inbox, etc.)
- Time periods
- Email sizes
"""
import os
import random
import json
from pathlib import Path
from collections import defaultdict
from typing import List, Dict
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)
def get_enron_structure(maildir_path: str = "maildir") -> Dict[str, List[Path]]:
"""
Analyze Enron dataset structure.
Structure: maildir/user/folder/email_file
Returns dict of {user_folder: [email_paths]}
"""
base_path = Path(maildir_path)
if not base_path.exists():
logger.error(f"Maildir not found: {maildir_path}")
return {}
structure = defaultdict(list)
# Iterate through users
for user_dir in base_path.iterdir():
if not user_dir.is_dir():
continue
user_name = user_dir.name
# Iterate through folders within user
for folder in user_dir.iterdir():
if not folder.is_dir():
continue
folder_name = f"{user_name}/{folder.name}"
# Collect emails in folder
for email_file in folder.iterdir():
if email_file.is_file():
structure[folder_name].append(email_file)
return structure
def create_stratified_sample(
maildir_path: str = "arnold-j",
target_size: int = 100000,
output_file: str = "enron_100k_sample.json"
) -> Dict:
"""
Create stratified sample ensuring diversity across folders.
Strategy:
1. Sample proportionally from each folder
2. Ensure minimum representation from small folders
3. Randomize within each stratum
4. Save sample metadata for reproducibility
"""
logger.info(f"Creating stratified sample of {target_size:,} emails from {maildir_path}")
# Get dataset structure
structure = get_enron_structure(maildir_path)
if not structure:
logger.error("No emails found!")
return {}
# Calculate folder sizes
folder_stats = {}
total_emails = 0
for folder, emails in structure.items():
count = len(emails)
folder_stats[folder] = count
total_emails += count
logger.info(f" {folder}: {count:,} emails")
logger.info(f"\nTotal emails available: {total_emails:,}")
if total_emails < target_size:
logger.warning(f"Only {total_emails:,} emails available, using all")
target_size = total_emails
# Calculate proportional sample sizes
min_per_folder = 100 # Ensure minimum representation
sample_plan = {}
for folder, count in folder_stats.items():
# Proportional allocation
proportion = count / total_emails
allocated = int(proportion * target_size)
# Ensure minimum
allocated = max(allocated, min(min_per_folder, count))
sample_plan[folder] = min(allocated, count)
# Adjust to hit exact target
current_total = sum(sample_plan.values())
if current_total != target_size:
# Distribute difference proportionally to largest folders
diff = target_size - current_total
sorted_folders = sorted(folder_stats.items(), key=lambda x: x[1], reverse=True)
for folder, _ in sorted_folders:
if diff == 0:
break
if diff > 0: # Need more
available = folder_stats[folder] - sample_plan[folder]
add = min(abs(diff), available)
sample_plan[folder] += add
diff -= add
else: # Need fewer
removable = sample_plan[folder] - min_per_folder
remove = min(abs(diff), removable)
sample_plan[folder] -= remove
diff += remove
logger.info(f"\nSample Plan (total: {sum(sample_plan.values()):,}):")
for folder, count in sorted(sample_plan.items(), key=lambda x: x[1], reverse=True):
pct = (count / sum(sample_plan.values())) * 100
logger.info(f" {folder}: {count:,} ({pct:.1f}%)")
# Execute sampling
random.seed(42) # Reproducibility
sample = {}
for folder, target_count in sample_plan.items():
emails = structure[folder]
sampled = random.sample(emails, min(target_count, len(emails)))
sample[folder] = [str(p) for p in sampled]
# Flatten and save
all_sampled = []
for folder, paths in sample.items():
for path in paths:
all_sampled.append({
'path': path,
'folder': folder
})
# Shuffle for randomness
random.shuffle(all_sampled)
# Save sample metadata
output_data = {
'version': '1.0',
'target_size': target_size,
'actual_size': len(all_sampled),
'maildir_path': maildir_path,
'sample_plan': sample_plan,
'folder_stats': folder_stats,
'emails': all_sampled
}
with open(output_file, 'w') as f:
json.dump(output_data, f, indent=2)
logger.info(f"\n✅ Sample created: {len(all_sampled):,} emails")
logger.info(f"📁 Saved to: {output_file}")
logger.info(f"🎲 Random seed: 42 (reproducible)")
return output_data
if __name__ == "__main__":
import sys
maildir = sys.argv[1] if len(sys.argv) > 1 else "arnold-j"
target = int(sys.argv[2]) if len(sys.argv) > 2 else 100000
output = sys.argv[3] if len(sys.argv) > 3 else "enron_100k_sample.json"
create_stratified_sample(maildir, target, output)