Add stratified 100k Enron email sampler
Creates diverse, representative sample: - Samples from full maildir (150 users, 2768 folders) - Proportional stratified sampling - Minimum 100 emails per folder for representation - Reproducible with seed 42 - Generated: 102,152 stratified emails ready for calibration
This commit is contained in:
parent
fa09d14e52
commit
a29d7d1401
189
create_stratified_sample.py
Normal file
189
create_stratified_sample.py
Normal file
@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create stratified 100k sample from Enron dataset for calibration.
|
||||
|
||||
Ensures diverse, representative sample across:
|
||||
- Different mailboxes (users)
|
||||
- Different folders (sent, inbox, etc.)
|
||||
- Time periods
|
||||
- Email sizes
|
||||
"""
|
||||
|
||||
import os
|
||||
import random
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from typing import List, Dict
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_enron_structure(maildir_path: str = "maildir") -> Dict[str, List[Path]]:
|
||||
"""
|
||||
Analyze Enron dataset structure.
|
||||
|
||||
Structure: maildir/user/folder/email_file
|
||||
Returns dict of {user_folder: [email_paths]}
|
||||
"""
|
||||
base_path = Path(maildir_path)
|
||||
|
||||
if not base_path.exists():
|
||||
logger.error(f"Maildir not found: {maildir_path}")
|
||||
return {}
|
||||
|
||||
structure = defaultdict(list)
|
||||
|
||||
# Iterate through users
|
||||
for user_dir in base_path.iterdir():
|
||||
if not user_dir.is_dir():
|
||||
continue
|
||||
|
||||
user_name = user_dir.name
|
||||
|
||||
# Iterate through folders within user
|
||||
for folder in user_dir.iterdir():
|
||||
if not folder.is_dir():
|
||||
continue
|
||||
|
||||
folder_name = f"{user_name}/{folder.name}"
|
||||
|
||||
# Collect emails in folder
|
||||
for email_file in folder.iterdir():
|
||||
if email_file.is_file():
|
||||
structure[folder_name].append(email_file)
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def create_stratified_sample(
|
||||
maildir_path: str = "arnold-j",
|
||||
target_size: int = 100000,
|
||||
output_file: str = "enron_100k_sample.json"
|
||||
) -> Dict:
|
||||
"""
|
||||
Create stratified sample ensuring diversity across folders.
|
||||
|
||||
Strategy:
|
||||
1. Sample proportionally from each folder
|
||||
2. Ensure minimum representation from small folders
|
||||
3. Randomize within each stratum
|
||||
4. Save sample metadata for reproducibility
|
||||
"""
|
||||
logger.info(f"Creating stratified sample of {target_size:,} emails from {maildir_path}")
|
||||
|
||||
# Get dataset structure
|
||||
structure = get_enron_structure(maildir_path)
|
||||
|
||||
if not structure:
|
||||
logger.error("No emails found!")
|
||||
return {}
|
||||
|
||||
# Calculate folder sizes
|
||||
folder_stats = {}
|
||||
total_emails = 0
|
||||
|
||||
for folder, emails in structure.items():
|
||||
count = len(emails)
|
||||
folder_stats[folder] = count
|
||||
total_emails += count
|
||||
logger.info(f" {folder}: {count:,} emails")
|
||||
|
||||
logger.info(f"\nTotal emails available: {total_emails:,}")
|
||||
|
||||
if total_emails < target_size:
|
||||
logger.warning(f"Only {total_emails:,} emails available, using all")
|
||||
target_size = total_emails
|
||||
|
||||
# Calculate proportional sample sizes
|
||||
min_per_folder = 100 # Ensure minimum representation
|
||||
sample_plan = {}
|
||||
|
||||
for folder, count in folder_stats.items():
|
||||
# Proportional allocation
|
||||
proportion = count / total_emails
|
||||
allocated = int(proportion * target_size)
|
||||
|
||||
# Ensure minimum
|
||||
allocated = max(allocated, min(min_per_folder, count))
|
||||
|
||||
sample_plan[folder] = min(allocated, count)
|
||||
|
||||
# Adjust to hit exact target
|
||||
current_total = sum(sample_plan.values())
|
||||
if current_total != target_size:
|
||||
# Distribute difference proportionally to largest folders
|
||||
diff = target_size - current_total
|
||||
sorted_folders = sorted(folder_stats.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
for folder, _ in sorted_folders:
|
||||
if diff == 0:
|
||||
break
|
||||
if diff > 0: # Need more
|
||||
available = folder_stats[folder] - sample_plan[folder]
|
||||
add = min(abs(diff), available)
|
||||
sample_plan[folder] += add
|
||||
diff -= add
|
||||
else: # Need fewer
|
||||
removable = sample_plan[folder] - min_per_folder
|
||||
remove = min(abs(diff), removable)
|
||||
sample_plan[folder] -= remove
|
||||
diff += remove
|
||||
|
||||
logger.info(f"\nSample Plan (total: {sum(sample_plan.values()):,}):")
|
||||
for folder, count in sorted(sample_plan.items(), key=lambda x: x[1], reverse=True):
|
||||
pct = (count / sum(sample_plan.values())) * 100
|
||||
logger.info(f" {folder}: {count:,} ({pct:.1f}%)")
|
||||
|
||||
# Execute sampling
|
||||
random.seed(42) # Reproducibility
|
||||
sample = {}
|
||||
|
||||
for folder, target_count in sample_plan.items():
|
||||
emails = structure[folder]
|
||||
sampled = random.sample(emails, min(target_count, len(emails)))
|
||||
sample[folder] = [str(p) for p in sampled]
|
||||
|
||||
# Flatten and save
|
||||
all_sampled = []
|
||||
for folder, paths in sample.items():
|
||||
for path in paths:
|
||||
all_sampled.append({
|
||||
'path': path,
|
||||
'folder': folder
|
||||
})
|
||||
|
||||
# Shuffle for randomness
|
||||
random.shuffle(all_sampled)
|
||||
|
||||
# Save sample metadata
|
||||
output_data = {
|
||||
'version': '1.0',
|
||||
'target_size': target_size,
|
||||
'actual_size': len(all_sampled),
|
||||
'maildir_path': maildir_path,
|
||||
'sample_plan': sample_plan,
|
||||
'folder_stats': folder_stats,
|
||||
'emails': all_sampled
|
||||
}
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
|
||||
logger.info(f"\n✅ Sample created: {len(all_sampled):,} emails")
|
||||
logger.info(f"📁 Saved to: {output_file}")
|
||||
logger.info(f"🎲 Random seed: 42 (reproducible)")
|
||||
|
||||
return output_data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
maildir = sys.argv[1] if len(sys.argv) > 1 else "arnold-j"
|
||||
target = int(sys.argv[2]) if len(sys.argv) > 2 else 100000
|
||||
output = sys.argv[3] if len(sys.argv) > 3 else "enron_100k_sample.json"
|
||||
|
||||
create_stratified_sample(maildir, target, output)
|
||||
Loading…
x
Reference in New Issue
Block a user