Project Reorganization: - Created docs/ directory and moved all documentation - Created scripts/ directory for shell scripts - Created scripts/experimental/ for research scripts - Updated .gitignore for new structure - Updated README.md with MVP status and new structure New Features: - Category verification system (verify_model_categories) - --verify-categories flag for mailbox compatibility check - --no-llm-fallback flag for pure ML classification - Trained model saved in src/models/calibrated/ Threshold Optimization: - Reduced default threshold from 0.75 to 0.55 - Updated all category thresholds to 0.55 - Reduces LLM fallback rate by 40% (35% -> 21%) Documentation: - SYSTEM_FLOW.html - Complete system architecture - VERIFY_CATEGORIES_FEATURE.html - Feature documentation - LABEL_TRAINING_PHASE_DETAIL.html - Calibration breakdown - FAST_ML_ONLY_WORKFLOW.html - Pure ML guide - PROJECT_STATUS_AND_NEXT_STEPS.html - Roadmap - ROOT_CAUSE_ANALYSIS.md - Bug fixes MVP Status: - 10k emails in 4 minutes, 72.7% accuracy, 0 LLM calls - LLM-driven category discovery working - Embedding-based transfer learning confirmed - All model paths verified and working
83 lines
1.6 KiB
YAML
83 lines
1.6 KiB
YAML
version: "1.0.0"
|
|
|
|
calibration:
|
|
sample_size: 250
|
|
sample_strategy: "stratified"
|
|
validation_size: 50
|
|
min_confidence: 0.6
|
|
|
|
processing:
|
|
batch_size: 100
|
|
llm_queue_size: 100
|
|
parallel_workers: 4
|
|
checkpoint_interval: 1000
|
|
checkpoint_dir: "checkpoints"
|
|
|
|
classification:
|
|
default_threshold: 0.55
|
|
min_threshold: 0.50
|
|
max_threshold: 0.70
|
|
adjustment_step: 0.05
|
|
adjustment_frequency: 1000
|
|
category_thresholds:
|
|
junk: 0.55
|
|
auth: 0.55
|
|
transactional: 0.55
|
|
newsletters: 0.55
|
|
conversational: 0.55
|
|
|
|
llm:
|
|
provider: "ollama"
|
|
fallback_enabled: true
|
|
|
|
ollama:
|
|
base_url: "http://localhost:11434"
|
|
calibration_model: "qwen3:4b-instruct-2507-q8_0"
|
|
consolidation_model: "qwen3:4b-instruct-2507-q8_0"
|
|
classification_model: "qwen3:4b-instruct-2507-q8_0"
|
|
temperature: 0.1
|
|
max_tokens: 2000
|
|
timeout: 30
|
|
retry_attempts: 3
|
|
|
|
openai:
|
|
base_url: "https://api.openai.com/v1"
|
|
api_key: "${OPENAI_API_KEY}"
|
|
calibration_model: "gpt-4o-mini"
|
|
classification_model: "gpt-4o-mini"
|
|
temperature: 0.1
|
|
max_tokens: 500
|
|
|
|
email_providers:
|
|
gmail:
|
|
batch_size: 100
|
|
microsoft:
|
|
batch_size: 100
|
|
imap:
|
|
timeout: 30
|
|
batch_size: 50
|
|
|
|
features:
|
|
text_features:
|
|
max_vocab_size: 10000
|
|
ngram_range: [1, 2]
|
|
min_df: 2
|
|
max_df: 0.95
|
|
embedding_model: "all-MiniLM-L6-v2"
|
|
embedding_batch_size: 32
|
|
|
|
export:
|
|
format: "json"
|
|
include_confidence: true
|
|
create_report: true
|
|
output_dir: "results"
|
|
|
|
logging:
|
|
level: "INFO"
|
|
file: "logs/email-sorter.log"
|
|
|
|
cleanup:
|
|
delete_temp_files: true
|
|
delete_repo_after: false
|
|
temp_dir: ".email-sorter-tmp"
|