"""Tests for feature extraction module.""" import pytest import numpy as np from src.classification.feature_extractor import FeatureExtractor from src.email_providers.base import Email, Attachment from datetime import datetime def test_feature_extractor_init(): """Test feature extractor initialization.""" extractor = FeatureExtractor() assert extractor is not None assert extractor.embedder is not None or extractor.embedder is None # OK if embedder fails def test_extract_structural_features(sample_email): """Test structural feature extraction.""" extractor = FeatureExtractor() features = extractor._extract_structural(sample_email) assert 'has_attachments' in features assert 'attachment_count' in features assert 'body_length' in features assert 'subject_length' in features assert 'time_of_day' in features assert features['has_attachments'] is True assert features['attachment_count'] == 1 def test_extract_sender_features(sample_email): """Test sender feature extraction.""" extractor = FeatureExtractor() features = extractor._extract_sender(sample_email) assert 'sender_domain' in features assert 'sender_domain_type' in features assert 'is_noreply' in features assert features['sender_domain'] == 'company.com' assert features['sender_domain_type'] in ['freemail', 'corporate', 'noreply', 'unknown'] def test_extract_patterns(sample_email): """Test pattern extraction.""" extractor = FeatureExtractor() features = extractor._extract_patterns(sample_email) assert 'has_otp_pattern' in features assert 'has_invoice_pattern' in features assert 'has_meeting' in features assert all(isinstance(v, bool) or isinstance(v, int) for v in features.values()) def test_pattern_detection_otp(): """Test OTP pattern detection.""" email = Email( id='otp-test', subject='Verify your identity', sender='bank@example.com', body='Your OTP is 456789' ) extractor = FeatureExtractor() features = extractor._extract_patterns(email) assert features.get('has_otp_pattern') is True def test_pattern_detection_invoice(): """Test invoice pattern detection.""" email = Email( id='invoice-test', subject='Invoice #INV-2024-12345', sender='billing@vendor.com', body='Please pay for invoice #INV-2024-12345' ) extractor = FeatureExtractor() features = extractor._extract_patterns(email) assert features.get('has_invoice_pattern') is True def test_full_extraction(sample_email): """Test full feature extraction.""" extractor = FeatureExtractor() features = extractor.extract(sample_email) assert features is not None assert 'embedding' in features assert 'subject' in features assert 'body_snippet' in features # Check embedding is array embedding = features['embedding'] if hasattr(embedding, 'shape'): assert len(embedding.shape) == 1 def test_batch_extraction(sample_emails): """Test batch feature extraction.""" extractor = FeatureExtractor() # Only test if pandas available try: df = extractor.extract_batch(sample_emails) if df is not None: assert len(df) == len(sample_emails) assert df.shape[0] == len(sample_emails) except ImportError: pytest.skip("pandas not available") def test_freemail_detection(): """Test freemail domain detection.""" email = Email( id='freemail-test', subject='Hello', sender='user@gmail.com', body='Test' ) extractor = FeatureExtractor() features = extractor._extract_sender(email) assert features.get('sender_domain_type') == 'freemail' def test_noreply_detection(): """Test noreply sender detection.""" email = Email( id='noreply-test', subject='Alert', sender='noreply@system.com', body='Automated alert' ) extractor = FeatureExtractor() features = extractor._extract_sender(email) assert features.get('is_noreply') is True assert features.get('sender_domain_type') == 'noreply'