import { FrontmatterGenerator, FrontmatterOptions, DeterministicFields, LLMEnrichment, OutputMode, DEFAULTS } from '../src'; import * as yaml from 'js-yaml'; describe('FrontmatterGenerator', () => { const baseOptions: FrontmatterOptions = { generator: 'fss-parse-pdf', version: '1.0.0', title: 'Test Document' }; describe('generate()', () => { it('should return empty string for none mode', () => { const result = FrontmatterGenerator.generate(baseOptions, {}, undefined, 'none'); expect(result).toBe(''); }); it('should generate valid YAML frontmatter', () => { const result = FrontmatterGenerator.generate(baseOptions); expect(result).toMatch(/^---\n/); expect(result).toMatch(/\n---$/); // Extract and parse YAML const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed).toBeDefined(); }); it('should include core required fields', () => { const result = FrontmatterGenerator.generate(baseOptions); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.profile).toBe(DEFAULTS.profile); expect(parsed.generator).toBe('fss-parse-pdf'); expect(parsed.version).toBe('1.0.0'); expect(parsed.title).toBe('Test Document'); expect(parsed.extraction_confidence).toBe(DEFAULTS.extractionConfidence); expect(parsed.content_quality).toBe(DEFAULTS.contentQuality); expect(parsed.created).toBeDefined(); }); it('should include source_file when provided', () => { const options: FrontmatterOptions = { ...baseOptions, sourcePath: '/path/to/file.pdf' }; const result = FrontmatterGenerator.generate(options); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.source_file).toBe('/path/to/file.pdf'); }); it('should use custom profile when provided', () => { const options: FrontmatterOptions = { ...baseOptions, profile: 'technical' }; const result = FrontmatterGenerator.generate(options); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.profile).toBe('technical'); }); it('should add LLM enrichment placeholders when no enrichment provided', () => { const result = FrontmatterGenerator.generate(baseOptions); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.summary).toBe(''); expect(parsed.tags).toEqual([]); expect(parsed.category).toBe(''); }); it('should include LLM enrichment fields when provided', () => { const enrichment: LLMEnrichment = { summary: 'Test summary', tags: ['tag1', 'tag2'], category: 'technical', audience: 'expert', doc_purpose: 'reference', complexity: 4, actionable: true, key_technologies: ['TypeScript', 'Node.js'] }; const result = FrontmatterGenerator.generate(baseOptions, {}, enrichment); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.summary).toBe('Test summary'); expect(parsed.tags).toEqual(['tag1', 'tag2']); expect(parsed.category).toBe('technical'); expect(parsed.audience).toBe('expert'); expect(parsed.doc_purpose).toBe('reference'); expect(parsed.complexity).toBe(4); expect(parsed.actionable).toBe(true); expect(parsed.key_technologies).toEqual(['TypeScript', 'Node.js']); }); }); describe('balanced mode', () => { it('should include only balanced fields from deterministic', () => { const deterministic: DeterministicFields = { word_count: 1000, page_count: 5, character_count: 5000, has_tables: true, _internal_field: 'should be excluded', rare_field: 'should be excluded in balanced' }; const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced'); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.word_count).toBe(1000); expect(parsed.page_count).toBe(5); expect(parsed.character_count).toBe(5000); expect(parsed.has_tables).toBe(true); expect(parsed._internal_field).toBeUndefined(); expect(parsed.rare_field).toBeUndefined(); }); it('should exclude null and undefined balanced fields', () => { const deterministic: DeterministicFields = { word_count: 1000, page_count: null as any, character_count: undefined }; const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced'); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.word_count).toBe(1000); expect(parsed.page_count).toBeUndefined(); expect(parsed.character_count).toBeUndefined(); }); }); describe('complete mode', () => { it('should include all deterministic fields', () => { const deterministic: DeterministicFields = { word_count: 1000, page_count: 5, custom_field: 'included', nested: { deep: 'value' } }; const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete'); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.word_count).toBe(1000); expect(parsed.page_count).toBe(5); expect(parsed.custom_field).toBe('included'); expect(parsed.nested).toEqual({ deep: 'value' }); }); it('should exclude fields starting with underscore', () => { const deterministic: DeterministicFields = { word_count: 1000, _private: 'excluded' }; const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete'); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.word_count).toBe(1000); expect(parsed._private).toBeUndefined(); }); it('should convert Date objects to ISO strings', () => { const testDate = new Date('2024-01-15T10:30:00Z'); const deterministic: DeterministicFields = { modified: testDate }; const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete'); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.modified).toBe('2024-01-15T10:30:00.000Z'); }); }); describe('generateMarkdown()', () => { it('should prepend frontmatter to content', () => { const content = '# My Document\n\nContent here'; const result = FrontmatterGenerator.generateMarkdown( baseOptions, { word_count: 100 }, content ); expect(result).toMatch(/^---\n/); expect(result).toContain('# My Document'); expect(result).toContain('Content here'); }); it('should return only content for none mode', () => { const content = '# My Document\n\nContent here'; const result = FrontmatterGenerator.generateMarkdown( baseOptions, { word_count: 100 }, content, undefined, 'none' ); expect(result).toBe(content); }); it('should separate frontmatter and content with double newline', () => { const content = '# My Document'; const result = FrontmatterGenerator.generateMarkdown( baseOptions, {}, content ); expect(result).toMatch(/---\n\n#/); }); }); describe('parser-specific scenarios', () => { it('should handle PDF metadata', () => { const pdfDeterministic: DeterministicFields = { word_count: 5000, page_count: 20, character_count: 25000, has_tables: true, has_images: true, table_count: 5, image_count: 10, has_toc: true, has_forms: false, encrypted: false, author: 'John Doe' }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-pdf', profile: 'technical' }, pdfDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.word_count).toBe(5000); expect(parsed.page_count).toBe(20); expect(parsed.has_tables).toBe(true); expect(parsed.table_count).toBe(5); expect(parsed.author).toBe('John Doe'); }); it('should handle email metadata', () => { const emailDeterministic: DeterministicFields = { from: 'sender@example.com', to: 'recipient@example.com', date: '2024-01-15T10:30:00Z', message_id: '', has_attachments: true, attachment_count: 3, word_count: 500, importance: 'high' }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-email' }, emailDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.from).toBe('sender@example.com'); expect(parsed.to).toBe('recipient@example.com'); expect(parsed.has_attachments).toBe(true); expect(parsed.attachment_count).toBe(3); expect(parsed.importance).toBe('high'); }); it('should handle audio metadata', () => { const audioDeterministic: DeterministicFields = { duration: 3600, duration_seconds: 3600, bitrate: 320, sample_rate: 44100, codec: 'mp3', has_transcript: true, speaker_count: 3, language: 'en' }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-audio', profile: 'meeting' }, audioDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.duration).toBe(3600); expect(parsed.bitrate).toBe(320); expect(parsed.has_transcript).toBe(true); expect(parsed.speaker_count).toBe(3); }); it('should handle image metadata', () => { const imageDeterministic: DeterministicFields = { width: 1920, height: 1080, format: 'png', file_size: 2048000, channels: 4, has_alpha: true, ocr_confidence: 0.95 }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-image' }, imageDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.width).toBe(1920); expect(parsed.height).toBe(1080); expect(parsed.format).toBe('png'); expect(parsed.ocr_confidence).toBe(0.95); }); it('should handle video metadata', () => { const videoDeterministic: DeterministicFields = { duration: 7200, width: 1920, height: 1080, fps: 30, aspect_ratio: '16:9', bitrate: 8000, video_codec: 'h264', audio_codec: 'aac' }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-video', profile: 'meeting' }, videoDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.duration).toBe(7200); expect(parsed.fps).toBe(30); expect(parsed.aspect_ratio).toBe('16:9'); }); it('should handle presentation metadata', () => { const presentationDeterministic: DeterministicFields = { slide_count: 25, total_slides: 25, word_count: 3000, has_images: true, image_count: 15, chart_count: 5, has_speaker_notes: true, author: 'Jane Smith' }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-presentation', profile: 'technical' }, presentationDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.slide_count).toBe(25); expect(parsed.chart_count).toBe(5); expect(parsed.has_speaker_notes).toBe(true); }); it('should handle excel metadata', () => { const excelDeterministic: DeterministicFields = { sheet_count: 3, row_count: 1000, column_count: 20, author: 'Data Analyst' }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-excel' }, excelDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.sheet_count).toBe(3); expect(parsed.row_count).toBe(1000); expect(parsed.column_count).toBe(20); }); it('should handle diagram metadata', () => { const diagramDeterministic: DeterministicFields = { diagram_count: 5, diagram_type: 'flowchart', valid_diagrams: 4, invalid_diagrams: 1, node_count: 20, edge_count: 25 }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-diagram', profile: 'schema' }, diagramDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.diagram_count).toBe(5); expect(parsed.valid_diagrams).toBe(4); expect(parsed.node_count).toBe(20); }); it('should handle data parser metadata', () => { const dataDeterministic: DeterministicFields = { record_count: 10000, format_detected: 'json', file_size: 5000000, column_count: 15 }; const result = FrontmatterGenerator.generate( { ...baseOptions, generator: 'fss-parse-data' }, dataDeterministic, undefined, 'balanced' ); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.record_count).toBe(10000); expect(parsed.format_detected).toBe('json'); }); }); describe('edge cases', () => { it('should handle empty deterministic object', () => { const result = FrontmatterGenerator.generate(baseOptions, {}); expect(result).toMatch(/^---\n/); expect(result).toMatch(/\n---$/); }); it('should handle missing title with default', () => { const options: FrontmatterOptions = { generator: 'test', version: '1.0.0', title: '' }; const result = FrontmatterGenerator.generate(options); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.title).toBe('Untitled'); }); it('should handle arrays in deterministic fields', () => { const deterministic: DeterministicFields = { recipients: ['a@test.com', 'b@test.com'] }; const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete'); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.recipients).toEqual(['a@test.com', 'b@test.com']); }); it('should handle custom extraction confidence and quality', () => { const options: FrontmatterOptions = { ...baseOptions, extractionConfidence: 0.85, contentQuality: 1.2 }; const result = FrontmatterGenerator.generate(options); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.extraction_confidence).toBe(0.85); expect(parsed.content_quality).toBe(1.2); }); it('should handle zero values correctly', () => { const deterministic: DeterministicFields = { word_count: 0, page_count: 0 }; const result = FrontmatterGenerator.generate(baseOptions, deterministic); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.word_count).toBe(0); expect(parsed.page_count).toBe(0); }); it('should handle boolean false values', () => { const deterministic: DeterministicFields = { has_tables: false, encrypted: false }; const result = FrontmatterGenerator.generate(baseOptions, deterministic); const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, ''); const parsed = yaml.load(yamlContent) as Record; expect(parsed.has_tables).toBe(false); expect(parsed.encrypted).toBe(false); }); }); });