parser-frontmatter-ts/tests/generator.test.ts

526 lines
18 KiB
TypeScript

import {
FrontmatterGenerator,
FrontmatterOptions,
DeterministicFields,
LLMEnrichment,
OutputMode,
DEFAULTS
} from '../src';
import * as yaml from 'js-yaml';
describe('FrontmatterGenerator', () => {
const baseOptions: FrontmatterOptions = {
generator: 'fss-parse-pdf',
version: '1.0.0',
title: 'Test Document'
};
describe('generate()', () => {
it('should return empty string for none mode', () => {
const result = FrontmatterGenerator.generate(baseOptions, {}, undefined, 'none');
expect(result).toBe('');
});
it('should generate valid YAML frontmatter', () => {
const result = FrontmatterGenerator.generate(baseOptions);
expect(result).toMatch(/^---\n/);
expect(result).toMatch(/\n---$/);
// Extract and parse YAML
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed).toBeDefined();
});
it('should include core required fields', () => {
const result = FrontmatterGenerator.generate(baseOptions);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.profile).toBe(DEFAULTS.profile);
expect(parsed.generator).toBe('fss-parse-pdf');
expect(parsed.version).toBe('1.0.0');
expect(parsed.title).toBe('Test Document');
expect(parsed.extraction_confidence).toBe(DEFAULTS.extractionConfidence);
expect(parsed.content_quality).toBe(DEFAULTS.contentQuality);
expect(parsed.created).toBeDefined();
});
it('should include source_file when provided', () => {
const options: FrontmatterOptions = {
...baseOptions,
sourcePath: '/path/to/file.pdf'
};
const result = FrontmatterGenerator.generate(options);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.source_file).toBe('/path/to/file.pdf');
});
it('should use custom profile when provided', () => {
const options: FrontmatterOptions = {
...baseOptions,
profile: 'technical'
};
const result = FrontmatterGenerator.generate(options);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.profile).toBe('technical');
});
it('should add LLM enrichment placeholders when no enrichment provided', () => {
const result = FrontmatterGenerator.generate(baseOptions);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.summary).toBe('');
expect(parsed.tags).toEqual([]);
expect(parsed.category).toBe('');
});
it('should include LLM enrichment fields when provided', () => {
const enrichment: LLMEnrichment = {
summary: 'Test summary',
tags: ['tag1', 'tag2'],
category: 'technical',
audience: 'expert',
doc_purpose: 'reference',
complexity: 4,
actionable: true,
key_technologies: ['TypeScript', 'Node.js']
};
const result = FrontmatterGenerator.generate(baseOptions, {}, enrichment);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.summary).toBe('Test summary');
expect(parsed.tags).toEqual(['tag1', 'tag2']);
expect(parsed.category).toBe('technical');
expect(parsed.audience).toBe('expert');
expect(parsed.doc_purpose).toBe('reference');
expect(parsed.complexity).toBe(4);
expect(parsed.actionable).toBe(true);
expect(parsed.key_technologies).toEqual(['TypeScript', 'Node.js']);
});
});
describe('balanced mode', () => {
it('should include only balanced fields from deterministic', () => {
const deterministic: DeterministicFields = {
word_count: 1000,
page_count: 5,
character_count: 5000,
has_tables: true,
_internal_field: 'should be excluded',
rare_field: 'should be excluded in balanced'
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced');
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.word_count).toBe(1000);
expect(parsed.page_count).toBe(5);
expect(parsed.character_count).toBe(5000);
expect(parsed.has_tables).toBe(true);
expect(parsed._internal_field).toBeUndefined();
expect(parsed.rare_field).toBeUndefined();
});
it('should exclude null and undefined balanced fields', () => {
const deterministic: DeterministicFields = {
word_count: 1000,
page_count: null as any,
character_count: undefined
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced');
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.word_count).toBe(1000);
expect(parsed.page_count).toBeUndefined();
expect(parsed.character_count).toBeUndefined();
});
});
describe('complete mode', () => {
it('should include all deterministic fields', () => {
const deterministic: DeterministicFields = {
word_count: 1000,
page_count: 5,
custom_field: 'included',
nested: { deep: 'value' }
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.word_count).toBe(1000);
expect(parsed.page_count).toBe(5);
expect(parsed.custom_field).toBe('included');
expect(parsed.nested).toEqual({ deep: 'value' });
});
it('should exclude fields starting with underscore', () => {
const deterministic: DeterministicFields = {
word_count: 1000,
_private: 'excluded'
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.word_count).toBe(1000);
expect(parsed._private).toBeUndefined();
});
it('should convert Date objects to ISO strings', () => {
const testDate = new Date('2024-01-15T10:30:00Z');
const deterministic: DeterministicFields = {
modified: testDate
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.modified).toBe('2024-01-15T10:30:00.000Z');
});
});
describe('generateMarkdown()', () => {
it('should prepend frontmatter to content', () => {
const content = '# My Document\n\nContent here';
const result = FrontmatterGenerator.generateMarkdown(
baseOptions,
{ word_count: 100 },
content
);
expect(result).toMatch(/^---\n/);
expect(result).toContain('# My Document');
expect(result).toContain('Content here');
});
it('should return only content for none mode', () => {
const content = '# My Document\n\nContent here';
const result = FrontmatterGenerator.generateMarkdown(
baseOptions,
{ word_count: 100 },
content,
undefined,
'none'
);
expect(result).toBe(content);
});
it('should separate frontmatter and content with double newline', () => {
const content = '# My Document';
const result = FrontmatterGenerator.generateMarkdown(
baseOptions,
{},
content
);
expect(result).toMatch(/---\n\n#/);
});
});
describe('parser-specific scenarios', () => {
it('should handle PDF metadata', () => {
const pdfDeterministic: DeterministicFields = {
word_count: 5000,
page_count: 20,
character_count: 25000,
has_tables: true,
has_images: true,
table_count: 5,
image_count: 10,
has_toc: true,
has_forms: false,
encrypted: false,
author: 'John Doe'
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-pdf', profile: 'technical' },
pdfDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.word_count).toBe(5000);
expect(parsed.page_count).toBe(20);
expect(parsed.has_tables).toBe(true);
expect(parsed.table_count).toBe(5);
expect(parsed.author).toBe('John Doe');
});
it('should handle email metadata', () => {
const emailDeterministic: DeterministicFields = {
from: 'sender@example.com',
to: 'recipient@example.com',
date: '2024-01-15T10:30:00Z',
message_id: '<abc123@example.com>',
has_attachments: true,
attachment_count: 3,
word_count: 500,
importance: 'high'
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-email' },
emailDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.from).toBe('sender@example.com');
expect(parsed.to).toBe('recipient@example.com');
expect(parsed.has_attachments).toBe(true);
expect(parsed.attachment_count).toBe(3);
expect(parsed.importance).toBe('high');
});
it('should handle audio metadata', () => {
const audioDeterministic: DeterministicFields = {
duration: 3600,
duration_seconds: 3600,
bitrate: 320,
sample_rate: 44100,
codec: 'mp3',
has_transcript: true,
speaker_count: 3,
language: 'en'
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-audio', profile: 'meeting' },
audioDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.duration).toBe(3600);
expect(parsed.bitrate).toBe(320);
expect(parsed.has_transcript).toBe(true);
expect(parsed.speaker_count).toBe(3);
});
it('should handle image metadata', () => {
const imageDeterministic: DeterministicFields = {
width: 1920,
height: 1080,
format: 'png',
file_size: 2048000,
channels: 4,
has_alpha: true,
ocr_confidence: 0.95
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-image' },
imageDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.width).toBe(1920);
expect(parsed.height).toBe(1080);
expect(parsed.format).toBe('png');
expect(parsed.ocr_confidence).toBe(0.95);
});
it('should handle video metadata', () => {
const videoDeterministic: DeterministicFields = {
duration: 7200,
width: 1920,
height: 1080,
fps: 30,
aspect_ratio: '16:9',
bitrate: 8000,
video_codec: 'h264',
audio_codec: 'aac'
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-video', profile: 'meeting' },
videoDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.duration).toBe(7200);
expect(parsed.fps).toBe(30);
expect(parsed.aspect_ratio).toBe('16:9');
});
it('should handle presentation metadata', () => {
const presentationDeterministic: DeterministicFields = {
slide_count: 25,
total_slides: 25,
word_count: 3000,
has_images: true,
image_count: 15,
chart_count: 5,
has_speaker_notes: true,
author: 'Jane Smith'
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-presentation', profile: 'technical' },
presentationDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.slide_count).toBe(25);
expect(parsed.chart_count).toBe(5);
expect(parsed.has_speaker_notes).toBe(true);
});
it('should handle excel metadata', () => {
const excelDeterministic: DeterministicFields = {
sheet_count: 3,
row_count: 1000,
column_count: 20,
author: 'Data Analyst'
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-excel' },
excelDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.sheet_count).toBe(3);
expect(parsed.row_count).toBe(1000);
expect(parsed.column_count).toBe(20);
});
it('should handle diagram metadata', () => {
const diagramDeterministic: DeterministicFields = {
diagram_count: 5,
diagram_type: 'flowchart',
valid_diagrams: 4,
invalid_diagrams: 1,
node_count: 20,
edge_count: 25
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-diagram', profile: 'schema' },
diagramDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.diagram_count).toBe(5);
expect(parsed.valid_diagrams).toBe(4);
expect(parsed.node_count).toBe(20);
});
it('should handle data parser metadata', () => {
const dataDeterministic: DeterministicFields = {
record_count: 10000,
format_detected: 'json',
file_size: 5000000,
column_count: 15
};
const result = FrontmatterGenerator.generate(
{ ...baseOptions, generator: 'fss-parse-data' },
dataDeterministic,
undefined,
'balanced'
);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.record_count).toBe(10000);
expect(parsed.format_detected).toBe('json');
});
});
describe('edge cases', () => {
it('should handle empty deterministic object', () => {
const result = FrontmatterGenerator.generate(baseOptions, {});
expect(result).toMatch(/^---\n/);
expect(result).toMatch(/\n---$/);
});
it('should handle missing title with default', () => {
const options: FrontmatterOptions = {
generator: 'test',
version: '1.0.0',
title: ''
};
const result = FrontmatterGenerator.generate(options);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.title).toBe('Untitled');
});
it('should handle arrays in deterministic fields', () => {
const deterministic: DeterministicFields = {
recipients: ['a@test.com', 'b@test.com']
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.recipients).toEqual(['a@test.com', 'b@test.com']);
});
it('should handle custom extraction confidence and quality', () => {
const options: FrontmatterOptions = {
...baseOptions,
extractionConfidence: 0.85,
contentQuality: 1.2
};
const result = FrontmatterGenerator.generate(options);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.extraction_confidence).toBe(0.85);
expect(parsed.content_quality).toBe(1.2);
});
it('should handle zero values correctly', () => {
const deterministic: DeterministicFields = {
word_count: 0,
page_count: 0
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.word_count).toBe(0);
expect(parsed.page_count).toBe(0);
});
it('should handle boolean false values', () => {
const deterministic: DeterministicFields = {
has_tables: false,
encrypted: false
};
const result = FrontmatterGenerator.generate(baseOptions, deterministic);
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
const parsed = yaml.load(yamlContent) as Record<string, any>;
expect(parsed.has_tables).toBe(false);
expect(parsed.encrypted).toBe(false);
});
});
});