526 lines
18 KiB
TypeScript
526 lines
18 KiB
TypeScript
import {
|
|
FrontmatterGenerator,
|
|
FrontmatterOptions,
|
|
DeterministicFields,
|
|
LLMEnrichment,
|
|
OutputMode,
|
|
DEFAULTS
|
|
} from '../src';
|
|
import * as yaml from 'js-yaml';
|
|
|
|
describe('FrontmatterGenerator', () => {
|
|
const baseOptions: FrontmatterOptions = {
|
|
generator: 'fss-parse-pdf',
|
|
version: '1.0.0',
|
|
title: 'Test Document'
|
|
};
|
|
|
|
describe('generate()', () => {
|
|
it('should return empty string for none mode', () => {
|
|
const result = FrontmatterGenerator.generate(baseOptions, {}, undefined, 'none');
|
|
expect(result).toBe('');
|
|
});
|
|
|
|
it('should generate valid YAML frontmatter', () => {
|
|
const result = FrontmatterGenerator.generate(baseOptions);
|
|
expect(result).toMatch(/^---\n/);
|
|
expect(result).toMatch(/\n---$/);
|
|
|
|
// Extract and parse YAML
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
expect(parsed).toBeDefined();
|
|
});
|
|
|
|
it('should include core required fields', () => {
|
|
const result = FrontmatterGenerator.generate(baseOptions);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.profile).toBe(DEFAULTS.profile);
|
|
expect(parsed.generator).toBe('fss-parse-pdf');
|
|
expect(parsed.version).toBe('1.0.0');
|
|
expect(parsed.title).toBe('Test Document');
|
|
expect(parsed.extraction_confidence).toBe(DEFAULTS.extractionConfidence);
|
|
expect(parsed.content_quality).toBe(DEFAULTS.contentQuality);
|
|
expect(parsed.created).toBeDefined();
|
|
});
|
|
|
|
it('should include source_file when provided', () => {
|
|
const options: FrontmatterOptions = {
|
|
...baseOptions,
|
|
sourcePath: '/path/to/file.pdf'
|
|
};
|
|
const result = FrontmatterGenerator.generate(options);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.source_file).toBe('/path/to/file.pdf');
|
|
});
|
|
|
|
it('should use custom profile when provided', () => {
|
|
const options: FrontmatterOptions = {
|
|
...baseOptions,
|
|
profile: 'technical'
|
|
};
|
|
const result = FrontmatterGenerator.generate(options);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.profile).toBe('technical');
|
|
});
|
|
|
|
it('should add LLM enrichment placeholders when no enrichment provided', () => {
|
|
const result = FrontmatterGenerator.generate(baseOptions);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.summary).toBe('');
|
|
expect(parsed.tags).toEqual([]);
|
|
expect(parsed.category).toBe('');
|
|
});
|
|
|
|
it('should include LLM enrichment fields when provided', () => {
|
|
const enrichment: LLMEnrichment = {
|
|
summary: 'Test summary',
|
|
tags: ['tag1', 'tag2'],
|
|
category: 'technical',
|
|
audience: 'expert',
|
|
doc_purpose: 'reference',
|
|
complexity: 4,
|
|
actionable: true,
|
|
key_technologies: ['TypeScript', 'Node.js']
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, {}, enrichment);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.summary).toBe('Test summary');
|
|
expect(parsed.tags).toEqual(['tag1', 'tag2']);
|
|
expect(parsed.category).toBe('technical');
|
|
expect(parsed.audience).toBe('expert');
|
|
expect(parsed.doc_purpose).toBe('reference');
|
|
expect(parsed.complexity).toBe(4);
|
|
expect(parsed.actionable).toBe(true);
|
|
expect(parsed.key_technologies).toEqual(['TypeScript', 'Node.js']);
|
|
});
|
|
});
|
|
|
|
describe('balanced mode', () => {
|
|
it('should include only balanced fields from deterministic', () => {
|
|
const deterministic: DeterministicFields = {
|
|
word_count: 1000,
|
|
page_count: 5,
|
|
character_count: 5000,
|
|
has_tables: true,
|
|
_internal_field: 'should be excluded',
|
|
rare_field: 'should be excluded in balanced'
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced');
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.word_count).toBe(1000);
|
|
expect(parsed.page_count).toBe(5);
|
|
expect(parsed.character_count).toBe(5000);
|
|
expect(parsed.has_tables).toBe(true);
|
|
expect(parsed._internal_field).toBeUndefined();
|
|
expect(parsed.rare_field).toBeUndefined();
|
|
});
|
|
|
|
it('should exclude null and undefined balanced fields', () => {
|
|
const deterministic: DeterministicFields = {
|
|
word_count: 1000,
|
|
page_count: null as any,
|
|
character_count: undefined
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced');
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.word_count).toBe(1000);
|
|
expect(parsed.page_count).toBeUndefined();
|
|
expect(parsed.character_count).toBeUndefined();
|
|
});
|
|
});
|
|
|
|
describe('complete mode', () => {
|
|
it('should include all deterministic fields', () => {
|
|
const deterministic: DeterministicFields = {
|
|
word_count: 1000,
|
|
page_count: 5,
|
|
custom_field: 'included',
|
|
nested: { deep: 'value' }
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.word_count).toBe(1000);
|
|
expect(parsed.page_count).toBe(5);
|
|
expect(parsed.custom_field).toBe('included');
|
|
expect(parsed.nested).toEqual({ deep: 'value' });
|
|
});
|
|
|
|
it('should exclude fields starting with underscore', () => {
|
|
const deterministic: DeterministicFields = {
|
|
word_count: 1000,
|
|
_private: 'excluded'
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.word_count).toBe(1000);
|
|
expect(parsed._private).toBeUndefined();
|
|
});
|
|
|
|
it('should convert Date objects to ISO strings', () => {
|
|
const testDate = new Date('2024-01-15T10:30:00Z');
|
|
const deterministic: DeterministicFields = {
|
|
modified: testDate
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.modified).toBe('2024-01-15T10:30:00.000Z');
|
|
});
|
|
});
|
|
|
|
describe('generateMarkdown()', () => {
|
|
it('should prepend frontmatter to content', () => {
|
|
const content = '# My Document\n\nContent here';
|
|
const result = FrontmatterGenerator.generateMarkdown(
|
|
baseOptions,
|
|
{ word_count: 100 },
|
|
content
|
|
);
|
|
|
|
expect(result).toMatch(/^---\n/);
|
|
expect(result).toContain('# My Document');
|
|
expect(result).toContain('Content here');
|
|
});
|
|
|
|
it('should return only content for none mode', () => {
|
|
const content = '# My Document\n\nContent here';
|
|
const result = FrontmatterGenerator.generateMarkdown(
|
|
baseOptions,
|
|
{ word_count: 100 },
|
|
content,
|
|
undefined,
|
|
'none'
|
|
);
|
|
|
|
expect(result).toBe(content);
|
|
});
|
|
|
|
it('should separate frontmatter and content with double newline', () => {
|
|
const content = '# My Document';
|
|
const result = FrontmatterGenerator.generateMarkdown(
|
|
baseOptions,
|
|
{},
|
|
content
|
|
);
|
|
|
|
expect(result).toMatch(/---\n\n#/);
|
|
});
|
|
});
|
|
|
|
describe('parser-specific scenarios', () => {
|
|
it('should handle PDF metadata', () => {
|
|
const pdfDeterministic: DeterministicFields = {
|
|
word_count: 5000,
|
|
page_count: 20,
|
|
character_count: 25000,
|
|
has_tables: true,
|
|
has_images: true,
|
|
table_count: 5,
|
|
image_count: 10,
|
|
has_toc: true,
|
|
has_forms: false,
|
|
encrypted: false,
|
|
author: 'John Doe'
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-pdf', profile: 'technical' },
|
|
pdfDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.word_count).toBe(5000);
|
|
expect(parsed.page_count).toBe(20);
|
|
expect(parsed.has_tables).toBe(true);
|
|
expect(parsed.table_count).toBe(5);
|
|
expect(parsed.author).toBe('John Doe');
|
|
});
|
|
|
|
it('should handle email metadata', () => {
|
|
const emailDeterministic: DeterministicFields = {
|
|
from: 'sender@example.com',
|
|
to: 'recipient@example.com',
|
|
date: '2024-01-15T10:30:00Z',
|
|
message_id: '<abc123@example.com>',
|
|
has_attachments: true,
|
|
attachment_count: 3,
|
|
word_count: 500,
|
|
importance: 'high'
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-email' },
|
|
emailDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.from).toBe('sender@example.com');
|
|
expect(parsed.to).toBe('recipient@example.com');
|
|
expect(parsed.has_attachments).toBe(true);
|
|
expect(parsed.attachment_count).toBe(3);
|
|
expect(parsed.importance).toBe('high');
|
|
});
|
|
|
|
it('should handle audio metadata', () => {
|
|
const audioDeterministic: DeterministicFields = {
|
|
duration: 3600,
|
|
duration_seconds: 3600,
|
|
bitrate: 320,
|
|
sample_rate: 44100,
|
|
codec: 'mp3',
|
|
has_transcript: true,
|
|
speaker_count: 3,
|
|
language: 'en'
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-audio', profile: 'meeting' },
|
|
audioDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.duration).toBe(3600);
|
|
expect(parsed.bitrate).toBe(320);
|
|
expect(parsed.has_transcript).toBe(true);
|
|
expect(parsed.speaker_count).toBe(3);
|
|
});
|
|
|
|
it('should handle image metadata', () => {
|
|
const imageDeterministic: DeterministicFields = {
|
|
width: 1920,
|
|
height: 1080,
|
|
format: 'png',
|
|
file_size: 2048000,
|
|
channels: 4,
|
|
has_alpha: true,
|
|
ocr_confidence: 0.95
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-image' },
|
|
imageDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.width).toBe(1920);
|
|
expect(parsed.height).toBe(1080);
|
|
expect(parsed.format).toBe('png');
|
|
expect(parsed.ocr_confidence).toBe(0.95);
|
|
});
|
|
|
|
it('should handle video metadata', () => {
|
|
const videoDeterministic: DeterministicFields = {
|
|
duration: 7200,
|
|
width: 1920,
|
|
height: 1080,
|
|
fps: 30,
|
|
aspect_ratio: '16:9',
|
|
bitrate: 8000,
|
|
video_codec: 'h264',
|
|
audio_codec: 'aac'
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-video', profile: 'meeting' },
|
|
videoDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.duration).toBe(7200);
|
|
expect(parsed.fps).toBe(30);
|
|
expect(parsed.aspect_ratio).toBe('16:9');
|
|
});
|
|
|
|
it('should handle presentation metadata', () => {
|
|
const presentationDeterministic: DeterministicFields = {
|
|
slide_count: 25,
|
|
total_slides: 25,
|
|
word_count: 3000,
|
|
has_images: true,
|
|
image_count: 15,
|
|
chart_count: 5,
|
|
has_speaker_notes: true,
|
|
author: 'Jane Smith'
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-presentation', profile: 'technical' },
|
|
presentationDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.slide_count).toBe(25);
|
|
expect(parsed.chart_count).toBe(5);
|
|
expect(parsed.has_speaker_notes).toBe(true);
|
|
});
|
|
|
|
it('should handle excel metadata', () => {
|
|
const excelDeterministic: DeterministicFields = {
|
|
sheet_count: 3,
|
|
row_count: 1000,
|
|
column_count: 20,
|
|
author: 'Data Analyst'
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-excel' },
|
|
excelDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.sheet_count).toBe(3);
|
|
expect(parsed.row_count).toBe(1000);
|
|
expect(parsed.column_count).toBe(20);
|
|
});
|
|
|
|
it('should handle diagram metadata', () => {
|
|
const diagramDeterministic: DeterministicFields = {
|
|
diagram_count: 5,
|
|
diagram_type: 'flowchart',
|
|
valid_diagrams: 4,
|
|
invalid_diagrams: 1,
|
|
node_count: 20,
|
|
edge_count: 25
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-diagram', profile: 'schema' },
|
|
diagramDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.diagram_count).toBe(5);
|
|
expect(parsed.valid_diagrams).toBe(4);
|
|
expect(parsed.node_count).toBe(20);
|
|
});
|
|
|
|
it('should handle data parser metadata', () => {
|
|
const dataDeterministic: DeterministicFields = {
|
|
record_count: 10000,
|
|
format_detected: 'json',
|
|
file_size: 5000000,
|
|
column_count: 15
|
|
};
|
|
const result = FrontmatterGenerator.generate(
|
|
{ ...baseOptions, generator: 'fss-parse-data' },
|
|
dataDeterministic,
|
|
undefined,
|
|
'balanced'
|
|
);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.record_count).toBe(10000);
|
|
expect(parsed.format_detected).toBe('json');
|
|
});
|
|
});
|
|
|
|
describe('edge cases', () => {
|
|
it('should handle empty deterministic object', () => {
|
|
const result = FrontmatterGenerator.generate(baseOptions, {});
|
|
expect(result).toMatch(/^---\n/);
|
|
expect(result).toMatch(/\n---$/);
|
|
});
|
|
|
|
it('should handle missing title with default', () => {
|
|
const options: FrontmatterOptions = {
|
|
generator: 'test',
|
|
version: '1.0.0',
|
|
title: ''
|
|
};
|
|
const result = FrontmatterGenerator.generate(options);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.title).toBe('Untitled');
|
|
});
|
|
|
|
it('should handle arrays in deterministic fields', () => {
|
|
const deterministic: DeterministicFields = {
|
|
recipients: ['a@test.com', 'b@test.com']
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.recipients).toEqual(['a@test.com', 'b@test.com']);
|
|
});
|
|
|
|
it('should handle custom extraction confidence and quality', () => {
|
|
const options: FrontmatterOptions = {
|
|
...baseOptions,
|
|
extractionConfidence: 0.85,
|
|
contentQuality: 1.2
|
|
};
|
|
const result = FrontmatterGenerator.generate(options);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.extraction_confidence).toBe(0.85);
|
|
expect(parsed.content_quality).toBe(1.2);
|
|
});
|
|
|
|
it('should handle zero values correctly', () => {
|
|
const deterministic: DeterministicFields = {
|
|
word_count: 0,
|
|
page_count: 0
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.word_count).toBe(0);
|
|
expect(parsed.page_count).toBe(0);
|
|
});
|
|
|
|
it('should handle boolean false values', () => {
|
|
const deterministic: DeterministicFields = {
|
|
has_tables: false,
|
|
encrypted: false
|
|
};
|
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic);
|
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
|
|
|
expect(parsed.has_tables).toBe(false);
|
|
expect(parsed.encrypted).toBe(false);
|
|
});
|
|
});
|
|
});
|