Initial commit: @bobai/frontmatter v1.1.0 - BOBAI Markdown Standard frontmatter generator
This commit is contained in:
commit
12193b2398
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
node_modules/
|
||||||
|
coverage/
|
||||||
|
*.log
|
||||||
|
.DS_Store
|
||||||
310
COMPLETION_SPEC.md
Normal file
310
COMPLETION_SPEC.md
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
# @bobai/frontmatter - Completion Specification
|
||||||
|
|
||||||
|
## Package Overview
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| Package Name | `@bobai/frontmatter` |
|
||||||
|
| Version | 1.1.0 |
|
||||||
|
| Standard | BOBAI Markdown Standard v1.1 |
|
||||||
|
| Language | TypeScript |
|
||||||
|
| Node.js | >= 18.0.0 |
|
||||||
|
| License | MIT |
|
||||||
|
|
||||||
|
## Implementation Status
|
||||||
|
|
||||||
|
### Core Features
|
||||||
|
|
||||||
|
| Feature | Status | Notes |
|
||||||
|
|---------|--------|-------|
|
||||||
|
| FrontmatterGenerator class | Complete | Static methods for generation |
|
||||||
|
| Output modes (none/balanced/complete) | Complete | All three modes implemented |
|
||||||
|
| YAML serialization | Complete | Uses js-yaml with proper formatting |
|
||||||
|
| Type definitions | Complete | Full TypeScript interfaces |
|
||||||
|
| Constants & defaults | Complete | Comprehensive coverage |
|
||||||
|
| LLM enrichment prompts | Complete | Prompt templates included |
|
||||||
|
| Parser profiles | Complete | All 10 parsers mapped |
|
||||||
|
|
||||||
|
### Test Coverage
|
||||||
|
|
||||||
|
| Test Suite | Tests | Status |
|
||||||
|
|------------|-------|--------|
|
||||||
|
| generator.test.ts | 35 | Passing |
|
||||||
|
| constants.test.ts | 16 | Passing |
|
||||||
|
| prompts.test.ts | 12 | Passing |
|
||||||
|
| **Total** | **63** | **All Passing** |
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
bobai-frontmatter/
|
||||||
|
├── src/
|
||||||
|
│ ├── index.ts # Main exports (27 lines)
|
||||||
|
│ ├── generator.ts # FrontmatterGenerator class (123 lines)
|
||||||
|
│ ├── types.ts # TypeScript interfaces (47 lines)
|
||||||
|
│ ├── constants.ts # Enums, defaults, balanced fields (130 lines)
|
||||||
|
│ └── prompts.ts # LLM enrichment prompts (43 lines)
|
||||||
|
├── tests/
|
||||||
|
│ ├── generator.test.ts # Generator tests (470 lines)
|
||||||
|
│ ├── constants.test.ts # Constants tests (140 lines)
|
||||||
|
│ └── prompts.test.ts # Prompt tests (80 lines)
|
||||||
|
├── dist/ # Compiled JavaScript + type definitions
|
||||||
|
├── package.json # NPM configuration with Jest
|
||||||
|
├── tsconfig.json # TypeScript configuration
|
||||||
|
├── README.md # Comprehensive documentation
|
||||||
|
├── COMPLETION_SPEC.md # This document
|
||||||
|
└── IMPLEMENTATION_BLUEPRINT.md # Original blueprint
|
||||||
|
```
|
||||||
|
|
||||||
|
## Exports
|
||||||
|
|
||||||
|
### Types
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export type OutputMode = 'none' | 'balanced' | 'complete';
|
||||||
|
export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';
|
||||||
|
export type DocPurpose = 'reference' | 'tutorial' | 'troubleshooting' | 'conceptual' | 'guide' | 'specification';
|
||||||
|
export type ProfileType = 'scraped' | 'research' | 'technical' | 'code' | 'data' | 'changelog' | 'legal' | 'test' | 'schema' | 'troubleshoot' | 'meeting' | 'faq' | 'config';
|
||||||
|
|
||||||
|
export interface FrontmatterOptions { ... }
|
||||||
|
export interface DeterministicFields { ... }
|
||||||
|
export interface LLMEnrichment { ... }
|
||||||
|
```
|
||||||
|
|
||||||
|
### Constants
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export const AUDIENCE_VALUES: AudienceLevel[]; // 4 values
|
||||||
|
export const DOC_PURPOSE_VALUES: DocPurpose[]; // 6 values
|
||||||
|
export const PROFILE_VALUES: ProfileType[]; // 13 values
|
||||||
|
export const DEFAULTS: { ... }; // 5 defaults
|
||||||
|
export const BALANCED_FIELDS: string[]; // 70+ fields
|
||||||
|
export const PARSER_PROFILES: Record<string, ProfileType>; // 10 parsers
|
||||||
|
```
|
||||||
|
|
||||||
|
### Functions
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export class FrontmatterGenerator {
|
||||||
|
static generate(options, deterministic?, enrichment?, mode?): string;
|
||||||
|
static generateMarkdown(options, deterministic, content, enrichment?, mode?): string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getEnrichmentPrompt(content: string, docType?: string): string;
|
||||||
|
export function getSamplePromptForDocType(docType: string): string;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parser Support Matrix
|
||||||
|
|
||||||
|
### Supported Parsers and Their Balanced Fields
|
||||||
|
|
||||||
|
| Parser | Profile | Key Balanced Fields |
|
||||||
|
|--------|---------|---------------------|
|
||||||
|
| fss-parse-pdf | technical | word_count, page_count, has_tables, has_images, has_toc, has_forms, encrypted, author |
|
||||||
|
| fss-parse-word | technical | word_count, page_count, paragraph_count, has_tracked_changes, has_toc, author |
|
||||||
|
| fss-parse-excel | data | sheet_count, row_count, column_count, author |
|
||||||
|
| fss-parse-image | data | width, height, format, channels, has_alpha, ocr_confidence, file_size |
|
||||||
|
| fss-parse-audio | meeting | duration, bitrate, sample_rate, codec, has_transcript, speaker_count, language |
|
||||||
|
| fss-parse-video | meeting | duration, width, height, fps, aspect_ratio, video_codec, audio_codec |
|
||||||
|
| fss-parse-email | data | from, to, cc, sender, recipients, date, message_id, has_attachments, attachment_count, importance |
|
||||||
|
| fss-parse-presentation | technical | slide_count, total_slides, word_count, chart_count, has_speaker_notes, has_images |
|
||||||
|
| fss-parse-data | data | record_count, format_detected, file_size, column_count |
|
||||||
|
| fss-parse-diagram | schema | diagram_count, diagram_type, valid_diagrams, invalid_diagrams, node_count, edge_count |
|
||||||
|
|
||||||
|
## BALANCED_FIELDS Complete List (70 fields)
|
||||||
|
|
||||||
|
### Universal Document (10)
|
||||||
|
- word_count, page_count, character_count, author, subject, creator, created, modified, file_size, format
|
||||||
|
|
||||||
|
### Structure Fields (10)
|
||||||
|
- has_tables, has_images, table_count, image_count, section_count, has_toc, has_forms, has_tracked_changes, paragraph_count, heading_count
|
||||||
|
|
||||||
|
### Excel/Data (5)
|
||||||
|
- sheet_count, row_count, column_count, record_count, format_detected
|
||||||
|
|
||||||
|
### Image (7)
|
||||||
|
- width, height, channels, has_alpha, color_space, ocr_confidence, has_exif
|
||||||
|
|
||||||
|
### Audio (8)
|
||||||
|
- duration, duration_seconds, bitrate, sample_rate, codec, has_transcript, speaker_count, language
|
||||||
|
|
||||||
|
### Video (5)
|
||||||
|
- fps, aspect_ratio, resolution, video_codec, audio_codec
|
||||||
|
|
||||||
|
### Presentation (5)
|
||||||
|
- slide_count, total_slides, chart_count, has_speaker_notes, has_animations
|
||||||
|
|
||||||
|
### Email (11)
|
||||||
|
- from, to, cc, sender, recipients, date, message_id, has_attachments, attachment_count, importance, thread_id
|
||||||
|
|
||||||
|
### Diagram (6)
|
||||||
|
- diagram_count, diagram_type, valid_diagrams, invalid_diagrams, node_count, edge_count
|
||||||
|
|
||||||
|
### Analysis (3)
|
||||||
|
- encrypted, complexity_score, reading_time_minutes
|
||||||
|
|
||||||
|
## Default Values
|
||||||
|
|
||||||
|
| Default | Value | Description |
|
||||||
|
|---------|-------|-------------|
|
||||||
|
| profile | 'data' | Default document profile |
|
||||||
|
| audience | 'all' | Default audience level |
|
||||||
|
| extractionConfidence | 1.0 | Default confidence (0.0-1.0) |
|
||||||
|
| contentQuality | 1.5 | Default quality score (0.0-2.0) |
|
||||||
|
| complexity | 3 | Default complexity (1-5) |
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
### Frontmatter Structure
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
---
|
||||||
|
# Core fields (always present)
|
||||||
|
profile: 'technical'
|
||||||
|
created: '2024-01-15T10:30:00.000Z'
|
||||||
|
generator: 'fss-parse-pdf'
|
||||||
|
version: '1.2.0'
|
||||||
|
title: 'Document Title'
|
||||||
|
extraction_confidence: 1
|
||||||
|
content_quality: 1.5
|
||||||
|
source_file: '/path/to/file.pdf'
|
||||||
|
|
||||||
|
# Deterministic fields (based on mode)
|
||||||
|
word_count: 5000
|
||||||
|
page_count: 25
|
||||||
|
has_tables: true
|
||||||
|
# ... more based on parser type
|
||||||
|
|
||||||
|
# LLM enrichment fields (or placeholders)
|
||||||
|
summary: 'Description of document...'
|
||||||
|
tags:
|
||||||
|
- tag1
|
||||||
|
- tag2
|
||||||
|
category: 'technical'
|
||||||
|
audience: 'intermediate'
|
||||||
|
doc_purpose: 'reference'
|
||||||
|
complexity: 3
|
||||||
|
actionable: false
|
||||||
|
key_technologies:
|
||||||
|
- TypeScript
|
||||||
|
- Node.js
|
||||||
|
---
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
### Production
|
||||||
|
- `js-yaml` ^4.1.0 - YAML serialization
|
||||||
|
|
||||||
|
### Development
|
||||||
|
- `typescript` ^5.0.0 - TypeScript compiler
|
||||||
|
- `jest` ^29.7.0 - Test runner
|
||||||
|
- `ts-jest` ^29.1.0 - Jest TypeScript transformer
|
||||||
|
- `@types/jest` ^29.5.0 - Jest type definitions
|
||||||
|
- `@types/js-yaml` ^4.0.9 - js-yaml type definitions
|
||||||
|
- `@types/node` ^20.0.0 - Node.js type definitions
|
||||||
|
|
||||||
|
## Usage Patterns
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { FrontmatterGenerator } from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
||||||
|
{ generator: 'fss-parse-pdf', version: '1.0.0', title: 'Doc' },
|
||||||
|
{ word_count: 1000, page_count: 5 },
|
||||||
|
'# Content here'
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### With LLM Enrichment
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { FrontmatterGenerator, getEnrichmentPrompt, LLMEnrichment } from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
const prompt = getEnrichmentPrompt(content, 'pdf');
|
||||||
|
const enrichment: LLMEnrichment = await getLLMResponse(prompt);
|
||||||
|
|
||||||
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
||||||
|
options, deterministic, content, enrichment, 'balanced'
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using Parser Profiles
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { PARSER_PROFILES } from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
const profile = PARSER_PROFILES['fss-parse-audio']; // 'meeting'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Integration Requirements
|
||||||
|
|
||||||
|
### For Parsers to Use This Package
|
||||||
|
|
||||||
|
1. **Install**: `npm install ../packages/bobai-frontmatter`
|
||||||
|
2. **Import**: `import { FrontmatterGenerator, ... } from '@bobai/frontmatter';`
|
||||||
|
3. **Build**: Ensure bobai-frontmatter is built before parser build
|
||||||
|
|
||||||
|
### Package.json Dependency
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"dependencies": {
|
||||||
|
"@bobai/frontmatter": "file:../packages/bobai-frontmatter"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quality Metrics
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------|-------|
|
||||||
|
| Total Lines of Code | ~500 (src) |
|
||||||
|
| Test Coverage | 63 tests |
|
||||||
|
| TypeScript Strict Mode | Yes |
|
||||||
|
| Zero Runtime Errors | Yes |
|
||||||
|
| Build Time | < 1s |
|
||||||
|
| Test Time | ~1s |
|
||||||
|
|
||||||
|
## Validation Checklist
|
||||||
|
|
||||||
|
- [x] All types properly exported
|
||||||
|
- [x] All constants properly exported
|
||||||
|
- [x] FrontmatterGenerator methods work correctly
|
||||||
|
- [x] YAML output is valid
|
||||||
|
- [x] All output modes function correctly
|
||||||
|
- [x] Balanced fields cover all parser types
|
||||||
|
- [x] Parser profiles are correct
|
||||||
|
- [x] LLM prompts generate correct structure
|
||||||
|
- [x] Tests pass with no warnings
|
||||||
|
- [x] TypeScript compiles with no errors
|
||||||
|
- [x] README documentation complete
|
||||||
|
- [x] Package.json properly configured
|
||||||
|
|
||||||
|
## Known Limitations
|
||||||
|
|
||||||
|
1. **No LLM client**: Package provides prompts but not LLM integration
|
||||||
|
2. **No file I/O**: Generate strings only, parsers handle file operations
|
||||||
|
3. **No validation**: Trusts parser-provided data
|
||||||
|
|
||||||
|
## Future Enhancements (Not Implemented)
|
||||||
|
|
||||||
|
1. LLM client integration (src/llm/ directory)
|
||||||
|
2. Schema validation for frontmatter
|
||||||
|
3. Custom field definitions per parser
|
||||||
|
4. Streaming generation for large documents
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
The `@bobai/frontmatter` package is **complete and ready for integration** with all FSS parsers. It provides:
|
||||||
|
|
||||||
|
- Consistent BOBAI v1.1 standard frontmatter generation
|
||||||
|
- Support for all 10 parser types
|
||||||
|
- Three output modes for different use cases
|
||||||
|
- LLM enrichment prompt templates
|
||||||
|
- Comprehensive test coverage
|
||||||
|
- Full TypeScript type safety
|
||||||
|
|
||||||
|
Parsers can immediately begin using this package by installing it as a local dependency and importing the required exports.
|
||||||
506
IMPLEMENTATION_BLUEPRINT.md
Normal file
506
IMPLEMENTATION_BLUEPRINT.md
Normal file
@ -0,0 +1,506 @@
|
|||||||
|
# @bobai/frontmatter - Implementation Blueprint
|
||||||
|
|
||||||
|
**Package Location**: `/MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter`
|
||||||
|
**Standard Reference**: `/MASTERFOLDER/KnowledgeBase/Standards/BOBAI_MARKDOWN_STANDARD_V1.1.md`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
Create a shared npm package that all FSS parsers import for consistent BOBAI v1.1 frontmatter generation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files to Create
|
||||||
|
|
||||||
|
```
|
||||||
|
bobai-frontmatter/
|
||||||
|
├── src/
|
||||||
|
│ ├── index.ts # Main exports
|
||||||
|
│ ├── generator.ts # FrontmatterGenerator class
|
||||||
|
│ ├── types.ts # TypeScript interfaces
|
||||||
|
│ ├── constants.ts # Enums, defaults
|
||||||
|
│ └── prompts.ts # LLM enrichment prompt templates
|
||||||
|
├── package.json
|
||||||
|
├── tsconfig.json
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: `src/llm/` directory is for future enhancement (Issue #3)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. package.json
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"name": "@bobai/frontmatter",
|
||||||
|
"version": "1.1.0",
|
||||||
|
"description": "BOBAI Markdown Standard v1.1 frontmatter generator",
|
||||||
|
"main": "dist/index.js",
|
||||||
|
"types": "dist/index.d.ts",
|
||||||
|
"files": ["dist"],
|
||||||
|
"scripts": {
|
||||||
|
"build": "tsc",
|
||||||
|
"clean": "rm -rf dist",
|
||||||
|
"prepublishOnly": "npm run clean && npm run build"
|
||||||
|
},
|
||||||
|
"keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"],
|
||||||
|
"author": "BobAI",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"js-yaml": "^4.1.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/js-yaml": "^4.0.9",
|
||||||
|
"@types/node": "^20.0.0",
|
||||||
|
"typescript": "^5.0.0"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. tsconfig.json
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2020",
|
||||||
|
"module": "commonjs",
|
||||||
|
"lib": ["ES2020"],
|
||||||
|
"declaration": true,
|
||||||
|
"strict": true,
|
||||||
|
"noImplicitAny": true,
|
||||||
|
"strictNullChecks": true,
|
||||||
|
"noImplicitThis": true,
|
||||||
|
"alwaysStrict": true,
|
||||||
|
"noUnusedLocals": false,
|
||||||
|
"noUnusedParameters": false,
|
||||||
|
"noImplicitReturns": true,
|
||||||
|
"noFallthroughCasesInSwitch": false,
|
||||||
|
"inlineSourceMap": true,
|
||||||
|
"inlineSources": true,
|
||||||
|
"experimentalDecorators": true,
|
||||||
|
"strictPropertyInitialization": false,
|
||||||
|
"outDir": "./dist",
|
||||||
|
"rootDir": "./src"
|
||||||
|
},
|
||||||
|
"exclude": ["node_modules", "dist"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. src/types.ts
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export type OutputMode = 'none' | 'balanced' | 'complete';
|
||||||
|
|
||||||
|
export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';
|
||||||
|
|
||||||
|
export type DocPurpose =
|
||||||
|
| 'reference'
|
||||||
|
| 'tutorial'
|
||||||
|
| 'troubleshooting'
|
||||||
|
| 'conceptual'
|
||||||
|
| 'guide'
|
||||||
|
| 'specification';
|
||||||
|
|
||||||
|
export type ProfileType =
|
||||||
|
| 'scraped'
|
||||||
|
| 'research'
|
||||||
|
| 'technical'
|
||||||
|
| 'code'
|
||||||
|
| 'data'
|
||||||
|
| 'changelog'
|
||||||
|
| 'legal'
|
||||||
|
| 'test'
|
||||||
|
| 'schema'
|
||||||
|
| 'troubleshoot'
|
||||||
|
| 'meeting'
|
||||||
|
| 'faq'
|
||||||
|
| 'config';
|
||||||
|
|
||||||
|
export interface FrontmatterOptions {
|
||||||
|
generator: string;
|
||||||
|
version: string;
|
||||||
|
title: string;
|
||||||
|
sourcePath?: string | null;
|
||||||
|
profile?: ProfileType;
|
||||||
|
extractionConfidence?: number;
|
||||||
|
contentQuality?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DeterministicFields {
|
||||||
|
word_count?: number;
|
||||||
|
page_count?: number;
|
||||||
|
character_count?: number;
|
||||||
|
[key: string]: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LLMEnrichment {
|
||||||
|
summary?: string;
|
||||||
|
tags?: string[];
|
||||||
|
category?: string;
|
||||||
|
audience?: AudienceLevel;
|
||||||
|
doc_purpose?: DocPurpose;
|
||||||
|
complexity?: number;
|
||||||
|
actionable?: boolean;
|
||||||
|
key_technologies?: string[];
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. src/constants.ts
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { AudienceLevel, DocPurpose, ProfileType } from './types';
|
||||||
|
|
||||||
|
export const AUDIENCE_VALUES: AudienceLevel[] = [
|
||||||
|
'all', 'beginner', 'intermediate', 'expert'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const DOC_PURPOSE_VALUES: DocPurpose[] = [
|
||||||
|
'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const PROFILE_VALUES: ProfileType[] = [
|
||||||
|
'scraped', 'research', 'technical', 'code', 'data', 'changelog',
|
||||||
|
'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const DEFAULTS = {
|
||||||
|
profile: 'data' as ProfileType,
|
||||||
|
audience: 'all' as AudienceLevel,
|
||||||
|
extractionConfidence: 1.0,
|
||||||
|
contentQuality: 1.5,
|
||||||
|
complexity: 3
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fields to include in balanced mode (not complete)
|
||||||
|
export const BALANCED_FIELDS = [
|
||||||
|
'word_count',
|
||||||
|
'page_count',
|
||||||
|
'has_tables',
|
||||||
|
'has_images',
|
||||||
|
'section_count',
|
||||||
|
'slide_count',
|
||||||
|
'duration_seconds',
|
||||||
|
'width',
|
||||||
|
'height',
|
||||||
|
'has_attachments',
|
||||||
|
'attachment_count',
|
||||||
|
'from',
|
||||||
|
'to',
|
||||||
|
'date',
|
||||||
|
'message_id',
|
||||||
|
'author',
|
||||||
|
'has_toc',
|
||||||
|
'has_tracked_changes',
|
||||||
|
'has_transcript',
|
||||||
|
'speaker_count'
|
||||||
|
];
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. src/prompts.ts
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string {
|
||||||
|
return `You are a document analyst preparing metadata for a RAG search system.
|
||||||
|
Extract structured metadata to help users find and understand this document.
|
||||||
|
|
||||||
|
Respond with this exact JSON structure:
|
||||||
|
|
||||||
|
{
|
||||||
|
"summary": "2-3 sentences: What is this document about? What problem does it solve?",
|
||||||
|
"tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
|
||||||
|
"category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
|
||||||
|
"audience": "all | beginner | intermediate | expert",
|
||||||
|
"doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
|
||||||
|
"complexity": 1-5,
|
||||||
|
"actionable": true or false,
|
||||||
|
"key_technologies": ["specific tools, languages, frameworks mentioned"]
|
||||||
|
}
|
||||||
|
|
||||||
|
Guidelines:
|
||||||
|
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
|
||||||
|
- category: Pick the single best match
|
||||||
|
- audience: Default to "all" unless clearly targeted to specific skill level
|
||||||
|
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
|
||||||
|
- actionable: true if reader should DO something, false if just informational
|
||||||
|
- key_technologies: Only include specific named technologies, not generic terms
|
||||||
|
|
||||||
|
Document type: ${docType}
|
||||||
|
|
||||||
|
---
|
||||||
|
${content}
|
||||||
|
---
|
||||||
|
|
||||||
|
Respond with valid JSON only. No explanation or markdown formatting.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getSamplePromptForDocType(docType: string): string {
|
||||||
|
const samples: Record<string, string> = {
|
||||||
|
pdf: 'PDF document',
|
||||||
|
word: 'Word document',
|
||||||
|
email: 'Email message',
|
||||||
|
image: 'Image with OCR text',
|
||||||
|
audio: 'Audio transcript',
|
||||||
|
video: 'Video transcript',
|
||||||
|
presentation: 'Presentation slides',
|
||||||
|
excel: 'Spreadsheet data',
|
||||||
|
markdown: 'Markdown document'
|
||||||
|
};
|
||||||
|
return samples[docType] || 'document';
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. src/generator.ts
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import * as yaml from 'js-yaml';
|
||||||
|
import {
|
||||||
|
FrontmatterOptions,
|
||||||
|
DeterministicFields,
|
||||||
|
LLMEnrichment,
|
||||||
|
OutputMode
|
||||||
|
} from './types';
|
||||||
|
import { DEFAULTS, BALANCED_FIELDS } from './constants';
|
||||||
|
|
||||||
|
export class FrontmatterGenerator {
|
||||||
|
|
||||||
|
static generate(
|
||||||
|
options: FrontmatterOptions,
|
||||||
|
deterministic: DeterministicFields = {},
|
||||||
|
enrichment?: LLMEnrichment,
|
||||||
|
mode: OutputMode = 'balanced'
|
||||||
|
): string {
|
||||||
|
|
||||||
|
if (mode === 'none') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const frontmatter: Record<string, any> = {
|
||||||
|
// Core required fields
|
||||||
|
profile: options.profile || DEFAULTS.profile,
|
||||||
|
created: new Date().toISOString(),
|
||||||
|
generator: options.generator,
|
||||||
|
version: options.version,
|
||||||
|
title: options.title || 'Untitled',
|
||||||
|
extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence,
|
||||||
|
content_quality: options.contentQuality ?? DEFAULTS.contentQuality,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Source file
|
||||||
|
if (options.sourcePath) {
|
||||||
|
frontmatter.source_file = options.sourcePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add deterministic fields based on mode
|
||||||
|
if (mode === 'complete') {
|
||||||
|
// Include all deterministic fields
|
||||||
|
const cleaned = this.cleanObject(deterministic);
|
||||||
|
Object.assign(frontmatter, cleaned);
|
||||||
|
} else {
|
||||||
|
// Balanced mode - include only key fields
|
||||||
|
for (const field of BALANCED_FIELDS) {
|
||||||
|
if (deterministic[field] !== undefined && deterministic[field] !== null) {
|
||||||
|
frontmatter[field] = deterministic[field];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLM enrichment fields (flat, not nested)
|
||||||
|
if (enrichment) {
|
||||||
|
if (enrichment.summary) frontmatter.summary = enrichment.summary;
|
||||||
|
if (enrichment.tags?.length) frontmatter.tags = enrichment.tags;
|
||||||
|
if (enrichment.category) frontmatter.category = enrichment.category;
|
||||||
|
if (enrichment.audience) frontmatter.audience = enrichment.audience;
|
||||||
|
if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose;
|
||||||
|
if (enrichment.complexity) frontmatter.complexity = enrichment.complexity;
|
||||||
|
if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable;
|
||||||
|
if (enrichment.key_technologies?.length) {
|
||||||
|
frontmatter.key_technologies = enrichment.key_technologies;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Placeholders for LLM enrichment
|
||||||
|
frontmatter.summary = '';
|
||||||
|
frontmatter.tags = [];
|
||||||
|
frontmatter.category = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const yamlStr = yaml.dump(this.removeNulls(frontmatter), {
|
||||||
|
indent: 2,
|
||||||
|
lineWidth: -1,
|
||||||
|
quotingType: "'",
|
||||||
|
sortKeys: false
|
||||||
|
});
|
||||||
|
|
||||||
|
return `---\n${yamlStr}---`;
|
||||||
|
}
|
||||||
|
|
||||||
|
static generateMarkdown(
|
||||||
|
options: FrontmatterOptions,
|
||||||
|
deterministic: DeterministicFields,
|
||||||
|
content: string,
|
||||||
|
enrichment?: LLMEnrichment,
|
||||||
|
mode: OutputMode = 'balanced'
|
||||||
|
): string {
|
||||||
|
const fm = this.generate(options, deterministic, enrichment, mode);
|
||||||
|
if (!fm) return content;
|
||||||
|
return `${fm}\n\n${content}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static cleanObject(obj: any): any {
|
||||||
|
const result: any = {};
|
||||||
|
for (const [k, v] of Object.entries(obj)) {
|
||||||
|
if (k.startsWith('_')) continue;
|
||||||
|
if (v === null || v === undefined) continue;
|
||||||
|
if (v instanceof Date) {
|
||||||
|
result[k] = v.toISOString();
|
||||||
|
} else if (v && typeof v === 'object' && !Array.isArray(v)) {
|
||||||
|
result[k] = this.cleanObject(v);
|
||||||
|
} else {
|
||||||
|
result[k] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static removeNulls(obj: any): any {
|
||||||
|
if (Array.isArray(obj)) {
|
||||||
|
return obj.filter(x => x != null).map(x => this.removeNulls(x));
|
||||||
|
}
|
||||||
|
if (obj && typeof obj === 'object') {
|
||||||
|
const result: any = {};
|
||||||
|
for (const [k, v] of Object.entries(obj)) {
|
||||||
|
if (v != null) result[k] = this.removeNulls(v);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. src/index.ts
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// Types
|
||||||
|
export {
|
||||||
|
OutputMode,
|
||||||
|
AudienceLevel,
|
||||||
|
DocPurpose,
|
||||||
|
ProfileType,
|
||||||
|
FrontmatterOptions,
|
||||||
|
DeterministicFields,
|
||||||
|
LLMEnrichment
|
||||||
|
} from './types';
|
||||||
|
|
||||||
|
// Constants
|
||||||
|
export {
|
||||||
|
AUDIENCE_VALUES,
|
||||||
|
DOC_PURPOSE_VALUES,
|
||||||
|
PROFILE_VALUES,
|
||||||
|
DEFAULTS,
|
||||||
|
BALANCED_FIELDS
|
||||||
|
} from './constants';
|
||||||
|
|
||||||
|
// Generator
|
||||||
|
export { FrontmatterGenerator } from './generator';
|
||||||
|
|
||||||
|
// Prompts
|
||||||
|
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. README.md
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# @bobai/frontmatter
|
||||||
|
|
||||||
|
BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install @bobai/frontmatter
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import {
|
||||||
|
FrontmatterGenerator,
|
||||||
|
getEnrichmentPrompt,
|
||||||
|
OutputMode,
|
||||||
|
LLMEnrichment
|
||||||
|
} from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
// Generate frontmatter
|
||||||
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
||||||
|
{
|
||||||
|
generator: 'fss-parse-pdf',
|
||||||
|
version: '1.2.0',
|
||||||
|
title: 'My Document',
|
||||||
|
sourcePath: '/path/to/file.pdf'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
word_count: 1234,
|
||||||
|
page_count: 8,
|
||||||
|
has_tables: true
|
||||||
|
},
|
||||||
|
content,
|
||||||
|
enrichment, // LLMEnrichment or undefined
|
||||||
|
'balanced' // OutputMode: 'none' | 'balanced' | 'complete'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Get LLM enrichment prompt
|
||||||
|
const prompt = getEnrichmentPrompt(content, 'pdf');
|
||||||
|
// Send to your LLM and parse response as LLMEnrichment
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Modes
|
||||||
|
|
||||||
|
- `none` - No frontmatter, just content
|
||||||
|
- `balanced` (default) - Core fields + key deterministic + LLM enrichment
|
||||||
|
- `complete` - All fields including full metadata
|
||||||
|
|
||||||
|
## Reference
|
||||||
|
|
||||||
|
See BOBAI Markdown Standard v1.1 for field definitions.
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
After implementation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter
|
||||||
|
npm install
|
||||||
|
npm run build
|
||||||
|
```
|
||||||
|
|
||||||
|
Test by importing in a parser or creating a simple test script.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps After Package Complete
|
||||||
|
|
||||||
|
1. Update each parser to import from this package
|
||||||
|
2. Remove duplicate frontmatter code from parsers
|
||||||
|
3. Consider publishing to npm registry (later)
|
||||||
354
README.md
Normal file
354
README.md
Normal file
@ -0,0 +1,354 @@
|
|||||||
|
# @bobai/frontmatter
|
||||||
|
|
||||||
|
BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers. Provides consistent, standardized frontmatter generation across all parser types.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From local path (recommended for FSS parsers)
|
||||||
|
npm install ../packages/bobai-frontmatter
|
||||||
|
|
||||||
|
# Or link globally
|
||||||
|
cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter
|
||||||
|
npm link
|
||||||
|
# Then in your parser:
|
||||||
|
npm link @bobai/frontmatter
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import {
|
||||||
|
FrontmatterGenerator,
|
||||||
|
getEnrichmentPrompt,
|
||||||
|
PARSER_PROFILES,
|
||||||
|
LLMEnrichment
|
||||||
|
} from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
// Generate markdown with frontmatter
|
||||||
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
||||||
|
{
|
||||||
|
generator: 'fss-parse-pdf',
|
||||||
|
version: '1.2.0',
|
||||||
|
title: 'My Document',
|
||||||
|
sourcePath: '/path/to/file.pdf',
|
||||||
|
profile: PARSER_PROFILES['fss-parse-pdf'] // 'technical'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
word_count: 1234,
|
||||||
|
page_count: 8,
|
||||||
|
has_tables: true,
|
||||||
|
has_images: false
|
||||||
|
},
|
||||||
|
content, // Markdown content string
|
||||||
|
undefined, // LLMEnrichment or undefined
|
||||||
|
'balanced' // OutputMode
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Reference
|
||||||
|
|
||||||
|
### FrontmatterGenerator
|
||||||
|
|
||||||
|
#### `generate(options, deterministic?, enrichment?, mode?)`
|
||||||
|
|
||||||
|
Generate frontmatter YAML block only.
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const frontmatter = FrontmatterGenerator.generate(
|
||||||
|
options: FrontmatterOptions,
|
||||||
|
deterministic?: DeterministicFields,
|
||||||
|
enrichment?: LLMEnrichment,
|
||||||
|
mode?: OutputMode // 'none' | 'balanced' | 'complete'
|
||||||
|
): string;
|
||||||
|
```
|
||||||
|
|
||||||
|
#### `generateMarkdown(options, deterministic, content, enrichment?, mode?)`
|
||||||
|
|
||||||
|
Generate complete markdown with frontmatter prepended.
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
||||||
|
options: FrontmatterOptions,
|
||||||
|
deterministic: DeterministicFields,
|
||||||
|
content: string,
|
||||||
|
enrichment?: LLMEnrichment,
|
||||||
|
mode?: OutputMode
|
||||||
|
): string;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Types
|
||||||
|
|
||||||
|
#### FrontmatterOptions
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
interface FrontmatterOptions {
|
||||||
|
generator: string; // e.g., 'fss-parse-pdf'
|
||||||
|
version: string; // e.g., '1.2.0'
|
||||||
|
title: string; // Document title
|
||||||
|
sourcePath?: string | null; // Original file path
|
||||||
|
profile?: ProfileType; // Document profile
|
||||||
|
extractionConfidence?: number; // 0.0-1.0
|
||||||
|
contentQuality?: number; // 0.0-2.0
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### DeterministicFields
|
||||||
|
|
||||||
|
Parser-extracted metadata. Any fields can be included:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
interface DeterministicFields {
|
||||||
|
word_count?: number;
|
||||||
|
page_count?: number;
|
||||||
|
character_count?: number;
|
||||||
|
[key: string]: any; // Parser-specific fields
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### LLMEnrichment
|
||||||
|
|
||||||
|
AI-generated metadata fields:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
interface LLMEnrichment {
|
||||||
|
summary?: string;
|
||||||
|
tags?: string[];
|
||||||
|
category?: string;
|
||||||
|
audience?: 'all' | 'beginner' | 'intermediate' | 'expert';
|
||||||
|
doc_purpose?: 'reference' | 'tutorial' | 'troubleshooting' | 'conceptual' | 'guide' | 'specification';
|
||||||
|
complexity?: number; // 1-5
|
||||||
|
actionable?: boolean;
|
||||||
|
key_technologies?: string[];
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Modes
|
||||||
|
|
||||||
|
### `none`
|
||||||
|
Returns empty string (no frontmatter). Content only.
|
||||||
|
|
||||||
|
### `balanced` (default)
|
||||||
|
Includes:
|
||||||
|
- Core required fields (profile, created, generator, version, title, etc.)
|
||||||
|
- Key deterministic fields from BALANCED_FIELDS list
|
||||||
|
- LLM enrichment fields (or placeholders)
|
||||||
|
|
||||||
|
Best for RAG indexing and search.
|
||||||
|
|
||||||
|
### `complete`
|
||||||
|
Includes all fields from deterministic object plus core and enrichment fields.
|
||||||
|
Use for archival or when full metadata is needed.
|
||||||
|
|
||||||
|
## Parser Profiles
|
||||||
|
|
||||||
|
Default profiles for each parser type:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { PARSER_PROFILES } from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
PARSER_PROFILES['fss-parse-pdf'] // 'technical'
|
||||||
|
PARSER_PROFILES['fss-parse-word'] // 'technical'
|
||||||
|
PARSER_PROFILES['fss-parse-excel'] // 'data'
|
||||||
|
PARSER_PROFILES['fss-parse-image'] // 'data'
|
||||||
|
PARSER_PROFILES['fss-parse-audio'] // 'meeting'
|
||||||
|
PARSER_PROFILES['fss-parse-video'] // 'meeting'
|
||||||
|
PARSER_PROFILES['fss-parse-email'] // 'data'
|
||||||
|
PARSER_PROFILES['fss-parse-presentation'] // 'technical'
|
||||||
|
PARSER_PROFILES['fss-parse-data'] // 'data'
|
||||||
|
PARSER_PROFILES['fss-parse-diagram'] // 'schema'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Balanced Fields by Parser Type
|
||||||
|
|
||||||
|
The BALANCED_FIELDS list includes 70+ fields covering all parser types:
|
||||||
|
|
||||||
|
### Universal
|
||||||
|
`word_count`, `page_count`, `character_count`, `author`, `subject`, `creator`, `created`, `modified`, `file_size`, `format`
|
||||||
|
|
||||||
|
### PDF/Word Structure
|
||||||
|
`has_tables`, `has_images`, `table_count`, `image_count`, `section_count`, `has_toc`, `has_forms`, `has_tracked_changes`, `paragraph_count`, `heading_count`
|
||||||
|
|
||||||
|
### Excel/Data
|
||||||
|
`sheet_count`, `row_count`, `column_count`, `record_count`, `format_detected`
|
||||||
|
|
||||||
|
### Image
|
||||||
|
`width`, `height`, `channels`, `has_alpha`, `color_space`, `ocr_confidence`, `has_exif`
|
||||||
|
|
||||||
|
### Audio
|
||||||
|
`duration`, `duration_seconds`, `bitrate`, `sample_rate`, `codec`, `has_transcript`, `speaker_count`, `language`
|
||||||
|
|
||||||
|
### Video
|
||||||
|
`fps`, `aspect_ratio`, `resolution`, `video_codec`, `audio_codec`
|
||||||
|
|
||||||
|
### Presentation
|
||||||
|
`slide_count`, `total_slides`, `chart_count`, `has_speaker_notes`, `has_animations`
|
||||||
|
|
||||||
|
### Email
|
||||||
|
`from`, `to`, `cc`, `sender`, `recipients`, `date`, `message_id`, `has_attachments`, `attachment_count`, `importance`, `thread_id`
|
||||||
|
|
||||||
|
### Diagram
|
||||||
|
`diagram_count`, `diagram_type`, `valid_diagrams`, `invalid_diagrams`, `node_count`, `edge_count`
|
||||||
|
|
||||||
|
## LLM Enrichment
|
||||||
|
|
||||||
|
### Getting the Prompt
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import { getEnrichmentPrompt, getSamplePromptForDocType } from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
// Get prompt for LLM enrichment
|
||||||
|
const prompt = getEnrichmentPrompt(content, 'pdf');
|
||||||
|
|
||||||
|
// Send to your LLM...
|
||||||
|
const response = await llm.generate(prompt);
|
||||||
|
const enrichment: LLMEnrichment = JSON.parse(response);
|
||||||
|
|
||||||
|
// Use in frontmatter generation
|
||||||
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
||||||
|
options,
|
||||||
|
deterministic,
|
||||||
|
content,
|
||||||
|
enrichment,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Prompt Output Format
|
||||||
|
|
||||||
|
The LLM will return JSON matching the LLMEnrichment interface:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"summary": "2-3 sentence description",
|
||||||
|
"tags": ["specific", "search", "terms"],
|
||||||
|
"category": "technical",
|
||||||
|
"audience": "intermediate",
|
||||||
|
"doc_purpose": "reference",
|
||||||
|
"complexity": 3,
|
||||||
|
"actionable": false,
|
||||||
|
"key_technologies": ["TypeScript", "Node.js"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parser Integration Example
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// In your parser (e.g., pdf-ts/src/pdf-parser.ts)
|
||||||
|
import {
|
||||||
|
FrontmatterGenerator,
|
||||||
|
PARSER_PROFILES,
|
||||||
|
FrontmatterOptions,
|
||||||
|
DeterministicFields
|
||||||
|
} from '@bobai/frontmatter';
|
||||||
|
import { version } from '../package.json';
|
||||||
|
|
||||||
|
export function generateOutput(
|
||||||
|
content: string,
|
||||||
|
metadata: ParsedMetadata,
|
||||||
|
sourcePath: string,
|
||||||
|
mode: 'none' | 'balanced' | 'complete' = 'balanced'
|
||||||
|
): string {
|
||||||
|
const options: FrontmatterOptions = {
|
||||||
|
generator: 'fss-parse-pdf',
|
||||||
|
version,
|
||||||
|
title: metadata.title || 'Untitled',
|
||||||
|
sourcePath,
|
||||||
|
profile: PARSER_PROFILES['fss-parse-pdf'],
|
||||||
|
extractionConfidence: metadata.confidence,
|
||||||
|
contentQuality: calculateQuality(metadata)
|
||||||
|
};
|
||||||
|
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
word_count: metadata.wordCount,
|
||||||
|
page_count: metadata.pageCount,
|
||||||
|
character_count: metadata.characterCount,
|
||||||
|
has_tables: metadata.hasTables,
|
||||||
|
has_images: metadata.hasImages,
|
||||||
|
table_count: metadata.tableCount,
|
||||||
|
image_count: metadata.imageCount,
|
||||||
|
author: metadata.author,
|
||||||
|
created: metadata.creationDate,
|
||||||
|
modified: metadata.modificationDate,
|
||||||
|
encrypted: metadata.isEncrypted
|
||||||
|
};
|
||||||
|
|
||||||
|
return FrontmatterGenerator.generateMarkdown(
|
||||||
|
options,
|
||||||
|
deterministic,
|
||||||
|
content,
|
||||||
|
undefined, // No LLM enrichment
|
||||||
|
mode
|
||||||
|
);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Constants & Defaults
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
import {
|
||||||
|
DEFAULTS,
|
||||||
|
AUDIENCE_VALUES,
|
||||||
|
DOC_PURPOSE_VALUES,
|
||||||
|
PROFILE_VALUES,
|
||||||
|
BALANCED_FIELDS
|
||||||
|
} from '@bobai/frontmatter';
|
||||||
|
|
||||||
|
// Default values
|
||||||
|
DEFAULTS.profile // 'data'
|
||||||
|
DEFAULTS.audience // 'all'
|
||||||
|
DEFAULTS.extractionConfidence // 1.0
|
||||||
|
DEFAULTS.contentQuality // 1.5
|
||||||
|
DEFAULTS.complexity // 3
|
||||||
|
|
||||||
|
// Valid values for validation
|
||||||
|
AUDIENCE_VALUES // ['all', 'beginner', 'intermediate', 'expert']
|
||||||
|
DOC_PURPOSE_VALUES // ['reference', 'tutorial', ...]
|
||||||
|
PROFILE_VALUES // ['scraped', 'research', 'technical', ...]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm test # Run all tests
|
||||||
|
npm run test:watch # Watch mode
|
||||||
|
npm run test:coverage # Coverage report
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run build # Compile TypeScript to dist/
|
||||||
|
npm run clean # Remove dist/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
---
|
||||||
|
profile: 'technical'
|
||||||
|
created: '2024-01-15T10:30:00.000Z'
|
||||||
|
generator: 'fss-parse-pdf'
|
||||||
|
version: '1.2.0'
|
||||||
|
title: 'API Documentation'
|
||||||
|
extraction_confidence: 1
|
||||||
|
content_quality: 1.5
|
||||||
|
source_file: '/docs/api.pdf'
|
||||||
|
word_count: 5000
|
||||||
|
page_count: 25
|
||||||
|
has_tables: true
|
||||||
|
has_images: true
|
||||||
|
author: 'Development Team'
|
||||||
|
summary: ''
|
||||||
|
tags: []
|
||||||
|
category: ''
|
||||||
|
---
|
||||||
|
|
||||||
|
# API Documentation
|
||||||
|
|
||||||
|
Content starts here...
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
13
dist/constants.d.ts
vendored
Normal file
13
dist/constants.d.ts
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import { AudienceLevel, DocPurpose, ProfileType } from './types';
|
||||||
|
export declare const AUDIENCE_VALUES: AudienceLevel[];
|
||||||
|
export declare const DOC_PURPOSE_VALUES: DocPurpose[];
|
||||||
|
export declare const PROFILE_VALUES: ProfileType[];
|
||||||
|
export declare const DEFAULTS: {
|
||||||
|
profile: ProfileType;
|
||||||
|
audience: AudienceLevel;
|
||||||
|
extractionConfidence: number;
|
||||||
|
contentQuality: number;
|
||||||
|
complexity: number;
|
||||||
|
};
|
||||||
|
export declare const PARSER_PROFILES: Record<string, ProfileType>;
|
||||||
|
export declare const BALANCED_FIELDS: string[];
|
||||||
118
dist/constants.js
vendored
Normal file
118
dist/constants.js
vendored
Normal file
File diff suppressed because one or more lines are too long
7
dist/generator.d.ts
vendored
Normal file
7
dist/generator.d.ts
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
import { FrontmatterOptions, DeterministicFields, LLMEnrichment, OutputMode } from './types';
|
||||||
|
export declare class FrontmatterGenerator {
|
||||||
|
static generate(options: FrontmatterOptions, deterministic?: DeterministicFields, enrichment?: LLMEnrichment, mode?: OutputMode): string;
|
||||||
|
static generateMarkdown(options: FrontmatterOptions, deterministic: DeterministicFields, content: string, enrichment?: LLMEnrichment, mode?: OutputMode): string;
|
||||||
|
private static cleanObject;
|
||||||
|
private static removeNulls;
|
||||||
|
}
|
||||||
114
dist/generator.js
vendored
Normal file
114
dist/generator.js
vendored
Normal file
File diff suppressed because one or more lines are too long
4
dist/index.d.ts
vendored
Normal file
4
dist/index.d.ts
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
export { OutputMode, AudienceLevel, DocPurpose, ProfileType, FrontmatterOptions, DeterministicFields, LLMEnrichment } from './types';
|
||||||
|
export { AUDIENCE_VALUES, DOC_PURPOSE_VALUES, PROFILE_VALUES, DEFAULTS, BALANCED_FIELDS, PARSER_PROFILES } from './constants';
|
||||||
|
export { FrontmatterGenerator } from './generator';
|
||||||
|
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';
|
||||||
19
dist/index.js
vendored
Normal file
19
dist/index.js
vendored
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
"use strict";
|
||||||
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
|
exports.getSamplePromptForDocType = exports.getEnrichmentPrompt = exports.FrontmatterGenerator = exports.PARSER_PROFILES = exports.BALANCED_FIELDS = exports.DEFAULTS = exports.PROFILE_VALUES = exports.DOC_PURPOSE_VALUES = exports.AUDIENCE_VALUES = void 0;
|
||||||
|
// Constants
|
||||||
|
var constants_1 = require("./constants");
|
||||||
|
Object.defineProperty(exports, "AUDIENCE_VALUES", { enumerable: true, get: function () { return constants_1.AUDIENCE_VALUES; } });
|
||||||
|
Object.defineProperty(exports, "DOC_PURPOSE_VALUES", { enumerable: true, get: function () { return constants_1.DOC_PURPOSE_VALUES; } });
|
||||||
|
Object.defineProperty(exports, "PROFILE_VALUES", { enumerable: true, get: function () { return constants_1.PROFILE_VALUES; } });
|
||||||
|
Object.defineProperty(exports, "DEFAULTS", { enumerable: true, get: function () { return constants_1.DEFAULTS; } });
|
||||||
|
Object.defineProperty(exports, "BALANCED_FIELDS", { enumerable: true, get: function () { return constants_1.BALANCED_FIELDS; } });
|
||||||
|
Object.defineProperty(exports, "PARSER_PROFILES", { enumerable: true, get: function () { return constants_1.PARSER_PROFILES; } });
|
||||||
|
// Generator
|
||||||
|
var generator_1 = require("./generator");
|
||||||
|
Object.defineProperty(exports, "FrontmatterGenerator", { enumerable: true, get: function () { return generator_1.FrontmatterGenerator; } });
|
||||||
|
// Prompts
|
||||||
|
var prompts_1 = require("./prompts");
|
||||||
|
Object.defineProperty(exports, "getEnrichmentPrompt", { enumerable: true, get: function () { return prompts_1.getEnrichmentPrompt; } });
|
||||||
|
Object.defineProperty(exports, "getSamplePromptForDocType", { enumerable: true, get: function () { return prompts_1.getSamplePromptForDocType; } });
|
||||||
|
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7O0FBV0EsWUFBWTtBQUNaLHlDQU9xQjtBQU5uQiw0R0FBQSxlQUFlLE9BQUE7QUFDZiwrR0FBQSxrQkFBa0IsT0FBQTtBQUNsQiwyR0FBQSxjQUFjLE9BQUE7QUFDZCxxR0FBQSxRQUFRLE9BQUE7QUFDUiw0R0FBQSxlQUFlLE9BQUE7QUFDZiw0R0FBQSxlQUFlLE9BQUE7QUFHakIsWUFBWTtBQUNaLHlDQUFtRDtBQUExQyxpSEFBQSxvQkFBb0IsT0FBQTtBQUU3QixVQUFVO0FBQ1YscUNBQTJFO0FBQWxFLDhHQUFBLG1CQUFtQixPQUFBO0FBQUUsb0hBQUEseUJBQXlCLE9BQUEiLCJzb3VyY2VzQ29udGVudCI6WyIvLyBUeXBlc1xuZXhwb3J0IHtcbiAgT3V0cHV0TW9kZSxcbiAgQXVkaWVuY2VMZXZlbCxcbiAgRG9jUHVycG9zZSxcbiAgUHJvZmlsZVR5cGUsXG4gIEZyb250bWF0dGVyT3B0aW9ucyxcbiAgRGV0ZXJtaW5pc3RpY0ZpZWxkcyxcbiAgTExNRW5yaWNobWVudFxufSBmcm9tICcuL3R5cGVzJztcblxuLy8gQ29uc3RhbnRzXG5leHBvcnQge1xuICBBVURJRU5DRV9WQUxVRVMsXG4gIERPQ19QVVJQT1NFX1ZBTFVFUyxcbiAgUFJPRklMRV9WQUxVRVMsXG4gIERFRkFVTFRTLFxuICBCQUxBTkNFRF9GSUVMRFMsXG4gIFBBUlNFUl9QUk9GSUxFU1xufSBmcm9tICcuL2NvbnN0YW50cyc7XG5cbi8vIEdlbmVyYXRvclxuZXhwb3J0IHsgRnJvbnRtYXR0ZXJHZW5lcmF0b3IgfSBmcm9tICcuL2dlbmVyYXRvcic7XG5cbi8vIFByb21wdHNcbmV4cG9ydCB7IGdldEVucmljaG1lbnRQcm9tcHQsIGdldFNhbXBsZVByb21wdEZvckRvY1R5cGUgfSBmcm9tICcuL3Byb21wdHMnO1xuIl19
|
||||||
2
dist/prompts.d.ts
vendored
Normal file
2
dist/prompts.d.ts
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
export declare function getEnrichmentPrompt(content: string, docType?: string): string;
|
||||||
|
export declare function getSamplePromptForDocType(docType: string): string;
|
||||||
52
dist/prompts.js
vendored
Normal file
52
dist/prompts.js
vendored
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
"use strict";
|
||||||
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
|
exports.getEnrichmentPrompt = getEnrichmentPrompt;
|
||||||
|
exports.getSamplePromptForDocType = getSamplePromptForDocType;
|
||||||
|
function getEnrichmentPrompt(content, docType = 'markdown') {
|
||||||
|
return `You are a document analyst preparing metadata for a RAG search system.
|
||||||
|
Extract structured metadata to help users find and understand this document.
|
||||||
|
|
||||||
|
Respond with this exact JSON structure:
|
||||||
|
|
||||||
|
{
|
||||||
|
"summary": "2-3 sentences: What is this document about? What problem does it solve?",
|
||||||
|
"tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
|
||||||
|
"category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
|
||||||
|
"audience": "all | beginner | intermediate | expert",
|
||||||
|
"doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
|
||||||
|
"complexity": 1-5,
|
||||||
|
"actionable": true or false,
|
||||||
|
"key_technologies": ["specific tools, languages, frameworks mentioned"]
|
||||||
|
}
|
||||||
|
|
||||||
|
Guidelines:
|
||||||
|
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
|
||||||
|
- category: Pick the single best match
|
||||||
|
- audience: Default to "all" unless clearly targeted to specific skill level
|
||||||
|
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
|
||||||
|
- actionable: true if reader should DO something, false if just informational
|
||||||
|
- key_technologies: Only include specific named technologies, not generic terms
|
||||||
|
|
||||||
|
Document type: ${docType}
|
||||||
|
|
||||||
|
---
|
||||||
|
${content}
|
||||||
|
---
|
||||||
|
|
||||||
|
Respond with valid JSON only. No explanation or markdown formatting.`;
|
||||||
|
}
|
||||||
|
function getSamplePromptForDocType(docType) {
|
||||||
|
const samples = {
|
||||||
|
pdf: 'PDF document',
|
||||||
|
word: 'Word document',
|
||||||
|
email: 'Email message',
|
||||||
|
image: 'Image with OCR text',
|
||||||
|
audio: 'Audio transcript',
|
||||||
|
video: 'Video transcript',
|
||||||
|
presentation: 'Presentation slides',
|
||||||
|
excel: 'Spreadsheet data',
|
||||||
|
markdown: 'Markdown document'
|
||||||
|
};
|
||||||
|
return samples[docType] || 'document';
|
||||||
|
}
|
||||||
|
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicHJvbXB0cy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uL3NyYy9wcm9tcHRzLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiI7O0FBQUEsa0RBZ0NDO0FBRUQsOERBYUM7QUEvQ0QsU0FBZ0IsbUJBQW1CLENBQUMsT0FBZSxFQUFFLFVBQWtCLFVBQVU7SUFDL0UsT0FBTzs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7O2lCQXdCUSxPQUFPOzs7RUFHdEIsT0FBTzs7O3FFQUc0RCxDQUFDO0FBQ3RFLENBQUM7QUFFRCxTQUFnQix5QkFBeUIsQ0FBQyxPQUFlO0lBQ3ZELE1BQU0sT0FBTyxHQUEyQjtRQUN0QyxHQUFHLEVBQUUsY0FBYztRQUNuQixJQUFJLEVBQUUsZUFBZTtRQUNyQixLQUFLLEVBQUUsZUFBZTtRQUN0QixLQUFLLEVBQUUscUJBQXFCO1FBQzVCLEtBQUssRUFBRSxrQkFBa0I7UUFDekIsS0FBSyxFQUFFLGtCQUFrQjtRQUN6QixZQUFZLEVBQUUscUJBQXFCO1FBQ25DLEtBQUssRUFBRSxrQkFBa0I7UUFDekIsUUFBUSxFQUFFLG1CQUFtQjtLQUM5QixDQUFDO0lBQ0YsT0FBTyxPQUFPLENBQUMsT0FBTyxDQUFDLElBQUksVUFBVSxDQUFDO0FBQ3hDLENBQUMiLCJzb3VyY2VzQ29udGVudCI6WyJleHBvcnQgZnVuY3Rpb24gZ2V0RW5yaWNobWVudFByb21wdChjb250ZW50OiBzdHJpbmcsIGRvY1R5cGU6IHN0cmluZyA9ICdtYXJrZG93bicpOiBzdHJpbmcge1xuICByZXR1cm4gYFlvdSBhcmUgYSBkb2N1bWVudCBhbmFseXN0IHByZXBhcmluZyBtZXRhZGF0YSBmb3IgYSBSQUcgc2VhcmNoIHN5c3RlbS5cbkV4dHJhY3Qgc3RydWN0dXJlZCBtZXRhZGF0YSB0byBoZWxwIHVzZXJzIGZpbmQgYW5kIHVuZGVyc3RhbmQgdGhpcyBkb2N1bWVudC5cblxuUmVzcG9uZCB3aXRoIHRoaXMgZXhhY3QgSlNPTiBzdHJ1Y3R1cmU6XG5cbntcbiAgXCJzdW1tYXJ5XCI6IFwiMi0zIHNlbnRlbmNlczogV2hhdCBpcyB0aGlzIGRvY3VtZW50IGFib3V0PyBXaGF0IHByb2JsZW0gZG9lcyBpdCBzb2x2ZT9cIixcbiAgXCJ0YWdzXCI6IFtcIjUtMTAgU1BFQ0lGSUMgdGVybXMgZnJvbSB0aGlzIGRvY3VtZW50IGZvciBzZWFyY2ggLSBub3QgZ2VuZXJpYyB3b3Jkc1wiXSxcbiAgXCJjYXRlZ29yeVwiOiBcInRlY2huaWNhbCB8IHJlc2VhcmNoIHwgY29kZSB8IGRhdGEgfCBjaGFuZ2Vsb2cgfCB0cm91Ymxlc2hvb3RpbmcgfCByZWZlcmVuY2UgfCB0dXRvcmlhbFwiLFxuICBcImF1ZGllbmNlXCI6IFwiYWxsIHwgYmVnaW5uZXIgfCBpbnRlcm1lZGlhdGUgfCBleHBlcnRcIixcbiAgXCJkb2NfcHVycG9zZVwiOiBcInJlZmVyZW5jZSB8IHR1dG9yaWFsIHwgdHJvdWJsZXNob290aW5nIHwgY29uY2VwdHVhbCB8IGd1aWRlIHwgc3BlY2lmaWNhdGlvblwiLFxuICBcImNvbXBsZXhpdHlcIjogMS01LFxuICBcImFjdGlvbmFibGVcIjogdHJ1ZSBvciBmYWxzZSxcbiAgXCJrZXlfdGVjaG5vbG9naWVzXCI6IFtcInNwZWNpZmljIHRvb2xzLCBsYW5ndWFnZXMsIGZyYW1ld29ya3MgbWVudGlvbmVkXCJdXG59XG5cbkd1aWRlbGluZXM6XG4tIHRhZ3M6IEV4dHJhY3QgU1BFQ0lGSUMgdGVybXMgdGhhdCBhcHBlYXIgaW4gdGhlIGRvY3VtZW50LCBub3QgZ2VuZXJpYyBkZXNjcmlwdGlvbnNcbi0gY2F0ZWdvcnk6IFBpY2sgdGhlIHNpbmdsZSBiZXN0IG1hdGNoXG4tIGF1ZGllbmNlOiBEZWZhdWx0IHRvIFwiYWxsXCIgdW5sZXNzIGNsZWFybHkgdGFyZ2V0ZWQgdG8gc3BlY2lmaWMgc2tpbGwgbGV2ZWxcbi0gY29tcGxleGl0eTogMT1vdmVydmlldywgMj1iZWdpbm5lciBndWlkZSwgMz1pbnRlcm1lZGlhdGUsIDQ9YWR2YW5jZWQsIDU9ZGVlcCBpbXBsZW1lbnRhdGlvblxuLSBhY3Rpb25hYmxlOiB0cnVlIGlmIHJlYWRlciBzaG91bGQgRE8gc29tZXRoaW5nLCBmYWxzZSBpZiBqdXN0IGluZm9ybWF0aW9uYWxcbi0ga2V5X3RlY2hub2xvZ2llczogT25seSBpbmNsdWRlIHNwZWNpZmljIG5hbWVkIHRlY2hub2xvZ2llcywgbm90IGdlbmVyaWMgdGVybXNcblxuRG9jdW1lbnQgdHlwZTogJHtkb2NUeXBlfVxuXG4tLS1cbiR7Y29udGVudH1cbi0tLVxuXG5SZXNwb25kIHdpdGggdmFsaWQgSlNPTiBvbmx5LiBObyBleHBsYW5hdGlvbiBvciBtYXJrZG93biBmb3JtYXR0aW5nLmA7XG59XG5cbmV4cG9ydCBmdW5jdGlvbiBnZXRTYW1wbGVQcm9tcHRGb3JEb2NUeXBlKGRvY1R5cGU6IHN0cmluZyk6IHN0cmluZyB7XG4gIGNvbnN0IHNhbXBsZXM6IFJlY29yZDxzdHJpbmcsIHN0cmluZz4gPSB7XG4gICAgcGRmOiAnUERGIGRvY3VtZW50JyxcbiAgICB3b3JkOiAnV29yZCBkb2N1bWVudCcsXG4gICAgZW1haWw6ICdFbWFpbCBtZXNzYWdlJyxcbiAgICBpbWFnZTogJ0ltYWdlIHdpdGggT0NSIHRleHQnLFxuICAgIGF1ZGlvOiAnQXVkaW8gdHJhbnNjcmlwdCcsXG4gICAgdmlkZW86ICdWaWRlbyB0cmFuc2NyaXB0JyxcbiAgICBwcmVzZW50YXRpb246ICdQcmVzZW50YXRpb24gc2xpZGVzJyxcbiAgICBleGNlbDogJ1NwcmVhZHNoZWV0IGRhdGEnLFxuICAgIG1hcmtkb3duOiAnTWFya2Rvd24gZG9jdW1lbnQnXG4gIH07XG4gIHJldHVybiBzYW1wbGVzW2RvY1R5cGVdIHx8ICdkb2N1bWVudCc7XG59XG4iXX0=
|
||||||
29
dist/types.d.ts
vendored
Normal file
29
dist/types.d.ts
vendored
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
export type OutputMode = 'none' | 'balanced' | 'complete';
|
||||||
|
export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';
|
||||||
|
export type DocPurpose = 'reference' | 'tutorial' | 'troubleshooting' | 'conceptual' | 'guide' | 'specification';
|
||||||
|
export type ProfileType = 'scraped' | 'research' | 'technical' | 'code' | 'data' | 'changelog' | 'legal' | 'test' | 'schema' | 'troubleshoot' | 'meeting' | 'faq' | 'config';
|
||||||
|
export interface FrontmatterOptions {
|
||||||
|
generator: string;
|
||||||
|
version: string;
|
||||||
|
title: string;
|
||||||
|
sourcePath?: string | null;
|
||||||
|
profile?: ProfileType;
|
||||||
|
extractionConfidence?: number;
|
||||||
|
contentQuality?: number;
|
||||||
|
}
|
||||||
|
export interface DeterministicFields {
|
||||||
|
word_count?: number;
|
||||||
|
page_count?: number;
|
||||||
|
character_count?: number;
|
||||||
|
[key: string]: any;
|
||||||
|
}
|
||||||
|
export interface LLMEnrichment {
|
||||||
|
summary?: string;
|
||||||
|
tags?: string[];
|
||||||
|
category?: string;
|
||||||
|
audience?: AudienceLevel;
|
||||||
|
doc_purpose?: DocPurpose;
|
||||||
|
complexity?: number;
|
||||||
|
actionable?: boolean;
|
||||||
|
key_technologies?: string[];
|
||||||
|
}
|
||||||
3
dist/types.js
vendored
Normal file
3
dist/types.js
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
"use strict";
|
||||||
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||||||
|
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoidHlwZXMuanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvdHlwZXMudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IiIsInNvdXJjZXNDb250ZW50IjpbImV4cG9ydCB0eXBlIE91dHB1dE1vZGUgPSAnbm9uZScgfCAnYmFsYW5jZWQnIHwgJ2NvbXBsZXRlJztcblxuZXhwb3J0IHR5cGUgQXVkaWVuY2VMZXZlbCA9ICdhbGwnIHwgJ2JlZ2lubmVyJyB8ICdpbnRlcm1lZGlhdGUnIHwgJ2V4cGVydCc7XG5cbmV4cG9ydCB0eXBlIERvY1B1cnBvc2UgPVxuICB8ICdyZWZlcmVuY2UnXG4gIHwgJ3R1dG9yaWFsJ1xuICB8ICd0cm91Ymxlc2hvb3RpbmcnXG4gIHwgJ2NvbmNlcHR1YWwnXG4gIHwgJ2d1aWRlJ1xuICB8ICdzcGVjaWZpY2F0aW9uJztcblxuZXhwb3J0IHR5cGUgUHJvZmlsZVR5cGUgPVxuICB8ICdzY3JhcGVkJ1xuICB8ICdyZXNlYXJjaCdcbiAgfCAndGVjaG5pY2FsJ1xuICB8ICdjb2RlJ1xuICB8ICdkYXRhJ1xuICB8ICdjaGFuZ2Vsb2cnXG4gIHwgJ2xlZ2FsJ1xuICB8ICd0ZXN0J1xuICB8ICdzY2hlbWEnXG4gIHwgJ3Ryb3VibGVzaG9vdCdcbiAgfCAnbWVldGluZydcbiAgfCAnZmFxJ1xuICB8ICdjb25maWcnO1xuXG5leHBvcnQgaW50ZXJmYWNlIEZyb250bWF0dGVyT3B0aW9ucyB7XG4gIGdlbmVyYXRvcjogc3RyaW5nO1xuICB2ZXJzaW9uOiBzdHJpbmc7XG4gIHRpdGxlOiBzdHJpbmc7XG4gIHNvdXJjZVBhdGg/OiBzdHJpbmcgfCBudWxsO1xuICBwcm9maWxlPzogUHJvZmlsZVR5cGU7XG4gIGV4dHJhY3Rpb25Db25maWRlbmNlPzogbnVtYmVyO1xuICBjb250ZW50UXVhbGl0eT86IG51bWJlcjtcbn1cblxuZXhwb3J0IGludGVyZmFjZSBEZXRlcm1pbmlzdGljRmllbGRzIHtcbiAgd29yZF9jb3VudD86IG51bWJlcjtcbiAgcGFnZV9jb3VudD86IG51bWJlcjtcbiAgY2hhcmFjdGVyX2NvdW50PzogbnVtYmVyO1xuICBba2V5OiBzdHJpbmddOiBhbnk7XG59XG5cbmV4cG9ydCBpbnRlcmZhY2UgTExNRW5yaWNobWVudCB7XG4gIHN1bW1hcnk/OiBzdHJpbmc7XG4gIHRhZ3M/OiBzdHJpbmdbXTtcbiAgY2F0ZWdvcnk/OiBzdHJpbmc7XG4gIGF1ZGllbmNlPzogQXVkaWVuY2VMZXZlbDtcbiAgZG9jX3B1cnBvc2U/OiBEb2NQdXJwb3NlO1xuICBjb21wbGV4aXR5PzogbnVtYmVyO1xuICBhY3Rpb25hYmxlPzogYm9vbGVhbjtcbiAga2V5X3RlY2hub2xvZ2llcz86IHN0cmluZ1tdO1xufVxuIl19
|
||||||
3879
package-lock.json
generated
Normal file
3879
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
42
package.json
Normal file
42
package.json
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
{
|
||||||
|
"name": "@bobai/frontmatter",
|
||||||
|
"version": "1.1.0",
|
||||||
|
"description": "BOBAI Markdown Standard v1.1 frontmatter generator",
|
||||||
|
"main": "dist/index.js",
|
||||||
|
"types": "dist/index.d.ts",
|
||||||
|
"files": ["dist"],
|
||||||
|
"scripts": {
|
||||||
|
"build": "tsc",
|
||||||
|
"clean": "rm -rf dist",
|
||||||
|
"test": "jest",
|
||||||
|
"test:watch": "jest --watch",
|
||||||
|
"test:coverage": "jest --coverage",
|
||||||
|
"prepublishOnly": "npm run clean && npm run build"
|
||||||
|
},
|
||||||
|
"keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"],
|
||||||
|
"author": "BobAI",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"js-yaml": "^4.1.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/jest": "^29.5.0",
|
||||||
|
"@types/js-yaml": "^4.0.9",
|
||||||
|
"@types/node": "^20.0.0",
|
||||||
|
"jest": "^29.7.0",
|
||||||
|
"ts-jest": "^29.1.0",
|
||||||
|
"typescript": "^5.0.0"
|
||||||
|
},
|
||||||
|
"jest": {
|
||||||
|
"preset": "ts-jest",
|
||||||
|
"testEnvironment": "node",
|
||||||
|
"roots": ["<rootDir>/tests"],
|
||||||
|
"testMatch": ["**/*.test.ts"],
|
||||||
|
"collectCoverageFrom": ["src/**/*.ts"],
|
||||||
|
"coverageDirectory": "coverage",
|
||||||
|
"coverageReporters": ["text", "lcov"]
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": ">=18.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
130
src/constants.ts
Normal file
130
src/constants.ts
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
import { AudienceLevel, DocPurpose, ProfileType } from './types';
|
||||||
|
|
||||||
|
export const AUDIENCE_VALUES: AudienceLevel[] = [
|
||||||
|
'all', 'beginner', 'intermediate', 'expert'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const DOC_PURPOSE_VALUES: DocPurpose[] = [
|
||||||
|
'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const PROFILE_VALUES: ProfileType[] = [
|
||||||
|
'scraped', 'research', 'technical', 'code', 'data', 'changelog',
|
||||||
|
'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
|
||||||
|
];
|
||||||
|
|
||||||
|
export const DEFAULTS = {
|
||||||
|
profile: 'data' as ProfileType,
|
||||||
|
audience: 'all' as AudienceLevel,
|
||||||
|
extractionConfidence: 1.0,
|
||||||
|
contentQuality: 1.5,
|
||||||
|
complexity: 3
|
||||||
|
};
|
||||||
|
|
||||||
|
// Profile defaults by parser type
|
||||||
|
export const PARSER_PROFILES: Record<string, ProfileType> = {
|
||||||
|
'fss-parse-pdf': 'technical',
|
||||||
|
'fss-parse-word': 'technical',
|
||||||
|
'fss-parse-excel': 'data',
|
||||||
|
'fss-parse-image': 'data',
|
||||||
|
'fss-parse-audio': 'meeting',
|
||||||
|
'fss-parse-video': 'meeting',
|
||||||
|
'fss-parse-email': 'data',
|
||||||
|
'fss-parse-presentation': 'technical',
|
||||||
|
'fss-parse-data': 'data',
|
||||||
|
'fss-parse-diagram': 'schema'
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fields to include in balanced mode (not complete)
|
||||||
|
// Organized by parser type for clarity
|
||||||
|
export const BALANCED_FIELDS = [
|
||||||
|
// Universal document fields
|
||||||
|
'word_count',
|
||||||
|
'page_count',
|
||||||
|
'character_count',
|
||||||
|
'author',
|
||||||
|
'subject',
|
||||||
|
'creator',
|
||||||
|
'created',
|
||||||
|
'modified',
|
||||||
|
'file_size',
|
||||||
|
'format',
|
||||||
|
|
||||||
|
// Structure fields (PDF, Word, Presentation)
|
||||||
|
'has_tables',
|
||||||
|
'has_images',
|
||||||
|
'table_count',
|
||||||
|
'image_count',
|
||||||
|
'section_count',
|
||||||
|
'has_toc',
|
||||||
|
'has_forms',
|
||||||
|
'has_tracked_changes',
|
||||||
|
'paragraph_count',
|
||||||
|
'heading_count',
|
||||||
|
|
||||||
|
// Excel/Data fields
|
||||||
|
'sheet_count',
|
||||||
|
'row_count',
|
||||||
|
'column_count',
|
||||||
|
'record_count',
|
||||||
|
'format_detected',
|
||||||
|
|
||||||
|
// Image fields
|
||||||
|
'width',
|
||||||
|
'height',
|
||||||
|
'channels',
|
||||||
|
'has_alpha',
|
||||||
|
'color_space',
|
||||||
|
'ocr_confidence',
|
||||||
|
'has_exif',
|
||||||
|
|
||||||
|
// Audio fields
|
||||||
|
'duration',
|
||||||
|
'duration_seconds',
|
||||||
|
'bitrate',
|
||||||
|
'sample_rate',
|
||||||
|
'codec',
|
||||||
|
'has_transcript',
|
||||||
|
'speaker_count',
|
||||||
|
'language',
|
||||||
|
|
||||||
|
// Video fields
|
||||||
|
'fps',
|
||||||
|
'aspect_ratio',
|
||||||
|
'resolution',
|
||||||
|
'video_codec',
|
||||||
|
'audio_codec',
|
||||||
|
|
||||||
|
// Presentation fields
|
||||||
|
'slide_count',
|
||||||
|
'total_slides',
|
||||||
|
'chart_count',
|
||||||
|
'has_speaker_notes',
|
||||||
|
'has_animations',
|
||||||
|
|
||||||
|
// Email fields
|
||||||
|
'from',
|
||||||
|
'to',
|
||||||
|
'cc',
|
||||||
|
'sender',
|
||||||
|
'recipients',
|
||||||
|
'date',
|
||||||
|
'message_id',
|
||||||
|
'has_attachments',
|
||||||
|
'attachment_count',
|
||||||
|
'importance',
|
||||||
|
'thread_id',
|
||||||
|
|
||||||
|
// Diagram fields
|
||||||
|
'diagram_count',
|
||||||
|
'diagram_type',
|
||||||
|
'valid_diagrams',
|
||||||
|
'invalid_diagrams',
|
||||||
|
'node_count',
|
||||||
|
'edge_count',
|
||||||
|
|
||||||
|
// Analysis fields
|
||||||
|
'encrypted',
|
||||||
|
'complexity_score',
|
||||||
|
'reading_time_minutes'
|
||||||
|
];
|
||||||
123
src/generator.ts
Normal file
123
src/generator.ts
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import * as yaml from 'js-yaml';
|
||||||
|
import {
|
||||||
|
FrontmatterOptions,
|
||||||
|
DeterministicFields,
|
||||||
|
LLMEnrichment,
|
||||||
|
OutputMode
|
||||||
|
} from './types';
|
||||||
|
import { DEFAULTS, BALANCED_FIELDS } from './constants';
|
||||||
|
|
||||||
|
export class FrontmatterGenerator {
|
||||||
|
|
||||||
|
static generate(
|
||||||
|
options: FrontmatterOptions,
|
||||||
|
deterministic: DeterministicFields = {},
|
||||||
|
enrichment?: LLMEnrichment,
|
||||||
|
mode: OutputMode = 'balanced'
|
||||||
|
): string {
|
||||||
|
|
||||||
|
if (mode === 'none') {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const frontmatter: Record<string, any> = {
|
||||||
|
// Core required fields
|
||||||
|
profile: options.profile || DEFAULTS.profile,
|
||||||
|
created: new Date().toISOString(),
|
||||||
|
generator: options.generator,
|
||||||
|
version: options.version,
|
||||||
|
title: options.title || 'Untitled',
|
||||||
|
extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence,
|
||||||
|
content_quality: options.contentQuality ?? DEFAULTS.contentQuality,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Source file
|
||||||
|
if (options.sourcePath) {
|
||||||
|
frontmatter.source_file = options.sourcePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add deterministic fields based on mode
|
||||||
|
if (mode === 'complete') {
|
||||||
|
// Include all deterministic fields
|
||||||
|
const cleaned = this.cleanObject(deterministic);
|
||||||
|
Object.assign(frontmatter, cleaned);
|
||||||
|
} else {
|
||||||
|
// Balanced mode - include only key fields
|
||||||
|
for (const field of BALANCED_FIELDS) {
|
||||||
|
if (deterministic[field] !== undefined && deterministic[field] !== null) {
|
||||||
|
frontmatter[field] = deterministic[field];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLM enrichment fields (flat, not nested)
|
||||||
|
if (enrichment) {
|
||||||
|
if (enrichment.summary) frontmatter.summary = enrichment.summary;
|
||||||
|
if (enrichment.tags?.length) frontmatter.tags = enrichment.tags;
|
||||||
|
if (enrichment.category) frontmatter.category = enrichment.category;
|
||||||
|
if (enrichment.audience) frontmatter.audience = enrichment.audience;
|
||||||
|
if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose;
|
||||||
|
if (enrichment.complexity) frontmatter.complexity = enrichment.complexity;
|
||||||
|
if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable;
|
||||||
|
if (enrichment.key_technologies?.length) {
|
||||||
|
frontmatter.key_technologies = enrichment.key_technologies;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Placeholders for LLM enrichment
|
||||||
|
frontmatter.summary = '';
|
||||||
|
frontmatter.tags = [];
|
||||||
|
frontmatter.category = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
const yamlStr = yaml.dump(this.removeNulls(frontmatter), {
|
||||||
|
indent: 2,
|
||||||
|
lineWidth: -1,
|
||||||
|
quotingType: "'",
|
||||||
|
sortKeys: false
|
||||||
|
});
|
||||||
|
|
||||||
|
return `---\n${yamlStr}---`;
|
||||||
|
}
|
||||||
|
|
||||||
|
static generateMarkdown(
|
||||||
|
options: FrontmatterOptions,
|
||||||
|
deterministic: DeterministicFields,
|
||||||
|
content: string,
|
||||||
|
enrichment?: LLMEnrichment,
|
||||||
|
mode: OutputMode = 'balanced'
|
||||||
|
): string {
|
||||||
|
const fm = this.generate(options, deterministic, enrichment, mode);
|
||||||
|
if (!fm) return content;
|
||||||
|
return `${fm}\n\n${content}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static cleanObject(obj: any): any {
|
||||||
|
const result: any = {};
|
||||||
|
for (const [k, v] of Object.entries(obj)) {
|
||||||
|
if (k.startsWith('_')) continue;
|
||||||
|
if (v === null || v === undefined) continue;
|
||||||
|
if (v instanceof Date) {
|
||||||
|
result[k] = v.toISOString();
|
||||||
|
} else if (v && typeof v === 'object' && !Array.isArray(v)) {
|
||||||
|
result[k] = this.cleanObject(v);
|
||||||
|
} else {
|
||||||
|
result[k] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static removeNulls(obj: any): any {
|
||||||
|
if (Array.isArray(obj)) {
|
||||||
|
return obj.filter(x => x != null).map(x => this.removeNulls(x));
|
||||||
|
}
|
||||||
|
if (obj && typeof obj === 'object') {
|
||||||
|
const result: any = {};
|
||||||
|
for (const [k, v] of Object.entries(obj)) {
|
||||||
|
if (v != null) result[k] = this.removeNulls(v);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
}
|
||||||
26
src/index.ts
Normal file
26
src/index.ts
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
// Types
|
||||||
|
export {
|
||||||
|
OutputMode,
|
||||||
|
AudienceLevel,
|
||||||
|
DocPurpose,
|
||||||
|
ProfileType,
|
||||||
|
FrontmatterOptions,
|
||||||
|
DeterministicFields,
|
||||||
|
LLMEnrichment
|
||||||
|
} from './types';
|
||||||
|
|
||||||
|
// Constants
|
||||||
|
export {
|
||||||
|
AUDIENCE_VALUES,
|
||||||
|
DOC_PURPOSE_VALUES,
|
||||||
|
PROFILE_VALUES,
|
||||||
|
DEFAULTS,
|
||||||
|
BALANCED_FIELDS,
|
||||||
|
PARSER_PROFILES
|
||||||
|
} from './constants';
|
||||||
|
|
||||||
|
// Generator
|
||||||
|
export { FrontmatterGenerator } from './generator';
|
||||||
|
|
||||||
|
// Prompts
|
||||||
|
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';
|
||||||
48
src/prompts.ts
Normal file
48
src/prompts.ts
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string {
|
||||||
|
return `You are a document analyst preparing metadata for a RAG search system.
|
||||||
|
Extract structured metadata to help users find and understand this document.
|
||||||
|
|
||||||
|
Respond with this exact JSON structure:
|
||||||
|
|
||||||
|
{
|
||||||
|
"summary": "2-3 sentences: What is this document about? What problem does it solve?",
|
||||||
|
"tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
|
||||||
|
"category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
|
||||||
|
"audience": "all | beginner | intermediate | expert",
|
||||||
|
"doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
|
||||||
|
"complexity": 1-5,
|
||||||
|
"actionable": true or false,
|
||||||
|
"key_technologies": ["specific tools, languages, frameworks mentioned"]
|
||||||
|
}
|
||||||
|
|
||||||
|
Guidelines:
|
||||||
|
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
|
||||||
|
- category: Pick the single best match
|
||||||
|
- audience: Default to "all" unless clearly targeted to specific skill level
|
||||||
|
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
|
||||||
|
- actionable: true if reader should DO something, false if just informational
|
||||||
|
- key_technologies: Only include specific named technologies, not generic terms
|
||||||
|
|
||||||
|
Document type: ${docType}
|
||||||
|
|
||||||
|
---
|
||||||
|
${content}
|
||||||
|
---
|
||||||
|
|
||||||
|
Respond with valid JSON only. No explanation or markdown formatting.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getSamplePromptForDocType(docType: string): string {
|
||||||
|
const samples: Record<string, string> = {
|
||||||
|
pdf: 'PDF document',
|
||||||
|
word: 'Word document',
|
||||||
|
email: 'Email message',
|
||||||
|
image: 'Image with OCR text',
|
||||||
|
audio: 'Audio transcript',
|
||||||
|
video: 'Video transcript',
|
||||||
|
presentation: 'Presentation slides',
|
||||||
|
excel: 'Spreadsheet data',
|
||||||
|
markdown: 'Markdown document'
|
||||||
|
};
|
||||||
|
return samples[docType] || 'document';
|
||||||
|
}
|
||||||
54
src/types.ts
Normal file
54
src/types.ts
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
export type OutputMode = 'none' | 'balanced' | 'complete';
|
||||||
|
|
||||||
|
export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';
|
||||||
|
|
||||||
|
export type DocPurpose =
|
||||||
|
| 'reference'
|
||||||
|
| 'tutorial'
|
||||||
|
| 'troubleshooting'
|
||||||
|
| 'conceptual'
|
||||||
|
| 'guide'
|
||||||
|
| 'specification';
|
||||||
|
|
||||||
|
export type ProfileType =
|
||||||
|
| 'scraped'
|
||||||
|
| 'research'
|
||||||
|
| 'technical'
|
||||||
|
| 'code'
|
||||||
|
| 'data'
|
||||||
|
| 'changelog'
|
||||||
|
| 'legal'
|
||||||
|
| 'test'
|
||||||
|
| 'schema'
|
||||||
|
| 'troubleshoot'
|
||||||
|
| 'meeting'
|
||||||
|
| 'faq'
|
||||||
|
| 'config';
|
||||||
|
|
||||||
|
export interface FrontmatterOptions {
|
||||||
|
generator: string;
|
||||||
|
version: string;
|
||||||
|
title: string;
|
||||||
|
sourcePath?: string | null;
|
||||||
|
profile?: ProfileType;
|
||||||
|
extractionConfidence?: number;
|
||||||
|
contentQuality?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DeterministicFields {
|
||||||
|
word_count?: number;
|
||||||
|
page_count?: number;
|
||||||
|
character_count?: number;
|
||||||
|
[key: string]: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface LLMEnrichment {
|
||||||
|
summary?: string;
|
||||||
|
tags?: string[];
|
||||||
|
category?: string;
|
||||||
|
audience?: AudienceLevel;
|
||||||
|
doc_purpose?: DocPurpose;
|
||||||
|
complexity?: number;
|
||||||
|
actionable?: boolean;
|
||||||
|
key_technologies?: string[];
|
||||||
|
}
|
||||||
1
tests/constants.test.d.ts
vendored
Normal file
1
tests/constants.test.d.ts
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
export {};
|
||||||
168
tests/constants.test.js
Normal file
168
tests/constants.test.js
Normal file
File diff suppressed because one or more lines are too long
192
tests/constants.test.ts
Normal file
192
tests/constants.test.ts
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
import {
|
||||||
|
AUDIENCE_VALUES,
|
||||||
|
DOC_PURPOSE_VALUES,
|
||||||
|
PROFILE_VALUES,
|
||||||
|
DEFAULTS,
|
||||||
|
BALANCED_FIELDS,
|
||||||
|
PARSER_PROFILES
|
||||||
|
} from '../src';
|
||||||
|
|
||||||
|
describe('Constants', () => {
|
||||||
|
describe('AUDIENCE_VALUES', () => {
|
||||||
|
it('should contain all valid audience levels', () => {
|
||||||
|
expect(AUDIENCE_VALUES).toContain('all');
|
||||||
|
expect(AUDIENCE_VALUES).toContain('beginner');
|
||||||
|
expect(AUDIENCE_VALUES).toContain('intermediate');
|
||||||
|
expect(AUDIENCE_VALUES).toContain('expert');
|
||||||
|
expect(AUDIENCE_VALUES).toHaveLength(4);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('DOC_PURPOSE_VALUES', () => {
|
||||||
|
it('should contain all valid doc purposes', () => {
|
||||||
|
expect(DOC_PURPOSE_VALUES).toContain('reference');
|
||||||
|
expect(DOC_PURPOSE_VALUES).toContain('tutorial');
|
||||||
|
expect(DOC_PURPOSE_VALUES).toContain('troubleshooting');
|
||||||
|
expect(DOC_PURPOSE_VALUES).toContain('conceptual');
|
||||||
|
expect(DOC_PURPOSE_VALUES).toContain('guide');
|
||||||
|
expect(DOC_PURPOSE_VALUES).toContain('specification');
|
||||||
|
expect(DOC_PURPOSE_VALUES).toHaveLength(6);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('PROFILE_VALUES', () => {
|
||||||
|
it('should contain all valid profile types', () => {
|
||||||
|
const expectedProfiles = [
|
||||||
|
'scraped', 'research', 'technical', 'code', 'data', 'changelog',
|
||||||
|
'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
|
||||||
|
];
|
||||||
|
expectedProfiles.forEach(profile => {
|
||||||
|
expect(PROFILE_VALUES).toContain(profile);
|
||||||
|
});
|
||||||
|
expect(PROFILE_VALUES).toHaveLength(13);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('DEFAULTS', () => {
|
||||||
|
it('should have correct default values', () => {
|
||||||
|
expect(DEFAULTS.profile).toBe('data');
|
||||||
|
expect(DEFAULTS.audience).toBe('all');
|
||||||
|
expect(DEFAULTS.extractionConfidence).toBe(1.0);
|
||||||
|
expect(DEFAULTS.contentQuality).toBe(1.5);
|
||||||
|
expect(DEFAULTS.complexity).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should have confidence in valid range', () => {
|
||||||
|
expect(DEFAULTS.extractionConfidence).toBeGreaterThanOrEqual(0);
|
||||||
|
expect(DEFAULTS.extractionConfidence).toBeLessThanOrEqual(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should have quality in valid range', () => {
|
||||||
|
expect(DEFAULTS.contentQuality).toBeGreaterThanOrEqual(0);
|
||||||
|
expect(DEFAULTS.contentQuality).toBeLessThanOrEqual(2);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('PARSER_PROFILES', () => {
|
||||||
|
it('should have profiles for all parsers', () => {
|
||||||
|
const expectedParsers = [
|
||||||
|
'fss-parse-pdf',
|
||||||
|
'fss-parse-word',
|
||||||
|
'fss-parse-excel',
|
||||||
|
'fss-parse-image',
|
||||||
|
'fss-parse-audio',
|
||||||
|
'fss-parse-video',
|
||||||
|
'fss-parse-email',
|
||||||
|
'fss-parse-presentation',
|
||||||
|
'fss-parse-data',
|
||||||
|
'fss-parse-diagram'
|
||||||
|
];
|
||||||
|
expectedParsers.forEach(parser => {
|
||||||
|
expect(PARSER_PROFILES[parser]).toBeDefined();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use valid profile types', () => {
|
||||||
|
Object.values(PARSER_PROFILES).forEach(profile => {
|
||||||
|
expect(PROFILE_VALUES).toContain(profile);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should have appropriate profiles for parser types', () => {
|
||||||
|
expect(PARSER_PROFILES['fss-parse-pdf']).toBe('technical');
|
||||||
|
expect(PARSER_PROFILES['fss-parse-word']).toBe('technical');
|
||||||
|
expect(PARSER_PROFILES['fss-parse-excel']).toBe('data');
|
||||||
|
expect(PARSER_PROFILES['fss-parse-audio']).toBe('meeting');
|
||||||
|
expect(PARSER_PROFILES['fss-parse-video']).toBe('meeting');
|
||||||
|
expect(PARSER_PROFILES['fss-parse-email']).toBe('data');
|
||||||
|
expect(PARSER_PROFILES['fss-parse-diagram']).toBe('schema');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('BALANCED_FIELDS', () => {
|
||||||
|
it('should include universal document fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('word_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('page_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('character_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('author');
|
||||||
|
expect(BALANCED_FIELDS).toContain('format');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include PDF/Word structure fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_tables');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_images');
|
||||||
|
expect(BALANCED_FIELDS).toContain('table_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('image_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_toc');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_forms');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include Excel/Data fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('sheet_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('row_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('column_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('record_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('format_detected');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include image fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('width');
|
||||||
|
expect(BALANCED_FIELDS).toContain('height');
|
||||||
|
expect(BALANCED_FIELDS).toContain('channels');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_alpha');
|
||||||
|
expect(BALANCED_FIELDS).toContain('ocr_confidence');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include audio fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('duration');
|
||||||
|
expect(BALANCED_FIELDS).toContain('bitrate');
|
||||||
|
expect(BALANCED_FIELDS).toContain('sample_rate');
|
||||||
|
expect(BALANCED_FIELDS).toContain('codec');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_transcript');
|
||||||
|
expect(BALANCED_FIELDS).toContain('speaker_count');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include video fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('fps');
|
||||||
|
expect(BALANCED_FIELDS).toContain('aspect_ratio');
|
||||||
|
expect(BALANCED_FIELDS).toContain('video_codec');
|
||||||
|
expect(BALANCED_FIELDS).toContain('audio_codec');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include email fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('from');
|
||||||
|
expect(BALANCED_FIELDS).toContain('to');
|
||||||
|
expect(BALANCED_FIELDS).toContain('cc');
|
||||||
|
expect(BALANCED_FIELDS).toContain('sender');
|
||||||
|
expect(BALANCED_FIELDS).toContain('recipients');
|
||||||
|
expect(BALANCED_FIELDS).toContain('message_id');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_attachments');
|
||||||
|
expect(BALANCED_FIELDS).toContain('attachment_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('importance');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include presentation fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('slide_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('total_slides');
|
||||||
|
expect(BALANCED_FIELDS).toContain('chart_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('has_speaker_notes');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include diagram fields', () => {
|
||||||
|
expect(BALANCED_FIELDS).toContain('diagram_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('diagram_type');
|
||||||
|
expect(BALANCED_FIELDS).toContain('valid_diagrams');
|
||||||
|
expect(BALANCED_FIELDS).toContain('invalid_diagrams');
|
||||||
|
expect(BALANCED_FIELDS).toContain('node_count');
|
||||||
|
expect(BALANCED_FIELDS).toContain('edge_count');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not contain duplicate fields', () => {
|
||||||
|
const uniqueFields = new Set(BALANCED_FIELDS);
|
||||||
|
expect(uniqueFields.size).toBe(BALANCED_FIELDS.length);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should have reasonable number of fields', () => {
|
||||||
|
// Should have enough fields to cover all parser types
|
||||||
|
expect(BALANCED_FIELDS.length).toBeGreaterThan(50);
|
||||||
|
// But not so many that balanced mode becomes complete mode
|
||||||
|
expect(BALANCED_FIELDS.length).toBeLessThan(100);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
1
tests/generator.test.d.ts
vendored
Normal file
1
tests/generator.test.d.ts
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
export {};
|
||||||
403
tests/generator.test.js
Normal file
403
tests/generator.test.js
Normal file
File diff suppressed because one or more lines are too long
525
tests/generator.test.ts
Normal file
525
tests/generator.test.ts
Normal file
@ -0,0 +1,525 @@
|
|||||||
|
import {
|
||||||
|
FrontmatterGenerator,
|
||||||
|
FrontmatterOptions,
|
||||||
|
DeterministicFields,
|
||||||
|
LLMEnrichment,
|
||||||
|
OutputMode,
|
||||||
|
DEFAULTS
|
||||||
|
} from '../src';
|
||||||
|
import * as yaml from 'js-yaml';
|
||||||
|
|
||||||
|
describe('FrontmatterGenerator', () => {
|
||||||
|
const baseOptions: FrontmatterOptions = {
|
||||||
|
generator: 'fss-parse-pdf',
|
||||||
|
version: '1.0.0',
|
||||||
|
title: 'Test Document'
|
||||||
|
};
|
||||||
|
|
||||||
|
describe('generate()', () => {
|
||||||
|
it('should return empty string for none mode', () => {
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, {}, undefined, 'none');
|
||||||
|
expect(result).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should generate valid YAML frontmatter', () => {
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions);
|
||||||
|
expect(result).toMatch(/^---\n/);
|
||||||
|
expect(result).toMatch(/\n---$/);
|
||||||
|
|
||||||
|
// Extract and parse YAML
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
expect(parsed).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include core required fields', () => {
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.profile).toBe(DEFAULTS.profile);
|
||||||
|
expect(parsed.generator).toBe('fss-parse-pdf');
|
||||||
|
expect(parsed.version).toBe('1.0.0');
|
||||||
|
expect(parsed.title).toBe('Test Document');
|
||||||
|
expect(parsed.extraction_confidence).toBe(DEFAULTS.extractionConfidence);
|
||||||
|
expect(parsed.content_quality).toBe(DEFAULTS.contentQuality);
|
||||||
|
expect(parsed.created).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include source_file when provided', () => {
|
||||||
|
const options: FrontmatterOptions = {
|
||||||
|
...baseOptions,
|
||||||
|
sourcePath: '/path/to/file.pdf'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(options);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.source_file).toBe('/path/to/file.pdf');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use custom profile when provided', () => {
|
||||||
|
const options: FrontmatterOptions = {
|
||||||
|
...baseOptions,
|
||||||
|
profile: 'technical'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(options);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.profile).toBe('technical');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should add LLM enrichment placeholders when no enrichment provided', () => {
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.summary).toBe('');
|
||||||
|
expect(parsed.tags).toEqual([]);
|
||||||
|
expect(parsed.category).toBe('');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include LLM enrichment fields when provided', () => {
|
||||||
|
const enrichment: LLMEnrichment = {
|
||||||
|
summary: 'Test summary',
|
||||||
|
tags: ['tag1', 'tag2'],
|
||||||
|
category: 'technical',
|
||||||
|
audience: 'expert',
|
||||||
|
doc_purpose: 'reference',
|
||||||
|
complexity: 4,
|
||||||
|
actionable: true,
|
||||||
|
key_technologies: ['TypeScript', 'Node.js']
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, {}, enrichment);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.summary).toBe('Test summary');
|
||||||
|
expect(parsed.tags).toEqual(['tag1', 'tag2']);
|
||||||
|
expect(parsed.category).toBe('technical');
|
||||||
|
expect(parsed.audience).toBe('expert');
|
||||||
|
expect(parsed.doc_purpose).toBe('reference');
|
||||||
|
expect(parsed.complexity).toBe(4);
|
||||||
|
expect(parsed.actionable).toBe(true);
|
||||||
|
expect(parsed.key_technologies).toEqual(['TypeScript', 'Node.js']);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('balanced mode', () => {
|
||||||
|
it('should include only balanced fields from deterministic', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
word_count: 1000,
|
||||||
|
page_count: 5,
|
||||||
|
character_count: 5000,
|
||||||
|
has_tables: true,
|
||||||
|
_internal_field: 'should be excluded',
|
||||||
|
rare_field: 'should be excluded in balanced'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced');
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.word_count).toBe(1000);
|
||||||
|
expect(parsed.page_count).toBe(5);
|
||||||
|
expect(parsed.character_count).toBe(5000);
|
||||||
|
expect(parsed.has_tables).toBe(true);
|
||||||
|
expect(parsed._internal_field).toBeUndefined();
|
||||||
|
expect(parsed.rare_field).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should exclude null and undefined balanced fields', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
word_count: 1000,
|
||||||
|
page_count: null as any,
|
||||||
|
character_count: undefined
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'balanced');
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.word_count).toBe(1000);
|
||||||
|
expect(parsed.page_count).toBeUndefined();
|
||||||
|
expect(parsed.character_count).toBeUndefined();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('complete mode', () => {
|
||||||
|
it('should include all deterministic fields', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
word_count: 1000,
|
||||||
|
page_count: 5,
|
||||||
|
custom_field: 'included',
|
||||||
|
nested: { deep: 'value' }
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.word_count).toBe(1000);
|
||||||
|
expect(parsed.page_count).toBe(5);
|
||||||
|
expect(parsed.custom_field).toBe('included');
|
||||||
|
expect(parsed.nested).toEqual({ deep: 'value' });
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should exclude fields starting with underscore', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
word_count: 1000,
|
||||||
|
_private: 'excluded'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.word_count).toBe(1000);
|
||||||
|
expect(parsed._private).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should convert Date objects to ISO strings', () => {
|
||||||
|
const testDate = new Date('2024-01-15T10:30:00Z');
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
modified: testDate
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.modified).toBe('2024-01-15T10:30:00.000Z');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('generateMarkdown()', () => {
|
||||||
|
it('should prepend frontmatter to content', () => {
|
||||||
|
const content = '# My Document\n\nContent here';
|
||||||
|
const result = FrontmatterGenerator.generateMarkdown(
|
||||||
|
baseOptions,
|
||||||
|
{ word_count: 100 },
|
||||||
|
content
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result).toMatch(/^---\n/);
|
||||||
|
expect(result).toContain('# My Document');
|
||||||
|
expect(result).toContain('Content here');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return only content for none mode', () => {
|
||||||
|
const content = '# My Document\n\nContent here';
|
||||||
|
const result = FrontmatterGenerator.generateMarkdown(
|
||||||
|
baseOptions,
|
||||||
|
{ word_count: 100 },
|
||||||
|
content,
|
||||||
|
undefined,
|
||||||
|
'none'
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result).toBe(content);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should separate frontmatter and content with double newline', () => {
|
||||||
|
const content = '# My Document';
|
||||||
|
const result = FrontmatterGenerator.generateMarkdown(
|
||||||
|
baseOptions,
|
||||||
|
{},
|
||||||
|
content
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result).toMatch(/---\n\n#/);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('parser-specific scenarios', () => {
|
||||||
|
it('should handle PDF metadata', () => {
|
||||||
|
const pdfDeterministic: DeterministicFields = {
|
||||||
|
word_count: 5000,
|
||||||
|
page_count: 20,
|
||||||
|
character_count: 25000,
|
||||||
|
has_tables: true,
|
||||||
|
has_images: true,
|
||||||
|
table_count: 5,
|
||||||
|
image_count: 10,
|
||||||
|
has_toc: true,
|
||||||
|
has_forms: false,
|
||||||
|
encrypted: false,
|
||||||
|
author: 'John Doe'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-pdf', profile: 'technical' },
|
||||||
|
pdfDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.word_count).toBe(5000);
|
||||||
|
expect(parsed.page_count).toBe(20);
|
||||||
|
expect(parsed.has_tables).toBe(true);
|
||||||
|
expect(parsed.table_count).toBe(5);
|
||||||
|
expect(parsed.author).toBe('John Doe');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle email metadata', () => {
|
||||||
|
const emailDeterministic: DeterministicFields = {
|
||||||
|
from: 'sender@example.com',
|
||||||
|
to: 'recipient@example.com',
|
||||||
|
date: '2024-01-15T10:30:00Z',
|
||||||
|
message_id: '<abc123@example.com>',
|
||||||
|
has_attachments: true,
|
||||||
|
attachment_count: 3,
|
||||||
|
word_count: 500,
|
||||||
|
importance: 'high'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-email' },
|
||||||
|
emailDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.from).toBe('sender@example.com');
|
||||||
|
expect(parsed.to).toBe('recipient@example.com');
|
||||||
|
expect(parsed.has_attachments).toBe(true);
|
||||||
|
expect(parsed.attachment_count).toBe(3);
|
||||||
|
expect(parsed.importance).toBe('high');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle audio metadata', () => {
|
||||||
|
const audioDeterministic: DeterministicFields = {
|
||||||
|
duration: 3600,
|
||||||
|
duration_seconds: 3600,
|
||||||
|
bitrate: 320,
|
||||||
|
sample_rate: 44100,
|
||||||
|
codec: 'mp3',
|
||||||
|
has_transcript: true,
|
||||||
|
speaker_count: 3,
|
||||||
|
language: 'en'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-audio', profile: 'meeting' },
|
||||||
|
audioDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.duration).toBe(3600);
|
||||||
|
expect(parsed.bitrate).toBe(320);
|
||||||
|
expect(parsed.has_transcript).toBe(true);
|
||||||
|
expect(parsed.speaker_count).toBe(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle image metadata', () => {
|
||||||
|
const imageDeterministic: DeterministicFields = {
|
||||||
|
width: 1920,
|
||||||
|
height: 1080,
|
||||||
|
format: 'png',
|
||||||
|
file_size: 2048000,
|
||||||
|
channels: 4,
|
||||||
|
has_alpha: true,
|
||||||
|
ocr_confidence: 0.95
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-image' },
|
||||||
|
imageDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.width).toBe(1920);
|
||||||
|
expect(parsed.height).toBe(1080);
|
||||||
|
expect(parsed.format).toBe('png');
|
||||||
|
expect(parsed.ocr_confidence).toBe(0.95);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle video metadata', () => {
|
||||||
|
const videoDeterministic: DeterministicFields = {
|
||||||
|
duration: 7200,
|
||||||
|
width: 1920,
|
||||||
|
height: 1080,
|
||||||
|
fps: 30,
|
||||||
|
aspect_ratio: '16:9',
|
||||||
|
bitrate: 8000,
|
||||||
|
video_codec: 'h264',
|
||||||
|
audio_codec: 'aac'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-video', profile: 'meeting' },
|
||||||
|
videoDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.duration).toBe(7200);
|
||||||
|
expect(parsed.fps).toBe(30);
|
||||||
|
expect(parsed.aspect_ratio).toBe('16:9');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle presentation metadata', () => {
|
||||||
|
const presentationDeterministic: DeterministicFields = {
|
||||||
|
slide_count: 25,
|
||||||
|
total_slides: 25,
|
||||||
|
word_count: 3000,
|
||||||
|
has_images: true,
|
||||||
|
image_count: 15,
|
||||||
|
chart_count: 5,
|
||||||
|
has_speaker_notes: true,
|
||||||
|
author: 'Jane Smith'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-presentation', profile: 'technical' },
|
||||||
|
presentationDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.slide_count).toBe(25);
|
||||||
|
expect(parsed.chart_count).toBe(5);
|
||||||
|
expect(parsed.has_speaker_notes).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle excel metadata', () => {
|
||||||
|
const excelDeterministic: DeterministicFields = {
|
||||||
|
sheet_count: 3,
|
||||||
|
row_count: 1000,
|
||||||
|
column_count: 20,
|
||||||
|
author: 'Data Analyst'
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-excel' },
|
||||||
|
excelDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.sheet_count).toBe(3);
|
||||||
|
expect(parsed.row_count).toBe(1000);
|
||||||
|
expect(parsed.column_count).toBe(20);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle diagram metadata', () => {
|
||||||
|
const diagramDeterministic: DeterministicFields = {
|
||||||
|
diagram_count: 5,
|
||||||
|
diagram_type: 'flowchart',
|
||||||
|
valid_diagrams: 4,
|
||||||
|
invalid_diagrams: 1,
|
||||||
|
node_count: 20,
|
||||||
|
edge_count: 25
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-diagram', profile: 'schema' },
|
||||||
|
diagramDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.diagram_count).toBe(5);
|
||||||
|
expect(parsed.valid_diagrams).toBe(4);
|
||||||
|
expect(parsed.node_count).toBe(20);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle data parser metadata', () => {
|
||||||
|
const dataDeterministic: DeterministicFields = {
|
||||||
|
record_count: 10000,
|
||||||
|
format_detected: 'json',
|
||||||
|
file_size: 5000000,
|
||||||
|
column_count: 15
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(
|
||||||
|
{ ...baseOptions, generator: 'fss-parse-data' },
|
||||||
|
dataDeterministic,
|
||||||
|
undefined,
|
||||||
|
'balanced'
|
||||||
|
);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.record_count).toBe(10000);
|
||||||
|
expect(parsed.format_detected).toBe('json');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('edge cases', () => {
|
||||||
|
it('should handle empty deterministic object', () => {
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, {});
|
||||||
|
expect(result).toMatch(/^---\n/);
|
||||||
|
expect(result).toMatch(/\n---$/);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle missing title with default', () => {
|
||||||
|
const options: FrontmatterOptions = {
|
||||||
|
generator: 'test',
|
||||||
|
version: '1.0.0',
|
||||||
|
title: ''
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(options);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.title).toBe('Untitled');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle arrays in deterministic fields', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
recipients: ['a@test.com', 'b@test.com']
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic, undefined, 'complete');
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.recipients).toEqual(['a@test.com', 'b@test.com']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle custom extraction confidence and quality', () => {
|
||||||
|
const options: FrontmatterOptions = {
|
||||||
|
...baseOptions,
|
||||||
|
extractionConfidence: 0.85,
|
||||||
|
contentQuality: 1.2
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(options);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.extraction_confidence).toBe(0.85);
|
||||||
|
expect(parsed.content_quality).toBe(1.2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle zero values correctly', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
word_count: 0,
|
||||||
|
page_count: 0
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.word_count).toBe(0);
|
||||||
|
expect(parsed.page_count).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle boolean false values', () => {
|
||||||
|
const deterministic: DeterministicFields = {
|
||||||
|
has_tables: false,
|
||||||
|
encrypted: false
|
||||||
|
};
|
||||||
|
const result = FrontmatterGenerator.generate(baseOptions, deterministic);
|
||||||
|
const yamlContent = result.replace(/^---\n/, '').replace(/\n---$/, '');
|
||||||
|
const parsed = yaml.load(yamlContent) as Record<string, any>;
|
||||||
|
|
||||||
|
expect(parsed.has_tables).toBe(false);
|
||||||
|
expect(parsed.encrypted).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
1
tests/prompts.test.d.ts
vendored
Normal file
1
tests/prompts.test.d.ts
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
export {};
|
||||||
84
tests/prompts.test.js
Normal file
84
tests/prompts.test.js
Normal file
File diff suppressed because one or more lines are too long
97
tests/prompts.test.ts
Normal file
97
tests/prompts.test.ts
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
import {
|
||||||
|
getEnrichmentPrompt,
|
||||||
|
getSamplePromptForDocType
|
||||||
|
} from '../src';
|
||||||
|
|
||||||
|
describe('Prompts', () => {
|
||||||
|
describe('getEnrichmentPrompt()', () => {
|
||||||
|
it('should return a string prompt', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Test content');
|
||||||
|
expect(typeof prompt).toBe('string');
|
||||||
|
expect(prompt.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include the content in the prompt', () => {
|
||||||
|
const content = 'This is my test document content';
|
||||||
|
const prompt = getEnrichmentPrompt(content);
|
||||||
|
expect(prompt).toContain(content);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include document type in prompt', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content', 'pdf');
|
||||||
|
expect(prompt).toContain('Document type: pdf');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should default to markdown doc type', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('Document type: markdown');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include JSON structure requirements', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('"summary"');
|
||||||
|
expect(prompt).toContain('"tags"');
|
||||||
|
expect(prompt).toContain('"category"');
|
||||||
|
expect(prompt).toContain('"audience"');
|
||||||
|
expect(prompt).toContain('"doc_purpose"');
|
||||||
|
expect(prompt).toContain('"complexity"');
|
||||||
|
expect(prompt).toContain('"actionable"');
|
||||||
|
expect(prompt).toContain('"key_technologies"');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include valid audience values', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('all | beginner | intermediate | expert');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include valid doc_purpose values', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('reference | tutorial | troubleshooting | conceptual | guide | specification');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include complexity scale', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('1-5');
|
||||||
|
expect(prompt).toContain('1=overview');
|
||||||
|
expect(prompt).toContain('5=deep implementation');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should include guidelines for tags', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('tags');
|
||||||
|
expect(prompt).toContain('SPECIFIC terms');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should request JSON-only response', () => {
|
||||||
|
const prompt = getEnrichmentPrompt('Content');
|
||||||
|
expect(prompt).toContain('valid JSON only');
|
||||||
|
expect(prompt).toContain('No explanation');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should wrap content with delimiters', () => {
|
||||||
|
const content = 'Test content';
|
||||||
|
const prompt = getEnrichmentPrompt(content);
|
||||||
|
expect(prompt).toContain('---\n' + content + '\n---');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getSamplePromptForDocType()', () => {
|
||||||
|
it('should return correct descriptions for known types', () => {
|
||||||
|
expect(getSamplePromptForDocType('pdf')).toBe('PDF document');
|
||||||
|
expect(getSamplePromptForDocType('word')).toBe('Word document');
|
||||||
|
expect(getSamplePromptForDocType('email')).toBe('Email message');
|
||||||
|
expect(getSamplePromptForDocType('image')).toBe('Image with OCR text');
|
||||||
|
expect(getSamplePromptForDocType('audio')).toBe('Audio transcript');
|
||||||
|
expect(getSamplePromptForDocType('video')).toBe('Video transcript');
|
||||||
|
expect(getSamplePromptForDocType('presentation')).toBe('Presentation slides');
|
||||||
|
expect(getSamplePromptForDocType('excel')).toBe('Spreadsheet data');
|
||||||
|
expect(getSamplePromptForDocType('markdown')).toBe('Markdown document');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return "document" for unknown types', () => {
|
||||||
|
expect(getSamplePromptForDocType('unknown')).toBe('document');
|
||||||
|
expect(getSamplePromptForDocType('xyz')).toBe('document');
|
||||||
|
expect(getSamplePromptForDocType('')).toBe('document');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
24
tsconfig.json
Normal file
24
tsconfig.json
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
"target": "ES2020",
|
||||||
|
"module": "commonjs",
|
||||||
|
"lib": ["ES2020"],
|
||||||
|
"declaration": true,
|
||||||
|
"strict": true,
|
||||||
|
"noImplicitAny": true,
|
||||||
|
"strictNullChecks": true,
|
||||||
|
"noImplicitThis": true,
|
||||||
|
"alwaysStrict": true,
|
||||||
|
"noUnusedLocals": false,
|
||||||
|
"noUnusedParameters": false,
|
||||||
|
"noImplicitReturns": true,
|
||||||
|
"noFallthroughCasesInSwitch": false,
|
||||||
|
"inlineSourceMap": true,
|
||||||
|
"inlineSources": true,
|
||||||
|
"experimentalDecorators": true,
|
||||||
|
"strictPropertyInitialization": false,
|
||||||
|
"outDir": "./dist",
|
||||||
|
"rootDir": "./src"
|
||||||
|
},
|
||||||
|
"exclude": ["node_modules", "dist", "tests"]
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user