parser-frontmatter-ts/IMPLEMENTATION_BLUEPRINT.md

507 lines
12 KiB
Markdown

# @bobai/frontmatter - Implementation Blueprint
**Package Location**: `/MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter`
**Standard Reference**: `/MASTERFOLDER/KnowledgeBase/Standards/BOBAI_MARKDOWN_STANDARD_V1.1.md`
---
## Purpose
Create a shared npm package that all FSS parsers import for consistent BOBAI v1.1 frontmatter generation.
---
## Files to Create
```
bobai-frontmatter/
├── src/
│ ├── index.ts # Main exports
│ ├── generator.ts # FrontmatterGenerator class
│ ├── types.ts # TypeScript interfaces
│ ├── constants.ts # Enums, defaults
│ └── prompts.ts # LLM enrichment prompt templates
├── package.json
├── tsconfig.json
└── README.md
```
Note: `src/llm/` directory is for future enhancement (Issue #3)
---
## 1. package.json
```json
{
"name": "@bobai/frontmatter",
"version": "1.1.0",
"description": "BOBAI Markdown Standard v1.1 frontmatter generator",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"files": ["dist"],
"scripts": {
"build": "tsc",
"clean": "rm -rf dist",
"prepublishOnly": "npm run clean && npm run build"
},
"keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"],
"author": "BobAI",
"license": "MIT",
"dependencies": {
"js-yaml": "^4.1.0"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",
"@types/node": "^20.0.0",
"typescript": "^5.0.0"
},
"engines": {
"node": ">=18.0.0"
}
}
```
---
## 2. tsconfig.json
```json
{
"compilerOptions": {
"target": "ES2020",
"module": "commonjs",
"lib": ["ES2020"],
"declaration": true,
"strict": true,
"noImplicitAny": true,
"strictNullChecks": true,
"noImplicitThis": true,
"alwaysStrict": true,
"noUnusedLocals": false,
"noUnusedParameters": false,
"noImplicitReturns": true,
"noFallthroughCasesInSwitch": false,
"inlineSourceMap": true,
"inlineSources": true,
"experimentalDecorators": true,
"strictPropertyInitialization": false,
"outDir": "./dist",
"rootDir": "./src"
},
"exclude": ["node_modules", "dist"]
}
```
---
## 3. src/types.ts
```typescript
export type OutputMode = 'none' | 'balanced' | 'complete';
export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';
export type DocPurpose =
| 'reference'
| 'tutorial'
| 'troubleshooting'
| 'conceptual'
| 'guide'
| 'specification';
export type ProfileType =
| 'scraped'
| 'research'
| 'technical'
| 'code'
| 'data'
| 'changelog'
| 'legal'
| 'test'
| 'schema'
| 'troubleshoot'
| 'meeting'
| 'faq'
| 'config';
export interface FrontmatterOptions {
generator: string;
version: string;
title: string;
sourcePath?: string | null;
profile?: ProfileType;
extractionConfidence?: number;
contentQuality?: number;
}
export interface DeterministicFields {
word_count?: number;
page_count?: number;
character_count?: number;
[key: string]: any;
}
export interface LLMEnrichment {
summary?: string;
tags?: string[];
category?: string;
audience?: AudienceLevel;
doc_purpose?: DocPurpose;
complexity?: number;
actionable?: boolean;
key_technologies?: string[];
}
```
---
## 4. src/constants.ts
```typescript
import { AudienceLevel, DocPurpose, ProfileType } from './types';
export const AUDIENCE_VALUES: AudienceLevel[] = [
'all', 'beginner', 'intermediate', 'expert'
];
export const DOC_PURPOSE_VALUES: DocPurpose[] = [
'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification'
];
export const PROFILE_VALUES: ProfileType[] = [
'scraped', 'research', 'technical', 'code', 'data', 'changelog',
'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
];
export const DEFAULTS = {
profile: 'data' as ProfileType,
audience: 'all' as AudienceLevel,
extractionConfidence: 1.0,
contentQuality: 1.5,
complexity: 3
};
// Fields to include in balanced mode (not complete)
export const BALANCED_FIELDS = [
'word_count',
'page_count',
'has_tables',
'has_images',
'section_count',
'slide_count',
'duration_seconds',
'width',
'height',
'has_attachments',
'attachment_count',
'from',
'to',
'date',
'message_id',
'author',
'has_toc',
'has_tracked_changes',
'has_transcript',
'speaker_count'
];
```
---
## 5. src/prompts.ts
```typescript
export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string {
return `You are a document analyst preparing metadata for a RAG search system.
Extract structured metadata to help users find and understand this document.
Respond with this exact JSON structure:
{
"summary": "2-3 sentences: What is this document about? What problem does it solve?",
"tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
"category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
"audience": "all | beginner | intermediate | expert",
"doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
"complexity": 1-5,
"actionable": true or false,
"key_technologies": ["specific tools, languages, frameworks mentioned"]
}
Guidelines:
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
- category: Pick the single best match
- audience: Default to "all" unless clearly targeted to specific skill level
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
- actionable: true if reader should DO something, false if just informational
- key_technologies: Only include specific named technologies, not generic terms
Document type: ${docType}
---
${content}
---
Respond with valid JSON only. No explanation or markdown formatting.`;
}
export function getSamplePromptForDocType(docType: string): string {
const samples: Record<string, string> = {
pdf: 'PDF document',
word: 'Word document',
email: 'Email message',
image: 'Image with OCR text',
audio: 'Audio transcript',
video: 'Video transcript',
presentation: 'Presentation slides',
excel: 'Spreadsheet data',
markdown: 'Markdown document'
};
return samples[docType] || 'document';
}
```
---
## 6. src/generator.ts
```typescript
import * as yaml from 'js-yaml';
import {
FrontmatterOptions,
DeterministicFields,
LLMEnrichment,
OutputMode
} from './types';
import { DEFAULTS, BALANCED_FIELDS } from './constants';
export class FrontmatterGenerator {
static generate(
options: FrontmatterOptions,
deterministic: DeterministicFields = {},
enrichment?: LLMEnrichment,
mode: OutputMode = 'balanced'
): string {
if (mode === 'none') {
return '';
}
const frontmatter: Record<string, any> = {
// Core required fields
profile: options.profile || DEFAULTS.profile,
created: new Date().toISOString(),
generator: options.generator,
version: options.version,
title: options.title || 'Untitled',
extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence,
content_quality: options.contentQuality ?? DEFAULTS.contentQuality,
};
// Source file
if (options.sourcePath) {
frontmatter.source_file = options.sourcePath;
}
// Add deterministic fields based on mode
if (mode === 'complete') {
// Include all deterministic fields
const cleaned = this.cleanObject(deterministic);
Object.assign(frontmatter, cleaned);
} else {
// Balanced mode - include only key fields
for (const field of BALANCED_FIELDS) {
if (deterministic[field] !== undefined && deterministic[field] !== null) {
frontmatter[field] = deterministic[field];
}
}
}
// LLM enrichment fields (flat, not nested)
if (enrichment) {
if (enrichment.summary) frontmatter.summary = enrichment.summary;
if (enrichment.tags?.length) frontmatter.tags = enrichment.tags;
if (enrichment.category) frontmatter.category = enrichment.category;
if (enrichment.audience) frontmatter.audience = enrichment.audience;
if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose;
if (enrichment.complexity) frontmatter.complexity = enrichment.complexity;
if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable;
if (enrichment.key_technologies?.length) {
frontmatter.key_technologies = enrichment.key_technologies;
}
} else {
// Placeholders for LLM enrichment
frontmatter.summary = '';
frontmatter.tags = [];
frontmatter.category = '';
}
const yamlStr = yaml.dump(this.removeNulls(frontmatter), {
indent: 2,
lineWidth: -1,
quotingType: "'",
sortKeys: false
});
return `---\n${yamlStr}---`;
}
static generateMarkdown(
options: FrontmatterOptions,
deterministic: DeterministicFields,
content: string,
enrichment?: LLMEnrichment,
mode: OutputMode = 'balanced'
): string {
const fm = this.generate(options, deterministic, enrichment, mode);
if (!fm) return content;
return `${fm}\n\n${content}`;
}
private static cleanObject(obj: any): any {
const result: any = {};
for (const [k, v] of Object.entries(obj)) {
if (k.startsWith('_')) continue;
if (v === null || v === undefined) continue;
if (v instanceof Date) {
result[k] = v.toISOString();
} else if (v && typeof v === 'object' && !Array.isArray(v)) {
result[k] = this.cleanObject(v);
} else {
result[k] = v;
}
}
return result;
}
private static removeNulls(obj: any): any {
if (Array.isArray(obj)) {
return obj.filter(x => x != null).map(x => this.removeNulls(x));
}
if (obj && typeof obj === 'object') {
const result: any = {};
for (const [k, v] of Object.entries(obj)) {
if (v != null) result[k] = this.removeNulls(v);
}
return result;
}
return obj;
}
}
```
---
## 7. src/index.ts
```typescript
// Types
export {
OutputMode,
AudienceLevel,
DocPurpose,
ProfileType,
FrontmatterOptions,
DeterministicFields,
LLMEnrichment
} from './types';
// Constants
export {
AUDIENCE_VALUES,
DOC_PURPOSE_VALUES,
PROFILE_VALUES,
DEFAULTS,
BALANCED_FIELDS
} from './constants';
// Generator
export { FrontmatterGenerator } from './generator';
// Prompts
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';
```
---
## 8. README.md
```markdown
# @bobai/frontmatter
BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers.
## Installation
```bash
npm install @bobai/frontmatter
```
## Usage
```typescript
import {
FrontmatterGenerator,
getEnrichmentPrompt,
OutputMode,
LLMEnrichment
} from '@bobai/frontmatter';
// Generate frontmatter
const markdown = FrontmatterGenerator.generateMarkdown(
{
generator: 'fss-parse-pdf',
version: '1.2.0',
title: 'My Document',
sourcePath: '/path/to/file.pdf'
},
{
word_count: 1234,
page_count: 8,
has_tables: true
},
content,
enrichment, // LLMEnrichment or undefined
'balanced' // OutputMode: 'none' | 'balanced' | 'complete'
);
// Get LLM enrichment prompt
const prompt = getEnrichmentPrompt(content, 'pdf');
// Send to your LLM and parse response as LLMEnrichment
```
## Output Modes
- `none` - No frontmatter, just content
- `balanced` (default) - Core fields + key deterministic + LLM enrichment
- `complete` - All fields including full metadata
## Reference
See BOBAI Markdown Standard v1.1 for field definitions.
```
---
## Testing
After implementation:
```bash
cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter
npm install
npm run build
```
Test by importing in a parser or creating a simple test script.
---
## Next Steps After Package Complete
1. Update each parser to import from this package
2. Remove duplicate frontmatter code from parsers
3. Consider publishing to npm registry (later)