# @bobai/frontmatter - Implementation Blueprint **Package Location**: `/MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter` **Standard Reference**: `/MASTERFOLDER/KnowledgeBase/Standards/BOBAI_MARKDOWN_STANDARD_V1.1.md` --- ## Purpose Create a shared npm package that all FSS parsers import for consistent BOBAI v1.1 frontmatter generation. --- ## Files to Create ``` bobai-frontmatter/ ├── src/ │ ├── index.ts # Main exports │ ├── generator.ts # FrontmatterGenerator class │ ├── types.ts # TypeScript interfaces │ ├── constants.ts # Enums, defaults │ └── prompts.ts # LLM enrichment prompt templates ├── package.json ├── tsconfig.json └── README.md ``` Note: `src/llm/` directory is for future enhancement (Issue #3) --- ## 1. package.json ```json { "name": "@bobai/frontmatter", "version": "1.1.0", "description": "BOBAI Markdown Standard v1.1 frontmatter generator", "main": "dist/index.js", "types": "dist/index.d.ts", "files": ["dist"], "scripts": { "build": "tsc", "clean": "rm -rf dist", "prepublishOnly": "npm run clean && npm run build" }, "keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"], "author": "BobAI", "license": "MIT", "dependencies": { "js-yaml": "^4.1.0" }, "devDependencies": { "@types/js-yaml": "^4.0.9", "@types/node": "^20.0.0", "typescript": "^5.0.0" }, "engines": { "node": ">=18.0.0" } } ``` --- ## 2. tsconfig.json ```json { "compilerOptions": { "target": "ES2020", "module": "commonjs", "lib": ["ES2020"], "declaration": true, "strict": true, "noImplicitAny": true, "strictNullChecks": true, "noImplicitThis": true, "alwaysStrict": true, "noUnusedLocals": false, "noUnusedParameters": false, "noImplicitReturns": true, "noFallthroughCasesInSwitch": false, "inlineSourceMap": true, "inlineSources": true, "experimentalDecorators": true, "strictPropertyInitialization": false, "outDir": "./dist", "rootDir": "./src" }, "exclude": ["node_modules", "dist"] } ``` --- ## 3. src/types.ts ```typescript export type OutputMode = 'none' | 'balanced' | 'complete'; export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert'; export type DocPurpose = | 'reference' | 'tutorial' | 'troubleshooting' | 'conceptual' | 'guide' | 'specification'; export type ProfileType = | 'scraped' | 'research' | 'technical' | 'code' | 'data' | 'changelog' | 'legal' | 'test' | 'schema' | 'troubleshoot' | 'meeting' | 'faq' | 'config'; export interface FrontmatterOptions { generator: string; version: string; title: string; sourcePath?: string | null; profile?: ProfileType; extractionConfidence?: number; contentQuality?: number; } export interface DeterministicFields { word_count?: number; page_count?: number; character_count?: number; [key: string]: any; } export interface LLMEnrichment { summary?: string; tags?: string[]; category?: string; audience?: AudienceLevel; doc_purpose?: DocPurpose; complexity?: number; actionable?: boolean; key_technologies?: string[]; } ``` --- ## 4. src/constants.ts ```typescript import { AudienceLevel, DocPurpose, ProfileType } from './types'; export const AUDIENCE_VALUES: AudienceLevel[] = [ 'all', 'beginner', 'intermediate', 'expert' ]; export const DOC_PURPOSE_VALUES: DocPurpose[] = [ 'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification' ]; export const PROFILE_VALUES: ProfileType[] = [ 'scraped', 'research', 'technical', 'code', 'data', 'changelog', 'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config' ]; export const DEFAULTS = { profile: 'data' as ProfileType, audience: 'all' as AudienceLevel, extractionConfidence: 1.0, contentQuality: 1.5, complexity: 3 }; // Fields to include in balanced mode (not complete) export const BALANCED_FIELDS = [ 'word_count', 'page_count', 'has_tables', 'has_images', 'section_count', 'slide_count', 'duration_seconds', 'width', 'height', 'has_attachments', 'attachment_count', 'from', 'to', 'date', 'message_id', 'author', 'has_toc', 'has_tracked_changes', 'has_transcript', 'speaker_count' ]; ``` --- ## 5. src/prompts.ts ```typescript export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string { return `You are a document analyst preparing metadata for a RAG search system. Extract structured metadata to help users find and understand this document. Respond with this exact JSON structure: { "summary": "2-3 sentences: What is this document about? What problem does it solve?", "tags": ["5-10 SPECIFIC terms from this document for search - not generic words"], "category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial", "audience": "all | beginner | intermediate | expert", "doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification", "complexity": 1-5, "actionable": true or false, "key_technologies": ["specific tools, languages, frameworks mentioned"] } Guidelines: - tags: Extract SPECIFIC terms that appear in the document, not generic descriptions - category: Pick the single best match - audience: Default to "all" unless clearly targeted to specific skill level - complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation - actionable: true if reader should DO something, false if just informational - key_technologies: Only include specific named technologies, not generic terms Document type: ${docType} --- ${content} --- Respond with valid JSON only. No explanation or markdown formatting.`; } export function getSamplePromptForDocType(docType: string): string { const samples: Record = { pdf: 'PDF document', word: 'Word document', email: 'Email message', image: 'Image with OCR text', audio: 'Audio transcript', video: 'Video transcript', presentation: 'Presentation slides', excel: 'Spreadsheet data', markdown: 'Markdown document' }; return samples[docType] || 'document'; } ``` --- ## 6. src/generator.ts ```typescript import * as yaml from 'js-yaml'; import { FrontmatterOptions, DeterministicFields, LLMEnrichment, OutputMode } from './types'; import { DEFAULTS, BALANCED_FIELDS } from './constants'; export class FrontmatterGenerator { static generate( options: FrontmatterOptions, deterministic: DeterministicFields = {}, enrichment?: LLMEnrichment, mode: OutputMode = 'balanced' ): string { if (mode === 'none') { return ''; } const frontmatter: Record = { // Core required fields profile: options.profile || DEFAULTS.profile, created: new Date().toISOString(), generator: options.generator, version: options.version, title: options.title || 'Untitled', extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence, content_quality: options.contentQuality ?? DEFAULTS.contentQuality, }; // Source file if (options.sourcePath) { frontmatter.source_file = options.sourcePath; } // Add deterministic fields based on mode if (mode === 'complete') { // Include all deterministic fields const cleaned = this.cleanObject(deterministic); Object.assign(frontmatter, cleaned); } else { // Balanced mode - include only key fields for (const field of BALANCED_FIELDS) { if (deterministic[field] !== undefined && deterministic[field] !== null) { frontmatter[field] = deterministic[field]; } } } // LLM enrichment fields (flat, not nested) if (enrichment) { if (enrichment.summary) frontmatter.summary = enrichment.summary; if (enrichment.tags?.length) frontmatter.tags = enrichment.tags; if (enrichment.category) frontmatter.category = enrichment.category; if (enrichment.audience) frontmatter.audience = enrichment.audience; if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose; if (enrichment.complexity) frontmatter.complexity = enrichment.complexity; if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable; if (enrichment.key_technologies?.length) { frontmatter.key_technologies = enrichment.key_technologies; } } else { // Placeholders for LLM enrichment frontmatter.summary = ''; frontmatter.tags = []; frontmatter.category = ''; } const yamlStr = yaml.dump(this.removeNulls(frontmatter), { indent: 2, lineWidth: -1, quotingType: "'", sortKeys: false }); return `---\n${yamlStr}---`; } static generateMarkdown( options: FrontmatterOptions, deterministic: DeterministicFields, content: string, enrichment?: LLMEnrichment, mode: OutputMode = 'balanced' ): string { const fm = this.generate(options, deterministic, enrichment, mode); if (!fm) return content; return `${fm}\n\n${content}`; } private static cleanObject(obj: any): any { const result: any = {}; for (const [k, v] of Object.entries(obj)) { if (k.startsWith('_')) continue; if (v === null || v === undefined) continue; if (v instanceof Date) { result[k] = v.toISOString(); } else if (v && typeof v === 'object' && !Array.isArray(v)) { result[k] = this.cleanObject(v); } else { result[k] = v; } } return result; } private static removeNulls(obj: any): any { if (Array.isArray(obj)) { return obj.filter(x => x != null).map(x => this.removeNulls(x)); } if (obj && typeof obj === 'object') { const result: any = {}; for (const [k, v] of Object.entries(obj)) { if (v != null) result[k] = this.removeNulls(v); } return result; } return obj; } } ``` --- ## 7. src/index.ts ```typescript // Types export { OutputMode, AudienceLevel, DocPurpose, ProfileType, FrontmatterOptions, DeterministicFields, LLMEnrichment } from './types'; // Constants export { AUDIENCE_VALUES, DOC_PURPOSE_VALUES, PROFILE_VALUES, DEFAULTS, BALANCED_FIELDS } from './constants'; // Generator export { FrontmatterGenerator } from './generator'; // Prompts export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts'; ``` --- ## 8. README.md ```markdown # @bobai/frontmatter BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers. ## Installation ```bash npm install @bobai/frontmatter ``` ## Usage ```typescript import { FrontmatterGenerator, getEnrichmentPrompt, OutputMode, LLMEnrichment } from '@bobai/frontmatter'; // Generate frontmatter const markdown = FrontmatterGenerator.generateMarkdown( { generator: 'fss-parse-pdf', version: '1.2.0', title: 'My Document', sourcePath: '/path/to/file.pdf' }, { word_count: 1234, page_count: 8, has_tables: true }, content, enrichment, // LLMEnrichment or undefined 'balanced' // OutputMode: 'none' | 'balanced' | 'complete' ); // Get LLM enrichment prompt const prompt = getEnrichmentPrompt(content, 'pdf'); // Send to your LLM and parse response as LLMEnrichment ``` ## Output Modes - `none` - No frontmatter, just content - `balanced` (default) - Core fields + key deterministic + LLM enrichment - `complete` - All fields including full metadata ## Reference See BOBAI Markdown Standard v1.1 for field definitions. ``` --- ## Testing After implementation: ```bash cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter npm install npm run build ``` Test by importing in a parser or creating a simple test script. --- ## Next Steps After Package Complete 1. Update each parser to import from this package 2. Remove duplicate frontmatter code from parsers 3. Consider publishing to npm registry (later)