507 lines
12 KiB
Markdown
507 lines
12 KiB
Markdown
# @bobai/frontmatter - Implementation Blueprint
|
|
|
|
**Package Location**: `/MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter`
|
|
**Standard Reference**: `/MASTERFOLDER/KnowledgeBase/Standards/BOBAI_MARKDOWN_STANDARD_V1.1.md`
|
|
|
|
---
|
|
|
|
## Purpose
|
|
|
|
Create a shared npm package that all FSS parsers import for consistent BOBAI v1.1 frontmatter generation.
|
|
|
|
---
|
|
|
|
## Files to Create
|
|
|
|
```
|
|
bobai-frontmatter/
|
|
├── src/
|
|
│ ├── index.ts # Main exports
|
|
│ ├── generator.ts # FrontmatterGenerator class
|
|
│ ├── types.ts # TypeScript interfaces
|
|
│ ├── constants.ts # Enums, defaults
|
|
│ └── prompts.ts # LLM enrichment prompt templates
|
|
├── package.json
|
|
├── tsconfig.json
|
|
└── README.md
|
|
```
|
|
|
|
Note: `src/llm/` directory is for future enhancement (Issue #3)
|
|
|
|
---
|
|
|
|
## 1. package.json
|
|
|
|
```json
|
|
{
|
|
"name": "@bobai/frontmatter",
|
|
"version": "1.1.0",
|
|
"description": "BOBAI Markdown Standard v1.1 frontmatter generator",
|
|
"main": "dist/index.js",
|
|
"types": "dist/index.d.ts",
|
|
"files": ["dist"],
|
|
"scripts": {
|
|
"build": "tsc",
|
|
"clean": "rm -rf dist",
|
|
"prepublishOnly": "npm run clean && npm run build"
|
|
},
|
|
"keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"],
|
|
"author": "BobAI",
|
|
"license": "MIT",
|
|
"dependencies": {
|
|
"js-yaml": "^4.1.0"
|
|
},
|
|
"devDependencies": {
|
|
"@types/js-yaml": "^4.0.9",
|
|
"@types/node": "^20.0.0",
|
|
"typescript": "^5.0.0"
|
|
},
|
|
"engines": {
|
|
"node": ">=18.0.0"
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 2. tsconfig.json
|
|
|
|
```json
|
|
{
|
|
"compilerOptions": {
|
|
"target": "ES2020",
|
|
"module": "commonjs",
|
|
"lib": ["ES2020"],
|
|
"declaration": true,
|
|
"strict": true,
|
|
"noImplicitAny": true,
|
|
"strictNullChecks": true,
|
|
"noImplicitThis": true,
|
|
"alwaysStrict": true,
|
|
"noUnusedLocals": false,
|
|
"noUnusedParameters": false,
|
|
"noImplicitReturns": true,
|
|
"noFallthroughCasesInSwitch": false,
|
|
"inlineSourceMap": true,
|
|
"inlineSources": true,
|
|
"experimentalDecorators": true,
|
|
"strictPropertyInitialization": false,
|
|
"outDir": "./dist",
|
|
"rootDir": "./src"
|
|
},
|
|
"exclude": ["node_modules", "dist"]
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 3. src/types.ts
|
|
|
|
```typescript
|
|
export type OutputMode = 'none' | 'balanced' | 'complete';
|
|
|
|
export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';
|
|
|
|
export type DocPurpose =
|
|
| 'reference'
|
|
| 'tutorial'
|
|
| 'troubleshooting'
|
|
| 'conceptual'
|
|
| 'guide'
|
|
| 'specification';
|
|
|
|
export type ProfileType =
|
|
| 'scraped'
|
|
| 'research'
|
|
| 'technical'
|
|
| 'code'
|
|
| 'data'
|
|
| 'changelog'
|
|
| 'legal'
|
|
| 'test'
|
|
| 'schema'
|
|
| 'troubleshoot'
|
|
| 'meeting'
|
|
| 'faq'
|
|
| 'config';
|
|
|
|
export interface FrontmatterOptions {
|
|
generator: string;
|
|
version: string;
|
|
title: string;
|
|
sourcePath?: string | null;
|
|
profile?: ProfileType;
|
|
extractionConfidence?: number;
|
|
contentQuality?: number;
|
|
}
|
|
|
|
export interface DeterministicFields {
|
|
word_count?: number;
|
|
page_count?: number;
|
|
character_count?: number;
|
|
[key: string]: any;
|
|
}
|
|
|
|
export interface LLMEnrichment {
|
|
summary?: string;
|
|
tags?: string[];
|
|
category?: string;
|
|
audience?: AudienceLevel;
|
|
doc_purpose?: DocPurpose;
|
|
complexity?: number;
|
|
actionable?: boolean;
|
|
key_technologies?: string[];
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 4. src/constants.ts
|
|
|
|
```typescript
|
|
import { AudienceLevel, DocPurpose, ProfileType } from './types';
|
|
|
|
export const AUDIENCE_VALUES: AudienceLevel[] = [
|
|
'all', 'beginner', 'intermediate', 'expert'
|
|
];
|
|
|
|
export const DOC_PURPOSE_VALUES: DocPurpose[] = [
|
|
'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification'
|
|
];
|
|
|
|
export const PROFILE_VALUES: ProfileType[] = [
|
|
'scraped', 'research', 'technical', 'code', 'data', 'changelog',
|
|
'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
|
|
];
|
|
|
|
export const DEFAULTS = {
|
|
profile: 'data' as ProfileType,
|
|
audience: 'all' as AudienceLevel,
|
|
extractionConfidence: 1.0,
|
|
contentQuality: 1.5,
|
|
complexity: 3
|
|
};
|
|
|
|
// Fields to include in balanced mode (not complete)
|
|
export const BALANCED_FIELDS = [
|
|
'word_count',
|
|
'page_count',
|
|
'has_tables',
|
|
'has_images',
|
|
'section_count',
|
|
'slide_count',
|
|
'duration_seconds',
|
|
'width',
|
|
'height',
|
|
'has_attachments',
|
|
'attachment_count',
|
|
'from',
|
|
'to',
|
|
'date',
|
|
'message_id',
|
|
'author',
|
|
'has_toc',
|
|
'has_tracked_changes',
|
|
'has_transcript',
|
|
'speaker_count'
|
|
];
|
|
```
|
|
|
|
---
|
|
|
|
## 5. src/prompts.ts
|
|
|
|
```typescript
|
|
export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string {
|
|
return `You are a document analyst preparing metadata for a RAG search system.
|
|
Extract structured metadata to help users find and understand this document.
|
|
|
|
Respond with this exact JSON structure:
|
|
|
|
{
|
|
"summary": "2-3 sentences: What is this document about? What problem does it solve?",
|
|
"tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
|
|
"category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
|
|
"audience": "all | beginner | intermediate | expert",
|
|
"doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
|
|
"complexity": 1-5,
|
|
"actionable": true or false,
|
|
"key_technologies": ["specific tools, languages, frameworks mentioned"]
|
|
}
|
|
|
|
Guidelines:
|
|
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
|
|
- category: Pick the single best match
|
|
- audience: Default to "all" unless clearly targeted to specific skill level
|
|
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
|
|
- actionable: true if reader should DO something, false if just informational
|
|
- key_technologies: Only include specific named technologies, not generic terms
|
|
|
|
Document type: ${docType}
|
|
|
|
---
|
|
${content}
|
|
---
|
|
|
|
Respond with valid JSON only. No explanation or markdown formatting.`;
|
|
}
|
|
|
|
export function getSamplePromptForDocType(docType: string): string {
|
|
const samples: Record<string, string> = {
|
|
pdf: 'PDF document',
|
|
word: 'Word document',
|
|
email: 'Email message',
|
|
image: 'Image with OCR text',
|
|
audio: 'Audio transcript',
|
|
video: 'Video transcript',
|
|
presentation: 'Presentation slides',
|
|
excel: 'Spreadsheet data',
|
|
markdown: 'Markdown document'
|
|
};
|
|
return samples[docType] || 'document';
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 6. src/generator.ts
|
|
|
|
```typescript
|
|
import * as yaml from 'js-yaml';
|
|
import {
|
|
FrontmatterOptions,
|
|
DeterministicFields,
|
|
LLMEnrichment,
|
|
OutputMode
|
|
} from './types';
|
|
import { DEFAULTS, BALANCED_FIELDS } from './constants';
|
|
|
|
export class FrontmatterGenerator {
|
|
|
|
static generate(
|
|
options: FrontmatterOptions,
|
|
deterministic: DeterministicFields = {},
|
|
enrichment?: LLMEnrichment,
|
|
mode: OutputMode = 'balanced'
|
|
): string {
|
|
|
|
if (mode === 'none') {
|
|
return '';
|
|
}
|
|
|
|
const frontmatter: Record<string, any> = {
|
|
// Core required fields
|
|
profile: options.profile || DEFAULTS.profile,
|
|
created: new Date().toISOString(),
|
|
generator: options.generator,
|
|
version: options.version,
|
|
title: options.title || 'Untitled',
|
|
extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence,
|
|
content_quality: options.contentQuality ?? DEFAULTS.contentQuality,
|
|
};
|
|
|
|
// Source file
|
|
if (options.sourcePath) {
|
|
frontmatter.source_file = options.sourcePath;
|
|
}
|
|
|
|
// Add deterministic fields based on mode
|
|
if (mode === 'complete') {
|
|
// Include all deterministic fields
|
|
const cleaned = this.cleanObject(deterministic);
|
|
Object.assign(frontmatter, cleaned);
|
|
} else {
|
|
// Balanced mode - include only key fields
|
|
for (const field of BALANCED_FIELDS) {
|
|
if (deterministic[field] !== undefined && deterministic[field] !== null) {
|
|
frontmatter[field] = deterministic[field];
|
|
}
|
|
}
|
|
}
|
|
|
|
// LLM enrichment fields (flat, not nested)
|
|
if (enrichment) {
|
|
if (enrichment.summary) frontmatter.summary = enrichment.summary;
|
|
if (enrichment.tags?.length) frontmatter.tags = enrichment.tags;
|
|
if (enrichment.category) frontmatter.category = enrichment.category;
|
|
if (enrichment.audience) frontmatter.audience = enrichment.audience;
|
|
if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose;
|
|
if (enrichment.complexity) frontmatter.complexity = enrichment.complexity;
|
|
if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable;
|
|
if (enrichment.key_technologies?.length) {
|
|
frontmatter.key_technologies = enrichment.key_technologies;
|
|
}
|
|
} else {
|
|
// Placeholders for LLM enrichment
|
|
frontmatter.summary = '';
|
|
frontmatter.tags = [];
|
|
frontmatter.category = '';
|
|
}
|
|
|
|
const yamlStr = yaml.dump(this.removeNulls(frontmatter), {
|
|
indent: 2,
|
|
lineWidth: -1,
|
|
quotingType: "'",
|
|
sortKeys: false
|
|
});
|
|
|
|
return `---\n${yamlStr}---`;
|
|
}
|
|
|
|
static generateMarkdown(
|
|
options: FrontmatterOptions,
|
|
deterministic: DeterministicFields,
|
|
content: string,
|
|
enrichment?: LLMEnrichment,
|
|
mode: OutputMode = 'balanced'
|
|
): string {
|
|
const fm = this.generate(options, deterministic, enrichment, mode);
|
|
if (!fm) return content;
|
|
return `${fm}\n\n${content}`;
|
|
}
|
|
|
|
private static cleanObject(obj: any): any {
|
|
const result: any = {};
|
|
for (const [k, v] of Object.entries(obj)) {
|
|
if (k.startsWith('_')) continue;
|
|
if (v === null || v === undefined) continue;
|
|
if (v instanceof Date) {
|
|
result[k] = v.toISOString();
|
|
} else if (v && typeof v === 'object' && !Array.isArray(v)) {
|
|
result[k] = this.cleanObject(v);
|
|
} else {
|
|
result[k] = v;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private static removeNulls(obj: any): any {
|
|
if (Array.isArray(obj)) {
|
|
return obj.filter(x => x != null).map(x => this.removeNulls(x));
|
|
}
|
|
if (obj && typeof obj === 'object') {
|
|
const result: any = {};
|
|
for (const [k, v] of Object.entries(obj)) {
|
|
if (v != null) result[k] = this.removeNulls(v);
|
|
}
|
|
return result;
|
|
}
|
|
return obj;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 7. src/index.ts
|
|
|
|
```typescript
|
|
// Types
|
|
export {
|
|
OutputMode,
|
|
AudienceLevel,
|
|
DocPurpose,
|
|
ProfileType,
|
|
FrontmatterOptions,
|
|
DeterministicFields,
|
|
LLMEnrichment
|
|
} from './types';
|
|
|
|
// Constants
|
|
export {
|
|
AUDIENCE_VALUES,
|
|
DOC_PURPOSE_VALUES,
|
|
PROFILE_VALUES,
|
|
DEFAULTS,
|
|
BALANCED_FIELDS
|
|
} from './constants';
|
|
|
|
// Generator
|
|
export { FrontmatterGenerator } from './generator';
|
|
|
|
// Prompts
|
|
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';
|
|
```
|
|
|
|
---
|
|
|
|
## 8. README.md
|
|
|
|
```markdown
|
|
# @bobai/frontmatter
|
|
|
|
BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers.
|
|
|
|
## Installation
|
|
|
|
```bash
|
|
npm install @bobai/frontmatter
|
|
```
|
|
|
|
## Usage
|
|
|
|
```typescript
|
|
import {
|
|
FrontmatterGenerator,
|
|
getEnrichmentPrompt,
|
|
OutputMode,
|
|
LLMEnrichment
|
|
} from '@bobai/frontmatter';
|
|
|
|
// Generate frontmatter
|
|
const markdown = FrontmatterGenerator.generateMarkdown(
|
|
{
|
|
generator: 'fss-parse-pdf',
|
|
version: '1.2.0',
|
|
title: 'My Document',
|
|
sourcePath: '/path/to/file.pdf'
|
|
},
|
|
{
|
|
word_count: 1234,
|
|
page_count: 8,
|
|
has_tables: true
|
|
},
|
|
content,
|
|
enrichment, // LLMEnrichment or undefined
|
|
'balanced' // OutputMode: 'none' | 'balanced' | 'complete'
|
|
);
|
|
|
|
// Get LLM enrichment prompt
|
|
const prompt = getEnrichmentPrompt(content, 'pdf');
|
|
// Send to your LLM and parse response as LLMEnrichment
|
|
```
|
|
|
|
## Output Modes
|
|
|
|
- `none` - No frontmatter, just content
|
|
- `balanced` (default) - Core fields + key deterministic + LLM enrichment
|
|
- `complete` - All fields including full metadata
|
|
|
|
## Reference
|
|
|
|
See BOBAI Markdown Standard v1.1 for field definitions.
|
|
```
|
|
|
|
---
|
|
|
|
## Testing
|
|
|
|
After implementation:
|
|
|
|
```bash
|
|
cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter
|
|
npm install
|
|
npm run build
|
|
```
|
|
|
|
Test by importing in a parser or creating a simple test script.
|
|
|
|
---
|
|
|
|
## Next Steps After Package Complete
|
|
|
|
1. Update each parser to import from this package
|
|
2. Remove duplicate frontmatter code from parsers
|
|
3. Consider publishing to npm registry (later)
|