parser-frontmatter-ts/IMPLEMENTATION_BLUEPRINT.md

12 KiB

@bobai/frontmatter - Implementation Blueprint

Package Location: /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter Standard Reference: /MASTERFOLDER/KnowledgeBase/Standards/BOBAI_MARKDOWN_STANDARD_V1.1.md


Purpose

Create a shared npm package that all FSS parsers import for consistent BOBAI v1.1 frontmatter generation.


Files to Create

bobai-frontmatter/
├── src/
│   ├── index.ts              # Main exports
│   ├── generator.ts          # FrontmatterGenerator class
│   ├── types.ts              # TypeScript interfaces
│   ├── constants.ts          # Enums, defaults
│   └── prompts.ts            # LLM enrichment prompt templates
├── package.json
├── tsconfig.json
└── README.md

Note: src/llm/ directory is for future enhancement (Issue #3)


1. package.json

{
  "name": "@bobai/frontmatter",
  "version": "1.1.0",
  "description": "BOBAI Markdown Standard v1.1 frontmatter generator",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "files": ["dist"],
  "scripts": {
    "build": "tsc",
    "clean": "rm -rf dist",
    "prepublishOnly": "npm run clean && npm run build"
  },
  "keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"],
  "author": "BobAI",
  "license": "MIT",
  "dependencies": {
    "js-yaml": "^4.1.0"
  },
  "devDependencies": {
    "@types/js-yaml": "^4.0.9",
    "@types/node": "^20.0.0",
    "typescript": "^5.0.0"
  },
  "engines": {
    "node": ">=18.0.0"
  }
}

2. tsconfig.json

{
  "compilerOptions": {
    "target": "ES2020",
    "module": "commonjs",
    "lib": ["ES2020"],
    "declaration": true,
    "strict": true,
    "noImplicitAny": true,
    "strictNullChecks": true,
    "noImplicitThis": true,
    "alwaysStrict": true,
    "noUnusedLocals": false,
    "noUnusedParameters": false,
    "noImplicitReturns": true,
    "noFallthroughCasesInSwitch": false,
    "inlineSourceMap": true,
    "inlineSources": true,
    "experimentalDecorators": true,
    "strictPropertyInitialization": false,
    "outDir": "./dist",
    "rootDir": "./src"
  },
  "exclude": ["node_modules", "dist"]
}

3. src/types.ts

export type OutputMode = 'none' | 'balanced' | 'complete';

export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';

export type DocPurpose =
  | 'reference'
  | 'tutorial'
  | 'troubleshooting'
  | 'conceptual'
  | 'guide'
  | 'specification';

export type ProfileType =
  | 'scraped'
  | 'research'
  | 'technical'
  | 'code'
  | 'data'
  | 'changelog'
  | 'legal'
  | 'test'
  | 'schema'
  | 'troubleshoot'
  | 'meeting'
  | 'faq'
  | 'config';

export interface FrontmatterOptions {
  generator: string;
  version: string;
  title: string;
  sourcePath?: string | null;
  profile?: ProfileType;
  extractionConfidence?: number;
  contentQuality?: number;
}

export interface DeterministicFields {
  word_count?: number;
  page_count?: number;
  character_count?: number;
  [key: string]: any;
}

export interface LLMEnrichment {
  summary?: string;
  tags?: string[];
  category?: string;
  audience?: AudienceLevel;
  doc_purpose?: DocPurpose;
  complexity?: number;
  actionable?: boolean;
  key_technologies?: string[];
}

4. src/constants.ts

import { AudienceLevel, DocPurpose, ProfileType } from './types';

export const AUDIENCE_VALUES: AudienceLevel[] = [
  'all', 'beginner', 'intermediate', 'expert'
];

export const DOC_PURPOSE_VALUES: DocPurpose[] = [
  'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification'
];

export const PROFILE_VALUES: ProfileType[] = [
  'scraped', 'research', 'technical', 'code', 'data', 'changelog',
  'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
];

export const DEFAULTS = {
  profile: 'data' as ProfileType,
  audience: 'all' as AudienceLevel,
  extractionConfidence: 1.0,
  contentQuality: 1.5,
  complexity: 3
};

// Fields to include in balanced mode (not complete)
export const BALANCED_FIELDS = [
  'word_count',
  'page_count',
  'has_tables',
  'has_images',
  'section_count',
  'slide_count',
  'duration_seconds',
  'width',
  'height',
  'has_attachments',
  'attachment_count',
  'from',
  'to',
  'date',
  'message_id',
  'author',
  'has_toc',
  'has_tracked_changes',
  'has_transcript',
  'speaker_count'
];

5. src/prompts.ts

export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string {
  return `You are a document analyst preparing metadata for a RAG search system.
Extract structured metadata to help users find and understand this document.

Respond with this exact JSON structure:

{
  "summary": "2-3 sentences: What is this document about? What problem does it solve?",
  "tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
  "category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
  "audience": "all | beginner | intermediate | expert",
  "doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
  "complexity": 1-5,
  "actionable": true or false,
  "key_technologies": ["specific tools, languages, frameworks mentioned"]
}

Guidelines:
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
- category: Pick the single best match
- audience: Default to "all" unless clearly targeted to specific skill level
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
- actionable: true if reader should DO something, false if just informational
- key_technologies: Only include specific named technologies, not generic terms

Document type: ${docType}

---
${content}
---

Respond with valid JSON only. No explanation or markdown formatting.`;
}

export function getSamplePromptForDocType(docType: string): string {
  const samples: Record<string, string> = {
    pdf: 'PDF document',
    word: 'Word document',
    email: 'Email message',
    image: 'Image with OCR text',
    audio: 'Audio transcript',
    video: 'Video transcript',
    presentation: 'Presentation slides',
    excel: 'Spreadsheet data',
    markdown: 'Markdown document'
  };
  return samples[docType] || 'document';
}

6. src/generator.ts

import * as yaml from 'js-yaml';
import {
  FrontmatterOptions,
  DeterministicFields,
  LLMEnrichment,
  OutputMode
} from './types';
import { DEFAULTS, BALANCED_FIELDS } from './constants';

export class FrontmatterGenerator {

  static generate(
    options: FrontmatterOptions,
    deterministic: DeterministicFields = {},
    enrichment?: LLMEnrichment,
    mode: OutputMode = 'balanced'
  ): string {

    if (mode === 'none') {
      return '';
    }

    const frontmatter: Record<string, any> = {
      // Core required fields
      profile: options.profile || DEFAULTS.profile,
      created: new Date().toISOString(),
      generator: options.generator,
      version: options.version,
      title: options.title || 'Untitled',
      extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence,
      content_quality: options.contentQuality ?? DEFAULTS.contentQuality,
    };

    // Source file
    if (options.sourcePath) {
      frontmatter.source_file = options.sourcePath;
    }

    // Add deterministic fields based on mode
    if (mode === 'complete') {
      // Include all deterministic fields
      const cleaned = this.cleanObject(deterministic);
      Object.assign(frontmatter, cleaned);
    } else {
      // Balanced mode - include only key fields
      for (const field of BALANCED_FIELDS) {
        if (deterministic[field] !== undefined && deterministic[field] !== null) {
          frontmatter[field] = deterministic[field];
        }
      }
    }

    // LLM enrichment fields (flat, not nested)
    if (enrichment) {
      if (enrichment.summary) frontmatter.summary = enrichment.summary;
      if (enrichment.tags?.length) frontmatter.tags = enrichment.tags;
      if (enrichment.category) frontmatter.category = enrichment.category;
      if (enrichment.audience) frontmatter.audience = enrichment.audience;
      if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose;
      if (enrichment.complexity) frontmatter.complexity = enrichment.complexity;
      if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable;
      if (enrichment.key_technologies?.length) {
        frontmatter.key_technologies = enrichment.key_technologies;
      }
    } else {
      // Placeholders for LLM enrichment
      frontmatter.summary = '';
      frontmatter.tags = [];
      frontmatter.category = '';
    }

    const yamlStr = yaml.dump(this.removeNulls(frontmatter), {
      indent: 2,
      lineWidth: -1,
      quotingType: "'",
      sortKeys: false
    });

    return `---\n${yamlStr}---`;
  }

  static generateMarkdown(
    options: FrontmatterOptions,
    deterministic: DeterministicFields,
    content: string,
    enrichment?: LLMEnrichment,
    mode: OutputMode = 'balanced'
  ): string {
    const fm = this.generate(options, deterministic, enrichment, mode);
    if (!fm) return content;
    return `${fm}\n\n${content}`;
  }

  private static cleanObject(obj: any): any {
    const result: any = {};
    for (const [k, v] of Object.entries(obj)) {
      if (k.startsWith('_')) continue;
      if (v === null || v === undefined) continue;
      if (v instanceof Date) {
        result[k] = v.toISOString();
      } else if (v && typeof v === 'object' && !Array.isArray(v)) {
        result[k] = this.cleanObject(v);
      } else {
        result[k] = v;
      }
    }
    return result;
  }

  private static removeNulls(obj: any): any {
    if (Array.isArray(obj)) {
      return obj.filter(x => x != null).map(x => this.removeNulls(x));
    }
    if (obj && typeof obj === 'object') {
      const result: any = {};
      for (const [k, v] of Object.entries(obj)) {
        if (v != null) result[k] = this.removeNulls(v);
      }
      return result;
    }
    return obj;
  }
}

7. src/index.ts

// Types
export {
  OutputMode,
  AudienceLevel,
  DocPurpose,
  ProfileType,
  FrontmatterOptions,
  DeterministicFields,
  LLMEnrichment
} from './types';

// Constants
export {
  AUDIENCE_VALUES,
  DOC_PURPOSE_VALUES,
  PROFILE_VALUES,
  DEFAULTS,
  BALANCED_FIELDS
} from './constants';

// Generator
export { FrontmatterGenerator } from './generator';

// Prompts
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';

8. README.md

# @bobai/frontmatter

BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers.

## Installation

```bash
npm install @bobai/frontmatter

Usage

import {
  FrontmatterGenerator,
  getEnrichmentPrompt,
  OutputMode,
  LLMEnrichment
} from '@bobai/frontmatter';

// Generate frontmatter
const markdown = FrontmatterGenerator.generateMarkdown(
  {
    generator: 'fss-parse-pdf',
    version: '1.2.0',
    title: 'My Document',
    sourcePath: '/path/to/file.pdf'
  },
  {
    word_count: 1234,
    page_count: 8,
    has_tables: true
  },
  content,
  enrichment,  // LLMEnrichment or undefined
  'balanced'   // OutputMode: 'none' | 'balanced' | 'complete'
);

// Get LLM enrichment prompt
const prompt = getEnrichmentPrompt(content, 'pdf');
// Send to your LLM and parse response as LLMEnrichment

Output Modes

  • none - No frontmatter, just content
  • balanced (default) - Core fields + key deterministic + LLM enrichment
  • complete - All fields including full metadata

Reference

See BOBAI Markdown Standard v1.1 for field definitions.


---

## Testing

After implementation:

```bash
cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter
npm install
npm run build

Test by importing in a parser or creating a simple test script.


Next Steps After Package Complete

  1. Update each parser to import from this package
  2. Remove duplicate frontmatter code from parsers
  3. Consider publishing to npm registry (later)