parser-frontmatter-ts/IMPLEMENTATION_BLUEPRINT.md

# @bobai/frontmatter - Implementation Blueprint

**Package Location**: `/MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter`
**Standard Reference**: `/MASTERFOLDER/KnowledgeBase/Standards/BOBAI_MARKDOWN_STANDARD_V1.1.md`

---

## Purpose

Create a shared npm package that all FSS parsers import for consistent BOBAI v1.1 frontmatter generation.

---

## Files to Create

```
bobai-frontmatter/
├── src/
│   ├── index.ts              # Main exports
│   ├── generator.ts          # FrontmatterGenerator class
│   ├── types.ts              # TypeScript interfaces
│   ├── constants.ts          # Enums, defaults
│   └── prompts.ts            # LLM enrichment prompt templates
├── package.json
├── tsconfig.json
└── README.md
```

Note: `src/llm/` directory is for future enhancement (Issue #3)

---

## 1. package.json

```json
{
  "name": "@bobai/frontmatter",
  "version": "1.1.0",
  "description": "BOBAI Markdown Standard v1.1 frontmatter generator",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
  "files": ["dist"],
  "scripts": {
    "build": "tsc",
    "clean": "rm -rf dist",
    "prepublishOnly": "npm run clean && npm run build"
  },
  "keywords": ["bobai", "frontmatter", "markdown", "yaml", "metadata"],
  "author": "BobAI",
  "license": "MIT",
  "dependencies": {
    "js-yaml": "^4.1.0"
  },
  "devDependencies": {
    "@types/js-yaml": "^4.0.9",
    "@types/node": "^20.0.0",
    "typescript": "^5.0.0"
  },
  "engines": {
    "node": ">=18.0.0"
  }
}
```

---

## 2. tsconfig.json

```json
{
  "compilerOptions": {
    "target": "ES2020",
    "module": "commonjs",
    "lib": ["ES2020"],
    "declaration": true,
    "strict": true,
    "noImplicitAny": true,
    "strictNullChecks": true,
    "noImplicitThis": true,
    "alwaysStrict": true,
    "noUnusedLocals": false,
    "noUnusedParameters": false,
    "noImplicitReturns": true,
    "noFallthroughCasesInSwitch": false,
    "inlineSourceMap": true,
    "inlineSources": true,
    "experimentalDecorators": true,
    "strictPropertyInitialization": false,
    "outDir": "./dist",
    "rootDir": "./src"
  },
  "exclude": ["node_modules", "dist"]
}
```

---

## 3. src/types.ts

```typescript
export type OutputMode = 'none' | 'balanced' | 'complete';

export type AudienceLevel = 'all' | 'beginner' | 'intermediate' | 'expert';

export type DocPurpose =
  | 'reference'
  | 'tutorial'
  | 'troubleshooting'
  | 'conceptual'
  | 'guide'
  | 'specification';

export type ProfileType =
  | 'scraped'
  | 'research'
  | 'technical'
  | 'code'
  | 'data'
  | 'changelog'
  | 'legal'
  | 'test'
  | 'schema'
  | 'troubleshoot'
  | 'meeting'
  | 'faq'
  | 'config';

export interface FrontmatterOptions {
  generator: string;
  version: string;
  title: string;
  sourcePath?: string | null;
  profile?: ProfileType;
  extractionConfidence?: number;
  contentQuality?: number;
}

export interface DeterministicFields {
  word_count?: number;
  page_count?: number;
  character_count?: number;
  [key: string]: any;
}

export interface LLMEnrichment {
  summary?: string;
  tags?: string[];
  category?: string;
  audience?: AudienceLevel;
  doc_purpose?: DocPurpose;
  complexity?: number;
  actionable?: boolean;
  key_technologies?: string[];
}
```

---

## 4. src/constants.ts

```typescript
import { AudienceLevel, DocPurpose, ProfileType } from './types';

export const AUDIENCE_VALUES: AudienceLevel[] = [
  'all', 'beginner', 'intermediate', 'expert'
];

export const DOC_PURPOSE_VALUES: DocPurpose[] = [
  'reference', 'tutorial', 'troubleshooting', 'conceptual', 'guide', 'specification'
];

export const PROFILE_VALUES: ProfileType[] = [
  'scraped', 'research', 'technical', 'code', 'data', 'changelog',
  'legal', 'test', 'schema', 'troubleshoot', 'meeting', 'faq', 'config'
];

export const DEFAULTS = {
  profile: 'data' as ProfileType,
  audience: 'all' as AudienceLevel,
  extractionConfidence: 1.0,
  contentQuality: 1.5,
  complexity: 3
};

// Fields to include in balanced mode (not complete)
export const BALANCED_FIELDS = [
  'word_count',
  'page_count',
  'has_tables',
  'has_images',
  'section_count',
  'slide_count',
  'duration_seconds',
  'width',
  'height',
  'has_attachments',
  'attachment_count',
  'from',
  'to',
  'date',
  'message_id',
  'author',
  'has_toc',
  'has_tracked_changes',
  'has_transcript',
  'speaker_count'
];
```

---

## 5. src/prompts.ts

```typescript
export function getEnrichmentPrompt(content: string, docType: string = 'markdown'): string {
  return `You are a document analyst preparing metadata for a RAG search system.
Extract structured metadata to help users find and understand this document.

Respond with this exact JSON structure:

{
  "summary": "2-3 sentences: What is this document about? What problem does it solve?",
  "tags": ["5-10 SPECIFIC terms from this document for search - not generic words"],
  "category": "technical | research | code | data | changelog | troubleshooting | reference | tutorial",
  "audience": "all | beginner | intermediate | expert",
  "doc_purpose": "reference | tutorial | troubleshooting | conceptual | guide | specification",
  "complexity": 1-5,
  "actionable": true or false,
  "key_technologies": ["specific tools, languages, frameworks mentioned"]
}

Guidelines:
- tags: Extract SPECIFIC terms that appear in the document, not generic descriptions
- category: Pick the single best match
- audience: Default to "all" unless clearly targeted to specific skill level
- complexity: 1=overview, 2=beginner guide, 3=intermediate, 4=advanced, 5=deep implementation
- actionable: true if reader should DO something, false if just informational
- key_technologies: Only include specific named technologies, not generic terms

Document type: ${docType}

---
${content}
---

Respond with valid JSON only. No explanation or markdown formatting.`;
}

export function getSamplePromptForDocType(docType: string): string {
  const samples: Record<string, string> = {
    pdf: 'PDF document',
    word: 'Word document',
    email: 'Email message',
    image: 'Image with OCR text',
    audio: 'Audio transcript',
    video: 'Video transcript',
    presentation: 'Presentation slides',
    excel: 'Spreadsheet data',
    markdown: 'Markdown document'
  };
  return samples[docType] || 'document';
}
```

---

## 6. src/generator.ts

```typescript
import * as yaml from 'js-yaml';
import {
  FrontmatterOptions,
  DeterministicFields,
  LLMEnrichment,
  OutputMode
} from './types';
import { DEFAULTS, BALANCED_FIELDS } from './constants';

export class FrontmatterGenerator {

  static generate(
    options: FrontmatterOptions,
    deterministic: DeterministicFields = {},
    enrichment?: LLMEnrichment,
    mode: OutputMode = 'balanced'
  ): string {

    if (mode === 'none') {
      return '';
    }

    const frontmatter: Record<string, any> = {
      // Core required fields
      profile: options.profile || DEFAULTS.profile,
      created: new Date().toISOString(),
      generator: options.generator,
      version: options.version,
      title: options.title || 'Untitled',
      extraction_confidence: options.extractionConfidence ?? DEFAULTS.extractionConfidence,
      content_quality: options.contentQuality ?? DEFAULTS.contentQuality,
    };

    // Source file
    if (options.sourcePath) {
      frontmatter.source_file = options.sourcePath;
    }

    // Add deterministic fields based on mode
    if (mode === 'complete') {
      // Include all deterministic fields
      const cleaned = this.cleanObject(deterministic);
      Object.assign(frontmatter, cleaned);
    } else {
      // Balanced mode - include only key fields
      for (const field of BALANCED_FIELDS) {
        if (deterministic[field] !== undefined && deterministic[field] !== null) {
          frontmatter[field] = deterministic[field];
        }
      }
    }

    // LLM enrichment fields (flat, not nested)
    if (enrichment) {
      if (enrichment.summary) frontmatter.summary = enrichment.summary;
      if (enrichment.tags?.length) frontmatter.tags = enrichment.tags;
      if (enrichment.category) frontmatter.category = enrichment.category;
      if (enrichment.audience) frontmatter.audience = enrichment.audience;
      if (enrichment.doc_purpose) frontmatter.doc_purpose = enrichment.doc_purpose;
      if (enrichment.complexity) frontmatter.complexity = enrichment.complexity;
      if (enrichment.actionable !== undefined) frontmatter.actionable = enrichment.actionable;
      if (enrichment.key_technologies?.length) {
        frontmatter.key_technologies = enrichment.key_technologies;
      }
    } else {
      // Placeholders for LLM enrichment
      frontmatter.summary = '';
      frontmatter.tags = [];
      frontmatter.category = '';
    }

    const yamlStr = yaml.dump(this.removeNulls(frontmatter), {
      indent: 2,
      lineWidth: -1,
      quotingType: "'",
      sortKeys: false
    });

    return `---\n${yamlStr}---`;
  }

  static generateMarkdown(
    options: FrontmatterOptions,
    deterministic: DeterministicFields,
    content: string,
    enrichment?: LLMEnrichment,
    mode: OutputMode = 'balanced'
  ): string {
    const fm = this.generate(options, deterministic, enrichment, mode);
    if (!fm) return content;
    return `${fm}\n\n${content}`;
  }

  private static cleanObject(obj: any): any {
    const result: any = {};
    for (const [k, v] of Object.entries(obj)) {
      if (k.startsWith('_')) continue;
      if (v === null || v === undefined) continue;
      if (v instanceof Date) {
        result[k] = v.toISOString();
      } else if (v && typeof v === 'object' && !Array.isArray(v)) {
        result[k] = this.cleanObject(v);
      } else {
        result[k] = v;
      }
    }
    return result;
  }

  private static removeNulls(obj: any): any {
    if (Array.isArray(obj)) {
      return obj.filter(x => x != null).map(x => this.removeNulls(x));
    }
    if (obj && typeof obj === 'object') {
      const result: any = {};
      for (const [k, v] of Object.entries(obj)) {
        if (v != null) result[k] = this.removeNulls(v);
      }
      return result;
    }
    return obj;
  }
}
```

---

## 7. src/index.ts

```typescript
// Types
export {
  OutputMode,
  AudienceLevel,
  DocPurpose,
  ProfileType,
  FrontmatterOptions,
  DeterministicFields,
  LLMEnrichment
} from './types';

// Constants
export {
  AUDIENCE_VALUES,
  DOC_PURPOSE_VALUES,
  PROFILE_VALUES,
  DEFAULTS,
  BALANCED_FIELDS
} from './constants';

// Generator
export { FrontmatterGenerator } from './generator';

// Prompts
export { getEnrichmentPrompt, getSamplePromptForDocType } from './prompts';
```

---

## 8. README.md

```markdown
# @bobai/frontmatter

BOBAI Markdown Standard v1.1 frontmatter generator for FSS parsers.

## Installation

```bash
npm install @bobai/frontmatter
```

## Usage

```typescript
import {
  FrontmatterGenerator,
  getEnrichmentPrompt,
  OutputMode,
  LLMEnrichment
} from '@bobai/frontmatter';

// Generate frontmatter
const markdown = FrontmatterGenerator.generateMarkdown(
  {
    generator: 'fss-parse-pdf',
    version: '1.2.0',
    title: 'My Document',
    sourcePath: '/path/to/file.pdf'
  },
  {
    word_count: 1234,
    page_count: 8,
    has_tables: true
  },
  content,
  enrichment,  // LLMEnrichment or undefined
  'balanced'   // OutputMode: 'none' | 'balanced' | 'complete'
);

// Get LLM enrichment prompt
const prompt = getEnrichmentPrompt(content, 'pdf');
// Send to your LLM and parse response as LLMEnrichment
```

## Output Modes

- `none` - No frontmatter, just content
- `balanced` (default) - Core fields + key deterministic + LLM enrichment
- `complete` - All fields including full metadata

## Reference

See BOBAI Markdown Standard v1.1 for field definitions.
```

---

## Testing

After implementation:

```bash
cd /MASTERFOLDER/Tools/parsers/packages/bobai-frontmatter
npm install
npm run build
```

Test by importing in a parser or creating a simple test script.

---

## Next Steps After Package Complete

1. Update each parser to import from this package
2. Remove duplicate frontmatter code from parsers
3. Consider publishing to npm registry (later)