Parseflow · Capability
Process Document

Capability Spec

apiVersion: naftiko.io/v1
kind: Capability
metadata:
  name: process-document
  version: 1.0.0-alpha2
  labels:
    provider: parseflow
    family: process
    domain: document-ai
spec:
  title: Process Document
  description: |
    Synchronously parse a PDF, DOCX, TXT, or raw text payload into structured
    chunks, extraction fields, and optional Markdown using Parseflow's
    deterministic engine (or BYOK-assisted mode when supplied).
  sharedAPI:
    $ref: ./shared/parseflow-api.yaml
  inputs:
    - name: file
      type: file
      description: Document binary (PDF/DOCX/TXT). Required if `text` is not supplied.
    - name: text
      type: string
      description: Raw text input. Required if `file` is not supplied.
    - name: mode
      type: string
      enum: [deterministic, byok_assisted, basic]
      default: deterministic
    - name: chunk_size
      type: integer
      default: 2000
    - name: overlap
      type: integer
      default: 200
    - name: output_format
      type: string
      enum: [json, markdown, zip]
      default: json
    - name: include_markdown
      type: boolean
      default: false
    - name: preset
      type: string
      description: Named extraction preset (e.g. invoice, receipt, contract).
    - name: schema_json
      type: string
      description: Caller-supplied JSON Schema for structured extraction.
    - name: enforce_schema
      type: boolean
      default: false
  consume:
    http:
      method: POST
      path: /v2/process
      contentType: multipart/form-data
      headers:
        Idempotency-Key: "{{ idempotency_key | default(uuid4()) }}"
  expose:
    rest:
      path: /capabilities/process-document
      method: POST
    mcp:
      tool: parseflow_process_document
      description: Parse a document or text into chunks, fields, and Markdown.
  outputs:
    contentType: application/json
    schemaRef: ../json-schema/parseflow-process-response-schema.json