Reducto · Schema

Settings

Reducto Settings schema

Document ParsingPDFOCRData ExtractionAIMachine LearningDocument IntelligenceStructured Data

Properties

Name Type Description
ocr_system string Standard is our best multilingual OCR system. Legacy only supports germanic languages and is available for backwards compatibility.
extraction_mode string The mode to use for text extraction from PDFs. OCR mode uses optical character recognition only. Hybrid mode combines OCR with embedded PDF text for best accuracy (default).
force_url_result boolean Force the result to be returned in URL form.
force_file_extension string Force the URL to be downloaded as a specific file extension (e.g. `.png`).
return_ocr_data boolean If True, return OCR data in the result. Defaults to False.
return_images array Whether to return images for the specified block types. 'page' returns full page images. By default, no images are returned.
embed_pdf_metadata boolean If True, embed OCR metadata into the returned PDF. Defaults to False.
embed_pdf_metadata_dpi integer Render DPI used when rasterizing the source PDF before embedding the OCR text layer (only applies when ``embed_pdf_metadata`` is True). Lower values produce dramatically smaller output PDFs; higher va
persist_results boolean If True, persist the results indefinitely. Defaults to False.
tenant_throttling object Per-tenant throttling for multi-tenant applications. Tag each request with your tenant's id to bound how much of your account's concurrency a single tenant can consume. Account-level throttles still a
timeout number The timeout for the job in seconds.
page_range object The page range to process (1-indexed). By default, the entire document is processed. For spreadsheets, you can also provide a list of sheet names.
document_password string Password to decrypt password-protected documents.
hybrid_vpc object Hybrid VPC request-scoped settings.
View JSON Schema on GitHub

JSON Schema

reducto-settings.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/reducto/refs/heads/main/json-schema/reducto-settings.json",
  "title": "Settings",
  "description": "Reducto Settings schema",
  "properties": {
    "ocr_system": {
      "type": "string",
      "enum": [
        "standard",
        "legacy"
      ],
      "title": "Ocr System",
      "description": "Standard is our best multilingual OCR system. Legacy only supports germanic languages and is available for backwards compatibility.",
      "default": "standard"
    },
    "extraction_mode": {
      "type": "string",
      "enum": [
        "ocr",
        "hybrid"
      ],
      "title": "Extraction Mode",
      "description": "The mode to use for text extraction from PDFs. OCR mode uses optical character recognition only. Hybrid mode combines OCR with embedded PDF text for best accuracy (default).",
      "default": "hybrid"
    },
    "force_url_result": {
      "type": "boolean",
      "title": "Force Url Result",
      "description": "Force the result to be returned in URL form.",
      "default": false
    },
    "force_file_extension": {
      "type": "string",
      "nullable": true,
      "title": "Force File Extension",
      "description": "Force the URL to be downloaded as a specific file extension (e.g. `.png`)."
    },
    "return_ocr_data": {
      "type": "boolean",
      "title": "Return Ocr Data",
      "description": "If True, return OCR data in the result. Defaults to False.",
      "default": false
    },
    "return_images": {
      "items": {
        "type": "string",
        "enum": [
          "figure",
          "table",
          "page"
        ]
      },
      "type": "array",
      "title": "Return Images",
      "description": "Whether to return images for the specified block types. 'page' returns full page images. By default, no images are returned.",
      "default": []
    },
    "embed_pdf_metadata": {
      "type": "boolean",
      "title": "Embed Pdf Metadata",
      "description": "If True, embed OCR metadata into the returned PDF. Defaults to False.",
      "default": false
    },
    "embed_pdf_metadata_dpi": {
      "type": "integer",
      "maximum": 250,
      "minimum": 50,
      "title": "Embed Pdf Metadata Dpi",
      "description": "Render DPI used when rasterizing the source PDF before embedding the OCR text layer (only applies when ``embed_pdf_metadata`` is True). Lower values produce dramatically smaller output PDFs; higher values preserve more detail when zoomed past 200%. Defaults to 100 (good for on-screen viewing); raise toward the source scan DPI for crisper output. Min 50, max 250.",
      "default": 100
    },
    "persist_results": {
      "type": "boolean",
      "title": "Persist Results",
      "description": "If True, persist the results indefinitely. Defaults to False.",
      "default": false
    },
    "tenant_throttling": {
      "$ref": "#/components/schemas/TenantThrottling",
      "nullable": true,
      "description": "Per-tenant throttling for multi-tenant applications. Tag each request with your tenant's id to bound how much of your account's concurrency a single tenant can consume. Account-level throttles still apply."
    },
    "timeout": {
      "type": "number",
      "nullable": true,
      "title": "Timeout",
      "description": "The timeout for the job in seconds."
    },
    "page_range": {
      "anyOf": [
        {
          "$ref": "#/components/schemas/PageRange"
        },
        {
          "items": {
            "$ref": "#/components/schemas/PageRange"
          },
          "type": "array"
        },
        {
          "items": {
            "type": "integer"
          },
          "type": "array"
        },
        {
          "items": {
            "type": "string"
          },
          "type": "array"
        }
      ],
      "title": "Page Range",
      "description": "The page range to process (1-indexed). By default, the entire document is processed. For spreadsheets, you can also provide a list of sheet names.",
      "nullable": true
    },
    "document_password": {
      "type": "string",
      "nullable": true,
      "title": "Document Password",
      "description": "Password to decrypt password-protected documents."
    },
    "hybrid_vpc": {
      "$ref": "#/components/schemas/HybridVpcSettings",
      "description": "Hybrid VPC request-scoped settings.",
      "default": {}
    }
  },
  "type": "object"
}