Reducto · Schema
Settings
Reducto Settings schema
Document ParsingPDFOCRData ExtractionAIMachine LearningDocument IntelligenceStructured Data
Properties
| Name | Type | Description |
|---|---|---|
| ocr_system | string | Standard is our best multilingual OCR system. Legacy only supports germanic languages and is available for backwards compatibility. |
| extraction_mode | string | The mode to use for text extraction from PDFs. OCR mode uses optical character recognition only. Hybrid mode combines OCR with embedded PDF text for best accuracy (default). |
| force_url_result | boolean | Force the result to be returned in URL form. |
| force_file_extension | string | Force the URL to be downloaded as a specific file extension (e.g. `.png`). |
| return_ocr_data | boolean | If True, return OCR data in the result. Defaults to False. |
| return_images | array | Whether to return images for the specified block types. 'page' returns full page images. By default, no images are returned. |
| embed_pdf_metadata | boolean | If True, embed OCR metadata into the returned PDF. Defaults to False. |
| embed_pdf_metadata_dpi | integer | Render DPI used when rasterizing the source PDF before embedding the OCR text layer (only applies when ``embed_pdf_metadata`` is True). Lower values produce dramatically smaller output PDFs; higher va |
| persist_results | boolean | If True, persist the results indefinitely. Defaults to False. |
| tenant_throttling | object | Per-tenant throttling for multi-tenant applications. Tag each request with your tenant's id to bound how much of your account's concurrency a single tenant can consume. Account-level throttles still a |
| timeout | number | The timeout for the job in seconds. |
| page_range | object | The page range to process (1-indexed). By default, the entire document is processed. For spreadsheets, you can also provide a list of sheet names. |
| document_password | string | Password to decrypt password-protected documents. |
| hybrid_vpc | object | Hybrid VPC request-scoped settings. |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/api-evangelist/reducto/refs/heads/main/json-schema/reducto-settings.json",
"title": "Settings",
"description": "Reducto Settings schema",
"properties": {
"ocr_system": {
"type": "string",
"enum": [
"standard",
"legacy"
],
"title": "Ocr System",
"description": "Standard is our best multilingual OCR system. Legacy only supports germanic languages and is available for backwards compatibility.",
"default": "standard"
},
"extraction_mode": {
"type": "string",
"enum": [
"ocr",
"hybrid"
],
"title": "Extraction Mode",
"description": "The mode to use for text extraction from PDFs. OCR mode uses optical character recognition only. Hybrid mode combines OCR with embedded PDF text for best accuracy (default).",
"default": "hybrid"
},
"force_url_result": {
"type": "boolean",
"title": "Force Url Result",
"description": "Force the result to be returned in URL form.",
"default": false
},
"force_file_extension": {
"type": "string",
"nullable": true,
"title": "Force File Extension",
"description": "Force the URL to be downloaded as a specific file extension (e.g. `.png`)."
},
"return_ocr_data": {
"type": "boolean",
"title": "Return Ocr Data",
"description": "If True, return OCR data in the result. Defaults to False.",
"default": false
},
"return_images": {
"items": {
"type": "string",
"enum": [
"figure",
"table",
"page"
]
},
"type": "array",
"title": "Return Images",
"description": "Whether to return images for the specified block types. 'page' returns full page images. By default, no images are returned.",
"default": []
},
"embed_pdf_metadata": {
"type": "boolean",
"title": "Embed Pdf Metadata",
"description": "If True, embed OCR metadata into the returned PDF. Defaults to False.",
"default": false
},
"embed_pdf_metadata_dpi": {
"type": "integer",
"maximum": 250,
"minimum": 50,
"title": "Embed Pdf Metadata Dpi",
"description": "Render DPI used when rasterizing the source PDF before embedding the OCR text layer (only applies when ``embed_pdf_metadata`` is True). Lower values produce dramatically smaller output PDFs; higher values preserve more detail when zoomed past 200%. Defaults to 100 (good for on-screen viewing); raise toward the source scan DPI for crisper output. Min 50, max 250.",
"default": 100
},
"persist_results": {
"type": "boolean",
"title": "Persist Results",
"description": "If True, persist the results indefinitely. Defaults to False.",
"default": false
},
"tenant_throttling": {
"$ref": "#/components/schemas/TenantThrottling",
"nullable": true,
"description": "Per-tenant throttling for multi-tenant applications. Tag each request with your tenant's id to bound how much of your account's concurrency a single tenant can consume. Account-level throttles still apply."
},
"timeout": {
"type": "number",
"nullable": true,
"title": "Timeout",
"description": "The timeout for the job in seconds."
},
"page_range": {
"anyOf": [
{
"$ref": "#/components/schemas/PageRange"
},
{
"items": {
"$ref": "#/components/schemas/PageRange"
},
"type": "array"
},
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"items": {
"type": "string"
},
"type": "array"
}
],
"title": "Page Range",
"description": "The page range to process (1-indexed). By default, the entire document is processed. For spreadsheets, you can also provide a list of sheet names.",
"nullable": true
},
"document_password": {
"type": "string",
"nullable": true,
"title": "Document Password",
"description": "Password to decrypt password-protected documents."
},
"hybrid_vpc": {
"$ref": "#/components/schemas/HybridVpcSettings",
"description": "Hybrid VPC request-scoped settings.",
"default": {}
}
},
"type": "object"
}