Amazon Textract · Schema

DocumentAnalysis

Schema for an Amazon Textract document analysis response containing detected blocks of text, tables, forms, and layout elements extracted from a document.

Document ProcessingMachine LearningOCR

Properties

Name Type Description
DocumentMetadata object Metadata about the analyzed document.
Blocks array The items detected in the document, including text lines, words, tables, and form elements.
AnalyzeDocumentModelVersion string The version of the model used to analyze the document.
View JSON Schema on GitHub

JSON Schema

amazon-textract-documentanalysis-schema.json Raw ↑
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "https://aws.amazon.com/textract/schemas/documentanalysis",
  "title": "DocumentAnalysis",
  "description": "Schema for an Amazon Textract document analysis response containing detected blocks of text, tables, forms, and layout elements extracted from a document.",
  "type": "object",
  "properties": {
    "DocumentMetadata": {
      "type": "object",
      "description": "Metadata about the analyzed document.",
      "properties": {
        "Pages": {
          "type": "integer",
          "description": "The number of pages detected in the document.",
          "minimum": 1
        }
      },
      "required": ["Pages"]
    },
    "Blocks": {
      "type": "array",
      "description": "The items detected in the document, including text lines, words, tables, and form elements.",
      "items": {
        "type": "object",
        "properties": {
          "BlockType": {
            "type": "string",
            "description": "The type of text item detected.",
            "enum": [
              "KEY_VALUE_SET",
              "PAGE",
              "LINE",
              "WORD",
              "TABLE",
              "CELL",
              "SELECTION_ELEMENT",
              "MERGED_CELL",
              "TITLE",
              "QUERY",
              "QUERY_RESULT",
              "SIGNATURE",
              "TABLE_TITLE",
              "TABLE_FOOTER",
              "LAYOUT_TEXT",
              "LAYOUT_TITLE",
              "LAYOUT_HEADER",
              "LAYOUT_FOOTER",
              "LAYOUT_SECTION_HEADER",
              "LAYOUT_PAGE_NUMBER",
              "LAYOUT_LIST",
              "LAYOUT_FIGURE",
              "LAYOUT_TABLE",
              "LAYOUT_KEY_VALUE"
            ]
          },
          "Confidence": {
            "type": "number",
            "description": "The confidence that Amazon Textract has in the accuracy of the detected block.",
            "minimum": 0,
            "maximum": 100
          },
          "Text": {
            "type": "string",
            "description": "The word or line of text that is recognized by Amazon Textract."
          },
          "TextType": {
            "type": "string",
            "description": "The kind of text detected.",
            "enum": ["HANDWRITING", "PRINTED"]
          },
          "RowIndex": {
            "type": "integer",
            "description": "The row in which a table cell is located.",
            "minimum": 1
          },
          "ColumnIndex": {
            "type": "integer",
            "description": "The column in which a table cell appears.",
            "minimum": 1
          },
          "RowSpan": {
            "type": "integer",
            "description": "The number of rows that a table cell spans.",
            "minimum": 1
          },
          "ColumnSpan": {
            "type": "integer",
            "description": "The number of columns that a table cell spans.",
            "minimum": 1
          },
          "Geometry": {
            "type": "object",
            "description": "The location of the detected block on the document page.",
            "properties": {
              "BoundingBox": {
                "type": "object",
                "description": "An axis-aligned bounding box for the detected block.",
                "properties": {
                  "Width": {
                    "type": "number",
                    "description": "The width of the bounding box as a ratio of the overall document page width."
                  },
                  "Height": {
                    "type": "number",
                    "description": "The height of the bounding box as a ratio of the overall document page height."
                  },
                  "Left": {
                    "type": "number",
                    "description": "The left coordinate of the bounding box."
                  },
                  "Top": {
                    "type": "number",
                    "description": "The top coordinate of the bounding box."
                  }
                },
                "required": ["Width", "Height", "Left", "Top"]
              },
              "Polygon": {
                "type": "array",
                "description": "A fine-grained polygon around the detected block.",
                "items": {
                  "type": "object",
                  "properties": {
                    "X": {
                      "type": "number"
                    },
                    "Y": {
                      "type": "number"
                    }
                  },
                  "required": ["X", "Y"]
                }
              }
            }
          },
          "Id": {
            "type": "string",
            "description": "The identifier for the recognized text block."
          },
          "Relationships": {
            "type": "array",
            "description": "A list of relationship objects that describe how blocks are related to each other.",
            "items": {
              "type": "object",
              "properties": {
                "Type": {
                  "type": "string",
                  "description": "The type of relationship.",
                  "enum": ["VALUE", "CHILD", "COMPLEX_FEATURES", "MERGED_CELL", "TITLE", "ANSWER", "TABLE", "TABLE_TITLE", "TABLE_FOOTER"]
                },
                "Ids": {
                  "type": "array",
                  "description": "An array of IDs for related blocks.",
                  "items": {
                    "type": "string"
                  }
                }
              },
              "required": ["Type", "Ids"]
            }
          },
          "EntityTypes": {
            "type": "array",
            "description": "The type of entity such as KEY or VALUE.",
            "items": {
              "type": "string",
              "enum": ["KEY", "VALUE", "COLUMN_HEADER", "TABLE_TITLE", "TABLE_FOOTER", "TABLE_SECTION_TITLE", "TABLE_SUMMARY", "STRUCTURED_TABLE", "SEMI_STRUCTURED_TABLE"]
            }
          },
          "SelectionStatus": {
            "type": "string",
            "description": "The selection status of a selection element.",
            "enum": ["SELECTED", "NOT_SELECTED"]
          },
          "Page": {
            "type": "integer",
            "description": "The page on which the block was detected.",
            "minimum": 1
          },
          "Query": {
            "type": "object",
            "description": "The query that was used for query-based analysis.",
            "properties": {
              "Text": {
                "type": "string"
              },
              "Alias": {
                "type": "string"
              },
              "Pages": {
                "type": "array",
                "items": {
                  "type": "string"
                }
              }
            },
            "required": ["Text"]
          }
        },
        "required": ["BlockType"]
      }
    },
    "AnalyzeDocumentModelVersion": {
      "type": "string",
      "description": "The version of the model used to analyze the document."
    }
  },
  "required": ["DocumentMetadata", "Blocks"]
}