Datafold · Schema
Datafold API Schemas
JSON Schema definitions extracted from the Datafold OpenAPI specification
Data ReliabilityData DiffData QualityColumn-Level LineageData PipelinesCI/CD IntegrationAnomaly DetectionData ObservabilityData Migrations
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12",
"title": "Datafold API Schemas",
"description": "JSON Schema definitions extracted from the Datafold OpenAPI specification",
"schemas": {
"APIPrDetails": {
"properties": {
"base_branch": {
"title": "Base Branch",
"type": "string"
},
"base_sha": {
"title": "Base Sha",
"type": "string"
},
"pr_branch": {
"title": "Pr Branch",
"type": "string"
},
"pr_number": {
"title": "Pr Number",
"type": "integer"
},
"pr_sha": {
"title": "Pr Sha",
"type": "string"
}
},
"required": [
"pr_number",
"pr_branch",
"base_branch",
"pr_sha",
"base_sha"
],
"title": "APIPrDetails",
"type": "object"
},
"AWSS3Config": {
"properties": {
"bucket_name": {
"title": "Bucket Name",
"type": "string"
},
"key_id": {
"anyOf": [
{
"maxLength": 1024,
"type": "string"
},
{
"type": "null"
}
],
"title": "Key Id"
},
"materialize_max_rows": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Materialize Max Rows"
},
"materialize_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Materialize Path"
},
"region": {
"title": "Region",
"type": "string"
},
"secret": {
"anyOf": [
{
"format": "password",
"type": "string",
"writeOnly": true
},
{
"type": "null"
}
],
"title": "Secret"
}
},
"required": [
"bucket_name",
"key_id",
"region"
],
"title": "AWSS3Config",
"type": "object"
},
"AbsoluteColumnTolerance": {
"properties": {
"type": {
"const": "absolute",
"default": "absolute",
"description": "The type of Column Tolerance.",
"title": "Type",
"type": "string"
},
"value": {
"description": "Value of Column Tolerance.",
"title": "Value",
"type": "number"
}
},
"required": [
"value"
],
"title": "Absolute",
"type": "object"
},
"AbsoluteThreshold": {
"properties": {
"max": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"description": "Maximum value for the absolute threshold.",
"title": "Max"
},
"min": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"description": "Minimum value for the absolute threshold.",
"title": "Min"
},
"type": {
"const": "absolute",
"title": "Type",
"type": "string"
}
},
"required": [
"type"
],
"title": "Absolute",
"type": "object"
},
"AnomalyDetectionThreshold": {
"properties": {
"sensitivity": {
"description": "Sensitivity level for anomaly detection, ranging from 0 to 100.",
"maximum": 100.0,
"minimum": 0.0,
"title": "Sensitivity",
"type": "integer"
},
"type": {
"const": "automatic",
"title": "Type",
"type": "string"
}
},
"required": [
"type",
"sensitivity"
],
"title": "Anomaly Detection",
"type": "object"
},
"ApiCIDependency": {
"properties": {
"data_source_id": {
"title": "Data Source Id",
"type": "integer"
},
"data_source_type": {
"title": "Data Source Type",
"type": "string"
},
"item_type": {
"title": "Item Type",
"type": "string"
},
"name": {
"title": "Name",
"type": "string"
},
"path": {
"items": {
"type": "string"
},
"title": "Path",
"type": "array"
},
"popularity": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Popularity"
},
"primary_key": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Primary Key"
},
"query_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Query Type"
},
"raw_sql": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Raw Sql"
},
"remote_id": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Remote Id"
},
"table_name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Table Name"
},
"uid": {
"title": "Uid",
"type": "string"
}
},
"required": [
"uid",
"item_type",
"name",
"path",
"data_source_id",
"data_source_type"
],
"title": "ApiCIDependency",
"type": "object"
},
"ApiCiRun": {
"properties": {
"base_branch": {
"title": "Base Branch",
"type": "string"
},
"base_sha": {
"title": "Base Sha",
"type": "string"
},
"id": {
"title": "Id",
"type": "integer"
},
"pr_branch": {
"title": "Pr Branch",
"type": "string"
},
"pr_num": {
"title": "Pr Num",
"type": "string"
},
"pr_sha": {
"title": "Pr Sha",
"type": "string"
},
"source": {
"title": "Source",
"type": "string"
},
"status": {
"title": "Status",
"type": "string"
}
},
"required": [
"id",
"base_branch",
"base_sha",
"pr_branch",
"pr_sha",
"pr_num",
"status",
"source"
],
"title": "ApiCiRun",
"type": "object"
},
"ApiColumnDiffStat": {
"properties": {
"column_name": {
"title": "Column Name",
"type": "string"
},
"match": {
"description": "Percentage of cells that matched (0-100)",
"title": "Match",
"type": "number"
},
"values_different": {
"description": "Count of different cells observed in the sample",
"title": "Values Different",
"type": "integer"
},
"values_different_estimate": {
"anyOf": [
{
"$ref": "#/components/schemas/ApiEstimatedIntValue"
},
{
"type": "null"
}
],
"description": "Statistical estimate of differences for the full dataset. Only present when sampling was applied. Includes observed value, extrapolated estimate, confidence level, and confidence interval bounds."
}
},
"required": [
"column_name",
"match",
"values_different"
],
"title": "ApiColumnDiffStat",
"type": "object"
},
"ApiColumnProfileStats": {
"properties": {
"distinct": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"description": "Approximate distinct value count",
"title": "Distinct"
},
"histogram_bins": {
"default": 0,
"description": "Number of histogram bins available (0 if no histogram)",
"title": "Histogram Bins",
"type": "integer"
},
"max": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "Maximum value (formatted as string)",
"title": "Max"
},
"min": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "Minimum value (formatted as string)",
"title": "Min"
},
"nulls": {
"description": "Number of NULL values",
"title": "Nulls",
"type": "integer"
},
"top_values_count": {
"default": 0,
"description": "Number of top enum values available (0 if no enums)",
"title": "Top Values Count",
"type": "integer"
},
"total": {
"description": "Total number of rows in exclusive set",
"title": "Total",
"type": "integer"
}
},
"required": [
"total",
"nulls"
],
"title": "ApiColumnProfileStats",
"type": "object"
},
"ApiColumnRule": {
"properties": {
"column": {
"anyOf": [
{
"minLength": 1,
"type": "string"
},
{
"type": "null"
}
],
"description": "Column name the rule applies to. When column remapping is enabled, this refers to the dataset-A (canonical) column name \u2014 the same name the comparator operates on after remapping. Mutually exclusive with `type`. Exactly one of `column` or `type` must be set.",
"title": "Column"
},
"equal_if": {
"description": "SQL boolean expression OR-ed into the value comparator for the matching column(s). Use {a} and {b} as placeholders for the side-A and side-B column references. Multiple rules matching the same column are all OR-ed together.",
"title": "Equal If",
"type": "string"
},
"type": {
"anyOf": [
{
"$ref": "#/components/schemas/DatatypeName"
},
{
"type": "null"
}
],
"description": "Type-based selector \u2014 matches all columns whose Datatype.typename equals this value. The rule is applied to every column of that type after data lands in DuckDB post-fetch. Mutually exclusive with `column`."
}
},
"required": [
"equal_if"
],
"title": "ApiColumnRule",
"type": "object"
},
"ApiCrossDataDiffSummaryForDone": {
"properties": {
"pks": {
"anyOf": [
{
"$ref": "#/components/schemas/ApiDataDiffSummaryPKs"
},
{
"type": "null"
}
]
},
"status": {
"enum": [
"done",
"success"
],
"title": "Status",
"type": "string"
},
"values": {
"anyOf": [
{
"$ref": "#/components/schemas/ApiDataDiffSummaryValues"
},
{
"type": "null"
}
]
}
},
"required": [
"status"
],
"title": "ApiCrossDataDiffSummaryForDone",
"type": "object"
},
"ApiDataDiffCancelled": {
"properties": {
"id": {
"title": "Id",
"type": "integer"
},
"status": {
"$ref": "#/components/schemas/JobStatus"
}
},
"required": [
"id",
"status"
],
"title": "ApiDataDiffCancelled",
"type": "object"
},
"ApiDataDiffData": {
"properties": {
"archived": {
"default": false,
"title": "Archived",
"type": "boolean"
},
"bisection_factor": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Bisection Factor"
},
"bisection_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Bisection Threshold"
},
"column_mapping": {
"anyOf": [
{
"items": {
"maxItems": 2,
"minItems": 2,
"prefixItems": [
{
"type": "string"
},
{
"type": "string"
}
],
"type": "array"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Map columns with different names between datasets. List of [column_in_A, column_in_B] pairs.",
"title": "Column Mapping"
},
"column_rules": {
"anyOf": [
{
"items": {
"$ref": "#/components/schemas/ApiColumnRule"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Extra equality clauses that extend the in-memory diff value comparator. Useful for treating specific value pairs (e.g. legacy sentinels vs NULL) as non-differences. See `ApiColumnRule` for entry shape.",
"title": "Column Rules"
},
"columns_to_compare": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Columns to compare between datasets. If set, only these columns are diffed (primary key columns are always included). Column names must match the dataset schema.",
"title": "Columns To Compare"
},
"compare_duplicates": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"description": "Compare rows with duplicate primary keys. Defaults to true.",
"title": "Compare Duplicates"
},
"data_source1_id": {
"description": "ID of the first data source (Dataset A).",
"title": "Data Source1 Id",
"type": "integer"
},
"data_source1_session_parameters": {
"anyOf": [
{
"additionalProperties": true,
"type": "object"
},
{
"type": "null"
}
],
"description": "Snowflake session parameters for Dataset A, e.g. {\"QUERY_TAG\": \"datadiff\", \"WAREHOUSE\": \"COMPUTE_WH\"}.",
"title": "Data Source1 Session Parameters"
},
"data_source2_id": {
"description": "ID of the second data source (Dataset B). Can be the same as data_source1_id.",
"title": "Data Source2 Id",
"type": "integer"
},
"data_source2_session_parameters": {
"anyOf": [
{
"additionalProperties": true,
"type": "object"
},
{
"type": "null"
}
],
"description": "Snowflake session parameters for Dataset B.",
"title": "Data Source2 Session Parameters"
},
"datetime_tolerance": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"description": "Datetime precision for comparison. 0=seconds, 1=tenths, 2=hundredths, 3=milliseconds, 4=tenth-ms, 5=hundredth-ms, 6=microseconds.",
"title": "Datetime Tolerance"
},
"diff_tolerance": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"description": "Default tolerance for float comparisons. In absolute mode: values within this distance are equal. In relative mode: fraction of difference allowed.",
"title": "Diff Tolerance"
},
"diff_tolerances_per_column": {
"anyOf": [
{
"items": {
"$ref": "#/components/schemas/ColumnTolerance"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Per-column tolerance overrides. Each entry: {column_name, tolerance_value (>= 0), tolerance_mode: 'absolute'|'relative'}.",
"title": "Diff Tolerances Per Column"
},
"download_limit": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Download Limit"
},
"exclude_columns": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Columns to exclude from comparison. Ignored if include_columns is set.",
"title": "Exclude Columns"
},
"file1": {
"anyOf": [
{
"format": "uri",
"minLength": 1,
"type": "string"
},
{
"type": "null"
}
],
"description": "File URL for Dataset A (s3://, gs://, abfss://, https://). Mutually exclusive with table1 and query1. Requires file1_options.",
"title": "File1"
},
"file1_options": {
"anyOf": [
{
"discriminator": {
"mapping": {
"csv": "#/components/schemas/CSVFileOptions",
"excel": "#/components/schemas/ExcelFileOptions",
"parquet": "#/components/schemas/ParquetFileOptions"
},
"propertyName": "file_type"
},
"oneOf": [
{
"$ref": "#/components/schemas/CSVFileOptions"
},
{
"$ref": "#/components/schemas/ExcelFileOptions"
},
{
"$ref": "#/components/schemas/ParquetFileOptions"
}
]
},
{
"type": "null"
}
],
"description": "File format options for file1 (file_type, delimiter, sheet, skip rows).",
"title": "File1 Options"
},
"file2": {
"anyOf": [
{
"format": "uri",
"minLength": 1,
"type": "string"
},
{
"type": "null"
}
],
"description": "File URL for Dataset B (s3://, gs://, abfss://, https://). Mutually exclusive with table2 and query2. Requires file2_options.",
"title": "File2"
},
"file2_options": {
"anyOf": [
{
"discriminator": {
"mapping": {
"csv": "#/components/schemas/CSVFileOptions",
"excel": "#/components/schemas/ExcelFileOptions",
"parquet": "#/components/schemas/ParquetFileOptions"
},
"propertyName": "file_type"
},
"oneOf": [
{
"$ref": "#/components/schemas/CSVFileOptions"
},
{
"$ref": "#/components/schemas/ExcelFileOptions"
},
{
"$ref": "#/components/schemas/ParquetFileOptions"
}
]
},
{
"type": "null"
}
],
"description": "File format options for file2 (file_type, delimiter, sheet, skip rows).",
"title": "File2 Options"
},
"filter1": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "SQL WHERE clause for Dataset A (omit the WHERE keyword), e.g. 'status = 1'.",
"title": "Filter1"
},
"filter2": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "SQL WHERE clause for Dataset B (omit the WHERE keyword), e.g. 'status = 1'.",
"title": "Filter2"
},
"include_columns": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Explicit list of columns to compare. If set, only these columns are diffed.",
"title": "Include Columns"
},
"infer_pk": {
"default": false,
"description": "Infer primary key columns automatically before running the diff. When true, pk_columns may be omitted; the inferred PK is stored back on the diff. Supported configurations: in-db diffs (all input types); cross-db diffs with query inputs on both sides (PK is inferred inside DuckDB after fetch). Not supported: cross-db with table inputs, or any file-based diff \u2014 use POST /api/v1/data_sources/{id}/guess_pk and pass pk_columns explicitly. For in-db query inputs, materialization is auto-forced.",
"title": "Infer Pk",
"type": "boolean"
},
"infer_pk_avoid_names": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Column names to skip during PK inference. Matched case-insensitively against actual column names (exact match, no glob/regex). Requires infer_pk=true and pk_columns empty; rejected with 422 otherwise.",
"title": "Infer Pk Avoid Names"
},
"infer_pk_avoid_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Extra column types to skip during PK inference, on top of the built-in defaults (float, timestamp, array, boolean, binary, unsupported). Use canonical type names (e.g. 'integer', 'text') or 'db:<RAW_TYPE>' to target a raw warehouse type (e.g. 'db:JSONB'). Requires infer_pk=true and pk_columns empty; rejected with 422 otherwise.",
"title": "Infer Pk Avoid Types"
},
"materialization_destination_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"description": "Data source ID where materialized diff results are stored.",
"title": "Materialization Destination Id"
},
"materialize_dataset1": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"description": "Materialize Dataset A before diffing. Improves speed for heavy queries, filtered non-indexed columns, or transformed primary keys.",
"title": "Materialize Dataset1"
},
"materialize_dataset2": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"description": "Materialize Dataset B before diffing. Same use cases as materialize_dataset1.",
"title": "Materialize Dataset2"
},
"materialize_without_sampling": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Skip sampling when materializing results.",
"title": "Materialize Without Sampling"
},
"per_column_diff_limit": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"title": "Per Column Diff Limit"
},
"pk_columns": {
"description": "Column names that uniquely identify rows, e.g. ['id'] or ['tenant_id', 'order_id']. Must match actual column names in both datasets. Leave empty and set infer_pk=True to auto-detect.",
"items": {
"type": "string"
},
"title": "Pk Columns",
"type": "array"
},
"purged": {
"default": false,
"title": "Purged",
"type": "boolean"
},
"query1": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "SQL query for Dataset A. Mutually exclusive with table1 and file1.",
"title": "Query1"
},
"query2": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "SQL query for Dataset B. Mutually exclusive with table2 and file2.",
"title": "Query2"
},
"run_profiles": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"description": "Run column profiling on diff results.",
"title": "Run Profiles"
},
"sampling_confidence": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"description": "Sampling confidence level, between 0 and 100 exclusive. Common values: 90, 95, 99, 99.5, 99.9. Use with sampling_tolerance.",
"title": "Sampling Confidence"
},
"sampling_max_rows": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"description": "Maximum number of rows to sample (absolute count). Alternative to tolerance+confidence and sampling_ratio.",
"title": "Sampling Max Rows"
},
"sampling_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"description": "Sample this fraction of rows. Value between 0 and 1 exclusive (e.g. 0.1 = 10% of rows). Alternative to tolerance+confidence.",
"title": "Sampling Ratio"
},
"sampling_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"description": "Minimum row count to activate sampling. Sampling is disabled if the largest table has fewer rows than this.",
"title": "Sampling Threshold"
},
"sampling_tolerance": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"description": "Sampling tolerance: max fraction of rows with PK errors before sampling is disabled. Value between 0 and 1 exclusive (e.g. 0.001 = 0.1%). Use with sampling_confidence.",
"title": "Sampling Tolerance"
},
"sort_array_columns": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"description": "Sort array elements before comparison to ignore insertion order differences.",
"title": "Sort Array Columns"
},
"table1": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Table path for Dataset A as a list of path components, e.g. ['schema', 'table'] or ['database', 'schema', 'table']. Mutually exclusive with query1 and file1.",
"title": "Table1"
},
"table2": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Table path for Dataset B as a list of path components, e.g. ['schema', 'table'] or ['database', 'schema', 'table']. Mutually exclusive with query2 and file2.",
"title": "Table2"
},
"table_modifiers": {
"anyOf": [
{
"items": {
"$ref": "#/components/schemas/TableModifiers"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Table-level modifiers. Allowed values: 'case_insensitive_strings' (ignore string case), 'null_equals_empty_string' (treat NULL and empty string as equal when one warehouse uses NULL and another uses '' for missing text).",
"title": "Table Modifiers"
},
"tags": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"description": "Tags for organizing and filtering diffs.",
"title": "Tags"
},
"time_aggregate": {
"anyOf": [
{
"$ref": "#/components/schemas/TimeAggregateEnum"
},
{
"type": "null"
}
],
"description": "Time aggregation level when using time_column."
},
"time_column": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "Column name used for time-based filtering or aggregation.",
"title": "Time Column"
},
"time_interval_end": {
"anyOf": [
{
"format": "date-time",
"type": "string"
},
{
"type": "null"
}
],
"title": "Time Interval End"
},
"time_interval_start": {
"anyOf": [
{
"format": "date-time",
"type": "string"
},
{
"type": "null"
}
],
"title": "Time Interval Start"
},
"time_travel_point1": {
"anyOf": [
{
# --- truncated at 32 KB (479 KB total) ---
# Full source: https://raw.githubusercontent.com/api-evangelist/datafold/refs/heads/main/json-schema/datafold-schemas.json