Dataiku · Schema

Dataiku DSS Dataset

A dataset in Dataiku DSS representing a structured data source or output, including its schema, connection parameters, format configuration, and flow integration settings.

AnalyticsArtificial IntelligenceData PlatformData ScienceMachine Learning

Properties

Name Type Description
projectKey string Project key that this dataset belongs to
name string Dataset name, unique within the project
type string Type of the dataset, determining the underlying storage or connection
managed boolean Whether the dataset is managed by DSS (output of a recipe) or external
schema object
formatType string Data format for file-based datasets
formatParams object Format-specific parameters
params object Type-specific connection and access parameters
partitioning object
flowOptions object Options for how this dataset behaves in the flow
metrics object
checks object
creationTag object
versionTag object
View JSON Schema on GitHub

JSON Schema

dataiku-dataset-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://doc.dataiku.com/schemas/dataiku/dataset.json",
  "title": "Dataiku DSS Dataset",
  "description": "A dataset in Dataiku DSS representing a structured data source or output, including its schema, connection parameters, format configuration, and flow integration settings.",
  "type": "object",
  "required": ["name", "type"],
  "properties": {
    "projectKey": {
      "type": "string",
      "description": "Project key that this dataset belongs to"
    },
    "name": {
      "type": "string",
      "description": "Dataset name, unique within the project",
      "minLength": 1,
      "maxLength": 256
    },
    "type": {
      "type": "string",
      "enum": [
        "Filesystem",
        "UploadedFiles",
        "PostgreSQL",
        "MySQL",
        "Oracle",
        "SQLServer",
        "Redshift",
        "BigQuery",
        "Snowflake",
        "Synapse",
        "Teradata",
        "S3",
        "GCS",
        "Azure",
        "HDFS",
        "Hive",
        "MongoDB",
        "Elasticsearch",
        "Cassandra",
        "HTTP",
        "FTP",
        "SCP",
        "Twitter",
        "Inline",
        "StatsDB",
        "JobsDB",
        "JDBC"
      ],
      "description": "Type of the dataset, determining the underlying storage or connection"
    },
    "managed": {
      "type": "boolean",
      "description": "Whether the dataset is managed by DSS (output of a recipe) or external"
    },
    "schema": {
      "$ref": "#/$defs/Schema"
    },
    "formatType": {
      "type": "string",
      "enum": ["csv", "parquet", "json", "avro", "orc", "excel", "xml"],
      "description": "Data format for file-based datasets"
    },
    "formatParams": {
      "type": "object",
      "description": "Format-specific parameters",
      "properties": {
        "separator": {
          "type": "string",
          "description": "Column separator for CSV files"
        },
        "style": {
          "type": "string",
          "description": "CSV style (e.g., excel, unix, escaped)"
        },
        "quoteChar": {
          "type": "string",
          "description": "Quote character for CSV"
        },
        "escapeChar": {
          "type": "string",
          "description": "Escape character for CSV"
        },
        "parseHeaderRow": {
          "type": "boolean",
          "description": "Whether to parse the first row as header"
        },
        "charset": {
          "type": "string",
          "description": "Character encoding (e.g., utf8, latin1)"
        },
        "compress": {
          "type": "string",
          "description": "Compression type (e.g., gz, bz2, snappy)"
        }
      }
    },
    "params": {
      "type": "object",
      "description": "Type-specific connection and access parameters",
      "properties": {
        "connection": {
          "type": "string",
          "description": "DSS connection name for database-backed datasets"
        },
        "table": {
          "type": "string",
          "description": "Database table name"
        },
        "schema": {
          "type": "string",
          "description": "Database schema name"
        },
        "catalog": {
          "type": "string",
          "description": "Database catalog name"
        },
        "path": {
          "type": "string",
          "description": "File path for file-based datasets"
        },
        "bucket": {
          "type": "string",
          "description": "Cloud storage bucket name"
        },
        "notReadyIfEmpty": {
          "type": "boolean",
          "description": "Consider dataset not ready if it contains no data"
        }
      }
    },
    "partitioning": {
      "$ref": "#/$defs/Partitioning"
    },
    "flowOptions": {
      "type": "object",
      "description": "Options for how this dataset behaves in the flow",
      "properties": {
        "virtualizable": {
          "type": "boolean",
          "description": "Whether the dataset can be virtualized"
        },
        "rebuildBehavior": {
          "type": "string",
          "enum": ["NORMAL", "WRITE_PROTECTED", "NO_REBUILD"],
          "description": "Rebuild behavior for the dataset"
        },
        "crossProjectBuildBehavior": {
          "type": "string",
          "enum": ["DEFAULT", "BUILD", "NO_BUILD"],
          "description": "Build behavior when accessed from another project"
        }
      }
    },
    "metrics": {
      "$ref": "#/$defs/MetricsSettings"
    },
    "checks": {
      "$ref": "#/$defs/ChecksSettings"
    },
    "creationTag": {
      "$ref": "#/$defs/VersionTag"
    },
    "versionTag": {
      "$ref": "#/$defs/VersionTag"
    }
  },
  "$defs": {
    "Schema": {
      "type": "object",
      "description": "Dataset schema defining columns and their types",
      "properties": {
        "columns": {
          "type": "array",
          "items": {
            "$ref": "#/$defs/Column"
          },
          "description": "Ordered list of columns"
        },
        "userModified": {
          "type": "boolean",
          "description": "Whether the schema was manually modified by a user"
        }
      }
    },
    "Column": {
      "type": "object",
      "description": "A column in a dataset schema",
      "required": ["name", "type"],
      "properties": {
        "name": {
          "type": "string",
          "description": "Column name"
        },
        "type": {
          "type": "string",
          "enum": [
            "string",
            "bigint",
            "int",
            "smallint",
            "tinyint",
            "double",
            "float",
            "boolean",
            "date",
            "array",
            "map",
            "object",
            "geopoint",
            "geometry"
          ],
          "description": "Column data type"
        },
        "meaning": {
          "type": "string",
          "description": "Semantic meaning assigned to this column (e.g., Email, URL, IPAddress)"
        },
        "maxLength": {
          "type": "integer",
          "minimum": -1,
          "description": "Maximum length for string columns (-1 for unlimited)"
        },
        "comment": {
          "type": "string",
          "description": "Documentation comment for the column"
        }
      }
    },
    "Partitioning": {
      "type": "object",
      "description": "Partitioning configuration for the dataset",
      "properties": {
        "dimensions": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string",
                "description": "Partition dimension name"
              },
              "type": {
                "type": "string",
                "enum": ["value", "time"],
                "description": "Partition dimension type"
              },
              "params": {
                "type": "object",
                "description": "Dimension-specific parameters",
                "properties": {
                  "period": {
                    "type": "string",
                    "enum": ["YEAR", "MONTH", "DAY", "HOUR"],
                    "description": "Time period for time-based partitions"
                  }
                }
              }
            }
          },
          "description": "Partition dimensions"
        },
        "filePathPattern": {
          "type": "string",
          "description": "File path pattern for file-based partitioned datasets"
        }
      }
    },
    "MetricsSettings": {
      "type": "object",
      "description": "Metrics computation settings",
      "properties": {
        "probes": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "description": "Metric probe type"
              },
              "enabled": {
                "type": "boolean",
                "description": "Whether this probe is enabled"
              },
              "computeOnBuildMode": {
                "type": "string",
                "enum": ["NO", "PARTITION", "WHOLE_DATASET"],
                "description": "When to compute the metric during builds"
              }
            }
          }
        },
        "displayedState": {
          "type": "object",
          "description": "Display state for metrics"
        }
      }
    },
    "ChecksSettings": {
      "type": "object",
      "description": "Data quality check settings",
      "properties": {
        "checks": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "description": "Check type"
              },
              "name": {
                "type": "string",
                "description": "Check name"
              },
              "meta": {
                "type": "object",
                "description": "Check metadata"
              },
              "params": {
                "type": "object",
                "description": "Check parameters"
              }
            }
          }
        }
      }
    },
    "VersionTag": {
      "type": "object",
      "description": "Version tracking information",
      "properties": {
        "versionNumber": {
          "type": "integer",
          "minimum": 0,
          "description": "Sequential version number"
        },
        "lastModifiedBy": {
          "type": "object",
          "properties": {
            "login": {
              "type": "string",
              "description": "Login of the user who made the modification"
            }
          }
        },
        "lastModifiedOn": {
          "type": "string",
          "format": "date-time",
          "description": "Timestamp of the last modification"
        }
      }
    }
  }
}