Dataiku · Schema
Dataiku DSS Dataset
A dataset in Dataiku DSS representing a structured data source or output, including its schema, connection parameters, format configuration, and flow integration settings.
AnalyticsArtificial IntelligenceData PlatformData ScienceMachine Learning
Properties
| Name | Type | Description |
|---|---|---|
| projectKey | string | Project key that this dataset belongs to |
| name | string | Dataset name, unique within the project |
| type | string | Type of the dataset, determining the underlying storage or connection |
| managed | boolean | Whether the dataset is managed by DSS (output of a recipe) or external |
| schema | object | |
| formatType | string | Data format for file-based datasets |
| formatParams | object | Format-specific parameters |
| params | object | Type-specific connection and access parameters |
| partitioning | object | |
| flowOptions | object | Options for how this dataset behaves in the flow |
| metrics | object | |
| checks | object | |
| creationTag | object | |
| versionTag | object |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://doc.dataiku.com/schemas/dataiku/dataset.json",
"title": "Dataiku DSS Dataset",
"description": "A dataset in Dataiku DSS representing a structured data source or output, including its schema, connection parameters, format configuration, and flow integration settings.",
"type": "object",
"required": ["name", "type"],
"properties": {
"projectKey": {
"type": "string",
"description": "Project key that this dataset belongs to"
},
"name": {
"type": "string",
"description": "Dataset name, unique within the project",
"minLength": 1,
"maxLength": 256
},
"type": {
"type": "string",
"enum": [
"Filesystem",
"UploadedFiles",
"PostgreSQL",
"MySQL",
"Oracle",
"SQLServer",
"Redshift",
"BigQuery",
"Snowflake",
"Synapse",
"Teradata",
"S3",
"GCS",
"Azure",
"HDFS",
"Hive",
"MongoDB",
"Elasticsearch",
"Cassandra",
"HTTP",
"FTP",
"SCP",
"Twitter",
"Inline",
"StatsDB",
"JobsDB",
"JDBC"
],
"description": "Type of the dataset, determining the underlying storage or connection"
},
"managed": {
"type": "boolean",
"description": "Whether the dataset is managed by DSS (output of a recipe) or external"
},
"schema": {
"$ref": "#/$defs/Schema"
},
"formatType": {
"type": "string",
"enum": ["csv", "parquet", "json", "avro", "orc", "excel", "xml"],
"description": "Data format for file-based datasets"
},
"formatParams": {
"type": "object",
"description": "Format-specific parameters",
"properties": {
"separator": {
"type": "string",
"description": "Column separator for CSV files"
},
"style": {
"type": "string",
"description": "CSV style (e.g., excel, unix, escaped)"
},
"quoteChar": {
"type": "string",
"description": "Quote character for CSV"
},
"escapeChar": {
"type": "string",
"description": "Escape character for CSV"
},
"parseHeaderRow": {
"type": "boolean",
"description": "Whether to parse the first row as header"
},
"charset": {
"type": "string",
"description": "Character encoding (e.g., utf8, latin1)"
},
"compress": {
"type": "string",
"description": "Compression type (e.g., gz, bz2, snappy)"
}
}
},
"params": {
"type": "object",
"description": "Type-specific connection and access parameters",
"properties": {
"connection": {
"type": "string",
"description": "DSS connection name for database-backed datasets"
},
"table": {
"type": "string",
"description": "Database table name"
},
"schema": {
"type": "string",
"description": "Database schema name"
},
"catalog": {
"type": "string",
"description": "Database catalog name"
},
"path": {
"type": "string",
"description": "File path for file-based datasets"
},
"bucket": {
"type": "string",
"description": "Cloud storage bucket name"
},
"notReadyIfEmpty": {
"type": "boolean",
"description": "Consider dataset not ready if it contains no data"
}
}
},
"partitioning": {
"$ref": "#/$defs/Partitioning"
},
"flowOptions": {
"type": "object",
"description": "Options for how this dataset behaves in the flow",
"properties": {
"virtualizable": {
"type": "boolean",
"description": "Whether the dataset can be virtualized"
},
"rebuildBehavior": {
"type": "string",
"enum": ["NORMAL", "WRITE_PROTECTED", "NO_REBUILD"],
"description": "Rebuild behavior for the dataset"
},
"crossProjectBuildBehavior": {
"type": "string",
"enum": ["DEFAULT", "BUILD", "NO_BUILD"],
"description": "Build behavior when accessed from another project"
}
}
},
"metrics": {
"$ref": "#/$defs/MetricsSettings"
},
"checks": {
"$ref": "#/$defs/ChecksSettings"
},
"creationTag": {
"$ref": "#/$defs/VersionTag"
},
"versionTag": {
"$ref": "#/$defs/VersionTag"
}
},
"$defs": {
"Schema": {
"type": "object",
"description": "Dataset schema defining columns and their types",
"properties": {
"columns": {
"type": "array",
"items": {
"$ref": "#/$defs/Column"
},
"description": "Ordered list of columns"
},
"userModified": {
"type": "boolean",
"description": "Whether the schema was manually modified by a user"
}
}
},
"Column": {
"type": "object",
"description": "A column in a dataset schema",
"required": ["name", "type"],
"properties": {
"name": {
"type": "string",
"description": "Column name"
},
"type": {
"type": "string",
"enum": [
"string",
"bigint",
"int",
"smallint",
"tinyint",
"double",
"float",
"boolean",
"date",
"array",
"map",
"object",
"geopoint",
"geometry"
],
"description": "Column data type"
},
"meaning": {
"type": "string",
"description": "Semantic meaning assigned to this column (e.g., Email, URL, IPAddress)"
},
"maxLength": {
"type": "integer",
"minimum": -1,
"description": "Maximum length for string columns (-1 for unlimited)"
},
"comment": {
"type": "string",
"description": "Documentation comment for the column"
}
}
},
"Partitioning": {
"type": "object",
"description": "Partitioning configuration for the dataset",
"properties": {
"dimensions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Partition dimension name"
},
"type": {
"type": "string",
"enum": ["value", "time"],
"description": "Partition dimension type"
},
"params": {
"type": "object",
"description": "Dimension-specific parameters",
"properties": {
"period": {
"type": "string",
"enum": ["YEAR", "MONTH", "DAY", "HOUR"],
"description": "Time period for time-based partitions"
}
}
}
}
},
"description": "Partition dimensions"
},
"filePathPattern": {
"type": "string",
"description": "File path pattern for file-based partitioned datasets"
}
}
},
"MetricsSettings": {
"type": "object",
"description": "Metrics computation settings",
"properties": {
"probes": {
"type": "array",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "Metric probe type"
},
"enabled": {
"type": "boolean",
"description": "Whether this probe is enabled"
},
"computeOnBuildMode": {
"type": "string",
"enum": ["NO", "PARTITION", "WHOLE_DATASET"],
"description": "When to compute the metric during builds"
}
}
}
},
"displayedState": {
"type": "object",
"description": "Display state for metrics"
}
}
},
"ChecksSettings": {
"type": "object",
"description": "Data quality check settings",
"properties": {
"checks": {
"type": "array",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "Check type"
},
"name": {
"type": "string",
"description": "Check name"
},
"meta": {
"type": "object",
"description": "Check metadata"
},
"params": {
"type": "object",
"description": "Check parameters"
}
}
}
}
}
},
"VersionTag": {
"type": "object",
"description": "Version tracking information",
"properties": {
"versionNumber": {
"type": "integer",
"minimum": 0,
"description": "Sequential version number"
},
"lastModifiedBy": {
"type": "object",
"properties": {
"login": {
"type": "string",
"description": "Login of the user who made the modification"
}
}
},
"lastModifiedOn": {
"type": "string",
"format": "date-time",
"description": "Timestamp of the last modification"
}
}
}
}
}