Hugging Face · Schema

Hugging Face Dataset

Schema for a dataset hosted on the Hugging Face Hub, including metadata, structure, splits, and repository information.

Properties

Name Type Description
_id string Internal unique identifier for the dataset
id string Dataset repository ID in the format author/dataset-name or dataset-name
author string Author or organization that owns the dataset
sha string Latest Git commit SHA of the dataset repository
lastModified string Timestamp of the last modification
createdAt string Timestamp when the dataset was created
private boolean Whether the dataset is private
disabled boolean Whether the dataset has been disabled
gated object Access gating configuration
tags array Tags associated with the dataset
downloads integer Number of downloads in the last 30 days
likes integer Number of likes/favorites
description string Short description of the dataset
citation string Citation text for the dataset (BibTeX format)
siblings array Files in the dataset repository
cardData object Parsed metadata from the dataset card YAML front matter
View JSON Schema on GitHub

JSON Schema

hugging-face-dataset-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://huggingface.co/schemas/dataset.json",
  "title": "Hugging Face Dataset",
  "description": "Schema for a dataset hosted on the Hugging Face Hub, including metadata, structure, splits, and repository information.",
  "type": "object",
  "required": [
    "id"
  ],
  "properties": {
    "_id": {
      "type": "string",
      "description": "Internal unique identifier for the dataset"
    },
    "id": {
      "type": "string",
      "description": "Dataset repository ID in the format author/dataset-name or dataset-name",
      "examples": [
        "squad",
        "glue",
        "mozilla-foundation/common_voice_17_0",
        "tatsu-lab/alpaca"
      ]
    },
    "author": {
      "type": "string",
      "description": "Author or organization that owns the dataset"
    },
    "sha": {
      "type": "string",
      "description": "Latest Git commit SHA of the dataset repository",
      "pattern": "^[0-9a-f]{40}$"
    },
    "lastModified": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp of the last modification"
    },
    "createdAt": {
      "type": "string",
      "format": "date-time",
      "description": "Timestamp when the dataset was created"
    },
    "private": {
      "type": "boolean",
      "description": "Whether the dataset is private",
      "default": false
    },
    "disabled": {
      "type": "boolean",
      "description": "Whether the dataset has been disabled",
      "default": false
    },
    "gated": {
      "oneOf": [
        {
          "type": "boolean"
        },
        {
          "type": "string",
          "enum": [
            "auto",
            "manual"
          ]
        }
      ],
      "description": "Access gating configuration"
    },
    "tags": {
      "type": "array",
      "items": {
        "type": "string"
      },
      "description": "Tags associated with the dataset"
    },
    "downloads": {
      "type": "integer",
      "description": "Number of downloads in the last 30 days",
      "minimum": 0
    },
    "likes": {
      "type": "integer",
      "description": "Number of likes/favorites",
      "minimum": 0
    },
    "description": {
      "type": "string",
      "description": "Short description of the dataset"
    },
    "citation": {
      "type": "string",
      "description": "Citation text for the dataset (BibTeX format)"
    },
    "siblings": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "rfilename": {
            "type": "string",
            "description": "Relative file path within the repository"
          },
          "size": {
            "type": "integer",
            "description": "File size in bytes"
          },
          "blobId": {
            "type": "string",
            "description": "Git blob ID"
          },
          "lfs": {
            "type": "object",
            "properties": {
              "sha256": {
                "type": "string"
              },
              "size": {
                "type": "integer"
              },
              "pointerSize": {
                "type": "integer"
              }
            }
          }
        }
      },
      "description": "Files in the dataset repository"
    },
    "cardData": {
      "type": "object",
      "description": "Parsed metadata from the dataset card YAML front matter",
      "properties": {
        "language": {
          "oneOf": [
            {
              "type": "string"
            },
            {
              "type": "array",
              "items": {
                "type": "string"
              }
            }
          ],
          "description": "Language(s) of the dataset"
        },
        "license": {
          "oneOf": [
            {
              "type": "string"
            },
            {
              "type": "array",
              "items": {
                "type": "string"
              }
            }
          ],
          "description": "License identifier(s)",
          "examples": [
            "apache-2.0",
            "mit",
            "cc-by-4.0",
            "cc-by-sa-4.0"
          ]
        },
        "multilinguality": {
          "type": "array",
          "items": {
            "type": "string",
            "enum": [
              "monolingual",
              "multilingual",
              "translation",
              "other"
            ]
          }
        },
        "size_categories": {
          "type": "array",
          "items": {
            "type": "string",
            "enum": [
              "n<1K",
              "1K<n<10K",
              "10K<n<100K",
              "100K<n<1M",
              "1M<n<10M",
              "10M<n<100M",
              "100M<n<1B",
              "1B<n<10B",
              "n>10B"
            ]
          },
          "description": "Size category of the dataset"
        },
        "task_categories": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "Task categories the dataset supports",
          "examples": [
            [
              "text-classification",
              "question-answering",
              "summarization",
              "translation",
              "text-generation"
            ]
          ]
        },
        "task_ids": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "Specific task IDs (more granular than task_categories)"
        },
        "paperswithcode_id": {
          "type": "string",
          "description": "Papers With Code dataset identifier"
        },
        "pretty_name": {
          "type": "string",
          "description": "Human-readable display name"
        },
        "configs": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "config_name": {
                "type": "string",
                "description": "Configuration/subset name"
              },
              "data_files": {
                "oneOf": [
                  {
                    "type": "string"
                  },
                  {
                    "type": "array",
                    "items": {
                      "type": "object",
                      "properties": {
                        "split": {
                          "type": "string"
                        },
                        "path": {
                          "oneOf": [
                            {
                              "type": "string"
                            },
                            {
                              "type": "array",
                              "items": {
                                "type": "string"
                              }
                            }
                          ]
                        }
                      }
                    }
                  }
                ],
                "description": "Data file locations for this config"
              },
              "default": {
                "type": "boolean",
                "description": "Whether this is the default configuration"
              }
            }
          },
          "description": "Dataset loading configurations"
        },
        "dataset_info": {
          "oneOf": [
            {
              "$ref": "#/$defs/DatasetInfoEntry"
            },
            {
              "type": "array",
              "items": {
                "$ref": "#/$defs/DatasetInfoEntry"
              }
            }
          ],
          "description": "Detailed structural information about the dataset"
        },
        "train-eval-index": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "config": {
                "type": "string"
              },
              "task": {
                "type": "string"
              },
              "task_id": {
                "type": "string"
              },
              "splits": {
                "type": "object"
              },
              "col_mapping": {
                "type": "object"
              },
              "metrics": {
                "type": "array",
                "items": {
                  "type": "object"
                }
              }
            }
          },
          "description": "AutoTrain evaluation configuration"
        }
      }
    }
  },
  "$defs": {
    "DatasetInfoEntry": {
      "type": "object",
      "properties": {
        "config_name": {
          "type": "string",
          "description": "Configuration name"
        },
        "features": {
          "type": "array",
          "items": {
            "$ref": "#/$defs/Feature"
          },
          "description": "Dataset feature (column) definitions"
        },
        "splits": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string",
                "description": "Split name (e.g., train, test, validation)"
              },
              "num_bytes": {
                "type": "integer",
                "description": "Size of the split in bytes"
              },
              "num_examples": {
                "type": "integer",
                "description": "Number of examples in the split"
              }
            }
          },
          "description": "Data splits"
        },
        "download_size": {
          "type": "integer",
          "description": "Total download size in bytes"
        },
        "dataset_size": {
          "type": "integer",
          "description": "Total dataset size in bytes (uncompressed)"
        }
      }
    },
    "Feature": {
      "type": "object",
      "properties": {
        "name": {
          "type": "string",
          "description": "Feature/column name"
        },
        "dtype": {
          "type": "string",
          "description": "Data type (e.g., string, int32, float64, bool)",
          "examples": [
            "string",
            "int32",
            "int64",
            "float32",
            "float64",
            "bool"
          ]
        },
        "struct": {
          "type": "array",
          "items": {
            "$ref": "#/$defs/Feature"
          },
          "description": "Nested struct fields"
        },
        "sequence": {
          "oneOf": [
            {
              "type": "string"
            },
            {
              "$ref": "#/$defs/Feature"
            }
          ],
          "description": "Sequence element type"
        },
        "class_label": {
          "type": "object",
          "properties": {
            "names": {
              "type": "object",
              "additionalProperties": {
                "type": "string"
              },
              "description": "Mapping from integer labels to string names"
            }
          },
          "description": "Class label metadata"
        },
        "_type": {
          "type": "string",
          "description": "Internal type identifier",
          "enum": [
            "Value",
            "ClassLabel",
            "Sequence",
            "Image",
            "Audio",
            "Translation",
            "TranslationVariableLanguages",
            "Array2D",
            "Array3D",
            "Array4D",
            "Array5D"
          ]
        }
      }
    }
  }
}