Evals · Schema

EvalDataset

A collection of EvalCases plus metadata describing its provenance, license, splits, and the task it targets. Maps directly to Hugging Face datasets (MMLU, HumanEval, GAIA), LangSmith datasets, Braintrust datasets, and DeepEval EvaluationDatasets.

EvalsLLM EvaluationAI QualityBenchmarksLLM as a JudgeObservabilityAgent EvaluationRAG EvaluationTest-Driven AI

Properties

Name	Type	Description
id	string
name	string
description	string
version	string
task	string	What the dataset is for.
source	string	Canonical URL where the dataset is published.
license	string
splits	object	Named subsets (train/dev/test, or domain-specific splits).
case_count	integer	Total number of cases across all splits.
tags	array
created	string
modified	string

View JSON Schema on GitHub

JSON Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/evals/refs/heads/main/json-schema/evals-dataset-schema.json",
  "title": "EvalDataset",
  "description": "A collection of EvalCases plus metadata describing its provenance, license, splits, and the task it targets. Maps directly to Hugging Face datasets (MMLU, HumanEval, GAIA), LangSmith datasets, Braintrust datasets, and DeepEval EvaluationDatasets.",
  "type": "object",
  "properties": {
    "id": {
      "type": "string",
      "example": "ds_support_faq_2026q2"
    },
    "name": {
      "type": "string",
      "example": "Customer Support FAQ 2026 Q2"
    },
    "description": {
      "type": "string",
      "example": "Curated and human-verified set of 240 customer-support questions and reference answers for evaluating the support RAG pipeline."
    },
    "version": {
      "type": "string",
      "example": "2026.05.0"
    },
    "task": {
      "type": "string",
      "description": "What the dataset is for.",
      "enum": ["qa", "rag", "code_generation", "summarization", "classification", "agent_task", "safety", "multi_turn_dialogue", "knowledge", "reasoning"],
      "example": "rag"
    },
    "source": {
      "type": "string",
      "format": "uri",
      "description": "Canonical URL where the dataset is published.",
      "example": "https://huggingface.co/datasets/cais/mmlu"
    },
    "license": {
      "type": "string",
      "example": "MIT"
    },
    "splits": {
      "type": "object",
      "description": "Named subsets (train/dev/test, or domain-specific splits).",
      "additionalProperties": {
        "type": "object",
        "properties": {
          "count": { "type": "integer", "example": 240 },
          "uri": { "type": "string", "format": "uri" }
        }
      },
      "example": {
        "train": { "count": 180 },
        "test": { "count": 60 }
      }
    },
    "case_count": {
      "type": "integer",
      "description": "Total number of cases across all splits.",
      "example": 240
    },
    "tags": {
      "type": "array",
      "items": { "type": "string" },
      "example": ["rag", "support", "en-US"]
    },
    "created": {
      "type": "string",
      "format": "date-time"
    },
    "modified": {
      "type": "string",
      "format": "date-time"
    }
  },
  "required": ["id", "name", "task"]
}