Evals · Schema

EvalDataset

A collection of EvalCases plus metadata describing its provenance, license, splits, and the task it targets. Maps directly to Hugging Face datasets (MMLU, HumanEval, GAIA), LangSmith datasets, Braintrust datasets, and DeepEval EvaluationDatasets.

EvalsLLM EvaluationAI QualityBenchmarksLLM as a JudgeObservabilityAgent EvaluationRAG EvaluationTest-Driven AI

Properties

Name Type Description
id string
name string
description string
version string
task string What the dataset is for.
source string Canonical URL where the dataset is published.
license string
splits object Named subsets (train/dev/test, or domain-specific splits).
case_count integer Total number of cases across all splits.
tags array
created string
modified string
View JSON Schema on GitHub

JSON Schema

evals-dataset-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/evals/refs/heads/main/json-schema/evals-dataset-schema.json",
  "title": "EvalDataset",
  "description": "A collection of EvalCases plus metadata describing its provenance, license, splits, and the task it targets. Maps directly to Hugging Face datasets (MMLU, HumanEval, GAIA), LangSmith datasets, Braintrust datasets, and DeepEval EvaluationDatasets.",
  "type": "object",
  "properties": {
    "id": {
      "type": "string",
      "example": "ds_support_faq_2026q2"
    },
    "name": {
      "type": "string",
      "example": "Customer Support FAQ 2026 Q2"
    },
    "description": {
      "type": "string",
      "example": "Curated and human-verified set of 240 customer-support questions and reference answers for evaluating the support RAG pipeline."
    },
    "version": {
      "type": "string",
      "example": "2026.05.0"
    },
    "task": {
      "type": "string",
      "description": "What the dataset is for.",
      "enum": ["qa", "rag", "code_generation", "summarization", "classification", "agent_task", "safety", "multi_turn_dialogue", "knowledge", "reasoning"],
      "example": "rag"
    },
    "source": {
      "type": "string",
      "format": "uri",
      "description": "Canonical URL where the dataset is published.",
      "example": "https://huggingface.co/datasets/cais/mmlu"
    },
    "license": {
      "type": "string",
      "example": "MIT"
    },
    "splits": {
      "type": "object",
      "description": "Named subsets (train/dev/test, or domain-specific splits).",
      "additionalProperties": {
        "type": "object",
        "properties": {
          "count": { "type": "integer", "example": 240 },
          "uri": { "type": "string", "format": "uri" }
        }
      },
      "example": {
        "train": { "count": 180 },
        "test": { "count": 60 }
      }
    },
    "case_count": {
      "type": "integer",
      "description": "Total number of cases across all splits.",
      "example": 240
    },
    "tags": {
      "type": "array",
      "items": { "type": "string" },
      "example": ["rag", "support", "en-US"]
    },
    "created": {
      "type": "string",
      "format": "date-time"
    },
    "modified": {
      "type": "string",
      "format": "date-time"
    }
  },
  "required": ["id", "name", "task"]
}