Evals · Schema

EvalSuite

A named collection of eval cases plus the scorers and grading policy that run against them. An eval suite is the unit that gets versioned, attached to a CI pipeline, and re-executed across models — analogous to a test suite in software testing.

EvalsLLM EvaluationAI QualityBenchmarksLLM as a JudgeObservabilityAgent EvaluationRAG EvaluationTest-Driven AI

Properties

Name Type Description
id string
name string
description string
version string
dataset_id string Identifier of the dataset of eval cases this suite is bound to.
scorers array Scorers run on every case in the suite.
policy object Aggregation and pass/fail policy at the suite level.
tags array
created string
modified string
View JSON Schema on GitHub

JSON Schema

evals-eval-suite-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/evals/refs/heads/main/json-schema/evals-eval-suite-schema.json",
  "title": "EvalSuite",
  "description": "A named collection of eval cases plus the scorers and grading policy that run against them. An eval suite is the unit that gets versioned, attached to a CI pipeline, and re-executed across models — analogous to a test suite in software testing.",
  "type": "object",
  "properties": {
    "id": {
      "type": "string",
      "example": "suite_rag_faq_v3"
    },
    "name": {
      "type": "string",
      "example": "Support FAQ RAG Suite"
    },
    "description": {
      "type": "string",
      "example": "End-to-end evaluation of the customer-support RAG pipeline across 240 representative questions."
    },
    "version": {
      "type": "string",
      "example": "3.2.0"
    },
    "dataset_id": {
      "type": "string",
      "description": "Identifier of the dataset of eval cases this suite is bound to.",
      "example": "ds_support_faq_2026q2"
    },
    "scorers": {
      "type": "array",
      "description": "Scorers run on every case in the suite.",
      "items": {
        "type": "object",
        "properties": {
          "id": { "type": "string", "example": "scorer_faithfulness_v2" },
          "name": { "type": "string", "example": "faithfulness" },
          "type": {
            "type": "string",
            "enum": ["code", "llm_judge", "human", "heuristic", "reference_based", "reference_free", "pairwise"]
          },
          "threshold": {
            "type": "number",
            "description": "Pass/fail threshold for this scorer on this suite.",
            "example": 0.8
          }
        },
        "required": ["name", "type"]
      }
    },
    "policy": {
      "type": "object",
      "description": "Aggregation and pass/fail policy at the suite level.",
      "properties": {
        "aggregation": {
          "type": "string",
          "enum": ["mean", "median", "pass_rate", "min", "max"],
          "example": "mean"
        },
        "fail_on_threshold": {
          "type": "boolean",
          "description": "When true, the suite fails CI if any scorer falls below its threshold.",
          "example": true
        }
      }
    },
    "tags": {
      "type": "array",
      "items": { "type": "string" },
      "example": ["rag", "production", "support"]
    },
    "created": {
      "type": "string",
      "format": "date-time",
      "example": "2026-04-01T00:00:00Z"
    },
    "modified": {
      "type": "string",
      "format": "date-time",
      "example": "2026-05-15T11:24:00Z"
    }
  },
  "required": ["id", "name", "dataset_id", "scorers"]
}