Evals · Schema
EvalSuite
A named collection of eval cases plus the scorers and grading policy that run against them. An eval suite is the unit that gets versioned, attached to a CI pipeline, and re-executed across models — analogous to a test suite in software testing.
EvalsLLM EvaluationAI QualityBenchmarksLLM as a JudgeObservabilityAgent EvaluationRAG EvaluationTest-Driven AI
Properties
| Name | Type | Description |
|---|---|---|
| id | string | |
| name | string | |
| description | string | |
| version | string | |
| dataset_id | string | Identifier of the dataset of eval cases this suite is bound to. |
| scorers | array | Scorers run on every case in the suite. |
| policy | object | Aggregation and pass/fail policy at the suite level. |
| tags | array | |
| created | string | |
| modified | string |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/api-evangelist/evals/refs/heads/main/json-schema/evals-eval-suite-schema.json",
"title": "EvalSuite",
"description": "A named collection of eval cases plus the scorers and grading policy that run against them. An eval suite is the unit that gets versioned, attached to a CI pipeline, and re-executed across models — analogous to a test suite in software testing.",
"type": "object",
"properties": {
"id": {
"type": "string",
"example": "suite_rag_faq_v3"
},
"name": {
"type": "string",
"example": "Support FAQ RAG Suite"
},
"description": {
"type": "string",
"example": "End-to-end evaluation of the customer-support RAG pipeline across 240 representative questions."
},
"version": {
"type": "string",
"example": "3.2.0"
},
"dataset_id": {
"type": "string",
"description": "Identifier of the dataset of eval cases this suite is bound to.",
"example": "ds_support_faq_2026q2"
},
"scorers": {
"type": "array",
"description": "Scorers run on every case in the suite.",
"items": {
"type": "object",
"properties": {
"id": { "type": "string", "example": "scorer_faithfulness_v2" },
"name": { "type": "string", "example": "faithfulness" },
"type": {
"type": "string",
"enum": ["code", "llm_judge", "human", "heuristic", "reference_based", "reference_free", "pairwise"]
},
"threshold": {
"type": "number",
"description": "Pass/fail threshold for this scorer on this suite.",
"example": 0.8
}
},
"required": ["name", "type"]
}
},
"policy": {
"type": "object",
"description": "Aggregation and pass/fail policy at the suite level.",
"properties": {
"aggregation": {
"type": "string",
"enum": ["mean", "median", "pass_rate", "min", "max"],
"example": "mean"
},
"fail_on_threshold": {
"type": "boolean",
"description": "When true, the suite fails CI if any scorer falls below its threshold.",
"example": true
}
}
},
"tags": {
"type": "array",
"items": { "type": "string" },
"example": ["rag", "production", "support"]
},
"created": {
"type": "string",
"format": "date-time",
"example": "2026-04-01T00:00:00Z"
},
"modified": {
"type": "string",
"format": "date-time",
"example": "2026-05-15T11:24:00Z"
}
},
"required": ["id", "name", "dataset_id", "scorers"]
}