Volcano · Schema
Volcano Job
Schema for the Volcano Job (vcjob) custom resource definition. A Volcano Job defines a batch workload with multiple task types, lifecycle policies, gang scheduling requirements, plugin integration, and queue assignment for Kubernetes batch processing.
Batch ProcessingCloud NativeHPCIncubatingKubernetesSchedulingMachine Learning
Properties
| Name | Type | Description |
|---|---|---|
| apiVersion | string | API version of the Volcano Job resource. |
| kind | string | Resource kind. |
| metadata | object | |
| spec | object | |
| status | object |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://volcano.sh/schemas/job.json",
"title": "Volcano Job",
"description": "Schema for the Volcano Job (vcjob) custom resource definition. A Volcano Job defines a batch workload with multiple task types, lifecycle policies, gang scheduling requirements, plugin integration, and queue assignment for Kubernetes batch processing.",
"type": "object",
"required": ["apiVersion", "kind", "metadata", "spec"],
"properties": {
"apiVersion": {
"type": "string",
"description": "API version of the Volcano Job resource.",
"const": "batch.volcano.sh/v1alpha1"
},
"kind": {
"type": "string",
"description": "Resource kind.",
"const": "Job"
},
"metadata": {
"$ref": "#/$defs/ObjectMeta"
},
"spec": {
"$ref": "#/$defs/JobSpec"
},
"status": {
"$ref": "#/$defs/JobStatus"
}
},
"$defs": {
"JobSpec": {
"type": "object",
"description": "Specification of a Volcano Job defining workload structure, tasks, scheduling requirements, and lifecycle policies.",
"properties": {
"schedulerName": {
"type": "string",
"description": "Name of the scheduler responsible for this job. Defaults to 'volcano'.",
"default": "volcano"
},
"minAvailable": {
"type": "integer",
"description": "Minimum number of pods that must be schedulable simultaneously for gang scheduling. If fewer pods are available, none are scheduled.",
"minimum": 0
},
"tasks": {
"type": "array",
"description": "List of task groups composing the job. Each task defines a pod template, replica count, and task-level policies.",
"items": {
"$ref": "#/$defs/TaskSpec"
}
},
"policies": {
"type": "array",
"description": "Job-level lifecycle policies controlling how the job reacts to events like pod failures or completions.",
"items": {
"$ref": "#/$defs/LifecyclePolicy"
}
},
"plugins": {
"type": "object",
"description": "Map of plugin names to argument arrays. Plugins inject environment variables and sidecar containers for ML frameworks. Common plugins include 'tensorflow', 'pytorch', 'mpi', 'svc', and 'env'.",
"additionalProperties": {
"type": "array",
"items": {
"type": "string"
}
},
"examples": [
{"svc": [], "env": []},
{"pytorch": ["--master=master", "--worker=worker"]},
{"mpi": ["--master=mpimaster", "--worker=mpiworker"]}
]
},
"queue": {
"type": "string",
"description": "Name of the Volcano Queue to submit this job to. Determines scheduling priority and resource quota constraints."
},
"priorityClassName": {
"type": "string",
"description": "Kubernetes PriorityClass name controlling this job's scheduling priority."
},
"maxRetry": {
"type": "integer",
"description": "Maximum number of retry attempts before the job is permanently failed.",
"minimum": 0
},
"ttlSecondsAfterFinished": {
"type": "integer",
"description": "Seconds after completion before the job is automatically garbage-collected. Omit to disable automatic cleanup.",
"minimum": 0
},
"runningDuration": {
"type": "string",
"description": "Maximum allowed running duration for the job (e.g. '2h', '30m'). Job is terminated if it exceeds this duration.",
"examples": ["1h", "30m", "2h30m"]
},
"volumes": {
"type": "array",
"description": "Volumes to be mounted into job task containers.",
"items": {
"type": "object"
}
}
}
},
"TaskSpec": {
"type": "object",
"description": "A named task group within a Volcano Job, defining a set of pods with a shared pod template and optional task-level policies.",
"required": ["name", "replicas", "template"],
"properties": {
"name": {
"type": "string",
"description": "Unique name of the task within the job. Used to reference the task in plugins and dependencies.",
"maxLength": 63,
"pattern": "^[a-z0-9][a-z0-9-]*[a-z0-9]$"
},
"replicas": {
"type": "integer",
"description": "Number of pod replicas for this task.",
"minimum": 1
},
"template": {
"type": "object",
"description": "Kubernetes pod template spec for pods in this task."
},
"policies": {
"type": "array",
"description": "Lifecycle policies for this task, overriding job-level policies for events affecting this task's pods.",
"items": {
"$ref": "#/$defs/LifecyclePolicy"
}
},
"topologyPolicy": {
"type": "string",
"description": "NUMA topology policy for resource allocation. Controls how CPU and memory are assigned relative to NUMA nodes.",
"enum": ["none", "best-effort", "restricted", "single-numa-node"]
},
"maxRetry": {
"type": "integer",
"description": "Maximum retry attempts for this task before it is marked as failed.",
"minimum": 0
},
"dependsOn": {
"type": "object",
"description": "Task dependency configuration controlling execution order within the job.",
"properties": {
"name": {
"type": "array",
"description": "Names of tasks that must successfully complete before this task starts.",
"items": {
"type": "string"
}
},
"iteration": {
"type": "string",
"description": "Dependency evaluation strategy.",
"enum": ["any", "all"]
}
}
}
}
},
"LifecyclePolicy": {
"type": "object",
"description": "A lifecycle policy rule defining an automated action to take when a specific event occurs during job or task execution.",
"properties": {
"action": {
"type": "string",
"description": "Action to execute when the policy condition is met.",
"enum": [
"AbortJob",
"RestartJob",
"RestartTask",
"TerminateJob",
"CompleteJob",
"ResumeJob",
"SyncJob",
"EnqueueJob"
]
},
"event": {
"type": "string",
"description": "Single event type that triggers this policy.",
"enum": [
"PodFailed",
"PodEvicted",
"PodPending",
"PodRunning",
"PodSucceeded",
"TaskCompleted",
"AnyEvent",
"CommandIssued",
"JobUnknown",
"JobUpdated",
"OutOfSync"
]
},
"events": {
"type": "array",
"description": "Multiple event types that each independently trigger this policy.",
"items": {
"type": "string"
}
},
"exitCode": {
"type": "integer",
"description": "Container exit code that triggers this policy when a pod exits with this code."
},
"timeout": {
"type": "string",
"description": "Duration after which the policy fires if the triggering condition persists.",
"examples": ["30s", "5m", "1h"]
}
}
},
"JobStatus": {
"type": "object",
"description": "Observed status of a Volcano Job including its lifecycle phase, retry count, and per-task pod counts.",
"properties": {
"state": {
"type": "object",
"description": "Current phase and transition details of the job.",
"properties": {
"phase": {
"type": "string",
"description": "Current lifecycle phase of the job.",
"enum": [
"Pending",
"Aborting",
"Aborted",
"Running",
"Restarting",
"Completing",
"Completed",
"Terminating",
"Terminated",
"Failed"
]
},
"reason": {
"type": "string",
"description": "Machine-readable reason for the current phase."
},
"message": {
"type": "string",
"description": "Human-readable description of the current state."
},
"lastTransitionTime": {
"type": "string",
"format": "date-time",
"description": "Timestamp of the most recent phase transition."
}
}
},
"minAvailable": {
"type": "integer",
"description": "Minimum pod count required for this job."
},
"retryCount": {
"type": "integer",
"format": "int32",
"description": "Number of times the job has been retried so far."
},
"runningDuration": {
"type": "string",
"description": "Duration the job has been in the Running phase."
},
"taskStatusCount": {
"type": "object",
"description": "Per-task breakdown of pod counts by Kubernetes pod phase.",
"additionalProperties": {
"type": "object",
"properties": {
"phase": {
"type": "object",
"description": "Map of Kubernetes pod phase names to pod counts.",
"additionalProperties": {
"type": "integer"
}
}
}
}
},
"conditions": {
"type": "array",
"description": "Detailed conditions describing the current state of the job.",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string"
},
"status": {
"type": "string",
"enum": ["True", "False", "Unknown"]
},
"transitionID": {
"type": "string"
},
"lastTransitionTime": {
"type": "string",
"format": "date-time"
},
"reason": {
"type": "string"
},
"message": {
"type": "string"
}
}
}
}
}
},
"ObjectMeta": {
"type": "object",
"description": "Standard Kubernetes object metadata.",
"required": ["name"],
"properties": {
"name": {
"type": "string",
"description": "Name of the job, unique within its namespace.",
"maxLength": 253,
"pattern": "^[a-z0-9][a-z0-9.-]*[a-z0-9]$"
},
"namespace": {
"type": "string",
"description": "Namespace the job belongs to."
},
"labels": {
"type": "object",
"description": "Labels for organizing and selecting the job.",
"additionalProperties": {
"type": "string"
}
},
"annotations": {
"type": "object",
"description": "Non-identifying metadata for the job.",
"additionalProperties": {
"type": "string"
}
}
}
}
},
"examples": [
{
"apiVersion": "batch.volcano.sh/v1alpha1",
"kind": "Job",
"metadata": {
"name": "pytorch-training",
"namespace": "default"
},
"spec": {
"minAvailable": 3,
"schedulerName": "volcano",
"queue": "training",
"plugins": {
"pytorch": ["--master=master", "--worker=worker"],
"svc": [],
"env": []
},
"policies": [
{
"event": "PodEvicted",
"action": "RestartJob"
}
],
"maxRetry": 3,
"tasks": [
{
"name": "master",
"replicas": 1,
"template": {
"spec": {
"containers": [
{
"name": "pytorch",
"image": "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime",
"resources": {
"requests": {"cpu": "4", "memory": "8Gi"},
"limits": {"cpu": "4", "memory": "8Gi", "nvidia.com/gpu": "1"}
}
}
]
}
}
},
{
"name": "worker",
"replicas": 2,
"template": {
"spec": {
"containers": [
{
"name": "pytorch",
"image": "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime",
"resources": {
"requests": {"cpu": "4", "memory": "8Gi"},
"limits": {"cpu": "4", "memory": "8Gi", "nvidia.com/gpu": "2"}
}
}
]
}
}
}
]
}
}
]
}