Volcano · Schema

Volcano Job

Schema for the Volcano Job (vcjob) custom resource definition. A Volcano Job defines a batch workload with multiple task types, lifecycle policies, gang scheduling requirements, plugin integration, and queue assignment for Kubernetes batch processing.

Batch ProcessingCloud NativeHPCIncubatingKubernetesSchedulingMachine Learning

Properties

Name Type Description
apiVersion string API version of the Volcano Job resource.
kind string Resource kind.
metadata object
spec object
status object
View JSON Schema on GitHub

JSON Schema

volcano-job-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://volcano.sh/schemas/job.json",
  "title": "Volcano Job",
  "description": "Schema for the Volcano Job (vcjob) custom resource definition. A Volcano Job defines a batch workload with multiple task types, lifecycle policies, gang scheduling requirements, plugin integration, and queue assignment for Kubernetes batch processing.",
  "type": "object",
  "required": ["apiVersion", "kind", "metadata", "spec"],
  "properties": {
    "apiVersion": {
      "type": "string",
      "description": "API version of the Volcano Job resource.",
      "const": "batch.volcano.sh/v1alpha1"
    },
    "kind": {
      "type": "string",
      "description": "Resource kind.",
      "const": "Job"
    },
    "metadata": {
      "$ref": "#/$defs/ObjectMeta"
    },
    "spec": {
      "$ref": "#/$defs/JobSpec"
    },
    "status": {
      "$ref": "#/$defs/JobStatus"
    }
  },
  "$defs": {
    "JobSpec": {
      "type": "object",
      "description": "Specification of a Volcano Job defining workload structure, tasks, scheduling requirements, and lifecycle policies.",
      "properties": {
        "schedulerName": {
          "type": "string",
          "description": "Name of the scheduler responsible for this job. Defaults to 'volcano'.",
          "default": "volcano"
        },
        "minAvailable": {
          "type": "integer",
          "description": "Minimum number of pods that must be schedulable simultaneously for gang scheduling. If fewer pods are available, none are scheduled.",
          "minimum": 0
        },
        "tasks": {
          "type": "array",
          "description": "List of task groups composing the job. Each task defines a pod template, replica count, and task-level policies.",
          "items": {
            "$ref": "#/$defs/TaskSpec"
          }
        },
        "policies": {
          "type": "array",
          "description": "Job-level lifecycle policies controlling how the job reacts to events like pod failures or completions.",
          "items": {
            "$ref": "#/$defs/LifecyclePolicy"
          }
        },
        "plugins": {
          "type": "object",
          "description": "Map of plugin names to argument arrays. Plugins inject environment variables and sidecar containers for ML frameworks. Common plugins include 'tensorflow', 'pytorch', 'mpi', 'svc', and 'env'.",
          "additionalProperties": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "examples": [
            {"svc": [], "env": []},
            {"pytorch": ["--master=master", "--worker=worker"]},
            {"mpi": ["--master=mpimaster", "--worker=mpiworker"]}
          ]
        },
        "queue": {
          "type": "string",
          "description": "Name of the Volcano Queue to submit this job to. Determines scheduling priority and resource quota constraints."
        },
        "priorityClassName": {
          "type": "string",
          "description": "Kubernetes PriorityClass name controlling this job's scheduling priority."
        },
        "maxRetry": {
          "type": "integer",
          "description": "Maximum number of retry attempts before the job is permanently failed.",
          "minimum": 0
        },
        "ttlSecondsAfterFinished": {
          "type": "integer",
          "description": "Seconds after completion before the job is automatically garbage-collected. Omit to disable automatic cleanup.",
          "minimum": 0
        },
        "runningDuration": {
          "type": "string",
          "description": "Maximum allowed running duration for the job (e.g. '2h', '30m'). Job is terminated if it exceeds this duration.",
          "examples": ["1h", "30m", "2h30m"]
        },
        "volumes": {
          "type": "array",
          "description": "Volumes to be mounted into job task containers.",
          "items": {
            "type": "object"
          }
        }
      }
    },
    "TaskSpec": {
      "type": "object",
      "description": "A named task group within a Volcano Job, defining a set of pods with a shared pod template and optional task-level policies.",
      "required": ["name", "replicas", "template"],
      "properties": {
        "name": {
          "type": "string",
          "description": "Unique name of the task within the job. Used to reference the task in plugins and dependencies.",
          "maxLength": 63,
          "pattern": "^[a-z0-9][a-z0-9-]*[a-z0-9]$"
        },
        "replicas": {
          "type": "integer",
          "description": "Number of pod replicas for this task.",
          "minimum": 1
        },
        "template": {
          "type": "object",
          "description": "Kubernetes pod template spec for pods in this task."
        },
        "policies": {
          "type": "array",
          "description": "Lifecycle policies for this task, overriding job-level policies for events affecting this task's pods.",
          "items": {
            "$ref": "#/$defs/LifecyclePolicy"
          }
        },
        "topologyPolicy": {
          "type": "string",
          "description": "NUMA topology policy for resource allocation. Controls how CPU and memory are assigned relative to NUMA nodes.",
          "enum": ["none", "best-effort", "restricted", "single-numa-node"]
        },
        "maxRetry": {
          "type": "integer",
          "description": "Maximum retry attempts for this task before it is marked as failed.",
          "minimum": 0
        },
        "dependsOn": {
          "type": "object",
          "description": "Task dependency configuration controlling execution order within the job.",
          "properties": {
            "name": {
              "type": "array",
              "description": "Names of tasks that must successfully complete before this task starts.",
              "items": {
                "type": "string"
              }
            },
            "iteration": {
              "type": "string",
              "description": "Dependency evaluation strategy.",
              "enum": ["any", "all"]
            }
          }
        }
      }
    },
    "LifecyclePolicy": {
      "type": "object",
      "description": "A lifecycle policy rule defining an automated action to take when a specific event occurs during job or task execution.",
      "properties": {
        "action": {
          "type": "string",
          "description": "Action to execute when the policy condition is met.",
          "enum": [
            "AbortJob",
            "RestartJob",
            "RestartTask",
            "TerminateJob",
            "CompleteJob",
            "ResumeJob",
            "SyncJob",
            "EnqueueJob"
          ]
        },
        "event": {
          "type": "string",
          "description": "Single event type that triggers this policy.",
          "enum": [
            "PodFailed",
            "PodEvicted",
            "PodPending",
            "PodRunning",
            "PodSucceeded",
            "TaskCompleted",
            "AnyEvent",
            "CommandIssued",
            "JobUnknown",
            "JobUpdated",
            "OutOfSync"
          ]
        },
        "events": {
          "type": "array",
          "description": "Multiple event types that each independently trigger this policy.",
          "items": {
            "type": "string"
          }
        },
        "exitCode": {
          "type": "integer",
          "description": "Container exit code that triggers this policy when a pod exits with this code."
        },
        "timeout": {
          "type": "string",
          "description": "Duration after which the policy fires if the triggering condition persists.",
          "examples": ["30s", "5m", "1h"]
        }
      }
    },
    "JobStatus": {
      "type": "object",
      "description": "Observed status of a Volcano Job including its lifecycle phase, retry count, and per-task pod counts.",
      "properties": {
        "state": {
          "type": "object",
          "description": "Current phase and transition details of the job.",
          "properties": {
            "phase": {
              "type": "string",
              "description": "Current lifecycle phase of the job.",
              "enum": [
                "Pending",
                "Aborting",
                "Aborted",
                "Running",
                "Restarting",
                "Completing",
                "Completed",
                "Terminating",
                "Terminated",
                "Failed"
              ]
            },
            "reason": {
              "type": "string",
              "description": "Machine-readable reason for the current phase."
            },
            "message": {
              "type": "string",
              "description": "Human-readable description of the current state."
            },
            "lastTransitionTime": {
              "type": "string",
              "format": "date-time",
              "description": "Timestamp of the most recent phase transition."
            }
          }
        },
        "minAvailable": {
          "type": "integer",
          "description": "Minimum pod count required for this job."
        },
        "retryCount": {
          "type": "integer",
          "format": "int32",
          "description": "Number of times the job has been retried so far."
        },
        "runningDuration": {
          "type": "string",
          "description": "Duration the job has been in the Running phase."
        },
        "taskStatusCount": {
          "type": "object",
          "description": "Per-task breakdown of pod counts by Kubernetes pod phase.",
          "additionalProperties": {
            "type": "object",
            "properties": {
              "phase": {
                "type": "object",
                "description": "Map of Kubernetes pod phase names to pod counts.",
                "additionalProperties": {
                  "type": "integer"
                }
              }
            }
          }
        },
        "conditions": {
          "type": "array",
          "description": "Detailed conditions describing the current state of the job.",
          "items": {
            "type": "object",
            "properties": {
              "type": {
                "type": "string"
              },
              "status": {
                "type": "string",
                "enum": ["True", "False", "Unknown"]
              },
              "transitionID": {
                "type": "string"
              },
              "lastTransitionTime": {
                "type": "string",
                "format": "date-time"
              },
              "reason": {
                "type": "string"
              },
              "message": {
                "type": "string"
              }
            }
          }
        }
      }
    },
    "ObjectMeta": {
      "type": "object",
      "description": "Standard Kubernetes object metadata.",
      "required": ["name"],
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the job, unique within its namespace.",
          "maxLength": 253,
          "pattern": "^[a-z0-9][a-z0-9.-]*[a-z0-9]$"
        },
        "namespace": {
          "type": "string",
          "description": "Namespace the job belongs to."
        },
        "labels": {
          "type": "object",
          "description": "Labels for organizing and selecting the job.",
          "additionalProperties": {
            "type": "string"
          }
        },
        "annotations": {
          "type": "object",
          "description": "Non-identifying metadata for the job.",
          "additionalProperties": {
            "type": "string"
          }
        }
      }
    }
  },
  "examples": [
    {
      "apiVersion": "batch.volcano.sh/v1alpha1",
      "kind": "Job",
      "metadata": {
        "name": "pytorch-training",
        "namespace": "default"
      },
      "spec": {
        "minAvailable": 3,
        "schedulerName": "volcano",
        "queue": "training",
        "plugins": {
          "pytorch": ["--master=master", "--worker=worker"],
          "svc": [],
          "env": []
        },
        "policies": [
          {
            "event": "PodEvicted",
            "action": "RestartJob"
          }
        ],
        "maxRetry": 3,
        "tasks": [
          {
            "name": "master",
            "replicas": 1,
            "template": {
              "spec": {
                "containers": [
                  {
                    "name": "pytorch",
                    "image": "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime",
                    "resources": {
                      "requests": {"cpu": "4", "memory": "8Gi"},
                      "limits": {"cpu": "4", "memory": "8Gi", "nvidia.com/gpu": "1"}
                    }
                  }
                ]
              }
            }
          },
          {
            "name": "worker",
            "replicas": 2,
            "template": {
              "spec": {
                "containers": [
                  {
                    "name": "pytorch",
                    "image": "pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime",
                    "resources": {
                      "requests": {"cpu": "4", "memory": "8Gi"},
                      "limits": {"cpu": "4", "memory": "8Gi", "nvidia.com/gpu": "2"}
                    }
                  }
                ]
              }
            }
          }
        ]
      }
    }
  ]
}