Triton Inference Server · Schema
Triton Inference Server Model
A machine learning model managed by NVIDIA Triton Inference Server, including its configuration, input/output tensor definitions, batching strategy, and deployment settings.
AIDeep LearningInferenceMachine LearningModel ServingNVIDIAOpen Source
Properties
| Name | Type | Description |
|---|---|---|
| name | string | Unique name of the model within the model repository |
| platform | string | Framework platform of the model |
| backend | string | Backend used by the model for inference execution |
| version_policy | object | |
| max_batch_size | integer | Maximum batch size supported by the model. A value of 0 means batching is disabled. |
| input | array | Input tensor definitions for the model |
| output | array | Output tensor definitions for the model |
| instance_group | array | Instance group configurations specifying how model instances are deployed across devices |
| dynamic_batching | object | |
| sequence_batching | object | |
| ensemble_scheduling | object | |
| parameters | object | Custom key-value parameters for the model |
| model_warmup | array | Warmup configurations to pre-heat the model after loading |
| optimization | object | |
| response_cache | object | Response cache configuration |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://developer.nvidia.com/schemas/triton/model.json",
"title": "Triton Inference Server Model",
"description": "A machine learning model managed by NVIDIA Triton Inference Server, including its configuration, input/output tensor definitions, batching strategy, and deployment settings.",
"type": "object",
"required": ["name"],
"properties": {
"name": {
"type": "string",
"description": "Unique name of the model within the model repository",
"minLength": 1
},
"platform": {
"type": "string",
"description": "Framework platform of the model",
"enum": [
"tensorrt_plan",
"tensorflow_graphdef",
"tensorflow_savedmodel",
"onnxruntime_onnx",
"pytorch_libtorch",
"python",
"ensemble"
]
},
"backend": {
"type": "string",
"description": "Backend used by the model for inference execution",
"examples": ["tensorrt", "tensorflow", "onnxruntime", "pytorch", "python", "openvino", "fil"]
},
"version_policy": {
"$ref": "#/$defs/VersionPolicy"
},
"max_batch_size": {
"type": "integer",
"minimum": 0,
"description": "Maximum batch size supported by the model. A value of 0 means batching is disabled."
},
"input": {
"type": "array",
"description": "Input tensor definitions for the model",
"items": {
"$ref": "#/$defs/TensorConfig"
}
},
"output": {
"type": "array",
"description": "Output tensor definitions for the model",
"items": {
"$ref": "#/$defs/TensorConfig"
}
},
"instance_group": {
"type": "array",
"description": "Instance group configurations specifying how model instances are deployed across devices",
"items": {
"$ref": "#/$defs/InstanceGroup"
}
},
"dynamic_batching": {
"$ref": "#/$defs/DynamicBatching"
},
"sequence_batching": {
"$ref": "#/$defs/SequenceBatching"
},
"ensemble_scheduling": {
"$ref": "#/$defs/EnsembleScheduling"
},
"parameters": {
"type": "object",
"description": "Custom key-value parameters for the model",
"additionalProperties": {
"type": "object",
"properties": {
"string_value": {
"type": "string"
}
}
}
},
"model_warmup": {
"type": "array",
"description": "Warmup configurations to pre-heat the model after loading",
"items": {
"$ref": "#/$defs/ModelWarmup"
}
},
"optimization": {
"$ref": "#/$defs/Optimization"
},
"response_cache": {
"type": "object",
"description": "Response cache configuration",
"properties": {
"enable": {
"type": "boolean",
"description": "Whether response caching is enabled for this model"
}
}
}
},
"$defs": {
"VersionPolicy": {
"type": "object",
"description": "Policy for selecting which model versions are available for inference",
"properties": {
"latest": {
"type": "object",
"description": "Serve the N most recent versions",
"properties": {
"num_versions": {
"type": "integer",
"minimum": 1,
"description": "Number of latest versions to serve"
}
}
},
"all": {
"type": "object",
"description": "Serve all available versions"
},
"specific": {
"type": "object",
"description": "Serve only the specified versions",
"properties": {
"versions": {
"type": "array",
"items": {
"type": "integer"
},
"description": "List of specific version numbers to serve"
}
}
}
}
},
"TensorConfig": {
"type": "object",
"description": "Configuration for a model input or output tensor",
"required": ["name", "data_type", "dims"],
"properties": {
"name": {
"type": "string",
"description": "Name of the tensor"
},
"data_type": {
"type": "string",
"description": "Data type of the tensor elements",
"enum": [
"TYPE_BOOL",
"TYPE_UINT8",
"TYPE_UINT16",
"TYPE_UINT32",
"TYPE_UINT64",
"TYPE_INT8",
"TYPE_INT16",
"TYPE_INT32",
"TYPE_INT64",
"TYPE_FP16",
"TYPE_FP32",
"TYPE_FP64",
"TYPE_STRING",
"TYPE_BF16"
]
},
"dims": {
"type": "array",
"description": "Tensor dimensions. Use -1 for variable-length dimensions.",
"items": {
"type": "integer"
}
},
"reshape": {
"type": "object",
"description": "Optional reshape configuration for the tensor",
"properties": {
"shape": {
"type": "array",
"items": {
"type": "integer"
}
}
}
},
"is_shape_tensor": {
"type": "boolean",
"description": "Whether this tensor is a shape tensor",
"default": false
},
"allow_ragged_batch": {
"type": "boolean",
"description": "Whether ragged batching is allowed for this tensor",
"default": false
}
}
},
"InstanceGroup": {
"type": "object",
"description": "Defines a group of model instances deployed on specific devices",
"properties": {
"name": {
"type": "string",
"description": "Name of the instance group"
},
"kind": {
"type": "string",
"description": "Device type for the instance group",
"enum": ["KIND_AUTO", "KIND_GPU", "KIND_CPU", "KIND_MODEL"],
"default": "KIND_AUTO"
},
"count": {
"type": "integer",
"minimum": 1,
"description": "Number of instances in this group",
"default": 1
},
"gpus": {
"type": "array",
"description": "GPU device IDs to use for this instance group",
"items": {
"type": "integer",
"minimum": 0
}
},
"rate_group": {
"type": "integer",
"description": "Rate limiter group assignment"
},
"rate_limit": {
"type": "object",
"description": "Rate limiting configuration for the instance group",
"properties": {
"resources": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"global": {
"type": "boolean"
},
"count": {
"type": "integer"
}
}
}
}
}
}
}
},
"DynamicBatching": {
"type": "object",
"description": "Dynamic batching configuration for combining multiple inference requests into a single batch",
"properties": {
"preferred_batch_size": {
"type": "array",
"description": "Preferred batch sizes for dynamic batching",
"items": {
"type": "integer",
"minimum": 1
}
},
"max_queue_delay_microseconds": {
"type": "integer",
"minimum": 0,
"description": "Maximum delay in microseconds to wait for forming a preferred batch"
},
"preserve_ordering": {
"type": "boolean",
"description": "Whether to preserve the ordering of responses",
"default": false
},
"priority_levels": {
"type": "integer",
"minimum": 0,
"description": "Number of priority levels for request scheduling"
},
"default_priority_level": {
"type": "integer",
"description": "Default priority level for requests"
},
"default_queue_policy": {
"$ref": "#/$defs/QueuePolicy"
},
"priority_queue_policy": {
"type": "object",
"description": "Per-priority-level queue policies",
"additionalProperties": {
"$ref": "#/$defs/QueuePolicy"
}
}
}
},
"QueuePolicy": {
"type": "object",
"description": "Queue management policy for inference requests",
"properties": {
"timeout_action": {
"type": "string",
"enum": ["REJECT", "DELAY"],
"description": "Action to take when a request times out in the queue"
},
"default_timeout_microseconds": {
"type": "integer",
"minimum": 0,
"description": "Default timeout in microseconds for queued requests"
},
"allow_timeout_override": {
"type": "boolean",
"description": "Whether requests can override the default timeout"
},
"max_queue_size": {
"type": "integer",
"minimum": 0,
"description": "Maximum number of requests in the queue"
}
}
},
"SequenceBatching": {
"type": "object",
"description": "Sequence batching configuration for stateful models that process ordered sequences of requests",
"properties": {
"max_sequence_idle_microseconds": {
"type": "integer",
"minimum": 0,
"description": "Maximum idle time for a sequence before it is automatically ended"
},
"control_input": {
"type": "array",
"description": "Control inputs for sequence management",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"control": {
"type": "array",
"items": {
"type": "object",
"properties": {
"kind": {
"type": "string",
"enum": [
"CONTROL_SEQUENCE_START",
"CONTROL_SEQUENCE_READY",
"CONTROL_SEQUENCE_END",
"CONTROL_SEQUENCE_CORRID"
]
},
"int32_false_true": {
"type": "array",
"items": { "type": "integer" }
},
"fp32_false_true": {
"type": "array",
"items": { "type": "number" }
},
"bool_false_true": {
"type": "array",
"items": { "type": "boolean" }
}
}
}
}
}
}
},
"state": {
"type": "array",
"description": "Implicit state configurations for the sequence",
"items": {
"type": "object",
"properties": {
"input_name": { "type": "string" },
"output_name": { "type": "string" },
"data_type": { "type": "string" },
"dims": {
"type": "array",
"items": { "type": "integer" }
}
}
}
}
}
},
"EnsembleScheduling": {
"type": "object",
"description": "Ensemble model scheduling configuration defining a pipeline of models",
"properties": {
"step": {
"type": "array",
"description": "Steps in the ensemble pipeline",
"items": {
"type": "object",
"required": ["model_name"],
"properties": {
"model_name": {
"type": "string",
"description": "Name of the model in this step"
},
"model_version": {
"type": "integer",
"description": "Version of the model to use (-1 for latest)"
},
"input_map": {
"type": "object",
"description": "Mapping from ensemble tensor names to step model input names",
"additionalProperties": { "type": "string" }
},
"output_map": {
"type": "object",
"description": "Mapping from step model output names to ensemble tensor names",
"additionalProperties": { "type": "string" }
}
}
}
}
}
},
"ModelWarmup": {
"type": "object",
"description": "Model warmup configuration for pre-heating the model",
"properties": {
"name": {
"type": "string",
"description": "Name of the warmup configuration"
},
"batch_size": {
"type": "integer",
"minimum": 1,
"description": "Batch size to use for warmup"
},
"inputs": {
"type": "object",
"description": "Input data specifications for warmup",
"additionalProperties": {
"type": "object",
"properties": {
"data_type": { "type": "string" },
"dims": {
"type": "array",
"items": { "type": "integer" }
},
"zero_data": { "type": "boolean" },
"random_data": { "type": "boolean" }
}
}
},
"count": {
"type": "integer",
"minimum": 1,
"description": "Number of warmup iterations"
}
}
},
"Optimization": {
"type": "object",
"description": "Model optimization settings",
"properties": {
"priority": {
"type": "string",
"enum": ["PRIORITY_DEFAULT", "PRIORITY_MIN", "PRIORITY_MAX"],
"description": "Optimization priority"
},
"execution_accelerators": {
"type": "object",
"description": "Execution accelerator configurations",
"properties": {
"gpu_execution_accelerator": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"parameters": {
"type": "object",
"additionalProperties": { "type": "string" }
}
}
}
},
"cpu_execution_accelerator": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"parameters": {
"type": "object",
"additionalProperties": { "type": "string" }
}
}
}
}
}
},
"input_pinned_memory": {
"type": "object",
"properties": {
"enable": { "type": "boolean" }
}
},
"output_pinned_memory": {
"type": "object",
"properties": {
"enable": { "type": "boolean" }
}
},
"gather_kernel_buffer_threshold": {
"type": "integer",
"description": "Threshold for using gather kernel for input tensor copy"
},
"eager_batching": {
"type": "boolean",
"description": "Whether to use eager batching"
}
}
}
}
}