Triton Inference Server Model

A machine learning model managed by NVIDIA Triton Inference Server, including its configuration, input/output tensor definitions, batching strategy, and deployment settings.

AIDeep LearningInferenceMachine LearningModel ServingNVIDIAOpen Source

Properties

Name Type Description
name string Unique name of the model within the model repository
platform string Framework platform of the model
backend string Backend used by the model for inference execution
version_policy object
max_batch_size integer Maximum batch size supported by the model. A value of 0 means batching is disabled.
input array Input tensor definitions for the model
output array Output tensor definitions for the model
instance_group array Instance group configurations specifying how model instances are deployed across devices
dynamic_batching object
sequence_batching object
ensemble_scheduling object
parameters object Custom key-value parameters for the model
model_warmup array Warmup configurations to pre-heat the model after loading
optimization object
response_cache object Response cache configuration
View JSON Schema on GitHub

JSON Schema

triton-model-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://developer.nvidia.com/schemas/triton/model.json",
  "title": "Triton Inference Server Model",
  "description": "A machine learning model managed by NVIDIA Triton Inference Server, including its configuration, input/output tensor definitions, batching strategy, and deployment settings.",
  "type": "object",
  "required": ["name"],
  "properties": {
    "name": {
      "type": "string",
      "description": "Unique name of the model within the model repository",
      "minLength": 1
    },
    "platform": {
      "type": "string",
      "description": "Framework platform of the model",
      "enum": [
        "tensorrt_plan",
        "tensorflow_graphdef",
        "tensorflow_savedmodel",
        "onnxruntime_onnx",
        "pytorch_libtorch",
        "python",
        "ensemble"
      ]
    },
    "backend": {
      "type": "string",
      "description": "Backend used by the model for inference execution",
      "examples": ["tensorrt", "tensorflow", "onnxruntime", "pytorch", "python", "openvino", "fil"]
    },
    "version_policy": {
      "$ref": "#/$defs/VersionPolicy"
    },
    "max_batch_size": {
      "type": "integer",
      "minimum": 0,
      "description": "Maximum batch size supported by the model. A value of 0 means batching is disabled."
    },
    "input": {
      "type": "array",
      "description": "Input tensor definitions for the model",
      "items": {
        "$ref": "#/$defs/TensorConfig"
      }
    },
    "output": {
      "type": "array",
      "description": "Output tensor definitions for the model",
      "items": {
        "$ref": "#/$defs/TensorConfig"
      }
    },
    "instance_group": {
      "type": "array",
      "description": "Instance group configurations specifying how model instances are deployed across devices",
      "items": {
        "$ref": "#/$defs/InstanceGroup"
      }
    },
    "dynamic_batching": {
      "$ref": "#/$defs/DynamicBatching"
    },
    "sequence_batching": {
      "$ref": "#/$defs/SequenceBatching"
    },
    "ensemble_scheduling": {
      "$ref": "#/$defs/EnsembleScheduling"
    },
    "parameters": {
      "type": "object",
      "description": "Custom key-value parameters for the model",
      "additionalProperties": {
        "type": "object",
        "properties": {
          "string_value": {
            "type": "string"
          }
        }
      }
    },
    "model_warmup": {
      "type": "array",
      "description": "Warmup configurations to pre-heat the model after loading",
      "items": {
        "$ref": "#/$defs/ModelWarmup"
      }
    },
    "optimization": {
      "$ref": "#/$defs/Optimization"
    },
    "response_cache": {
      "type": "object",
      "description": "Response cache configuration",
      "properties": {
        "enable": {
          "type": "boolean",
          "description": "Whether response caching is enabled for this model"
        }
      }
    }
  },
  "$defs": {
    "VersionPolicy": {
      "type": "object",
      "description": "Policy for selecting which model versions are available for inference",
      "properties": {
        "latest": {
          "type": "object",
          "description": "Serve the N most recent versions",
          "properties": {
            "num_versions": {
              "type": "integer",
              "minimum": 1,
              "description": "Number of latest versions to serve"
            }
          }
        },
        "all": {
          "type": "object",
          "description": "Serve all available versions"
        },
        "specific": {
          "type": "object",
          "description": "Serve only the specified versions",
          "properties": {
            "versions": {
              "type": "array",
              "items": {
                "type": "integer"
              },
              "description": "List of specific version numbers to serve"
            }
          }
        }
      }
    },
    "TensorConfig": {
      "type": "object",
      "description": "Configuration for a model input or output tensor",
      "required": ["name", "data_type", "dims"],
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the tensor"
        },
        "data_type": {
          "type": "string",
          "description": "Data type of the tensor elements",
          "enum": [
            "TYPE_BOOL",
            "TYPE_UINT8",
            "TYPE_UINT16",
            "TYPE_UINT32",
            "TYPE_UINT64",
            "TYPE_INT8",
            "TYPE_INT16",
            "TYPE_INT32",
            "TYPE_INT64",
            "TYPE_FP16",
            "TYPE_FP32",
            "TYPE_FP64",
            "TYPE_STRING",
            "TYPE_BF16"
          ]
        },
        "dims": {
          "type": "array",
          "description": "Tensor dimensions. Use -1 for variable-length dimensions.",
          "items": {
            "type": "integer"
          }
        },
        "reshape": {
          "type": "object",
          "description": "Optional reshape configuration for the tensor",
          "properties": {
            "shape": {
              "type": "array",
              "items": {
                "type": "integer"
              }
            }
          }
        },
        "is_shape_tensor": {
          "type": "boolean",
          "description": "Whether this tensor is a shape tensor",
          "default": false
        },
        "allow_ragged_batch": {
          "type": "boolean",
          "description": "Whether ragged batching is allowed for this tensor",
          "default": false
        }
      }
    },
    "InstanceGroup": {
      "type": "object",
      "description": "Defines a group of model instances deployed on specific devices",
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the instance group"
        },
        "kind": {
          "type": "string",
          "description": "Device type for the instance group",
          "enum": ["KIND_AUTO", "KIND_GPU", "KIND_CPU", "KIND_MODEL"],
          "default": "KIND_AUTO"
        },
        "count": {
          "type": "integer",
          "minimum": 1,
          "description": "Number of instances in this group",
          "default": 1
        },
        "gpus": {
          "type": "array",
          "description": "GPU device IDs to use for this instance group",
          "items": {
            "type": "integer",
            "minimum": 0
          }
        },
        "rate_group": {
          "type": "integer",
          "description": "Rate limiter group assignment"
        },
        "rate_limit": {
          "type": "object",
          "description": "Rate limiting configuration for the instance group",
          "properties": {
            "resources": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "name": {
                    "type": "string"
                  },
                  "global": {
                    "type": "boolean"
                  },
                  "count": {
                    "type": "integer"
                  }
                }
              }
            }
          }
        }
      }
    },
    "DynamicBatching": {
      "type": "object",
      "description": "Dynamic batching configuration for combining multiple inference requests into a single batch",
      "properties": {
        "preferred_batch_size": {
          "type": "array",
          "description": "Preferred batch sizes for dynamic batching",
          "items": {
            "type": "integer",
            "minimum": 1
          }
        },
        "max_queue_delay_microseconds": {
          "type": "integer",
          "minimum": 0,
          "description": "Maximum delay in microseconds to wait for forming a preferred batch"
        },
        "preserve_ordering": {
          "type": "boolean",
          "description": "Whether to preserve the ordering of responses",
          "default": false
        },
        "priority_levels": {
          "type": "integer",
          "minimum": 0,
          "description": "Number of priority levels for request scheduling"
        },
        "default_priority_level": {
          "type": "integer",
          "description": "Default priority level for requests"
        },
        "default_queue_policy": {
          "$ref": "#/$defs/QueuePolicy"
        },
        "priority_queue_policy": {
          "type": "object",
          "description": "Per-priority-level queue policies",
          "additionalProperties": {
            "$ref": "#/$defs/QueuePolicy"
          }
        }
      }
    },
    "QueuePolicy": {
      "type": "object",
      "description": "Queue management policy for inference requests",
      "properties": {
        "timeout_action": {
          "type": "string",
          "enum": ["REJECT", "DELAY"],
          "description": "Action to take when a request times out in the queue"
        },
        "default_timeout_microseconds": {
          "type": "integer",
          "minimum": 0,
          "description": "Default timeout in microseconds for queued requests"
        },
        "allow_timeout_override": {
          "type": "boolean",
          "description": "Whether requests can override the default timeout"
        },
        "max_queue_size": {
          "type": "integer",
          "minimum": 0,
          "description": "Maximum number of requests in the queue"
        }
      }
    },
    "SequenceBatching": {
      "type": "object",
      "description": "Sequence batching configuration for stateful models that process ordered sequences of requests",
      "properties": {
        "max_sequence_idle_microseconds": {
          "type": "integer",
          "minimum": 0,
          "description": "Maximum idle time for a sequence before it is automatically ended"
        },
        "control_input": {
          "type": "array",
          "description": "Control inputs for sequence management",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string"
              },
              "control": {
                "type": "array",
                "items": {
                  "type": "object",
                  "properties": {
                    "kind": {
                      "type": "string",
                      "enum": [
                        "CONTROL_SEQUENCE_START",
                        "CONTROL_SEQUENCE_READY",
                        "CONTROL_SEQUENCE_END",
                        "CONTROL_SEQUENCE_CORRID"
                      ]
                    },
                    "int32_false_true": {
                      "type": "array",
                      "items": { "type": "integer" }
                    },
                    "fp32_false_true": {
                      "type": "array",
                      "items": { "type": "number" }
                    },
                    "bool_false_true": {
                      "type": "array",
                      "items": { "type": "boolean" }
                    }
                  }
                }
              }
            }
          }
        },
        "state": {
          "type": "array",
          "description": "Implicit state configurations for the sequence",
          "items": {
            "type": "object",
            "properties": {
              "input_name": { "type": "string" },
              "output_name": { "type": "string" },
              "data_type": { "type": "string" },
              "dims": {
                "type": "array",
                "items": { "type": "integer" }
              }
            }
          }
        }
      }
    },
    "EnsembleScheduling": {
      "type": "object",
      "description": "Ensemble model scheduling configuration defining a pipeline of models",
      "properties": {
        "step": {
          "type": "array",
          "description": "Steps in the ensemble pipeline",
          "items": {
            "type": "object",
            "required": ["model_name"],
            "properties": {
              "model_name": {
                "type": "string",
                "description": "Name of the model in this step"
              },
              "model_version": {
                "type": "integer",
                "description": "Version of the model to use (-1 for latest)"
              },
              "input_map": {
                "type": "object",
                "description": "Mapping from ensemble tensor names to step model input names",
                "additionalProperties": { "type": "string" }
              },
              "output_map": {
                "type": "object",
                "description": "Mapping from step model output names to ensemble tensor names",
                "additionalProperties": { "type": "string" }
              }
            }
          }
        }
      }
    },
    "ModelWarmup": {
      "type": "object",
      "description": "Model warmup configuration for pre-heating the model",
      "properties": {
        "name": {
          "type": "string",
          "description": "Name of the warmup configuration"
        },
        "batch_size": {
          "type": "integer",
          "minimum": 1,
          "description": "Batch size to use for warmup"
        },
        "inputs": {
          "type": "object",
          "description": "Input data specifications for warmup",
          "additionalProperties": {
            "type": "object",
            "properties": {
              "data_type": { "type": "string" },
              "dims": {
                "type": "array",
                "items": { "type": "integer" }
              },
              "zero_data": { "type": "boolean" },
              "random_data": { "type": "boolean" }
            }
          }
        },
        "count": {
          "type": "integer",
          "minimum": 1,
          "description": "Number of warmup iterations"
        }
      }
    },
    "Optimization": {
      "type": "object",
      "description": "Model optimization settings",
      "properties": {
        "priority": {
          "type": "string",
          "enum": ["PRIORITY_DEFAULT", "PRIORITY_MIN", "PRIORITY_MAX"],
          "description": "Optimization priority"
        },
        "execution_accelerators": {
          "type": "object",
          "description": "Execution accelerator configurations",
          "properties": {
            "gpu_execution_accelerator": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "name": { "type": "string" },
                  "parameters": {
                    "type": "object",
                    "additionalProperties": { "type": "string" }
                  }
                }
              }
            },
            "cpu_execution_accelerator": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "name": { "type": "string" },
                  "parameters": {
                    "type": "object",
                    "additionalProperties": { "type": "string" }
                  }
                }
              }
            }
          }
        },
        "input_pinned_memory": {
          "type": "object",
          "properties": {
            "enable": { "type": "boolean" }
          }
        },
        "output_pinned_memory": {
          "type": "object",
          "properties": {
            "enable": { "type": "boolean" }
          }
        },
        "gather_kernel_buffer_threshold": {
          "type": "integer",
          "description": "Threshold for using gather kernel for input tensor copy"
        },
        "eager_batching": {
          "type": "boolean",
          "description": "Whether to use eager batching"
        }
      }
    }
  }
}