Scalable Inference Serving · Schema
Inference Request
Open Inference Protocol V2 inference request submitted to a model serving endpoint via HTTP POST. Compatible with KServe, NVIDIA Triton, BentoML, and other OIP-compliant servers.
AICNCFDeploymentInferenceKubernetesLLMMachine LearningModel ServingMLOpsScalability
Properties
| Name | Type | Description |
|---|---|---|
| id | string | Optional request identifier that will be echoed back in the response for correlation. |
| parameters | object | Optional key/value parameters passed to the model's pre/post-processing pipeline. |
| inputs | array | Input tensors for the inference request. Each tensor specifies its name, shape, datatype, and data. |
| outputs | array | Optional list of output tensors to return. If omitted, all model outputs are returned. |
JSON Schema
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://raw.githubusercontent.com/api-evangelist/scalable-inference-serving/main/json-schema/kserve-inference-request-schema.json",
"title": "Inference Request",
"description": "Open Inference Protocol V2 inference request submitted to a model serving endpoint via HTTP POST. Compatible with KServe, NVIDIA Triton, BentoML, and other OIP-compliant servers.",
"type": "object",
"required": ["inputs"],
"properties": {
"id": {
"type": "string",
"description": "Optional request identifier that will be echoed back in the response for correlation.",
"example": "req-a1b2c3d4-e5f6-7890-abcd-ef1234567890"
},
"parameters": {
"type": "object",
"description": "Optional key/value parameters passed to the model's pre/post-processing pipeline.",
"additionalProperties": true
},
"inputs": {
"type": "array",
"description": "Input tensors for the inference request. Each tensor specifies its name, shape, datatype, and data.",
"minItems": 1,
"items": {
"$ref": "#/$defs/RequestInput"
}
},
"outputs": {
"type": "array",
"description": "Optional list of output tensors to return. If omitted, all model outputs are returned.",
"items": {
"$ref": "#/$defs/RequestOutput"
}
}
},
"$defs": {
"RequestInput": {
"type": "object",
"title": "Request Input",
"description": "A single named input tensor for an inference request.",
"required": ["name", "shape", "datatype", "data"],
"properties": {
"name": {
"type": "string",
"description": "Tensor name as defined in the model's input specification."
},
"shape": {
"type": "array",
"description": "Shape of the tensor. Use -1 for variable-length or batch dimensions.",
"items": {"type": "integer"},
"example": [1, 128]
},
"datatype": {
"$ref": "#/$defs/TensorDatatype"
},
"parameters": {
"type": "object",
"additionalProperties": true,
"description": "Optional tensor-level parameters."
},
"data": {
"description": "Tensor data in row-major order. Nested arrays or flat array acceptable.",
"oneOf": [
{"type": "array", "items": {}},
{"type": "string", "description": "Base64-encoded binary data for the binary tensor data extension."}
]
}
}
},
"RequestOutput": {
"type": "object",
"title": "Request Output",
"description": "Specifies which model output tensor to include in the response.",
"required": ["name"],
"properties": {
"name": {
"type": "string",
"description": "Name of the output tensor to include in the response."
},
"parameters": {
"type": "object",
"additionalProperties": true
}
}
},
"TensorDatatype": {
"type": "string",
"title": "Tensor Datatype",
"description": "Data type of a tensor per the Open Inference Protocol specification.",
"enum": ["BOOL", "UINT8", "UINT16", "UINT32", "UINT64", "INT8", "INT16", "INT32", "INT64", "FP16", "FP32", "FP64", "BYTES", "STRING"]
}
}
}