{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "#/components/schemas/ClientMessageAssistantSpeech",
"title": "ClientMessageAssistantSpeech",
"type": "object",
"properties": {
"phoneNumber": {
"description": "This is the phone number that the message is associated with.",
"oneOf": [
{
"$ref": "#/components/schemas/CreateByoPhoneNumberDTO",
"title": "ByoPhoneNumber"
},
{
"$ref": "#/components/schemas/CreateTwilioPhoneNumberDTO",
"title": "TwilioPhoneNumber"
},
{
"$ref": "#/components/schemas/CreateVonagePhoneNumberDTO",
"title": "VonagePhoneNumber"
},
{
"$ref": "#/components/schemas/CreateVapiPhoneNumberDTO",
"title": "VapiPhoneNumber"
},
{
"$ref": "#/components/schemas/CreateTelnyxPhoneNumberDTO",
"title": "TelnyxPhoneNumber"
}
]
},
"type": {
"type": "string",
"description": "This is the type of the message. \"assistant-speech\" is sent as assistant audio is being played.",
"enum": [
"assistant.speechStarted"
]
},
"text": {
"type": "string",
"description": "The full assistant text for the current turn. This is the complete text,\nnot an incremental delta \u2014 consumers should use `timing` metadata (e.g.\n`wordsSpoken`) to determine which portion has been spoken so far."
},
"turn": {
"type": "number",
"description": "This is the turn number of the assistant speech event (0-indexed)."
},
"source": {
"type": "string",
"description": "Indicates how the text was sourced.",
"enum": [
"model",
"force-say",
"custom-voice"
]
},
"timing": {
"description": "Optional timing metadata. Shape depends on `timing.type`:\n\n- `word-alignment` (ElevenLabs): per-character timing at playback\n cadence. words[] includes space entries. Best consumed by tracking\n a running character count: join timing.words, add to a char cursor,\n and highlight text up to that position. No interpolation needed.\n\n- `word-progress` (Minimax with voice.subtitleType: 'word'): cursor-\n based word count per TTS segment. Use wordsSpoken as the anchor,\n interpolate forward using segmentDurationMs or timing.words until\n the next event arrives.\n\nWhen absent, the event is a text-only fallback for providers without\nword-level timing (e.g. Cartesia, Deepgram, Azure). Text emits once\nper TTS chunk when audio is playing. Optionally interpolate a word\ncursor at ~3.5 words/sec between events for approximate tracking.",
"oneOf": [
{
"$ref": "#/components/schemas/AssistantSpeechWordAlignmentTiming",
"title": "WordAlignmentTiming"
},
{
"$ref": "#/components/schemas/AssistantSpeechWordProgressTiming",
"title": "WordProgressTiming"
}
],
"discriminator": {
"propertyName": "type"
}
},
"timestamp": {
"type": "number",
"description": "This is the timestamp of the message."
},
"call": {
"description": "This is the call that the message is associated with.",
"allOf": [
{
"$ref": "#/components/schemas/Call"
}
]
},
"customer": {
"description": "This is the customer that the message is associated with.",
"allOf": [
{
"$ref": "#/components/schemas/CreateCustomerDTO"
}
]
},
"assistant": {
"description": "This is the assistant that the message is associated with.",
"allOf": [
{
"$ref": "#/components/schemas/CreateAssistantDTO"
}
]
}
},
"required": [
"type",
"text"
]
}