Vapi · Schema

ServerMessageAssistantSpeech

AIVoiceAgentsRealtimeCPaaS

Properties

Name Type Description
phoneNumber object This is the phone number that the message is associated with.
type string This is the type of the message. "assistant-speech" is sent as assistant audio is being played.
text string The full assistant text for the current turn. This is the complete text, not an incremental delta — consumers should use `timing` metadata (e.g. `wordsSpoken`) to determine which portion has been spok
turn number This is the turn number of the assistant speech event (0-indexed).
source string Indicates how the text was sourced.
timing object Optional timing metadata. Shape depends on `timing.type`: - `word-alignment` (ElevenLabs): per-character timing at playback cadence. words[] includes space entries. Best consumed by tracking a running
timestamp number This is the timestamp of the message.
artifact object This is a live version of the `call.artifact`. This matches what is stored on `call.artifact` after the call.
assistant object This is the assistant that the message is associated with.
customer object This is the customer that the message is associated with.
call object This is the call that the message is associated with.
chat object This is the chat object.
View JSON Schema on GitHub

JSON Schema

vapi-servermessageassistantspeech-schema.json Raw ↑
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "#/components/schemas/ServerMessageAssistantSpeech",
  "title": "ServerMessageAssistantSpeech",
  "type": "object",
  "properties": {
    "phoneNumber": {
      "description": "This is the phone number that the message is associated with.",
      "oneOf": [
        {
          "$ref": "#/components/schemas/CreateByoPhoneNumberDTO",
          "title": "ByoPhoneNumber"
        },
        {
          "$ref": "#/components/schemas/CreateTwilioPhoneNumberDTO",
          "title": "TwilioPhoneNumber"
        },
        {
          "$ref": "#/components/schemas/CreateVonagePhoneNumberDTO",
          "title": "VonagePhoneNumber"
        },
        {
          "$ref": "#/components/schemas/CreateVapiPhoneNumberDTO",
          "title": "VapiPhoneNumber"
        },
        {
          "$ref": "#/components/schemas/CreateTelnyxPhoneNumberDTO",
          "title": "TelnyxPhoneNumber"
        }
      ]
    },
    "type": {
      "type": "string",
      "description": "This is the type of the message. \"assistant-speech\" is sent as assistant audio is being played.",
      "enum": [
        "assistant.speechStarted"
      ]
    },
    "text": {
      "type": "string",
      "description": "The full assistant text for the current turn. This is the complete text,\nnot an incremental delta \u2014 consumers should use `timing` metadata (e.g.\n`wordsSpoken`) to determine which portion has been spoken so far."
    },
    "turn": {
      "type": "number",
      "description": "This is the turn number of the assistant speech event (0-indexed)."
    },
    "source": {
      "type": "string",
      "description": "Indicates how the text was sourced.",
      "enum": [
        "model",
        "force-say",
        "custom-voice"
      ]
    },
    "timing": {
      "description": "Optional timing metadata. Shape depends on `timing.type`:\n\n- `word-alignment` (ElevenLabs): per-character timing at playback\n  cadence. words[] includes space entries. Best consumed by tracking\n  a running character count: join timing.words, add to a char cursor,\n  and highlight text up to that position. No interpolation needed.\n\n- `word-progress` (Minimax with voice.subtitleType: 'word'): cursor-\n  based word count per TTS segment. Use wordsSpoken as the anchor,\n  interpolate forward using segmentDurationMs or timing.words until\n  the next event arrives.\n\nWhen absent, the event is a text-only fallback for providers without\nword-level timing (e.g. Cartesia, Deepgram, Azure). Text emits once\nper TTS chunk when audio is playing. Optionally interpolate a word\ncursor at ~3.5 words/sec between events for approximate tracking.",
      "oneOf": [
        {
          "$ref": "#/components/schemas/AssistantSpeechWordAlignmentTiming",
          "title": "WordAlignmentTiming"
        },
        {
          "$ref": "#/components/schemas/AssistantSpeechWordProgressTiming",
          "title": "WordProgressTiming"
        }
      ],
      "discriminator": {
        "propertyName": "type"
      }
    },
    "timestamp": {
      "type": "number",
      "description": "This is the timestamp of the message."
    },
    "artifact": {
      "description": "This is a live version of the `call.artifact`.\n\nThis matches what is stored on `call.artifact` after the call.",
      "allOf": [
        {
          "$ref": "#/components/schemas/Artifact"
        }
      ]
    },
    "assistant": {
      "description": "This is the assistant that the message is associated with.",
      "allOf": [
        {
          "$ref": "#/components/schemas/CreateAssistantDTO"
        }
      ]
    },
    "customer": {
      "description": "This is the customer that the message is associated with.",
      "allOf": [
        {
          "$ref": "#/components/schemas/CreateCustomerDTO"
        }
      ]
    },
    "call": {
      "description": "This is the call that the message is associated with.",
      "allOf": [
        {
          "$ref": "#/components/schemas/Call"
        }
      ]
    },
    "chat": {
      "description": "This is the chat object.",
      "allOf": [
        {
          "$ref": "#/components/schemas/Chat"
        }
      ]
    }
  },
  "required": [
    "type",
    "text"
  ]
}