LMNT · Schema

LMNT Speech Synthesis Request

Schema for a text-to-speech synthesis request to the LMNT Speech API.

text-to-speechvoice synthesisvoice cloningaudio streamingconversational AIlow latencyreal-time audio

Properties

Name	Type	Description
voice	string	The ID of the voice to use for synthesis. Use the voices endpoint to list available voices.
text	string	The text to synthesize into speech.
format	string	Audio output format.
sample_rate	integer	Sample rate of the output audio in Hz.
speed	number	Speaking speed multiplier. 1.0 is normal speed.
temperature	number	Expressiveness of the synthesized speech. Higher values produce more expressive output.
quality	string	Audio quality level for synthesis.
language	string	BCP-47 language tag for the synthesis language. LMNT supports 31 languages.
return_timestamps	boolean	If true, the response includes word-level timestamps.
conversational	boolean	If true, optimizes for conversational speech patterns.
model	string	The LMNT model to use for synthesis. Defaults to the latest Blizzard model.

View JSON Schema on GitHub

JSON Schema

{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/api-evangelist/lmnt/main/json-schema/lmnt-speech-synthesis-schema.json",
  "title": "LMNT Speech Synthesis Request",
  "description": "Schema for a text-to-speech synthesis request to the LMNT Speech API.",
  "type": "object",
  "required": ["voice", "text"],
  "properties": {
    "voice": {
      "type": "string",
      "description": "The ID of the voice to use for synthesis. Use the voices endpoint to list available voices."
    },
    "text": {
      "type": "string",
      "description": "The text to synthesize into speech.",
      "maxLength": 5000
    },
    "format": {
      "type": "string",
      "description": "Audio output format.",
      "enum": ["mp3", "wav", "webm", "aac"],
      "default": "mp3"
    },
    "sample_rate": {
      "type": "integer",
      "description": "Sample rate of the output audio in Hz.",
      "enum": [8000, 16000, 24000]
    },
    "speed": {
      "type": "number",
      "description": "Speaking speed multiplier. 1.0 is normal speed.",
      "minimum": 0.25,
      "maximum": 2.0,
      "default": 1.0
    },
    "temperature": {
      "type": "number",
      "description": "Expressiveness of the synthesized speech. Higher values produce more expressive output.",
      "minimum": 0.0,
      "maximum": 1.0
    },
    "quality": {
      "type": "string",
      "description": "Audio quality level for synthesis.",
      "enum": ["low", "medium", "high"]
    },
    "language": {
      "type": "string",
      "description": "BCP-47 language tag for the synthesis language. LMNT supports 31 languages.",
      "examples": ["en", "es", "fr", "de", "ja", "ko", "zh"]
    },
    "return_timestamps": {
      "type": "boolean",
      "description": "If true, the response includes word-level timestamps.",
      "default": false
    },
    "conversational": {
      "type": "boolean",
      "description": "If true, optimizes for conversational speech patterns.",
      "default": false
    },
    "model": {
      "type": "string",
      "description": "The LMNT model to use for synthesis. Defaults to the latest Blizzard model.",
      "examples": ["blizzard-2"]
    }
  }
}