feat(cartesia): sonic-3 (#3715)

davidzhao · web-flow · commit 84f225a64962 · 2025-10-25T11:11:04.000-07:00
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py
@@ -8,36 +8,72 @@
     # "pcm_alaw",
 ]
 
-TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo"]
+TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo", "sonic-3"]
 TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
 TTSDefaultVoiceId = "f786b574-daa5-4673-aa0c-cbe3e8534c02"  # Katie - Friendly Fixer
 TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]
+
+# up to date as of 2025-10-24, refer to https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-emotion
 TTSVoiceEmotion = Literal[
-    "anger:lowest",
-    "anger:low",
-    "anger",
-    "anger:high",
-    "anger:highest",
-    "positivity:lowest",
-    "positivity:low",
-    "positivity",
-    "positivity:high",
-    "positivity:highest",
-    "surprise:lowest",
-    "surprise:low",
-    "surprise",
-    "surprise:high",
-    "surprise:highest",
-    "sadness:lowest",
-    "sadness:low",
-    "sadness",
-    "sadness:high",
-    "sadness:highest",
-    "curiosity:lowest",
-    "curiosity:low",
-    "curiosity",
-    "curiosity:high",
-    "curiosity:highest",
+    "Happy",
+    "Excited",
+    "Enthusiastic",
+    "Elated",
+    "Euphoric",
+    "Triumphant",
+    "Amazed",
+    "Surprised",
+    "Flirtatious",
+    "Joking/Comedic",
+    "Curious",
+    "Content",
+    "Peaceful",
+    "Serene",
+    "Calm",
+    "Grateful",
+    "Affectionate",
+    "Trust",
+    "Sympathetic",
+    "Anticipation",
+    "Mysterious",
+    "Angry",
+    "Mad",
+    "Outraged",
+    "Frustrated",
+    "Agitated",
+    "Threatened",
+    "Disgusted",
+    "Contempt",
+    "Envious",
+    "Sarcastic",
+    "Ironic",
+    "Sad",
+    "Dejected",
+    "Melancholic",
+    "Disappointed",
+    "Hurt",
+    "Guilty",
+    "Bored",
+    "Tired",
+    "Rejected",
+    "Nostalgic",
+    "Wistful",
+    "Apologetic",
+    "Hesitant",
+    "Insecure",
+    "Confused",
+    "Resigned",
+    "Anxious",
+    "Panicked",
+    "Alarmed",
+    "Scared",
+    "Neutral",
+    "Proud",
+    "Confident",
+    "Distant",
+    "Skeptical",
+    "Contemplative",
+    "Determined",
 ]
 
 # STT model definitions
@@ -89,3 +125,7 @@
     "or",
     "pa",
 ]
+
+
+def _is_sonic_3(model: str) -> bool:
+    return model.startswith("sonic-3")
diff --git a/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py b/livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py
@@ -20,7 +20,7 @@
 import os
 import weakref
 from dataclasses import dataclass, replace
-from typing import Any, Optional, Union, cast
+from typing import Any, Union, cast
 
 import aiohttp
 
@@ -45,6 +45,7 @@
     TTSModels,
     TTSVoiceEmotion,
     TTSVoiceSpeed,
+    _is_sonic_3,
 )
 
 API_AUTH_HEADER = "X-API-Key"
@@ -62,6 +63,7 @@ class _TTSOptions:
     voice: str | list[float]
     speed: TTSVoiceSpeed | float | None
     emotion: list[TTSVoiceEmotion | str] | None
+    volume: float | None
     word_timestamps: bool
     api_key: str
     language: str
@@ -85,7 +87,8 @@ def __init__(
         encoding: TTSEncoding = "pcm_s16le",
         voice: str | list[float] = TTSDefaultVoiceId,
         speed: TTSVoiceSpeed | float | None = None,
-        emotion: list[TTSVoiceEmotion | str] | None = None,
+        emotion: TTSVoiceEmotion | str | list[TTSVoiceEmotion | str] | None = None,
+        volume: float | None = None,
         sample_rate: int = 24000,
         word_timestamps: bool = True,
         http_session: aiohttp.ClientSession | None = None,
@@ -104,8 +107,9 @@ def __init__(
             language (str, optional): The language code for synthesis. Defaults to "en".
             encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
             voice (str | list[float], optional): The voice ID or embedding array.
-            speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
-            emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
+            speed (TTSVoiceSpeed | float, optional): Speed of speech, with sonic-3, the value is valid between 0.6 and 2.0 (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-speed)
+            emotion (list[TTSVoiceEmotion], optional): Emotion of the speech (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-emotion)
+            volume (float, optional): Volume of the speech, with sonic-3, the value is valid between 0.5 and 2.0
             sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
             word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True.
             api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
@@ -127,16 +131,8 @@ def __init__(
         if not cartesia_api_key:
             raise ValueError("CARTESIA_API_KEY must be set")
 
-        if speed or emotion:
-            if (
-                api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
-                or model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
-            ):
-                logger.warning(
-                    f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
-                    "see https://docs.cartesia.ai/developer-tools/changelog for details",
-                    extra={"model": model, "speed": speed, "emotion": emotion},
-                )
+        if isinstance(emotion, str):
+            emotion = [emotion]
 
         self._opts = _TTSOptions(
             model=model,
@@ -146,11 +142,16 @@ def __init__(
             voice=voice,
             speed=speed,
             emotion=emotion,
+            volume=volume,
             api_key=cartesia_api_key,
             base_url=base_url,
             word_timestamps=word_timestamps,
             api_version=api_version,
         )
+
+        if speed or emotion or volume:
+            self._check_generation_config()
+
         self._session = http_session
         self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
             connect_cb=self._connect_ws,
@@ -201,8 +202,9 @@ def update_options(
         model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
         language: NotGivenOr[str] = NOT_GIVEN,
         voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
-        speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
-        emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN,
+        speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
+        emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
+        volume: NotGivenOr[float] = NOT_GIVEN,
         api_version: NotGivenOr[str] = NOT_GIVEN,
     ) -> None:
         """
@@ -225,22 +227,17 @@ def update_options(
         if is_given(voice):
             self._opts.voice = cast(Union[str, list[float]], voice)
         if is_given(speed):
-            self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed)
+            self._opts.speed = cast(Union[TTSVoiceSpeed, float], speed)
         if is_given(emotion):
-            self._opts.emotion = emotion
+            emotion = [emotion] if isinstance(emotion, str) else emotion
+            self._opts.emotion = cast(list[Union[TTSVoiceEmotion, str]], emotion)
+        if is_given(volume):
+            self._opts.volume = volume
         if is_given(api_version):
             self._opts.api_version = api_version
 
         if speed or emotion:
-            if (
-                self._opts.api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
-                or self._opts.model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
-            ):
-                logger.warning(
-                    f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
-                    "see https://docs.cartesia.ai/developer-tools/changelog for details",
-                    extra={"model": self._opts.model, "speed": speed, "emotion": emotion},
-                )
+            self._check_generation_config()
 
     def synthesize(
         self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
@@ -261,6 +258,29 @@ async def aclose(self) -> None:
         self._streams.clear()
         await self._pool.aclose()
 
+    def _check_generation_config(self) -> None:
+        if _is_sonic_3(self._opts.model):
+            if self._opts.speed:
+                if not isinstance(self._opts.speed, float):
+                    raise ValueError("speed must be a float for sonic-3")
+                if not 0.6 <= self._opts.speed <= 2.0:
+                    logger.warning("speed must be between 0.6 and 2.0 for sonic-3")
+            if self._opts.volume is not None and not 0.5 <= self._opts.volume <= 2.0:
+                logger.warning("volume must be between 0.5 and 2.0 for sonic-3")
+        elif (
+            self._opts.api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
+            or self._opts.model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
+        ):
+            logger.warning(
+                f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
+                "see https://docs.cartesia.ai/developer-tools/changelog for details",
+                extra={
+                    "model": self._opts.model,
+                    "speed": self._opts.speed,
+                    "emotion": self._opts.emotion,
+                },
+            )
+
 
 class ChunkedStream(tts.ChunkedStream):
     """Synthesize chunked text using the bytes endpoint"""
@@ -436,15 +456,16 @@ def _to_cartesia_options(opts: _TTSOptions, *, streaming: bool) -> dict[str, Any
         voice["mode"] = "embedding"
         voice["embedding"] = opts.voice
 
-    voice_controls: dict = {}
-    if opts.speed:
-        voice_controls["speed"] = opts.speed
+    if opts.api_version == API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS:
+        voice_controls: dict = {}
+        if opts.speed:
+            voice_controls["speed"] = opts.speed
 
-    if opts.emotion:
-        voice_controls["emotion"] = opts.emotion
+        if opts.emotion:
+            voice_controls["emotion"] = opts.emotion
 
-    if voice_controls:
-        voice["__experimental_controls"] = voice_controls
+        if voice_controls:
+            voice["__experimental_controls"] = voice_controls
 
     options: dict[str, Any] = {
         "model_id": opts.model,
@@ -456,6 +477,21 @@ def _to_cartesia_options(opts: _TTSOptions, *, streaming: bool) -> dict[str, Any
         },
         "language": opts.language,
     }
+
+    if opts.api_version > API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS and _is_sonic_3(
+        opts.model
+    ):
+        generation_config: dict[str, Any] = {}
+        if opts.speed:
+            generation_config["speed"] = opts.speed
+        if opts.emotion:
+            generation_config["emotion"] = opts.emotion[0]
+        if opts.volume:
+            generation_config["volume"] = opts.volume
+        if generation_config:
+            options["generation_config"] = generation_config
+
     if streaming:
         options["add_timestamps"] = opts.word_timestamps
+
     return options