Skip to content

Commit 84f225a

Browse files
authored
feat(cartesia): sonic-3 (#3715)
1 parent c007eef commit 84f225a

File tree

2 files changed

+136
-60
lines changed
  • livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia

2 files changed

+136
-60
lines changed

livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/models.py

Lines changed: 66 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,36 +8,72 @@
88
# "pcm_alaw",
99
]
1010

11-
TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo"]
11+
TTSModels = Literal["sonic", "sonic-2", "sonic-lite", "sonic-preview", "sonic-turbo", "sonic-3"]
1212
TTSLanguages = Literal["en", "es", "fr", "de", "pt", "zh", "ja"]
1313
TTSDefaultVoiceId = "f786b574-daa5-4673-aa0c-cbe3e8534c02" # Katie - Friendly Fixer
1414
TTSVoiceSpeed = Literal["fastest", "fast", "normal", "slow", "slowest"]
15+
16+
# up to date as of 2025-10-24, refer to https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-emotion
1517
TTSVoiceEmotion = Literal[
16-
"anger:lowest",
17-
"anger:low",
18-
"anger",
19-
"anger:high",
20-
"anger:highest",
21-
"positivity:lowest",
22-
"positivity:low",
23-
"positivity",
24-
"positivity:high",
25-
"positivity:highest",
26-
"surprise:lowest",
27-
"surprise:low",
28-
"surprise",
29-
"surprise:high",
30-
"surprise:highest",
31-
"sadness:lowest",
32-
"sadness:low",
33-
"sadness",
34-
"sadness:high",
35-
"sadness:highest",
36-
"curiosity:lowest",
37-
"curiosity:low",
38-
"curiosity",
39-
"curiosity:high",
40-
"curiosity:highest",
18+
"Happy",
19+
"Excited",
20+
"Enthusiastic",
21+
"Elated",
22+
"Euphoric",
23+
"Triumphant",
24+
"Amazed",
25+
"Surprised",
26+
"Flirtatious",
27+
"Joking/Comedic",
28+
"Curious",
29+
"Content",
30+
"Peaceful",
31+
"Serene",
32+
"Calm",
33+
"Grateful",
34+
"Affectionate",
35+
"Trust",
36+
"Sympathetic",
37+
"Anticipation",
38+
"Mysterious",
39+
"Angry",
40+
"Mad",
41+
"Outraged",
42+
"Frustrated",
43+
"Agitated",
44+
"Threatened",
45+
"Disgusted",
46+
"Contempt",
47+
"Envious",
48+
"Sarcastic",
49+
"Ironic",
50+
"Sad",
51+
"Dejected",
52+
"Melancholic",
53+
"Disappointed",
54+
"Hurt",
55+
"Guilty",
56+
"Bored",
57+
"Tired",
58+
"Rejected",
59+
"Nostalgic",
60+
"Wistful",
61+
"Apologetic",
62+
"Hesitant",
63+
"Insecure",
64+
"Confused",
65+
"Resigned",
66+
"Anxious",
67+
"Panicked",
68+
"Alarmed",
69+
"Scared",
70+
"Neutral",
71+
"Proud",
72+
"Confident",
73+
"Distant",
74+
"Skeptical",
75+
"Contemplative",
76+
"Determined",
4177
]
4278

4379
# STT model definitions
@@ -89,3 +125,7 @@
89125
"or",
90126
"pa",
91127
]
128+
129+
130+
def _is_sonic_3(model: str) -> bool:
131+
return model.startswith("sonic-3")

livekit-plugins/livekit-plugins-cartesia/livekit/plugins/cartesia/tts.py

Lines changed: 70 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import os
2121
import weakref
2222
from dataclasses import dataclass, replace
23-
from typing import Any, Optional, Union, cast
23+
from typing import Any, Union, cast
2424

2525
import aiohttp
2626

@@ -45,6 +45,7 @@
4545
TTSModels,
4646
TTSVoiceEmotion,
4747
TTSVoiceSpeed,
48+
_is_sonic_3,
4849
)
4950

5051
API_AUTH_HEADER = "X-API-Key"
@@ -62,6 +63,7 @@ class _TTSOptions:
6263
voice: str | list[float]
6364
speed: TTSVoiceSpeed | float | None
6465
emotion: list[TTSVoiceEmotion | str] | None
66+
volume: float | None
6567
word_timestamps: bool
6668
api_key: str
6769
language: str
@@ -85,7 +87,8 @@ def __init__(
8587
encoding: TTSEncoding = "pcm_s16le",
8688
voice: str | list[float] = TTSDefaultVoiceId,
8789
speed: TTSVoiceSpeed | float | None = None,
88-
emotion: list[TTSVoiceEmotion | str] | None = None,
90+
emotion: TTSVoiceEmotion | str | list[TTSVoiceEmotion | str] | None = None,
91+
volume: float | None = None,
8992
sample_rate: int = 24000,
9093
word_timestamps: bool = True,
9194
http_session: aiohttp.ClientSession | None = None,
@@ -104,8 +107,9 @@ def __init__(
104107
language (str, optional): The language code for synthesis. Defaults to "en".
105108
encoding (TTSEncoding, optional): The audio encoding format. Defaults to "pcm_s16le".
106109
voice (str | list[float], optional): The voice ID or embedding array.
107-
speed (TTSVoiceSpeed | float, optional): Voice Control - Speed (https://docs.cartesia.ai/user-guides/voice-control)
108-
emotion (list[TTSVoiceEmotion], optional): Voice Control - Emotion (https://docs.cartesia.ai/user-guides/voice-control)
110+
speed (TTSVoiceSpeed | float, optional): Speed of speech, with sonic-3, the value is valid between 0.6 and 2.0 (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-speed)
111+
emotion (list[TTSVoiceEmotion], optional): Emotion of the speech (https://docs.cartesia.ai/api-reference/tts/bytes#body-generation-config-emotion)
112+
volume (float, optional): Volume of the speech, with sonic-3, the value is valid between 0.5 and 2.0
109113
sample_rate (int, optional): The audio sample rate in Hz. Defaults to 24000.
110114
word_timestamps (bool, optional): Whether to add word timestamps to the output. Defaults to True.
111115
api_key (str, optional): The Cartesia API key. If not provided, it will be read from the CARTESIA_API_KEY environment variable.
@@ -127,16 +131,8 @@ def __init__(
127131
if not cartesia_api_key:
128132
raise ValueError("CARTESIA_API_KEY must be set")
129133

130-
if speed or emotion:
131-
if (
132-
api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
133-
or model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
134-
):
135-
logger.warning(
136-
f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
137-
"see https://docs.cartesia.ai/developer-tools/changelog for details",
138-
extra={"model": model, "speed": speed, "emotion": emotion},
139-
)
134+
if isinstance(emotion, str):
135+
emotion = [emotion]
140136

141137
self._opts = _TTSOptions(
142138
model=model,
@@ -146,11 +142,16 @@ def __init__(
146142
voice=voice,
147143
speed=speed,
148144
emotion=emotion,
145+
volume=volume,
149146
api_key=cartesia_api_key,
150147
base_url=base_url,
151148
word_timestamps=word_timestamps,
152149
api_version=api_version,
153150
)
151+
152+
if speed or emotion or volume:
153+
self._check_generation_config()
154+
154155
self._session = http_session
155156
self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
156157
connect_cb=self._connect_ws,
@@ -201,8 +202,9 @@ def update_options(
201202
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
202203
language: NotGivenOr[str] = NOT_GIVEN,
203204
voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
204-
speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
205-
emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN,
205+
speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
206+
emotion: NotGivenOr[TTSVoiceEmotion | str | list[TTSVoiceEmotion | str]] = NOT_GIVEN,
207+
volume: NotGivenOr[float] = NOT_GIVEN,
206208
api_version: NotGivenOr[str] = NOT_GIVEN,
207209
) -> None:
208210
"""
@@ -225,22 +227,17 @@ def update_options(
225227
if is_given(voice):
226228
self._opts.voice = cast(Union[str, list[float]], voice)
227229
if is_given(speed):
228-
self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed)
230+
self._opts.speed = cast(Union[TTSVoiceSpeed, float], speed)
229231
if is_given(emotion):
230-
self._opts.emotion = emotion
232+
emotion = [emotion] if isinstance(emotion, str) else emotion
233+
self._opts.emotion = cast(list[Union[TTSVoiceEmotion, str]], emotion)
234+
if is_given(volume):
235+
self._opts.volume = volume
231236
if is_given(api_version):
232237
self._opts.api_version = api_version
233238

234239
if speed or emotion:
235-
if (
236-
self._opts.api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
237-
or self._opts.model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
238-
):
239-
logger.warning(
240-
f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
241-
"see https://docs.cartesia.ai/developer-tools/changelog for details",
242-
extra={"model": self._opts.model, "speed": speed, "emotion": emotion},
243-
)
240+
self._check_generation_config()
244241

245242
def synthesize(
246243
self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
@@ -261,6 +258,29 @@ async def aclose(self) -> None:
261258
self._streams.clear()
262259
await self._pool.aclose()
263260

261+
def _check_generation_config(self) -> None:
262+
if _is_sonic_3(self._opts.model):
263+
if self._opts.speed:
264+
if not isinstance(self._opts.speed, float):
265+
raise ValueError("speed must be a float for sonic-3")
266+
if not 0.6 <= self._opts.speed <= 2.0:
267+
logger.warning("speed must be between 0.6 and 2.0 for sonic-3")
268+
if self._opts.volume is not None and not 0.5 <= self._opts.volume <= 2.0:
269+
logger.warning("volume must be between 0.5 and 2.0 for sonic-3")
270+
elif (
271+
self._opts.api_version != API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
272+
or self._opts.model != MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS
273+
):
274+
logger.warning(
275+
f"speed and emotion controls are only supported for model '{MODEL_ID_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', and API version '{API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS}', "
276+
"see https://docs.cartesia.ai/developer-tools/changelog for details",
277+
extra={
278+
"model": self._opts.model,
279+
"speed": self._opts.speed,
280+
"emotion": self._opts.emotion,
281+
},
282+
)
283+
264284

265285
class ChunkedStream(tts.ChunkedStream):
266286
"""Synthesize chunked text using the bytes endpoint"""
@@ -436,15 +456,16 @@ def _to_cartesia_options(opts: _TTSOptions, *, streaming: bool) -> dict[str, Any
436456
voice["mode"] = "embedding"
437457
voice["embedding"] = opts.voice
438458

439-
voice_controls: dict = {}
440-
if opts.speed:
441-
voice_controls["speed"] = opts.speed
459+
if opts.api_version == API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS:
460+
voice_controls: dict = {}
461+
if opts.speed:
462+
voice_controls["speed"] = opts.speed
442463

443-
if opts.emotion:
444-
voice_controls["emotion"] = opts.emotion
464+
if opts.emotion:
465+
voice_controls["emotion"] = opts.emotion
445466

446-
if voice_controls:
447-
voice["__experimental_controls"] = voice_controls
467+
if voice_controls:
468+
voice["__experimental_controls"] = voice_controls
448469

449470
options: dict[str, Any] = {
450471
"model_id": opts.model,
@@ -456,6 +477,21 @@ def _to_cartesia_options(opts: _TTSOptions, *, streaming: bool) -> dict[str, Any
456477
},
457478
"language": opts.language,
458479
}
480+
481+
if opts.api_version > API_VERSION_WITH_EMBEDDINGS_AND_EXPERIMENTAL_CONTROLS and _is_sonic_3(
482+
opts.model
483+
):
484+
generation_config: dict[str, Any] = {}
485+
if opts.speed:
486+
generation_config["speed"] = opts.speed
487+
if opts.emotion:
488+
generation_config["emotion"] = opts.emotion[0]
489+
if opts.volume:
490+
generation_config["volume"] = opts.volume
491+
if generation_config:
492+
options["generation_config"] = generation_config
493+
459494
if streaming:
460495
options["add_timestamps"] = opts.word_timestamps
496+
461497
return options

0 commit comments

Comments
 (0)