Skip to content

Commit 237f4fd

Browse files
fully support ormsgpack (#518)
* fully support ormsgpack * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * dependency --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 0956e02 commit 237f4fd

File tree

5 files changed

+64
-77
lines changed

5 files changed

+64
-77
lines changed

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ dependencies = [
4242
"funasr==1.1.5",
4343
"opencc-python-reimplemented==0.1.7",
4444
"silero-vad",
45+
"ormsgpack",
4546
]
4647

4748
[project.optional-dependencies]

tools/api.py

+1-33
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from fish_speech.models.vqgan.modules.firefly import FireflyArchitecture
4040
from fish_speech.text.chn_text_norm.text import Text as ChnNormedText
4141
from fish_speech.utils import autocast_exclude_mps
42-
from tools.auto_rerank import batch_asr, calculate_wer, is_chinese, load_model
42+
from tools.commons import ServeReferenceAudio, ServeTTSRequest
4343
from tools.file import AUDIO_EXTENSIONS, audio_to_bytes, list_files, read_ref_text
4444
from tools.llama.generate import (
4545
GenerateRequest,
@@ -156,38 +156,6 @@ def decode_vq_tokens(
156156
routes = MultimethodRoutes(base_class=HttpView)
157157

158158

159-
class ServeReferenceAudio(BaseModel):
160-
audio: bytes
161-
text: str
162-
163-
164-
class ServeTTSRequest(BaseModel):
165-
text: str = "你说的对, 但是原神是一款由米哈游自主研发的开放世界手游."
166-
chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
167-
# Audio format
168-
format: Literal["wav", "pcm", "mp3"] = "wav"
169-
mp3_bitrate: Literal[64, 128, 192] = 128
170-
# References audios for in-context learning
171-
references: list[ServeReferenceAudio] = []
172-
# Reference id
173-
# For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
174-
# Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
175-
reference_id: str | None = None
176-
# Normalize text for en & zh, this increase stability for numbers
177-
normalize: bool = True
178-
mp3_bitrate: Optional[int] = 64
179-
opus_bitrate: Optional[int] = -1000
180-
# Balance mode will reduce latency to 300ms, but may decrease stability
181-
latency: Literal["normal", "balanced"] = "normal"
182-
# not usually used below
183-
streaming: bool = False
184-
emotion: Optional[str] = None
185-
max_new_tokens: int = 1024
186-
top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
187-
repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
188-
temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
189-
190-
191159
def get_content_type(audio_format):
192160
if audio_format == "wav":
193161
return "audio/wav"

tools/commons.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from typing import Annotated, Literal, Optional
2+
3+
from pydantic import BaseModel, Field, conint
4+
5+
6+
class ServeReferenceAudio(BaseModel):
7+
audio: bytes
8+
text: str
9+
10+
11+
class ServeTTSRequest(BaseModel):
12+
text: str
13+
chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
14+
# Audio format
15+
format: Literal["wav", "pcm", "mp3"] = "wav"
16+
mp3_bitrate: Literal[64, 128, 192] = 128
17+
# References audios for in-context learning
18+
references: list[ServeReferenceAudio] = []
19+
# Reference id
20+
# For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
21+
# Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
22+
reference_id: str | None = None
23+
# Normalize text for en & zh, this increase stability for numbers
24+
normalize: bool = True
25+
mp3_bitrate: Optional[int] = 64
26+
opus_bitrate: Optional[int] = -1000
27+
# Balance mode will reduce latency to 300ms, but may decrease stability
28+
latency: Literal["normal", "balanced"] = "normal"
29+
# not usually used below
30+
streaming: bool = False
31+
emotion: Optional[str] = None
32+
max_new_tokens: int = 1024
33+
top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
34+
repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
35+
temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7

tools/msgpack_api.py

+1-35
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,7 @@
1-
from typing import Annotated, AsyncGenerator, Literal, Optional
2-
31
import httpx
42
import ormsgpack
5-
from pydantic import AfterValidator, BaseModel, Field, conint
6-
7-
8-
class ServeReferenceAudio(BaseModel):
9-
audio: bytes
10-
text: str
11-
12-
13-
class ServeTTSRequest(BaseModel):
14-
text: str
15-
chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200
16-
# Audio format
17-
format: Literal["wav", "pcm", "mp3"] = "wav"
18-
mp3_bitrate: Literal[64, 128, 192] = 128
19-
# References audios for in-context learning
20-
references: list[ServeReferenceAudio] = []
21-
# Reference id
22-
# For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/
23-
# Just pass 7f92f8afb8ec43bf81429cc1c9199cb1
24-
reference_id: str | None = None
25-
# Normalize text for en & zh, this increase stability for numbers
26-
normalize: bool = True
27-
mp3_bitrate: Optional[int] = 64
28-
opus_bitrate: Optional[int] = -1000
29-
# Balance mode will reduce latency to 300ms, but may decrease stability
30-
latency: Literal["normal", "balanced"] = "normal"
31-
# not usually used below
32-
streaming: bool = False
33-
emotion: Optional[str] = None
34-
max_new_tokens: int = 1024
35-
top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
36-
repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.2
37-
temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.7
383

4+
from tools.commons import ServeReferenceAudio, ServeTTSRequest
395

406
# priority: ref_id > references
417
request = ServeTTSRequest(

tools/post_api.py

+26-9
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import argparse
22
import base64
33
import wave
4-
from pathlib import Path
54

5+
import ormsgpack
66
import pyaudio
77
import requests
88
from pydub import AudioSegment
99
from pydub.playback import play
1010

11+
from tools.commons import ServeReferenceAudio, ServeTTSRequest
1112
from tools.file import audio_to_bytes, read_ref_text
1213

1314

@@ -113,20 +114,26 @@ def parse_args():
113114
idstr: str | None = args.reference_id
114115
# priority: ref_id > [{text, audio},...]
115116
if idstr is None:
116-
base64_audios = [
117-
audio_to_bytes(ref_audio) for ref_audio in args.reference_audio
118-
]
119-
ref_texts = [read_ref_text(ref_text) for ref_text in args.reference_text]
117+
ref_audios = args.reference_audio
118+
ref_texts = args.reference_text
119+
if ref_audios is None:
120+
byte_audios = []
121+
else:
122+
byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]
123+
if ref_texts is None:
124+
ref_texts = []
125+
else:
126+
ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]
120127
else:
121-
base64_audios = []
128+
byte_audios = []
122129
ref_texts = []
123130
pass # in api.py
124131

125132
data = {
126133
"text": args.text,
127134
"references": [
128-
dict(text=ref_text, audio=ref_audio)
129-
for ref_text, ref_audio in zip(ref_texts, base64_audios)
135+
ServeReferenceAudio(audio=ref_audio, text=ref_text)
136+
for ref_text, ref_audio in zip(ref_texts, byte_audios)
130137
],
131138
"reference_id": idstr,
132139
"normalize": args.normalize,
@@ -143,7 +150,17 @@ def parse_args():
143150
"streaming": args.streaming,
144151
}
145152

146-
response = requests.post(args.url, json=data, stream=args.streaming)
153+
pydantic_data = ServeTTSRequest(**data)
154+
155+
response = requests.post(
156+
args.url,
157+
data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),
158+
stream=args.streaming,
159+
headers={
160+
"authorization": "Bearer YOUR_API_KEY",
161+
"content-type": "application/msgpack",
162+
},
163+
)
147164

148165
if response.status_code == 200:
149166
if args.streaming:

0 commit comments

Comments
 (0)