Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

求助!使用TTS默认参数生成的语音感觉遇到逗号或句号没有停顿 #2011

Open
bsdgo opened this issue Mar 16, 2025 · 3 comments

Comments

@bsdgo
Copy link

bsdgo commented Mar 16, 2025

使用TTS默认参数, 模型使用 csukuangfj/sherpa-onnx-vits-zh-ll (Chinese, 5 speakers)

同样的sid , 自己跑生成的语音文件 时常5秒,样例中的是8秒, 仔细听下来在断句处都没有停顿

自己生成

官网样例

@csukuangfj
Copy link
Collaborator

请描述复现方法

@bsdgo
Copy link
Author

bsdgo commented Mar 17, 2025

请描述复现方法

使用 github.com/k2-fsa/sherpa-onnx-go 项目

main.go

package main

import (
	"context"
	"fmt"
	"sherpademo/sherpa"
	"time"
)

var modelDir = "./sherpa"

func main() {

	//testwav, err := os.Open(modelDir + "/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav")
	//if err != nil {
	//	panic(fmt.Sprintf("open wav: %s", err.Error()))
	//	return
	//}
	//defer testwav.Close()
	//bs, _ := io.ReadAll(testwav)
	//audio := &sherpa.RecognitionAudio{
	//	Buffer:    bytes.NewBuffer(bs),
	//	AudioType: "wav",
	//}
	sherpaInstance := sherpa.NewSherpa(&sherpa.SherpaConfig{
		Decoder: modelDir + "/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg" +
			"-1.onnx",
		Encoder:         modelDir + "/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx",
		Joiner:          modelDir + "/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx",
		Tokens:          modelDir + "/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt",
		CtTransformer:   modelDir + "/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx",
		VitsModel:       modelDir + "/sherpa-onnx-vits-zh-ll/model.onnx",
		VitsDictDir:     modelDir + "/sherpa-onnx-vits-zh-ll/dict",
		VitsLexicon:     modelDir + "/sherpa-onnx-vits-zh-ll/lexicon.txt",
		VitsTokens:      modelDir + "/sherpa-onnx-vits-zh-ll/tokens.txt",
		TtsRuleFsts:     modelDir + "/sherpa-onnx-vits-zh-ll/number.fst," + modelDir + "/sherpa-onnx-vits-zh-ll/new_heteronym.fst",
		VitsLengthScale: 1.0,
		Sid:             4,
		TtsNumThreads:   6,
	})
	//result, err := sherpaInstance.HandleRecognize(context.Background(), audio)
	//if err != nil {
	//	panic(fmt.Errorf("handle recognize:%s", err.Error()))
	//}
	//println(fmt.Sprintf("result: %s", result))
	start := time.Now()

	wavName, err := sherpaInstance.HandleGenerateAudio(context.Background(), "小米的使命是,始终坚持做感动人心、价格厚道的好产品,让全球每个人都能享受科技带来的美好生活。")
	println(fmt.Sprintf("wav file name:%s", wavName), err)
	println(fmt.Sprintf("耗时:%v", time.Since(start)))
	//play(wavName)
}

sherpa/sherpa.go

package sherpa

import (
	"bytes"
	"context"
	"encoding/binary"
	"fmt"
	sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
	wav "github.com/youpy/go-wav"
	"io"
	"os"
	"os/exec"
	"path"
	"strings"
)

const (
	supportAudioType = "wav"
	// ffmpeg -i test.amr -y -ar 22050 test.wav
	CMD_AMR_TO_WAV = "-i %s -y -ar 22050 %s"
)

type SherpaOnnx struct {
	cfg               *SherpaConfig
	recognizer        *sherpa.OnlineRecognizer
	tts               *sherpa.OfflineTts
	offlinePuncuation *sherpa.OfflinePunctuation
	ffmpeg            string
}

type SherpaConfig struct {
	// 语音转文字模型
	Decoder string
	Encoder string
	Joiner  string
	Tokens  string

	// 标点
	CtTransformer string

	// TTS
	VitsModel       string
	VitsDictDir     string
	VitsLexicon     string
	VitsTokens      string
	TtsRuleFsts     string
	VitsLengthScale float32
	Sid             int
	TtsNumThreads   int
}

type RecognitionAudio struct {
	Buffer    *bytes.Buffer
	AudioType string
}

func NewSherpa(cfg *SherpaConfig) *SherpaOnnx {
	sherpaInstance := &SherpaOnnx{
		cfg: cfg,
	}
	config := sherpa.OnlineRecognizerConfig{}
	config.FeatConfig = sherpa.FeatureConfig{SampleRate: 16000, FeatureDim: 80}
	config.ModelConfig = sherpa.OnlineModelConfig{
		Transducer: sherpa.OnlineTransducerModelConfig{
			Encoder: cfg.Encoder,
			Decoder: cfg.Decoder,
			Joiner:  cfg.Joiner,
		},
		Tokens:     cfg.Tokens,
		NumThreads: 1,
		Provider:   "cpu",
	}
	config.DecodingMethod = "greedy_search"
	config.MaxActivePaths = 4

	soopmc := sherpa.OfflinePunctuationModelConfig{
		CtTransformer: cfg.CtTransformer,
	}
	println("Initializing recognizer (may take several seconds)")
	recognizer := sherpa.NewOnlineRecognizer(&config)
	sherpaInstance.recognizer = recognizer
	println("Recognizer created!")
	soopc := sherpa.OfflinePunctuationConfig{
		Model: soopmc,
	}
	sherpaInstance.offlinePuncuation = sherpa.NewOfflinePunctuation(&soopc)

	println("Initializing tts (may take several seconds)")
	ttsConfig := sherpa.OfflineTtsConfig{
		Model: sherpa.OfflineTtsModelConfig{
			Vits: sherpa.OfflineTtsVitsModelConfig{
				Model:       cfg.VitsModel,
				Lexicon:     cfg.VitsLexicon,
				Tokens:      cfg.VitsTokens,
				NoiseScale:  0.667, //0.667
				NoiseScaleW: 0.8,   //0.8
				LengthScale: cfg.VitsLengthScale,
				DictDir:     cfg.VitsDictDir,
			},
			NumThreads: cfg.TtsNumThreads,
			Debug:      0,
			Provider:   "cpu",
		},
		MaxNumSentences: 1,
		SilenceScale:    0.2,
		RuleFsts:        cfg.TtsRuleFsts,
	}
	sherpaInstance.tts = sherpa.NewOfflineTts(&ttsConfig)
	println("Tts created!")
	return sherpaInstance
}

func (s *SherpaOnnx) HandleRecognize(ctx context.Context, audio *RecognitionAudio) (textResult string, err error) {
	// 1. 读取原始文件
	audioFile, err := os.CreateTemp(os.TempDir(), "*."+audio.AudioType)
	if err != nil {
		return
	}
	defer os.Remove(audioFile.Name())

	if _, err = io.Copy(audioFile, audio.Buffer); err != nil {
		return
	}

	// 2. 判断文件格式转换
	if audio.AudioType != supportAudioType {
		tmpWav, cerr := os.CreateTemp(os.TempDir(), "*."+supportAudioType)
		if cerr != nil {
			return
		}
		defer os.Remove(tmpWav.Name())

		args := fmt.Sprintf(CMD_AMR_TO_WAV, audioFile.Name(), tmpWav.Name())
		if _, cerr := exec.Command(s.ffmpeg, strings.Split(args, " ")...).CombinedOutput(); cerr != nil {
			err = cerr
			return
		}
		audioFile = tmpWav
	}

	// 3. 开始识别
	stream := sherpa.NewOnlineStream(s.recognizer)
	defer sherpa.DeleteOnlineStream(stream)

	samples, sampleRate, err := readWave(ctx, audioFile)
	if err != nil {
		return
	}
	stream.AcceptWaveform(sampleRate, samples)

	tailPadding := make([]float32, int(float32(sampleRate)*0.3))
	stream.AcceptWaveform(sampleRate, tailPadding)

	for s.recognizer.IsReady(stream) {
		s.recognizer.Decode(stream)
	}

	// 4. 记录结果
	textResult = s.recognizer.GetResult(stream).Text
	// 加入标点
	textResult = s.offlinePuncuation.AddPunct(textResult)
	return
}

func readWave(ctx context.Context, file *os.File) (samples []float32, sampleRate int, err error) {
	reader := wav.NewReader(file)
	format, err := reader.Format()
	if err != nil {
		err = fmt.Errorf("failed to read wave format")
		return
	}

	if format.AudioFormat != 1 {
		err = fmt.Errorf("Support only PCM format. Given: %v\n", format.AudioFormat)
		return
	}

	if format.NumChannels != 1 {
		err = fmt.Errorf("Support only 1 channel wave file. Given: %v\n", format.NumChannels)
		return
	}

	if format.BitsPerSample != 16 {
		err = fmt.Errorf("Support only 16-bit per sample. Given: %v\n", format.BitsPerSample)
		return
	}

	reader.Duration() // so that it initializes reader.Size

	buf := make([]byte, reader.Size)
	n, err := reader.Read(buf)
	if n != int(reader.Size) {
		err = fmt.Errorf("Failed to read %v bytes. Returned %v bytes\n", reader.Size, n)
		return
	}

	samples, err = samplesInt16ToFloat(buf)
	if err != nil {
		return
	}
	sampleRate = int(format.SampleRate)

	return
}

func samplesInt16ToFloat(inSamples []byte) ([]float32, error) {
	numSamples := len(inSamples) / 2
	outSamples := make([]float32, numSamples)

	for i := 0; i != numSamples; i++ {
		s := inSamples[i*2 : (i+1)*2]

		var s16 int16
		buf := bytes.NewReader(s)
		err := binary.Read(buf, binary.LittleEndian, &s16)
		if err != nil {
			return nil, fmt.Errorf("Failed to parse 16-bit sample")
		}
		outSamples[i] = float32(s16) / 32768
	}

	return outSamples, nil
}

func (s *SherpaOnnx) HandleGenerateAudio(ctx context.Context, text string) (result string, err error) {
	// 生成audio
	audioResult := s.tts.Generate(text, s.cfg.Sid, 1)
	tmpFile := path.Join(os.TempDir(), "generate_wav."+supportAudioType)
	ok := audioResult.Save(tmpFile)
	if !ok {
		err = fmt.Errorf("save audio failed")
		return
	}
	result = tmpFile
	return
}


@csukuangfj
Copy link
Collaborator

你直接用我们提供的代码例子,能否复现?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants