-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
95 lines (74 loc) · 3.92 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
from TTS.api import TTS
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from src.custom_tts import CustomTTS
from src.thirdparty.whisper_client import FasterWhisperClient
from src.thirdparty.coqui_imported import main as coqui_main
from src.diff_ljspeech_metadata_csv import main as diff_ljspeech_main
ROOT_DIR = os.path.abspath(os.curdir)
speaker="combined-portal2-wheatley"
def main():
# diff_ljspeech_main(f"{ROOT_DIR}/lib/assets/training_data/single_channel_wavs")
# coqui_main(ROOT_DIR, "single_channel_wavs", "Wheatley_bw_a4_2nd_first_test_solve02")
# print("instantiating whisper client")
# whisper_client = FasterWhisperClient()
# print("instantiated whisper client")
# whisper_client.transcribe_directory(f"{ROOT_DIR}/out/single_channel_wavs")
# print("transcribed directory")
speaker_wavs = get_speaker_wavs(speaker, ROOT_DIR)
# # audacity_client = Audacity()
# # audacity.label_sounds(audacity_client, speaker_wavs[0], ROOT_DIR)
# # train_model(speaker)
# # # # print(speaker_wavs)
# generate_tts("On dark and lonely nights, George Bush is want to stare longingly into the moon while wearing his custom made fur-suit. The monster inside of him howls!?!?!? RAWWWRRR!!!! His little toes are cold in the snow of the first summer frost. He desperately seeks to find the one piece.", speaker_wavs)
model_dir= "model/xttsv2"
custom_tts = CustomTTS(f"{model_dir}/config.json", model_dir, speaker_wavs[0])
custom_tts.text_to_speech("On dark and lonely nights, George Bush is want to stare longingly into the moon while wearing his custom made fur-suit. The monster inside of him howls!?!?!? RAWWWRRR!!!! His little toes are cold in the snow of the first summer frost. He desperately seeks to find the one piece.")
pass
# TODO: this doesn't do anything right now
def train_model(speaker):
print("Training model...\n")
training_dir= f"{ROOT_DIR}/lib/assets/training_data/{speaker}"
# dataset config for one of the pre-defined datasets
dataset_config = BaseDatasetConfig(
formatter="vctk", meta_file_train="metadata.txt", language="en-us", path=training_dir, meta_file_val="metadata.txt"
)
# load training samples
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=formatter, eval_split_size=0.071428571428571)
# train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
print(train_samples, eval_samples)
# get speaker wav files from an ljspeech structured directory
def get_speaker_wavs(speaker, root_path):
training_dir= f"{root_path}/lib/assets/training_data/{speaker}"
files = os.walk(training_dir)
speaker_wavs = []
for (dir_path, dir_names, file_names) in files:
for file in file_names:
if(file == 'metadata.txt'):
continue
speaker_wavs.append(f"{dir_path}/{file}")
return speaker_wavs
def generate_tts(text, speaker_wavs):
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
# generate speech by cloning a voice using default settings
tts.tts_to_file(text=text,
file_path="output.wav",
speaker_wav=speaker_wavs,
language="en")
# custom formatter implementation
def formatter(root_path, manifest_file, **kwargs): # pylint: disable=unused-argument
"""Assumes each line as ```<filename>|<transcription>```
"""
txt_file = os.path.join(root_path, manifest_file)
items = []
speaker_name = speaker
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[-1])
text = cols[0]
items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
return items
if __name__ == "__main__":
main()