Skip to content

Commit 4b3bc31

Browse files
author
hertz-pj
committed
Support Japenese TTS
1 parent 34358d8 commit 4b3bc31

File tree

8 files changed

+367
-4
lines changed

8 files changed

+367
-4
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,5 @@ dump_ljspeech/
4242
dump_kss/
4343
dump_libritts/
4444
/notebooks/test_saved/
45+
build/
46+
dist/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# This is the hyperparameter configuration file for FastSpeech2 v2.
2+
# the different of v2 and v1 is that v2 apply linformer technique.
3+
# Please make sure this is adjusted for the Baker dataset. If you want to
4+
# apply to the other dataset, you might need to carefully change some parameters.
5+
# This configuration performs 200k iters but a best checkpoint is around 150k iters.
6+
7+
###########################################################
8+
# FEATURE EXTRACTION SETTING #
9+
###########################################################
10+
hop_size: 300 # Hop size.
11+
format: "npy"
12+
13+
14+
###########################################################
15+
# NETWORK ARCHITECTURE SETTING #
16+
###########################################################
17+
model_type: "fastspeech2"
18+
19+
fastspeech2_params:
20+
dataset: jsut
21+
n_speakers: 1
22+
encoder_hidden_size: 256
23+
encoder_num_hidden_layers: 3
24+
encoder_num_attention_heads: 2
25+
encoder_attention_head_size: 16 # in v1, = 384//2
26+
encoder_intermediate_size: 1024
27+
encoder_intermediate_kernel_size: 3
28+
encoder_hidden_act: "mish"
29+
decoder_hidden_size: 256
30+
decoder_num_hidden_layers: 3
31+
decoder_num_attention_heads: 2
32+
decoder_attention_head_size: 16 # in v1, = 384//2
33+
decoder_intermediate_size: 1024
34+
decoder_intermediate_kernel_size: 3
35+
decoder_hidden_act: "mish"
36+
variant_prediction_num_conv_layers: 2
37+
variant_predictor_filter: 256
38+
variant_predictor_kernel_size: 3
39+
variant_predictor_dropout_rate: 0.5
40+
num_mels: 80
41+
hidden_dropout_prob: 0.2
42+
attention_probs_dropout_prob: 0.1
43+
max_position_embeddings: 2048
44+
initializer_range: 0.02
45+
output_attentions: False
46+
output_hidden_states: False
47+
48+
###########################################################
49+
# DATA LOADER SETTING #
50+
###########################################################
51+
batch_size: 16 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
52+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
53+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
54+
mel_length_threshold: 32 # remove all targets has mel_length <= 32
55+
is_shuffle: true # shuffle dataset after each epoch.
56+
###########################################################
57+
# OPTIMIZER & SCHEDULER SETTING #
58+
###########################################################
59+
optimizer_params:
60+
initial_learning_rate: 0.001
61+
end_learning_rate: 0.00005
62+
decay_steps: 150000 # < train_max_steps is recommend.
63+
warmup_proportion: 0.02
64+
weight_decay: 0.001
65+
66+
gradient_accumulation_steps: 1
67+
var_train_expr: null # trainable variable expr (eg. 'embeddings|encoder|decoder' )
68+
# must separate by |. if var_train_expr is null then we
69+
# training all variable
70+
###########################################################
71+
# INTERVAL SETTING #
72+
###########################################################
73+
train_max_steps: 200000 # Number of training steps.
74+
save_interval_steps: 5000 # Interval steps to save checkpoint.
75+
eval_interval_steps: 500 # Interval steps to evaluate the network.
76+
log_interval_steps: 200 # Interval steps to record the training log.
77+
delay_f0_energy_steps: 3 # 2 steps use LR outputs only then 1 steps LR + F0 + Energy.
78+
###########################################################
79+
# OTHER SETTING #
80+
###########################################################
81+
num_save_intermediate_results: 1 # Number of batch to be saved as intermediate results.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# This is the hyperparameter configuration file for Tacotron2 v1.
2+
# Please make sure this is adjusted for the Baker dataset. If you want to
3+
# apply to the other dataset, you might need to carefully change some parameters.
4+
# This configuration performs 200k iters but 65k iters is enough to get a good models.
5+
6+
###########################################################
7+
# FEATURE EXTRACTION SETTING #
8+
###########################################################
9+
hop_size: 300 # Hop size.
10+
format: "npy"
11+
12+
13+
###########################################################
14+
# NETWORK ARCHITECTURE SETTING #
15+
###########################################################
16+
model_type: "tacotron2"
17+
18+
tacotron2_params:
19+
dataset: jsut
20+
embedding_hidden_size: 512
21+
initializer_range: 0.5
22+
embedding_dropout_prob: 0.1
23+
n_speakers: 1
24+
n_conv_encoder: 5
25+
encoder_conv_filters: 512
26+
encoder_conv_kernel_sizes: 5
27+
encoder_conv_activation: 'relu'
28+
encoder_conv_dropout_rate: 0.5
29+
encoder_lstm_units: 256
30+
n_prenet_layers: 2
31+
prenet_units: 256
32+
prenet_activation: 'relu'
33+
prenet_dropout_rate: 0.5
34+
n_lstm_decoder: 1
35+
reduction_factor: 2
36+
decoder_lstm_units: 1024
37+
attention_dim: 128
38+
attention_filters: 32
39+
attention_kernel: 31
40+
n_mels: 80
41+
n_conv_postnet: 5
42+
postnet_conv_filters: 512
43+
postnet_conv_kernel_sizes: 5
44+
postnet_dropout_rate: 0.1
45+
attention_type: "lsa"
46+
47+
###########################################################
48+
# DATA LOADER SETTING #
49+
###########################################################
50+
batch_size: 32 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
51+
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
52+
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
53+
mel_length_threshold: 32 # remove all targets has mel_length <= 32
54+
is_shuffle: true # shuffle dataset after each epoch.
55+
use_fixed_shapes: true # use_fixed_shapes for training (2x speed-up)
56+
# refer (https://github.com/tensorspeech/TensorflowTTS/issues/34#issuecomment-642309118)
57+
58+
###########################################################
59+
# OPTIMIZER & SCHEDULER SETTING #
60+
###########################################################
61+
optimizer_params:
62+
initial_learning_rate: 0.001
63+
end_learning_rate: 0.00001
64+
decay_steps: 150000 # < train_max_steps is recommend.
65+
warmup_proportion: 0.02
66+
weight_decay: 0.001
67+
68+
gradient_accumulation_steps: 1
69+
var_train_expr: null # trainable variable expr (eg. 'embeddings|decoder_cell' )
70+
# must separate by |. if var_train_expr is null then we
71+
# training all variable
72+
###########################################################
73+
# INTERVAL SETTING #
74+
###########################################################
75+
train_max_steps: 200000 # Number of training steps.
76+
save_interval_steps: 5000 # Interval steps to save checkpoint.
77+
eval_interval_steps: 500 # Interval steps to evaluate the network.
78+
log_interval_steps: 100 # Interval steps to record the training log.
79+
start_schedule_teacher_forcing: 200001 # don't need to apply schedule teacher forcing.
80+
start_ratio_value: 0.5 # start ratio of scheduled teacher forcing.
81+
schedule_decay_steps: 50000 # decay step scheduled teacher forcing.
82+
end_ratio_value: 0.0 # end ratio of scheduled teacher forcing.
83+
###########################################################
84+
# OTHER SETTING #
85+
###########################################################
86+
num_save_intermediate_results: 1 # Number of results to be saved as intermediate results.

preprocess/jsut_preprocess.yaml

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
###########################################################
2+
# FEATURE EXTRACTION SETTING #
3+
###########################################################
4+
sampling_rate: 24000 # Sampling rate.
5+
fft_size: 2048 # FFT size.
6+
hop_size: 300 # Hop size. (fixed value, don't change)
7+
win_length: 1200 # Window length.
8+
# If set to null, it will be the same as fft_size.
9+
window: "hann" # Window function.
10+
num_mels: 80 # Number of mel basis.
11+
fmin: 80 # Minimum freq in mel basis calculation.
12+
fmax: 7600 # Maximum frequency in mel basis calculation.
13+
global_gain_scale: 1.0 # Will be multiplied to all of waveform.
14+
trim_silence: true # Whether to trim the start and end of silence.
15+
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
16+
trim_frame_size: 2048 # Frame size in trimming.
17+
trim_hop_size: 512 # Hop size in trimming.
18+
format: "npy" # Feature file format. Only "npy" is supported.
19+

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
# TODO(@dathudeptrai) update requirement if needed.
2323
requirements = {
2424
"install": [
25-
"tensorflow-gpu==2.6.0",
25+
# "tensorflow-gpu==2.6.0",
2626
"tensorflow-addons>=0.10.0",
2727
"setuptools>=38.5.1",
2828
"huggingface_hub==0.0.8",

tensorflow_tts/bin/preprocess.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,15 @@
3737
from tensorflow_tts.processor import ThorstenProcessor
3838
from tensorflow_tts.processor import LJSpeechUltimateProcessor
3939
from tensorflow_tts.processor import SynpaflexProcessor
40+
from tensorflow_tts.processor import JSUTProcessor
4041
from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS
4142
from tensorflow_tts.processor.baker import BAKER_SYMBOLS
4243
from tensorflow_tts.processor.kss import KSS_SYMBOLS
4344
from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS
4445
from tensorflow_tts.processor.thorsten import THORSTEN_SYMBOLS
4546
from tensorflow_tts.processor.ljspeechu import LJSPEECH_U_SYMBOLS
4647
from tensorflow_tts.processor.synpaflex import SYNPAFLEX_SYMBOLS
48+
from tensorflow_tts.processor.jsut import JSUT_SYMBOLS
4749

4850
from tensorflow_tts.utils import remove_outlier
4951

@@ -74,7 +76,7 @@ def parse_and_config():
7476
"--dataset",
7577
type=str,
7678
default="ljspeech",
77-
choices=["ljspeech", "kss", "libritts", "baker", "thorsten", "ljspeechu", "synpaflex"],
79+
choices=["ljspeech", "kss", "libritts", "baker", "thorsten", "ljspeechu", "synpaflex", "jsut"],
7880
help="Dataset to preprocess.",
7981
)
8082
parser.add_argument(
@@ -355,8 +357,9 @@ def preprocess():
355357
"libritts": LibriTTSProcessor,
356358
"baker": BakerProcessor,
357359
"thorsten": ThorstenProcessor,
358-
"ljspeechu" : LJSpeechUltimateProcessor,
360+
"ljspeechu": LJSpeechUltimateProcessor,
359361
"synpaflex": SynpaflexProcessor,
362+
"jsut": JSUTProcessor,
360363
}
361364

362365
dataset_symbol = {
@@ -367,6 +370,7 @@ def preprocess():
367370
"thorsten": THORSTEN_SYMBOLS,
368371
"ljspeechu": LJSPEECH_U_SYMBOLS,
369372
"synpaflex": SYNPAFLEX_SYMBOLS,
373+
"jsut": JSUT_SYMBOLS,
370374
}
371375

372376
dataset_cleaner = {
@@ -377,6 +381,7 @@ def preprocess():
377381
"thorsten": "german_cleaners",
378382
"ljspeechu": "english_cleaners",
379383
"synpaflex": "basic_cleaners",
384+
"jsut": None,
380385
}
381386

382387
logging.info(f"Selected '{config['dataset']}' processor.")
@@ -576,4 +581,8 @@ def compute_statistics():
576581
# save statistics to file
577582
logging.info("Saving computed statistics.")
578583
scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
579-
save_statistics_to_file(scaler_list, config)
584+
save_statistics_to_file(scaler_list, config)
585+
586+
587+
if __name__ == "__main__":
588+
preprocess()

tensorflow_tts/processor/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@
77
from tensorflow_tts.processor.thorsten import ThorstenProcessor
88
from tensorflow_tts.processor.ljspeechu import LJSpeechUltimateProcessor
99
from tensorflow_tts.processor.synpaflex import SynpaflexProcessor
10+
from tensorflow_tts.processor.jsut import JSUTProcessor

0 commit comments

Comments
 (0)