Support Japenese TTS

hertz-pj · hertz-pj · commit 4b3bc314c23e · 2022-01-24T16:16:00.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,5 @@ dump_ljspeech/
 dump_kss/
 dump_libritts/
 /notebooks/test_saved/
+build/
+dist/
diff --git a/examples/fastspeech2/conf/fastspeech2.jsut.v1.yaml b/examples/fastspeech2/conf/fastspeech2.jsut.v1.yaml
@@ -0,0 +1,81 @@
+# This is the hyperparameter configuration file for FastSpeech2 v2.
+# the different of v2 and v1 is that v2 apply linformer technique.
+# Please make sure this is adjusted for the Baker dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 200k iters but a best checkpoint is around 150k iters.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+hop_size: 300            # Hop size.
+format: "npy"
+
+
+###########################################################
+#              NETWORK ARCHITECTURE SETTING               #
+###########################################################
+model_type: "fastspeech2"
+
+fastspeech2_params:
+    dataset: jsut
+    n_speakers: 1
+    encoder_hidden_size: 256
+    encoder_num_hidden_layers: 3
+    encoder_num_attention_heads: 2
+    encoder_attention_head_size: 16  # in v1, = 384//2
+    encoder_intermediate_size: 1024
+    encoder_intermediate_kernel_size: 3
+    encoder_hidden_act: "mish"
+    decoder_hidden_size: 256
+    decoder_num_hidden_layers: 3
+    decoder_num_attention_heads: 2
+    decoder_attention_head_size: 16  # in v1, = 384//2
+    decoder_intermediate_size: 1024
+    decoder_intermediate_kernel_size: 3
+    decoder_hidden_act: "mish"
+    variant_prediction_num_conv_layers: 2
+    variant_predictor_filter: 256
+    variant_predictor_kernel_size: 3
+    variant_predictor_dropout_rate: 0.5
+    num_mels: 80
+    hidden_dropout_prob: 0.2
+    attention_probs_dropout_prob: 0.1
+    max_position_embeddings: 2048
+    initializer_range: 0.02
+    output_attentions: False
+    output_hidden_states: False
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
+remove_short_samples: true  # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.
+mel_length_threshold: 32    # remove all targets has mel_length <= 32 
+is_shuffle: true            # shuffle dataset after each epoch.
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+optimizer_params:
+    initial_learning_rate: 0.001
+    end_learning_rate: 0.00005
+    decay_steps: 150000          # < train_max_steps is recommend.
+    warmup_proportion: 0.02
+    weight_decay: 0.001
+
+gradient_accumulation_steps: 1
+var_train_expr: null  # trainable variable expr (eg. 'embeddings|encoder|decoder' )
+                      # must separate by |. if var_train_expr is null then we 
+                      # training all variable
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 200000               # Number of training steps.
+save_interval_steps: 5000             # Interval steps to save checkpoint.
+eval_interval_steps: 500              # Interval steps to evaluate the network.
+log_interval_steps: 200               # Interval steps to record the training log.
+delay_f0_energy_steps: 3              # 2 steps use LR outputs only then 1 steps LR + F0 + Energy.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 1  # Number of batch to be saved as intermediate results.
diff --git a/examples/tacotron2/conf/tacotron2.jsut.v1.yaml b/examples/tacotron2/conf/tacotron2.jsut.v1.yaml
@@ -0,0 +1,86 @@
+# This is the hyperparameter configuration file for Tacotron2 v1.
+# Please make sure this is adjusted for the Baker dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration performs 200k iters but 65k iters is enough to get a good models.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+hop_size: 300            # Hop size.
+format: "npy"
+
+
+###########################################################
+#              NETWORK ARCHITECTURE SETTING               #
+###########################################################
+model_type: "tacotron2"
+
+tacotron2_params:
+    dataset: jsut
+    embedding_hidden_size: 512
+    initializer_range: 0.5
+    embedding_dropout_prob: 0.1
+    n_speakers: 1
+    n_conv_encoder: 5
+    encoder_conv_filters: 512
+    encoder_conv_kernel_sizes: 5
+    encoder_conv_activation: 'relu'
+    encoder_conv_dropout_rate: 0.5
+    encoder_lstm_units: 256
+    n_prenet_layers: 2
+    prenet_units: 256
+    prenet_activation: 'relu'
+    prenet_dropout_rate: 0.5
+    n_lstm_decoder: 1
+    reduction_factor: 2
+    decoder_lstm_units: 1024
+    attention_dim: 128
+    attention_filters: 32
+    attention_kernel: 31
+    n_mels: 80
+    n_conv_postnet: 5
+    postnet_conv_filters: 512
+    postnet_conv_kernel_sizes: 5
+    postnet_dropout_rate: 0.1
+    attention_type: "lsa"
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32             # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
+remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
+allow_cache: true          # Whether to allow cache in dataset. If true, it requires cpu memory.
+mel_length_threshold: 32   # remove all targets has mel_length <= 32 
+is_shuffle: true           # shuffle dataset after each epoch.
+use_fixed_shapes: true     # use_fixed_shapes for training (2x speed-up)
+                           # refer (https://github.com/tensorspeech/TensorflowTTS/issues/34#issuecomment-642309118)
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+optimizer_params:
+    initial_learning_rate: 0.001
+    end_learning_rate: 0.00001
+    decay_steps: 150000          # < train_max_steps is recommend.
+    warmup_proportion: 0.02
+    weight_decay: 0.001
+
+gradient_accumulation_steps: 1
+var_train_expr: null  # trainable variable expr (eg. 'embeddings|decoder_cell' )
+                      # must separate by |. if var_train_expr is null then we 
+                      # training all variable
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 200000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 500                # Interval steps to evaluate the network.
+log_interval_steps: 100                 # Interval steps to record the training log.
+start_schedule_teacher_forcing: 200001  # don't need to apply schedule teacher forcing.
+start_ratio_value: 0.5                  # start ratio of scheduled teacher forcing.
+schedule_decay_steps: 50000             # decay step scheduled teacher forcing.
+end_ratio_value: 0.0                    # end ratio of scheduled teacher forcing.
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 1  # Number of results to be saved as intermediate results.
diff --git a/preprocess/jsut_preprocess.yaml b/preprocess/jsut_preprocess.yaml
@@ -0,0 +1,19 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+sampling_rate: 24000     # Sampling rate.
+fft_size: 2048           # FFT size.
+hop_size: 300            # Hop size. (fixed value, don't change)
+win_length: 1200         # Window length.
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+num_mels: 80             # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation.
+fmax: 7600               # Maximum frequency in mel basis calculation.
+global_gain_scale: 1.0   # Will be multiplied to all of waveform.
+trim_silence: true       # Whether to trim the start and end of silence.
+trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
+trim_frame_size: 2048    # Frame size in trimming.
+trim_hop_size: 512       # Hop size in trimming.
+format: "npy"            # Feature file format. Only "npy" is supported.
+
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 # TODO(@dathudeptrai) update requirement if needed.
 requirements = {
     "install": [
-        "tensorflow-gpu==2.6.0",
+        # "tensorflow-gpu==2.6.0",
         "tensorflow-addons>=0.10.0",
         "setuptools>=38.5.1",
         "huggingface_hub==0.0.8",
diff --git a/tensorflow_tts/bin/preprocess.py b/tensorflow_tts/bin/preprocess.py
@@ -37,13 +37,15 @@
 from tensorflow_tts.processor import ThorstenProcessor
 from tensorflow_tts.processor import LJSpeechUltimateProcessor
 from tensorflow_tts.processor import SynpaflexProcessor
+from tensorflow_tts.processor import JSUTProcessor
 from tensorflow_tts.processor.ljspeech import LJSPEECH_SYMBOLS
 from tensorflow_tts.processor.baker import BAKER_SYMBOLS
 from tensorflow_tts.processor.kss import KSS_SYMBOLS
 from tensorflow_tts.processor.libritts import LIBRITTS_SYMBOLS
 from tensorflow_tts.processor.thorsten import THORSTEN_SYMBOLS
 from tensorflow_tts.processor.ljspeechu import LJSPEECH_U_SYMBOLS
 from tensorflow_tts.processor.synpaflex import SYNPAFLEX_SYMBOLS
+from tensorflow_tts.processor.jsut import JSUT_SYMBOLS
 
 from tensorflow_tts.utils import remove_outlier
 
@@ -74,7 +76,7 @@ def parse_and_config():
         "--dataset",
         type=str,
         default="ljspeech",
-        choices=["ljspeech", "kss", "libritts", "baker", "thorsten", "ljspeechu", "synpaflex"],
+        choices=["ljspeech", "kss", "libritts", "baker", "thorsten", "ljspeechu", "synpaflex", "jsut"],
         help="Dataset to preprocess.",
     )
     parser.add_argument(
@@ -355,8 +357,9 @@ def preprocess():
         "libritts": LibriTTSProcessor,
         "baker": BakerProcessor,
         "thorsten": ThorstenProcessor,
-        "ljspeechu" : LJSpeechUltimateProcessor,
+        "ljspeechu": LJSpeechUltimateProcessor,
         "synpaflex": SynpaflexProcessor,
+        "jsut": JSUTProcessor,
     }
 
     dataset_symbol = {
@@ -367,6 +370,7 @@ def preprocess():
         "thorsten": THORSTEN_SYMBOLS,
         "ljspeechu": LJSPEECH_U_SYMBOLS,
         "synpaflex": SYNPAFLEX_SYMBOLS,
+        "jsut": JSUT_SYMBOLS,
     }
 
     dataset_cleaner = {
@@ -377,6 +381,7 @@ def preprocess():
         "thorsten": "german_cleaners",
         "ljspeechu": "english_cleaners",
         "synpaflex": "basic_cleaners",
+        "jsut": None,
     }
 
     logging.info(f"Selected '{config['dataset']}' processor.")
@@ -576,4 +581,8 @@ def compute_statistics():
     # save statistics to file
     logging.info("Saving computed statistics.")
     scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")]
-    save_statistics_to_file(scaler_list, config)
+    save_statistics_to_file(scaler_list, config)
+
+
+if __name__ == "__main__":
+    preprocess()
diff --git a/tensorflow_tts/processor/__init__.py b/tensorflow_tts/processor/__init__.py
@@ -7,3 +7,4 @@
 from tensorflow_tts.processor.thorsten import ThorstenProcessor
 from tensorflow_tts.processor.ljspeechu import LJSpeechUltimateProcessor
 from tensorflow_tts.processor.synpaflex import SynpaflexProcessor
+from tensorflow_tts.processor.jsut import JSUTProcessor
diff --git a/tensorflow_tts/processor/jsut.py b/tensorflow_tts/processor/jsut.py