TensorSpeech
diff --git a/‎README.md
+1-2 b/‎README.md
+1-2
diff --git a/‎examples/fastspeech2/extractfs_postnets.py
+162 b/‎examples/fastspeech2/extractfs_postnets.py
+162
diff --git a/‎examples/multiband_melgan_hf/README.md
+94 b/‎examples/multiband_melgan_hf/README.md
+94
diff --git a/‎examples/multiband_pwgan/conf/multiband_pwgan.v1.yaml renamed to ‎examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml
+40-20 b/‎examples/multiband_pwgan/conf/multiband_pwgan.v1.yaml renamed to ‎examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml
+40-20
@@ -28,7 +28,6 @@
 - 2020/11/24 Add HiFi-GAN vocoder. See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/hifigan)
 - 2020/11/19 Add Multi-GPU gradient accumulator. See [here](https://github.com/TensorSpeech/TensorFlowTTS/pull/377)
 - 2020/08/23 Add Parallel WaveGAN tensorflow implementation. See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/parallel_wavegan)
-- 2020/08/23 Add MBMelGAN G + ParallelWaveGAN G example. See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/multiband_pwgan)
 - 2020/08/20 Add C++ inference code. Thank [@ZDisket](https://github.com/ZDisket). See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/cppwin)
 - 2020/08/18 Update [new base processor](https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/processor/base_processor.py). Add [AutoProcessor](https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/inference/auto_processor.py) and [pretrained processor](https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/processor/pretrained/) json file
 - 2020/08/14 Support Chinese TTS. Pls see the [colab](https://colab.research.google.com/drive/1YpSHRBRPBI7cnTkQn1UcVTWEQVbsUm1S?usp=sharing). Thank [@azraelkuan](https://github.com/azraelkuan)
@@ -227,7 +226,7 @@ To know how to train model from scratch or fine-tune with other datasets/languag
 - For MelGAN + STFT Loss tutorial, pls see [examples/melgan.stft](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/melgan.stft)
 - For Multiband-MelGAN tutorial, pls see [examples/multiband_melgan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_melgan)
 - For Parallel WaveGAN tutorial, pls see [examples/parallel_wavegan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/parallel_wavegan)
-- For Multiband-MelGAN Generator + Parallel WaveGAN Discriminator tutorial, pls see [examples/multiband_pwgan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_pwgan)
+- For Multiband-MelGAN Generator + HiFi-GAN tutorial, pls see [examples/multiband_melgan_hf](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_melgan_hf)
 - For HiFi-GAN tutorial, pls see [examples/hifigan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/hifigan)
 # Abstract Class Explaination
 
 
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+# Copyright 2020 Minh Nguyen (@dathudeptrai)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decode trained FastSpeech from folders."""
+
+import argparse
+import logging
+import os
+import sys
+
+sys.path.append(".")
+
+import numpy as np
+import tensorflow as tf
+import yaml
+from tqdm import tqdm
+
+from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
+from tensorflow_tts.configs import FastSpeech2Config
+from tensorflow_tts.models import TFFastSpeech2
+
+
+def main():
+    """Run fastspeech2 decoding from folder."""
+    parser = argparse.ArgumentParser(
+        description="Decode soft-mel features from charactor with trained FastSpeech "
+        "(See detail in examples/fastspeech2/decode_fastspeech2.py)."
+    )
+    parser.add_argument(
+        "--rootdir",
+        default=None,
+        type=str,
+        required=True,
+        help="directory including ids/durations files.",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=True, help="directory to save generated speech."
+    )
+    parser.add_argument(
+        "--checkpoint", type=str, required=True, help="checkpoint file to be loaded."
+    )
+    parser.add_argument(
+        "--config",
+        default=None,
+        type=str,
+        required=True,
+        help="yaml format configuration file. if not explicitly provided, "
+        "it will be searched in the checkpoint directory. (default=None)",
+    )
+    parser.add_argument(
+        "--batch-size",
+        default=8,
+        type=int,
+        required=False,
+        help="Batch size for inference.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)",
+    )
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check directory existence
+    if not os.path.exists(args.outdir):
+        os.makedirs(args.outdir)
+
+    # load config
+
+    outdpost = os.path.join(args.outdir, "postnets")
+
+    if not os.path.exists(outdpost):
+        os.makedirs(outdpost)
+
+    with open(args.config) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    config.update(vars(args))
+
+    if config["format"] == "npy":
+        char_query = "*-ids.npy"
+        char_load_fn = np.load
+    else:
+        raise ValueError("Only npy is supported.")
+
+    # define data-loader
+    dataset = CharactorDurationF0EnergyMelDataset(
+        root_dir=args.rootdir,
+        charactor_query=char_query,
+        charactor_load_fn=char_load_fn,
+    )
+    dataset = dataset.create(
+        batch_size=1
+    )  # force batch size to 1 otherwise it may miss certain files
+
+    # define model and load checkpoint
+    fastspeech2 = TFFastSpeech2(
+        config=FastSpeech2Config(**config["fastspeech2_params"]), name="fastspeech2"
+    )
+    fastspeech2._build()
+    fastspeech2.load_weights(args.checkpoint)
+    fastspeech2 = tf.function(fastspeech2, experimental_relax_shapes=True)
+
+    for data in tqdm(dataset, desc="Decoding"):
+        utt_ids = data["utt_ids"]
+        char_ids = data["input_ids"]
+        mel_lens = data["mel_lengths"]
+
+        # fastspeech inference.
+        masked_mel_before, masked_mel_after, duration_outputs, _, _ = fastspeech2(
+            **data, training=True
+        )
+
+        # convert to numpy
+        masked_mel_befores = masked_mel_before.numpy()
+        masked_mel_afters = masked_mel_after.numpy()
+
+        for (utt_id, mel_before, mel_after, durations, mel_len) in zip(
+            utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs, mel_lens
+        ):
+            # real len of mel predicted
+            real_length = np.around(durations.numpy().sum()).astype(int)
+            utt_id = utt_id.numpy().decode("utf-8")
+
+            np.save(
+                os.path.join(outdpost, f"{utt_id}-postnet.npy"),
+                mel_after[:mel_len, :].astype(np.float32),
+                allow_pickle=False,
+            )
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,94 @@
+
+# Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech
+Based on the script [`train_multiband_melgan_hf.py`](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/multiband_melgan_hf/train_multiband_melgan_hf.py).
+
+## Training Multi-band MelGAN from scratch with LJSpeech dataset.
+This example code show you how to train MelGAN from scratch with Tensorflow 2 based on custom training loop and tf.function. The data used for this example is LJSpeech Ultimate, you can download the dataset at  [link](https://machineexperiments.tumblr.com/post/662408083204685824/ljspeech-ultimate).
+
+### Step 1: Create Tensorflow based Dataloader (tf.dataset)
+Please see detail at [examples/melgan/](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/melgan#step-1-create-tensorflow-based-dataloader-tfdataset)
+
+### Step 2: Training from scratch
+After you re-define your dataloader, pls modify an input arguments, train_dataset and valid_dataset from [`train_multiband_melgan_hf.py`](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/multiband_melgan_hf/train_multiband_melgan_hf.py). Here is an example command line to training melgan-stft from scratch:
+
+First, you need training generator with only stft loss: 
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python examples/multiband_melgan_hf/train_multiband_melgan_hf.py \
+  --train-dir ./dump/train/ \
+  --dev-dir ./dump/valid/ \
+  --outdir ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/ \
+  --config ./examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml \
+  --use-norm 1 \
+  --generator_mixed_precision 1 \
+  --resume ""
+```
+
+Then resume and start training generator + discriminator:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python examples/multiband_melgan_hf/train_multiband_melgan_hf.py \
+  --train-dir ./dump/train/ \
+  --dev-dir ./dump/valid/ \
+  --outdir ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/ \
+  --config ./examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml \
+  --use-norm 1 \
+  --resume ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/checkpoints/ckpt-200000
+```
+
+IF you want to use MultiGPU to training you can replace `CUDA_VISIBLE_DEVICES=0` by `CUDA_VISIBLE_DEVICES=0,1,2,3` for example. You also need to tune the `batch_size` for each GPU (in config file) by yourself to maximize the performance. Note that MultiGPU now support for Training but not yet support for Decode. 
+
+In case you want to resume the training progress, please following below example command line:
+
+```bash
+--resume ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/checkpoints/ckpt-100000
+```
+
+If you want to finetune a model, use `--pretrained` like this with the filename of the generator and discriminator, separated by comma.
+```bash
+--pretrained ptgenerator.h5,ptdiscriminator.h5
+```
+It is recommended that you first train text2mel model then extract postnets so that vocoder learns to compensate for flaws, if you do so, append `--postnets 1` to arguments
+
+
+
+**IMPORTANT NOTES**:
+
+- If Your Dataset is 16K, upsample_scales = [2, 4, 8] worked.
+- If Your Dataset is > 16K (22K, 24K, ...), upsample_scales = [2, 4, 8] didn't worked, used [8, 4, 2] instead.
+- Mixed precision make Group Convolution training slower on Discriminator, both pytorch (apex) and tensorflow also has this problems. So, **DO NOT USE** mixed precision when discriminator enable.
+
+### Step 3: Decode audio from folder mel-spectrogram
+To running inference on folder mel-spectrogram (eg valid folder), run below command line:
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python examples/multiband_melgan_hf/decode_mb_melgan.py \
+  --rootdir ./dump/valid/ \
+  --outdir ./prediction/multiband_melgan_hf.v1/ \
+  --checkpoint ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/checkpoints/generator-920000.h5 \
+  --config ./examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml \
+  --batch-size 32 \
+  --use-norm 1
+```
+
+## Finetune MelGAN STFT with ljspeech pretrained on other languages
+Just load pretrained model and training from scratch with other languages. **DO NOT FORGET** re-preprocessing on your dataset if needed. A hop_size should be 512 if you want to use our pretrained.
+
+## Learning Curves
+Here is a learning curves of melgan based on this config [`multiband_melgan_hf.v1.yaml`](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/multiband_melgan_hf/conf/multiband_melgan_hf.v1.yaml)
+
+<img src="fig/eval.png" height="300" width="850">
+
+<img src="fig/train.png" height="300" width="850">
+
+## Pretrained Models and Audio samples
+| Model                                                                                                          | Conf                                                                                                                        | Lang  | Fs [Hz] | Mel range [Hz] | FFT / Hop / Win [pt] | # iters | Notes |
+| :------                                                                                                        | :---:                                                                                                                       | :---: | :----:  | :--------:     | :---------------:    | :-----:   | :-----: |
+| [multiband_melgan_hf.lju.v1](https://drive.google.com/drive/folders/1tOMzik_Nr4eY63gooKYSmNTJyXC6Pp55?usp=sharing)             | [link](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml)          | EN    | 44.1k  | 20-11025        | 2048 / 512 / 2048     | 920K    |  -|
+
+
+## Reference
+
+1. https://github.com/kan-bayashi/ParallelWaveGAN
+2. [Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
+3. [Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106)
@@ -1,14 +1,14 @@
 
-# This is the hyperparameter configuration file for Multi-Band MelGAN with PWGAN discriminator.
+# This is the hyperparameter configuration file for Multi-Band MelGAN + MPD
 # Please make sure this is adjusted for the LJSpeech dataset. If you want to
 # apply to the other dataset, you might need to carefully change some parameters.
 # This configuration performs 1000k iters.
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
-sampling_rate: 22050
-hop_size: 256            # Hop size.
+sampling_rate: 44100
+hop_size: 512            # Hop size.
 format: "npy"
 
 
@@ -21,23 +21,41 @@ multiband_melgan_generator_params:
     out_channels: 4               # Number of output channels (number of subbands).
     kernel_size: 7                # Kernel size of initial and final conv layers.
     filters: 384                  # Initial number of channels for conv layers.
-    upsample_scales: [8, 4, 2]    # List of Upsampling scales.
+    upsample_scales: [4, 4, 8]    # List of Upsampling scales.
     stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
     stacks: 4                     # Number of stacks in a single residual stack module.
     is_weight_norm: false         # Use weight-norm or not.
 
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
-parallel_wavegan_discriminator_params:
-    out_channels: 1       # Number of output channels.
-    kernel_size: 3        # Number of output channels.
-    n_layers: 10            # Number of conv layers.
-    conv_channels: 64     # Number of chnn layers.
-    use_bias: true            # Whether to use bias parameter in conv.
-    nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
-    nonlinear_activation_params:      # Nonlinear function parameters
-        alpha: 0.2           # Alpha in LeakyReLU.
+multiband_melgan_discriminator_params:
+    out_channels: 1                   # Number of output channels.
+    scales: 3                         # Number of multi-scales.
+    downsample_pooling: "AveragePooling1D"   # Pooling type for the input downsampling.
+    downsample_pooling_params:        # Parameters of the above pooling function.
+        pool_size: 4
+        strides: 2
+    kernel_sizes: [5, 3]              # List of kernel size.
+    filters: 16                       # Number of channels of the initial conv layer.
+    max_downsample_filters: 512       # Maximum number of channels of downsampling layers.
+    downsample_scales: [4, 4, 4]      # List of downsampling scales.
+    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
+    nonlinear_activation_params:      # Parameters of nonlinear activation function.
+        alpha: 0.2
+    is_weight_norm: false             # Use weight-norm or not.
+
+hifigan_discriminator_params:
+    out_channels: 1                     # Number of output channels (number of subbands).
+    period_scales: [3, 5, 7, 11, 17, 23, 37]     # List of period scales.
+    n_layers: 5                         # Number of layer of each period discriminator.
+    kernel_size: 5                      # Kernel size.
+    strides: 3                          # Strides
+    filters: 8                          # In Conv filters of each period discriminator
+    filter_scales: 4                    # Filter scales.
+    max_filters: 512                   # maximum filters of period discriminator's conv.
+    is_weight_norm: false               # Use weight-norm or not.
+
 
 ###########################################################
 #                   STFT LOSS SETTING                     #
@@ -61,7 +79,7 @@ lambda_adv: 2.5              # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
-batch_size: 64                 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
+batch_size: 64                 # Batch size.
 batch_max_steps: 8192          # Length of each audio in batch for training. Make sure dividable by hop_size.
 batch_max_steps_valid: 81920   # Length of each audio for validation. Make sure dividable by hope_size.
 remove_short_samples: true     # Whether to remove samples the length of which are less than batch_max_steps.
@@ -79,22 +97,24 @@ generator_optimizer_params:
     amsgrad: false
 
 discriminator_optimizer_params:
-    lr_fn: "ExponentialDecay"
+    lr_fn: "PiecewiseConstantDecay"
     lr_params: 
-        initial_learning_rate: 0.0005
-        decay_steps: 200000
-        decay_rate: 0.5
+        boundaries: [100000, 200000, 300000, 400000, 500000]
+        values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
+
+    amsgrad: false
 
-gradient_accumulation_steps: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 discriminator_train_start_steps: 200000  # steps begin training discriminator
-train_max_steps: 4000000                 # Number of training steps.
+train_max_steps: 1500000                 # Number of training steps.
 save_interval_steps: 20000               # Interval steps to save checkpoint.
 eval_interval_steps: 5000                # Interval steps to evaluate the network.
 log_interval_steps: 200                  # Interval steps to record the training log.
 
+
+gradient_accumulation_steps: 1
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################