Skip to content

Commit 34358d8

Browse files
committed
2 parents 8786f59 + cd3a5e1 commit 34358d8

23 files changed

+1265
-205
lines changed

README.md

+1-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
- 2020/11/24 Add HiFi-GAN vocoder. See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/hifigan)
2929
- 2020/11/19 Add Multi-GPU gradient accumulator. See [here](https://github.com/TensorSpeech/TensorFlowTTS/pull/377)
3030
- 2020/08/23 Add Parallel WaveGAN tensorflow implementation. See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/parallel_wavegan)
31-
- 2020/08/23 Add MBMelGAN G + ParallelWaveGAN G example. See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/multiband_pwgan)
3231
- 2020/08/20 Add C++ inference code. Thank [@ZDisket](https://github.com/ZDisket). See [here](https://github.com/TensorSpeech/TensorFlowTTS/tree/master/examples/cppwin)
3332
- 2020/08/18 Update [new base processor](https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/processor/base_processor.py). Add [AutoProcessor](https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/inference/auto_processor.py) and [pretrained processor](https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/processor/pretrained/) json file
3433
- 2020/08/14 Support Chinese TTS. Pls see the [colab](https://colab.research.google.com/drive/1YpSHRBRPBI7cnTkQn1UcVTWEQVbsUm1S?usp=sharing). Thank [@azraelkuan](https://github.com/azraelkuan)
@@ -227,7 +226,7 @@ To know how to train model from scratch or fine-tune with other datasets/languag
227226
- For MelGAN + STFT Loss tutorial, pls see [examples/melgan.stft](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/melgan.stft)
228227
- For Multiband-MelGAN tutorial, pls see [examples/multiband_melgan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_melgan)
229228
- For Parallel WaveGAN tutorial, pls see [examples/parallel_wavegan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/parallel_wavegan)
230-
- For Multiband-MelGAN Generator + Parallel WaveGAN Discriminator tutorial, pls see [examples/multiband_pwgan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_pwgan)
229+
- For Multiband-MelGAN Generator + HiFi-GAN tutorial, pls see [examples/multiband_melgan_hf](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_melgan_hf)
231230
- For HiFi-GAN tutorial, pls see [examples/hifigan](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/hifigan)
232231
# Abstract Class Explaination
233232

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright 2020 Minh Nguyen (@dathudeptrai)
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Decode trained FastSpeech from folders."""
16+
17+
import argparse
18+
import logging
19+
import os
20+
import sys
21+
22+
sys.path.append(".")
23+
24+
import numpy as np
25+
import tensorflow as tf
26+
import yaml
27+
from tqdm import tqdm
28+
29+
from examples.fastspeech2.fastspeech2_dataset import CharactorDurationF0EnergyMelDataset
30+
from tensorflow_tts.configs import FastSpeech2Config
31+
from tensorflow_tts.models import TFFastSpeech2
32+
33+
34+
def main():
35+
"""Run fastspeech2 decoding from folder."""
36+
parser = argparse.ArgumentParser(
37+
description="Decode soft-mel features from charactor with trained FastSpeech "
38+
"(See detail in examples/fastspeech2/decode_fastspeech2.py)."
39+
)
40+
parser.add_argument(
41+
"--rootdir",
42+
default=None,
43+
type=str,
44+
required=True,
45+
help="directory including ids/durations files.",
46+
)
47+
parser.add_argument(
48+
"--outdir", type=str, required=True, help="directory to save generated speech."
49+
)
50+
parser.add_argument(
51+
"--checkpoint", type=str, required=True, help="checkpoint file to be loaded."
52+
)
53+
parser.add_argument(
54+
"--config",
55+
default=None,
56+
type=str,
57+
required=True,
58+
help="yaml format configuration file. if not explicitly provided, "
59+
"it will be searched in the checkpoint directory. (default=None)",
60+
)
61+
parser.add_argument(
62+
"--batch-size",
63+
default=8,
64+
type=int,
65+
required=False,
66+
help="Batch size for inference.",
67+
)
68+
parser.add_argument(
69+
"--verbose",
70+
type=int,
71+
default=1,
72+
help="logging level. higher is more logging. (default=1)",
73+
)
74+
args = parser.parse_args()
75+
76+
# set logger
77+
if args.verbose > 1:
78+
logging.basicConfig(
79+
level=logging.DEBUG,
80+
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
81+
)
82+
elif args.verbose > 0:
83+
logging.basicConfig(
84+
level=logging.INFO,
85+
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
86+
)
87+
else:
88+
logging.basicConfig(
89+
level=logging.WARN,
90+
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
91+
)
92+
logging.warning("Skip DEBUG/INFO messages")
93+
94+
# check directory existence
95+
if not os.path.exists(args.outdir):
96+
os.makedirs(args.outdir)
97+
98+
# load config
99+
100+
outdpost = os.path.join(args.outdir, "postnets")
101+
102+
if not os.path.exists(outdpost):
103+
os.makedirs(outdpost)
104+
105+
with open(args.config) as f:
106+
config = yaml.load(f, Loader=yaml.Loader)
107+
config.update(vars(args))
108+
109+
if config["format"] == "npy":
110+
char_query = "*-ids.npy"
111+
char_load_fn = np.load
112+
else:
113+
raise ValueError("Only npy is supported.")
114+
115+
# define data-loader
116+
dataset = CharactorDurationF0EnergyMelDataset(
117+
root_dir=args.rootdir,
118+
charactor_query=char_query,
119+
charactor_load_fn=char_load_fn,
120+
)
121+
dataset = dataset.create(
122+
batch_size=1
123+
) # force batch size to 1 otherwise it may miss certain files
124+
125+
# define model and load checkpoint
126+
fastspeech2 = TFFastSpeech2(
127+
config=FastSpeech2Config(**config["fastspeech2_params"]), name="fastspeech2"
128+
)
129+
fastspeech2._build()
130+
fastspeech2.load_weights(args.checkpoint)
131+
fastspeech2 = tf.function(fastspeech2, experimental_relax_shapes=True)
132+
133+
for data in tqdm(dataset, desc="Decoding"):
134+
utt_ids = data["utt_ids"]
135+
char_ids = data["input_ids"]
136+
mel_lens = data["mel_lengths"]
137+
138+
# fastspeech inference.
139+
masked_mel_before, masked_mel_after, duration_outputs, _, _ = fastspeech2(
140+
**data, training=True
141+
)
142+
143+
# convert to numpy
144+
masked_mel_befores = masked_mel_before.numpy()
145+
masked_mel_afters = masked_mel_after.numpy()
146+
147+
for (utt_id, mel_before, mel_after, durations, mel_len) in zip(
148+
utt_ids, masked_mel_befores, masked_mel_afters, duration_outputs, mel_lens
149+
):
150+
# real len of mel predicted
151+
real_length = np.around(durations.numpy().sum()).astype(int)
152+
utt_id = utt_id.numpy().decode("utf-8")
153+
154+
np.save(
155+
os.path.join(outdpost, f"{utt_id}-postnet.npy"),
156+
mel_after[:mel_len, :].astype(np.float32),
157+
allow_pickle=False,
158+
)
159+
160+
161+
if __name__ == "__main__":
162+
main()
+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
2+
# Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech
3+
Based on the script [`train_multiband_melgan_hf.py`](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/multiband_melgan_hf/train_multiband_melgan_hf.py).
4+
5+
## Training Multi-band MelGAN from scratch with LJSpeech dataset.
6+
This example code show you how to train MelGAN from scratch with Tensorflow 2 based on custom training loop and tf.function. The data used for this example is LJSpeech Ultimate, you can download the dataset at [link](https://machineexperiments.tumblr.com/post/662408083204685824/ljspeech-ultimate).
7+
8+
### Step 1: Create Tensorflow based Dataloader (tf.dataset)
9+
Please see detail at [examples/melgan/](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/melgan#step-1-create-tensorflow-based-dataloader-tfdataset)
10+
11+
### Step 2: Training from scratch
12+
After you re-define your dataloader, pls modify an input arguments, train_dataset and valid_dataset from [`train_multiband_melgan_hf.py`](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/multiband_melgan_hf/train_multiband_melgan_hf.py). Here is an example command line to training melgan-stft from scratch:
13+
14+
First, you need training generator with only stft loss:
15+
16+
```bash
17+
CUDA_VISIBLE_DEVICES=0 python examples/multiband_melgan_hf/train_multiband_melgan_hf.py \
18+
--train-dir ./dump/train/ \
19+
--dev-dir ./dump/valid/ \
20+
--outdir ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/ \
21+
--config ./examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml \
22+
--use-norm 1 \
23+
--generator_mixed_precision 1 \
24+
--resume ""
25+
```
26+
27+
Then resume and start training generator + discriminator:
28+
29+
```bash
30+
CUDA_VISIBLE_DEVICES=0 python examples/multiband_melgan_hf/train_multiband_melgan_hf.py \
31+
--train-dir ./dump/train/ \
32+
--dev-dir ./dump/valid/ \
33+
--outdir ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/ \
34+
--config ./examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml \
35+
--use-norm 1 \
36+
--resume ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/checkpoints/ckpt-200000
37+
```
38+
39+
IF you want to use MultiGPU to training you can replace `CUDA_VISIBLE_DEVICES=0` by `CUDA_VISIBLE_DEVICES=0,1,2,3` for example. You also need to tune the `batch_size` for each GPU (in config file) by yourself to maximize the performance. Note that MultiGPU now support for Training but not yet support for Decode.
40+
41+
In case you want to resume the training progress, please following below example command line:
42+
43+
```bash
44+
--resume ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/checkpoints/ckpt-100000
45+
```
46+
47+
If you want to finetune a model, use `--pretrained` like this with the filename of the generator and discriminator, separated by comma.
48+
```bash
49+
--pretrained ptgenerator.h5,ptdiscriminator.h5
50+
```
51+
It is recommended that you first train text2mel model then extract postnets so that vocoder learns to compensate for flaws, if you do so, append `--postnets 1` to arguments
52+
53+
54+
55+
**IMPORTANT NOTES**:
56+
57+
- If Your Dataset is 16K, upsample_scales = [2, 4, 8] worked.
58+
- If Your Dataset is > 16K (22K, 24K, ...), upsample_scales = [2, 4, 8] didn't worked, used [8, 4, 2] instead.
59+
- Mixed precision make Group Convolution training slower on Discriminator, both pytorch (apex) and tensorflow also has this problems. So, **DO NOT USE** mixed precision when discriminator enable.
60+
61+
### Step 3: Decode audio from folder mel-spectrogram
62+
To running inference on folder mel-spectrogram (eg valid folder), run below command line:
63+
64+
```bash
65+
CUDA_VISIBLE_DEVICES=0 python examples/multiband_melgan_hf/decode_mb_melgan.py \
66+
--rootdir ./dump/valid/ \
67+
--outdir ./prediction/multiband_melgan_hf.v1/ \
68+
--checkpoint ./examples/multiband_melgan_hf/exp/train.multiband_melgan_hf.v1/checkpoints/generator-920000.h5 \
69+
--config ./examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml \
70+
--batch-size 32 \
71+
--use-norm 1
72+
```
73+
74+
## Finetune MelGAN STFT with ljspeech pretrained on other languages
75+
Just load pretrained model and training from scratch with other languages. **DO NOT FORGET** re-preprocessing on your dataset if needed. A hop_size should be 512 if you want to use our pretrained.
76+
77+
## Learning Curves
78+
Here is a learning curves of melgan based on this config [`multiband_melgan_hf.v1.yaml`](https://github.com/dathudeptrai/TensorflowTTS/tree/master/examples/multiband_melgan_hf/conf/multiband_melgan_hf.v1.yaml)
79+
80+
<img src="fig/eval.png" height="300" width="850">
81+
82+
<img src="fig/train.png" height="300" width="850">
83+
84+
## Pretrained Models and Audio samples
85+
| Model | Conf | Lang | Fs [Hz] | Mel range [Hz] | FFT / Hop / Win [pt] | # iters | Notes |
86+
| :------ | :---: | :---: | :----: | :--------: | :---------------: | :-----: | :-----: |
87+
| [multiband_melgan_hf.lju.v1](https://drive.google.com/drive/folders/1tOMzik_Nr4eY63gooKYSmNTJyXC6Pp55?usp=sharing) | [link](https://github.com/tensorspeech/TensorFlowTTS/tree/master/examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml) | EN | 44.1k | 20-11025 | 2048 / 512 / 2048 | 920K | -|
88+
89+
90+
## Reference
91+
92+
1. https://github.com/kan-bayashi/ParallelWaveGAN
93+
2. [Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
94+
3. [Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106)

examples/multiband_pwgan/conf/multiband_pwgan.v1.yaml renamed to examples/multiband_melgan_hf/conf/multiband_melgan_hf.lju.v1.yml

+40-20
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11

2-
# This is the hyperparameter configuration file for Multi-Band MelGAN with PWGAN discriminator.
2+
# This is the hyperparameter configuration file for Multi-Band MelGAN + MPD
33
# Please make sure this is adjusted for the LJSpeech dataset. If you want to
44
# apply to the other dataset, you might need to carefully change some parameters.
55
# This configuration performs 1000k iters.
66

77
###########################################################
88
# FEATURE EXTRACTION SETTING #
99
###########################################################
10-
sampling_rate: 22050
11-
hop_size: 256 # Hop size.
10+
sampling_rate: 44100
11+
hop_size: 512 # Hop size.
1212
format: "npy"
1313

1414

@@ -21,23 +21,41 @@ multiband_melgan_generator_params:
2121
out_channels: 4 # Number of output channels (number of subbands).
2222
kernel_size: 7 # Kernel size of initial and final conv layers.
2323
filters: 384 # Initial number of channels for conv layers.
24-
upsample_scales: [8, 4, 2] # List of Upsampling scales.
24+
upsample_scales: [4, 4, 8] # List of Upsampling scales.
2525
stack_kernel_size: 3 # Kernel size of dilated conv layers in residual stack.
2626
stacks: 4 # Number of stacks in a single residual stack module.
2727
is_weight_norm: false # Use weight-norm or not.
2828

2929
###########################################################
3030
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
3131
###########################################################
32-
parallel_wavegan_discriminator_params:
33-
out_channels: 1 # Number of output channels.
34-
kernel_size: 3 # Number of output channels.
35-
n_layers: 10 # Number of conv layers.
36-
conv_channels: 64 # Number of chnn layers.
37-
use_bias: true # Whether to use bias parameter in conv.
38-
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
39-
nonlinear_activation_params: # Nonlinear function parameters
40-
alpha: 0.2 # Alpha in LeakyReLU.
32+
multiband_melgan_discriminator_params:
33+
out_channels: 1 # Number of output channels.
34+
scales: 3 # Number of multi-scales.
35+
downsample_pooling: "AveragePooling1D" # Pooling type for the input downsampling.
36+
downsample_pooling_params: # Parameters of the above pooling function.
37+
pool_size: 4
38+
strides: 2
39+
kernel_sizes: [5, 3] # List of kernel size.
40+
filters: 16 # Number of channels of the initial conv layer.
41+
max_downsample_filters: 512 # Maximum number of channels of downsampling layers.
42+
downsample_scales: [4, 4, 4] # List of downsampling scales.
43+
nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
44+
nonlinear_activation_params: # Parameters of nonlinear activation function.
45+
alpha: 0.2
46+
is_weight_norm: false # Use weight-norm or not.
47+
48+
hifigan_discriminator_params:
49+
out_channels: 1 # Number of output channels (number of subbands).
50+
period_scales: [3, 5, 7, 11, 17, 23, 37] # List of period scales.
51+
n_layers: 5 # Number of layer of each period discriminator.
52+
kernel_size: 5 # Kernel size.
53+
strides: 3 # Strides
54+
filters: 8 # In Conv filters of each period discriminator
55+
filter_scales: 4 # Filter scales.
56+
max_filters: 512 # maximum filters of period discriminator's conv.
57+
is_weight_norm: false # Use weight-norm or not.
58+
4159

4260
###########################################################
4361
# STFT LOSS SETTING #
@@ -61,7 +79,7 @@ lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss.
6179
###########################################################
6280
# DATA LOADER SETTING #
6381
###########################################################
64-
batch_size: 64 # Batch size for each GPU with assuming that gradient_accumulation_steps == 1.
82+
batch_size: 64 # Batch size.
6583
batch_max_steps: 8192 # Length of each audio in batch for training. Make sure dividable by hop_size.
6684
batch_max_steps_valid: 81920 # Length of each audio for validation. Make sure dividable by hope_size.
6785
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
@@ -79,22 +97,24 @@ generator_optimizer_params:
7997
amsgrad: false
8098

8199
discriminator_optimizer_params:
82-
lr_fn: "ExponentialDecay"
100+
lr_fn: "PiecewiseConstantDecay"
83101
lr_params:
84-
initial_learning_rate: 0.0005
85-
decay_steps: 200000
86-
decay_rate: 0.5
102+
boundaries: [100000, 200000, 300000, 400000, 500000]
103+
values: [0.00025, 0.000125, 0.0000625, 0.00003125, 0.000015625, 0.000001]
104+
105+
amsgrad: false
87106

88-
gradient_accumulation_steps: 1
89107
###########################################################
90108
# INTERVAL SETTING #
91109
###########################################################
92110
discriminator_train_start_steps: 200000 # steps begin training discriminator
93-
train_max_steps: 4000000 # Number of training steps.
111+
train_max_steps: 1500000 # Number of training steps.
94112
save_interval_steps: 20000 # Interval steps to save checkpoint.
95113
eval_interval_steps: 5000 # Interval steps to evaluate the network.
96114
log_interval_steps: 200 # Interval steps to record the training log.
97115

116+
117+
gradient_accumulation_steps: 1
98118
###########################################################
99119
# OTHER SETTING #
100120
###########################################################

0 commit comments

Comments
 (0)