@@ -49,7 +49,13 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
49
49
model .make_generation_fast_ ()
50
50
51
51
mel_org = np .load (join (in_dir , mel_filename ))
52
- mel = Variable (torch .from_numpy (mel_org )).unsqueeze (0 ).contiguous ()
52
+ # zero padd
53
+ b_pad = r # imitates initial state
54
+ e_pad = r - len (mel_org ) % r if len (mel_org ) % r > 0 else 0
55
+ mel = np .pad (mel_org , [(b_pad , e_pad ), (0 , 0 )],
56
+ mode = "constant" , constant_values = 0 )
57
+
58
+ mel = Variable (torch .from_numpy (mel )).unsqueeze (0 ).contiguous ()
53
59
54
60
# Downsample mel spectrogram
55
61
if downsample_step > 1 :
@@ -78,10 +84,10 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
78
84
frame_positions = frame_positions , speaker_ids = speaker_ids )
79
85
80
86
mel_output = mel_outputs [0 ].data .cpu ().numpy ()
81
-
82
87
# **Time resolution adjustment**
83
- # remove begenning audio used for first mel prediction
84
- wav = np .load (join (in_dir , audio_filename ))[hparams .hop_size * downsample_step :]
88
+ mel_output = mel_output [:- (b_pad + e_pad )]
89
+
90
+ wav = np .load (join (in_dir , audio_filename ))
85
91
assert len (wav ) % hparams .hop_size == 0
86
92
87
93
# Coarse upsample just for convenience
@@ -92,18 +98,13 @@ def preprocess(model, in_dir, out_dir, text, audio_filename, mel_filename,
92
98
# the original mel length
93
99
assert mel_output .shape [0 ] >= mel_org .shape [0 ]
94
100
95
- # Trim mel output
96
- expected_frames = len (wav ) // hparams .hop_size
97
- mel_output = mel_output [:expected_frames ]
98
-
99
101
# Make sure we have correct lengths
100
102
assert mel_output .shape [0 ] * hparams .hop_size == len (wav )
101
103
102
104
timesteps = len (wav )
103
105
104
106
# save
105
- np .save (join (out_dir , audio_filename ), wav .astype (np .int16 ),
106
- allow_pickle = False )
107
+ np .save (join (out_dir , audio_filename ), wav , allow_pickle = False )
107
108
np .save (join (out_dir , mel_filename ), mel_output .astype (np .float32 ),
108
109
allow_pickle = False )
109
110
0 commit comments