1
1
import numpy as np
2
2
import onnxruntime as ort
3
3
4
+
4
5
def convert_pad_shape (pad_shape ):
5
6
layer = pad_shape [::- 1 ]
6
7
pad_shape = [item for sublist in layer for item in sublist ]
7
8
return pad_shape
8
9
10
+
9
11
def sequence_mask (length , max_length = None ):
10
12
if max_length is None :
11
13
max_length = length .max ()
12
14
x = np .arange (max_length , dtype = length .dtype )
13
15
return np .expand_dims (x , 0 ) < np .expand_dims (length , 1 )
14
16
17
+
15
18
def generate_path (duration , mask ):
16
19
"""
17
20
duration: [b, 1, t_x]
@@ -28,8 +31,9 @@ def generate_path(duration, mask):
28
31
path = np .expand_dims (path , 1 ).transpose (0 , 1 , 3 , 2 )
29
32
return path
30
33
31
- class OnnxInferenceSession ():
32
- def __init__ (self , path , Providers = ["CPUExecutionProvider" ]):
34
+
35
+ class OnnxInferenceSession :
36
+ def __init__ (self , path , Providers = ["CPUExecutionProvider" ]):
33
37
self .enc = ort .InferenceSession (path ["enc" ], providers = Providers )
34
38
self .emb_g = ort .InferenceSession (path ["emb_g" ], providers = Providers )
35
39
self .dp = ort .InferenceSession (path ["dp" ], providers = Providers )
@@ -38,43 +42,56 @@ def __init__(self, path, Providers = ["CPUExecutionProvider"]):
38
42
self .dec = ort .InferenceSession (path ["dec" ], providers = Providers )
39
43
40
44
def __call__ (
41
- self ,
42
- seq ,
43
- tone ,
44
- language ,
45
- bert_zh ,
46
- bert_jp ,
47
- bert_en ,
48
- emo ,
49
- sid ,
50
- seed = 114514 ,
51
- seq_noise_scale = 0.8 ,
52
- sdp_noise_scale = 0.6 ,
53
- length_scale = 1. ,
54
- sdp_ratio = 0.
55
- ):
56
- g = self .emb_g .run (None , {'sid' : sid .astype (np .int64 ),})[0 ]
45
+ self ,
46
+ seq ,
47
+ tone ,
48
+ language ,
49
+ bert_zh ,
50
+ bert_jp ,
51
+ bert_en ,
52
+ emo ,
53
+ sid ,
54
+ seed = 114514 ,
55
+ seq_noise_scale = 0.8 ,
56
+ sdp_noise_scale = 0.6 ,
57
+ length_scale = 1.0 ,
58
+ sdp_ratio = 0.0 ,
59
+ ):
60
+ g = self .emb_g .run (
61
+ None ,
62
+ {
63
+ "sid" : sid .astype (np .int64 ),
64
+ },
65
+ )[0 ]
57
66
g = np .expand_dims (g , - 1 )
58
67
enc_rtn = self .enc .run (
59
68
None ,
60
69
{
61
- "x" : seq .astype (np .int64 ),
62
- "t" : tone .astype (np .int64 ),
63
- "language" : language .astype (np .int64 ),
64
- "bert_0" : bert_zh .astype (np .float32 ),
65
- "bert_1" : bert_jp .astype (np .float32 ),
66
- "bert_2" : bert_en .astype (np .float32 ),
67
- "emo" : emo .astype (np .float32 ),
68
- "g" : g .astype (np .float32 )
69
- })
70
+ "x" : seq .astype (np .int64 ),
71
+ "t" : tone .astype (np .int64 ),
72
+ "language" : language .astype (np .int64 ),
73
+ "bert_0" : bert_zh .astype (np .float32 ),
74
+ "bert_1" : bert_jp .astype (np .float32 ),
75
+ "bert_2" : bert_en .astype (np .float32 ),
76
+ "emo" : emo .astype (np .float32 ),
77
+ "g" : g .astype (np .float32 ),
78
+ },
79
+ )
70
80
x , m_p , logs_p , x_mask = enc_rtn [0 ], enc_rtn [1 ], enc_rtn [2 ], enc_rtn [3 ]
71
81
np .random .seed (seed )
72
82
zinput = np .random .randn (x .shape [0 ], 2 , x .shape [2 ]) * sdp_noise_scale
73
- logw = self .sdp .run (None , {"x" : x , "x_mask" : x_mask , "zin" : zinput .astype (np .float32 ), "g" : g })[0 ] * (sdp_ratio ) + \
74
- self .dp .run (None , {"x" : x , "x_mask" : x_mask , "g" : g })[0 ] * (1 - sdp_ratio )
83
+ logw = self .sdp .run (
84
+ None , {"x" : x , "x_mask" : x_mask , "zin" : zinput .astype (np .float32 ), "g" : g }
85
+ )[0 ] * (sdp_ratio ) + self .dp .run (None , {"x" : x , "x_mask" : x_mask , "g" : g })[
86
+ 0
87
+ ] * (
88
+ 1 - sdp_ratio
89
+ )
75
90
w = np .exp (logw ) * x_mask * length_scale
76
91
w_ceil = np .ceil (w )
77
- y_lengths = np .clip (np .sum (w_ceil , (1 , 2 )), a_min = 1. , a_max = 100000 ).astype (np .int64 )
92
+ y_lengths = np .clip (np .sum (w_ceil , (1 , 2 )), a_min = 1.0 , a_max = 100000 ).astype (
93
+ np .int64
94
+ )
78
95
y_mask = np .expand_dims (sequence_mask (y_lengths , None ), 1 )
79
96
attn_mask = np .expand_dims (x_mask , 2 ) * np .expand_dims (y_mask , - 1 )
80
97
attn = generate_path (w_ceil , attn_mask )
@@ -84,9 +101,21 @@ def __call__(
84
101
logs_p = np .matmul (attn .squeeze (1 ), logs_p .transpose (0 , 2 , 1 )).transpose (
85
102
0 , 2 , 1
86
103
) # [b, t', t], [b, t, d] -> [b, d, t']
87
-
88
- z_p = m_p + np .random .randn (m_p .shape [0 ], m_p .shape [1 ], m_p .shape [2 ]) * np .exp (logs_p ) * seq_noise_scale
89
104
90
- z = self .flow .run (None , {"z_p" : z_p .astype (np .float32 ), "y_mask" : y_mask .astype (np .float32 ), "g" : g })[0 ]
105
+ z_p = (
106
+ m_p
107
+ + np .random .randn (m_p .shape [0 ], m_p .shape [1 ], m_p .shape [2 ])
108
+ * np .exp (logs_p )
109
+ * seq_noise_scale
110
+ )
111
+
112
+ z = self .flow .run (
113
+ None ,
114
+ {
115
+ "z_p" : z_p .astype (np .float32 ),
116
+ "y_mask" : y_mask .astype (np .float32 ),
117
+ "g" : g ,
118
+ },
119
+ )[0 ]
91
120
92
- return self .dec .run (None , {"z_in" : z .astype (np .float32 ), "g" : g })[0 ]
121
+ return self .dec .run (None , {"z_in" : z .astype (np .float32 ), "g" : g })[0 ]
0 commit comments