|
| 1 | +import copy |
| 2 | +import math |
| 3 | +import numpy as np |
| 4 | +import torch |
| 5 | +from torch import nn |
| 6 | +from torch.nn import functional as F |
| 7 | + |
| 8 | +import modules.commons as commons |
| 9 | +import modules.modules as modules |
| 10 | +from modules.modules import LayerNorm |
| 11 | + |
| 12 | + |
| 13 | +class FFT(nn.Module): |
| 14 | + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., |
| 15 | + proximal_bias=False, proximal_init=True, **kwargs): |
| 16 | + super().__init__() |
| 17 | + self.hidden_channels = hidden_channels |
| 18 | + self.filter_channels = filter_channels |
| 19 | + self.n_heads = n_heads |
| 20 | + self.n_layers = n_layers |
| 21 | + self.kernel_size = kernel_size |
| 22 | + self.p_dropout = p_dropout |
| 23 | + self.proximal_bias = proximal_bias |
| 24 | + self.proximal_init = proximal_init |
| 25 | + |
| 26 | + self.drop = nn.Dropout(p_dropout) |
| 27 | + self.self_attn_layers = nn.ModuleList() |
| 28 | + self.norm_layers_0 = nn.ModuleList() |
| 29 | + self.ffn_layers = nn.ModuleList() |
| 30 | + self.norm_layers_1 = nn.ModuleList() |
| 31 | + for i in range(self.n_layers): |
| 32 | + self.self_attn_layers.append( |
| 33 | + MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, |
| 34 | + proximal_init=proximal_init)) |
| 35 | + self.norm_layers_0.append(LayerNorm(hidden_channels)) |
| 36 | + self.ffn_layers.append( |
| 37 | + FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) |
| 38 | + self.norm_layers_1.append(LayerNorm(hidden_channels)) |
| 39 | + |
| 40 | + def forward(self, x, x_mask): |
| 41 | + """ |
| 42 | + x: decoder input |
| 43 | + h: encoder output |
| 44 | + """ |
| 45 | + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) |
| 46 | + x = x * x_mask |
| 47 | + for i in range(self.n_layers): |
| 48 | + y = self.self_attn_layers[i](x, x, self_attn_mask) |
| 49 | + y = self.drop(y) |
| 50 | + x = self.norm_layers_0[i](x + y) |
| 51 | + |
| 52 | + y = self.ffn_layers[i](x, x_mask) |
| 53 | + y = self.drop(y) |
| 54 | + x = self.norm_layers_1[i](x + y) |
| 55 | + x = x * x_mask |
| 56 | + return x |
| 57 | + |
| 58 | + |
| 59 | +class Encoder(nn.Module): |
| 60 | + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): |
| 61 | + super().__init__() |
| 62 | + self.hidden_channels = hidden_channels |
| 63 | + self.filter_channels = filter_channels |
| 64 | + self.n_heads = n_heads |
| 65 | + self.n_layers = n_layers |
| 66 | + self.kernel_size = kernel_size |
| 67 | + self.p_dropout = p_dropout |
| 68 | + self.window_size = window_size |
| 69 | + |
| 70 | + self.drop = nn.Dropout(p_dropout) |
| 71 | + self.attn_layers = nn.ModuleList() |
| 72 | + self.norm_layers_1 = nn.ModuleList() |
| 73 | + self.ffn_layers = nn.ModuleList() |
| 74 | + self.norm_layers_2 = nn.ModuleList() |
| 75 | + for i in range(self.n_layers): |
| 76 | + self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) |
| 77 | + self.norm_layers_1.append(LayerNorm(hidden_channels)) |
| 78 | + self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) |
| 79 | + self.norm_layers_2.append(LayerNorm(hidden_channels)) |
| 80 | + |
| 81 | + def forward(self, x, x_mask): |
| 82 | + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) |
| 83 | + x = x * x_mask |
| 84 | + for i in range(self.n_layers): |
| 85 | + y = self.attn_layers[i](x, x, attn_mask) |
| 86 | + y = self.drop(y) |
| 87 | + x = self.norm_layers_1[i](x + y) |
| 88 | + |
| 89 | + y = self.ffn_layers[i](x, x_mask) |
| 90 | + y = self.drop(y) |
| 91 | + x = self.norm_layers_2[i](x + y) |
| 92 | + x = x * x_mask |
| 93 | + return x |
| 94 | + |
| 95 | + |
| 96 | +class Decoder(nn.Module): |
| 97 | + def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): |
| 98 | + super().__init__() |
| 99 | + self.hidden_channels = hidden_channels |
| 100 | + self.filter_channels = filter_channels |
| 101 | + self.n_heads = n_heads |
| 102 | + self.n_layers = n_layers |
| 103 | + self.kernel_size = kernel_size |
| 104 | + self.p_dropout = p_dropout |
| 105 | + self.proximal_bias = proximal_bias |
| 106 | + self.proximal_init = proximal_init |
| 107 | + |
| 108 | + self.drop = nn.Dropout(p_dropout) |
| 109 | + self.self_attn_layers = nn.ModuleList() |
| 110 | + self.norm_layers_0 = nn.ModuleList() |
| 111 | + self.encdec_attn_layers = nn.ModuleList() |
| 112 | + self.norm_layers_1 = nn.ModuleList() |
| 113 | + self.ffn_layers = nn.ModuleList() |
| 114 | + self.norm_layers_2 = nn.ModuleList() |
| 115 | + for i in range(self.n_layers): |
| 116 | + self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) |
| 117 | + self.norm_layers_0.append(LayerNorm(hidden_channels)) |
| 118 | + self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) |
| 119 | + self.norm_layers_1.append(LayerNorm(hidden_channels)) |
| 120 | + self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) |
| 121 | + self.norm_layers_2.append(LayerNorm(hidden_channels)) |
| 122 | + |
| 123 | + def forward(self, x, x_mask, h, h_mask): |
| 124 | + """ |
| 125 | + x: decoder input |
| 126 | + h: encoder output |
| 127 | + """ |
| 128 | + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) |
| 129 | + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) |
| 130 | + x = x * x_mask |
| 131 | + for i in range(self.n_layers): |
| 132 | + y = self.self_attn_layers[i](x, x, self_attn_mask) |
| 133 | + y = self.drop(y) |
| 134 | + x = self.norm_layers_0[i](x + y) |
| 135 | + |
| 136 | + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) |
| 137 | + y = self.drop(y) |
| 138 | + x = self.norm_layers_1[i](x + y) |
| 139 | + |
| 140 | + y = self.ffn_layers[i](x, x_mask) |
| 141 | + y = self.drop(y) |
| 142 | + x = self.norm_layers_2[i](x + y) |
| 143 | + x = x * x_mask |
| 144 | + return x |
| 145 | + |
| 146 | + |
| 147 | +class MultiHeadAttention(nn.Module): |
| 148 | + def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): |
| 149 | + super().__init__() |
| 150 | + assert channels % n_heads == 0 |
| 151 | + |
| 152 | + self.channels = channels |
| 153 | + self.out_channels = out_channels |
| 154 | + self.n_heads = n_heads |
| 155 | + self.p_dropout = p_dropout |
| 156 | + self.window_size = window_size |
| 157 | + self.heads_share = heads_share |
| 158 | + self.block_length = block_length |
| 159 | + self.proximal_bias = proximal_bias |
| 160 | + self.proximal_init = proximal_init |
| 161 | + self.attn = None |
| 162 | + |
| 163 | + self.k_channels = channels // n_heads |
| 164 | + self.conv_q = nn.Conv1d(channels, channels, 1) |
| 165 | + self.conv_k = nn.Conv1d(channels, channels, 1) |
| 166 | + self.conv_v = nn.Conv1d(channels, channels, 1) |
| 167 | + self.conv_o = nn.Conv1d(channels, out_channels, 1) |
| 168 | + self.drop = nn.Dropout(p_dropout) |
| 169 | + |
| 170 | + if window_size is not None: |
| 171 | + n_heads_rel = 1 if heads_share else n_heads |
| 172 | + rel_stddev = self.k_channels**-0.5 |
| 173 | + self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) |
| 174 | + self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) |
| 175 | + |
| 176 | + nn.init.xavier_uniform_(self.conv_q.weight) |
| 177 | + nn.init.xavier_uniform_(self.conv_k.weight) |
| 178 | + nn.init.xavier_uniform_(self.conv_v.weight) |
| 179 | + if proximal_init: |
| 180 | + with torch.no_grad(): |
| 181 | + self.conv_k.weight.copy_(self.conv_q.weight) |
| 182 | + self.conv_k.bias.copy_(self.conv_q.bias) |
| 183 | + |
| 184 | + def forward(self, x, c, attn_mask=None): |
| 185 | + q = self.conv_q(x) |
| 186 | + k = self.conv_k(c) |
| 187 | + v = self.conv_v(c) |
| 188 | + |
| 189 | + x, self.attn = self.attention(q, k, v, mask=attn_mask) |
| 190 | + |
| 191 | + x = self.conv_o(x) |
| 192 | + return x |
| 193 | + |
| 194 | + def attention(self, query, key, value, mask=None): |
| 195 | + # reshape [b, d, t] -> [b, n_h, t, d_k] |
| 196 | + b, d, t_s, t_t = (*key.size(), query.size(2)) |
| 197 | + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) |
| 198 | + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) |
| 199 | + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) |
| 200 | + |
| 201 | + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) |
| 202 | + if self.window_size is not None: |
| 203 | + assert t_s == t_t, "Relative attention is only available for self-attention." |
| 204 | + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) |
| 205 | + rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) |
| 206 | + scores_local = self._relative_position_to_absolute_position(rel_logits) |
| 207 | + scores = scores + scores_local |
| 208 | + if self.proximal_bias: |
| 209 | + assert t_s == t_t, "Proximal bias is only available for self-attention." |
| 210 | + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) |
| 211 | + if mask is not None: |
| 212 | + scores = scores.masked_fill(mask == 0, -1e4) |
| 213 | + if self.block_length is not None: |
| 214 | + assert t_s == t_t, "Local attention is only available for self-attention." |
| 215 | + block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) |
| 216 | + scores = scores.masked_fill(block_mask == 0, -1e4) |
| 217 | + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] |
| 218 | + p_attn = self.drop(p_attn) |
| 219 | + output = torch.matmul(p_attn, value) |
| 220 | + if self.window_size is not None: |
| 221 | + relative_weights = self._absolute_position_to_relative_position(p_attn) |
| 222 | + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) |
| 223 | + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) |
| 224 | + output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] |
| 225 | + return output, p_attn |
| 226 | + |
| 227 | + def _matmul_with_relative_values(self, x, y): |
| 228 | + """ |
| 229 | + x: [b, h, l, m] |
| 230 | + y: [h or 1, m, d] |
| 231 | + ret: [b, h, l, d] |
| 232 | + """ |
| 233 | + ret = torch.matmul(x, y.unsqueeze(0)) |
| 234 | + return ret |
| 235 | + |
| 236 | + def _matmul_with_relative_keys(self, x, y): |
| 237 | + """ |
| 238 | + x: [b, h, l, d] |
| 239 | + y: [h or 1, m, d] |
| 240 | + ret: [b, h, l, m] |
| 241 | + """ |
| 242 | + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) |
| 243 | + return ret |
| 244 | + |
| 245 | + def _get_relative_embeddings(self, relative_embeddings, length): |
| 246 | + max_relative_position = 2 * self.window_size + 1 |
| 247 | + # Pad first before slice to avoid using cond ops. |
| 248 | + pad_length = max(length - (self.window_size + 1), 0) |
| 249 | + slice_start_position = max((self.window_size + 1) - length, 0) |
| 250 | + slice_end_position = slice_start_position + 2 * length - 1 |
| 251 | + if pad_length > 0: |
| 252 | + padded_relative_embeddings = F.pad( |
| 253 | + relative_embeddings, |
| 254 | + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) |
| 255 | + else: |
| 256 | + padded_relative_embeddings = relative_embeddings |
| 257 | + used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] |
| 258 | + return used_relative_embeddings |
| 259 | + |
| 260 | + def _relative_position_to_absolute_position(self, x): |
| 261 | + """ |
| 262 | + x: [b, h, l, 2*l-1] |
| 263 | + ret: [b, h, l, l] |
| 264 | + """ |
| 265 | + batch, heads, length, _ = x.size() |
| 266 | + # Concat columns of pad to shift from relative to absolute indexing. |
| 267 | + x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) |
| 268 | + |
| 269 | + # Concat extra elements so to add up to shape (len+1, 2*len-1). |
| 270 | + x_flat = x.view([batch, heads, length * 2 * length]) |
| 271 | + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) |
| 272 | + |
| 273 | + # Reshape and slice out the padded elements. |
| 274 | + x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] |
| 275 | + return x_final |
| 276 | + |
| 277 | + def _absolute_position_to_relative_position(self, x): |
| 278 | + """ |
| 279 | + x: [b, h, l, l] |
| 280 | + ret: [b, h, l, 2*l-1] |
| 281 | + """ |
| 282 | + batch, heads, length, _ = x.size() |
| 283 | + # padd along column |
| 284 | + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) |
| 285 | + x_flat = x.view([batch, heads, length**2 + length*(length -1)]) |
| 286 | + # add 0's in the beginning that will skew the elements after reshape |
| 287 | + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) |
| 288 | + x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] |
| 289 | + return x_final |
| 290 | + |
| 291 | + def _attention_bias_proximal(self, length): |
| 292 | + """Bias for self-attention to encourage attention to close positions. |
| 293 | + Args: |
| 294 | + length: an integer scalar. |
| 295 | + Returns: |
| 296 | + a Tensor with shape [1, 1, length, length] |
| 297 | + """ |
| 298 | + r = torch.arange(length, dtype=torch.float32) |
| 299 | + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) |
| 300 | + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) |
| 301 | + |
| 302 | + |
| 303 | +class FFN(nn.Module): |
| 304 | + def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): |
| 305 | + super().__init__() |
| 306 | + self.in_channels = in_channels |
| 307 | + self.out_channels = out_channels |
| 308 | + self.filter_channels = filter_channels |
| 309 | + self.kernel_size = kernel_size |
| 310 | + self.p_dropout = p_dropout |
| 311 | + self.activation = activation |
| 312 | + self.causal = causal |
| 313 | + |
| 314 | + if causal: |
| 315 | + self.padding = self._causal_padding |
| 316 | + else: |
| 317 | + self.padding = self._same_padding |
| 318 | + |
| 319 | + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) |
| 320 | + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) |
| 321 | + self.drop = nn.Dropout(p_dropout) |
| 322 | + |
| 323 | + def forward(self, x, x_mask): |
| 324 | + x = self.conv_1(self.padding(x * x_mask)) |
| 325 | + if self.activation == "gelu": |
| 326 | + x = x * torch.sigmoid(1.702 * x) |
| 327 | + else: |
| 328 | + x = torch.relu(x) |
| 329 | + x = self.drop(x) |
| 330 | + x = self.conv_2(self.padding(x * x_mask)) |
| 331 | + return x * x_mask |
| 332 | + |
| 333 | + def _causal_padding(self, x): |
| 334 | + if self.kernel_size == 1: |
| 335 | + return x |
| 336 | + pad_l = self.kernel_size - 1 |
| 337 | + pad_r = 0 |
| 338 | + padding = [[0, 0], [0, 0], [pad_l, pad_r]] |
| 339 | + x = F.pad(x, commons.convert_pad_shape(padding)) |
| 340 | + return x |
| 341 | + |
| 342 | + def _same_padding(self, x): |
| 343 | + if self.kernel_size == 1: |
| 344 | + return x |
| 345 | + pad_l = (self.kernel_size - 1) // 2 |
| 346 | + pad_r = self.kernel_size // 2 |
| 347 | + padding = [[0, 0], [0, 0], [pad_l, pad_r]] |
| 348 | + x = F.pad(x, commons.convert_pad_shape(padding)) |
| 349 | + return x |
0 commit comments