Skip to content

Commit a8acfa0

Browse files
authored
Merge branch '4.1-Stable' into 4.1-Latest
2 parents 57c079f + 7937294 commit a8acfa0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+420
-407
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ filelists/test.txt
156156
filelists/train.txt
157157
filelists/val.txt
158158
.idea/
159+
.vscode/
159160
.idea/modules.xml
160161
.idea/so-vits-svc.iml
161162
.idea/vcs.xml
@@ -168,3 +169,4 @@ pretrain/vec-256-layer-9.onnx
168169
pretrain/vec-256-layer-12.onnx
169170
pretrain/vec-768-layer-9.onnx
170171
.vscode/launch.json
172+
.ruff.toml

README.md

+24-24
Large diffs are not rendered by default.

README_zh_CN.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ python inference_main.py -m "logs/44k/G_30400.pth" -c "configs/config.json" -n "
340340
+ `-lg` | `--linear_gradient`:两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒
341341
+ `-f0p` | `--f0_predictor`:选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)
342342
+ `-a` | `--auto_predict_f0`:语音转换自动预测音高,转换歌声时不要打开这个会严重跑调
343-
+ `-cm` | `--cluster_model_path`:聚类模型或特征检索索引路径,如果没有训练聚类或特征检索则随便填
343+
+ `-cm` | `--cluster_model_path`:聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填
344344
+ `-cr` | `--cluster_infer_ratio`:聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可
345345
+ `-eh` | `--enhance`:是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭
346346
+ `-shd` | `--shallow_diffusion`:是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止
@@ -379,7 +379,7 @@ python inference_main.py -m "logs/44k/G_30400.pth" -c "configs/config.json" -n "
379379
+ 执行`python cluster/train_cluster.py`,模型的输出会在`logs/44k/kmeans_10000.pt`
380380
+ 聚类模型目前可以使用gpu进行训练,执行`python cluster/train_cluster.py --gpu`
381381
+ 推理过程:
382-
+ `inference_main.py`中指定`cluster_model_path`
382+
+ `inference_main.py`中指定`cluster_model_path` 为模型输出文件, 留空则默认为`logs/44k/kmeans_10000.pt`
383383
+ `inference_main.py`中指定`cluster_infer_ratio``0`为完全不使用聚类,`1`为只使用聚类,通常设置`0.5`即可
384384

385385
### 特征检索
@@ -396,8 +396,8 @@ python train_index.py -c configs/config.json
396396
模型的输出会在`logs/44k/feature_and_index.pkl`
397397

398398
+ 推理过程:
399-
+ 需要首先制定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
400-
+ `inference_main.py`中指定`cluster_model_path` 为模型输出文件
399+
+ 需要首先指定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
400+
+ `inference_main.py`中指定`cluster_model_path` 为模型输出文件, 留空则默认为`logs/44k/feature_and_index.pkl`
401401
+ `inference_main.py`中指定`cluster_infer_ratio``0`为完全不使用特征检索,`1`为只使用特征检索,通常设置`0.5`即可
402402

403403
### [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/svc-develop-team/so-vits-svc/blob/4.1-Stable/sovits4_for_colab.ipynb) [sovits4_for_colab.ipynb](https://colab.research.google.com/github/svc-develop-team/so-vits-svc/blob/4.1-Stable/sovits4_for_colab.ipynb)

cluster/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
import numpy as np
21
import torch
32
from sklearn.cluster import KMeans
43

4+
55
def get_cluster_model(ckpt_path):
66
checkpoint = torch.load(ckpt_path)
77
kmeans_dict = {}

cluster/kmeans.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1-
import math,pdb
2-
import torch,pynvml
3-
from torch.nn.functional import normalize
41
from time import time
2+
53
import numpy as np
4+
import pynvml
5+
import torch
6+
from torch.nn.functional import normalize
7+
8+
69
# device=torch.device("cuda:0")
710
def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
811
""" Picks k points in the data based on the kmeans++ method.

cluster/train_cluster.py

+13-15
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
1-
import time,pdb
2-
import tqdm
3-
from time import time as ttime
1+
import argparse
2+
import logging
43
import os
4+
import time
55
from pathlib import Path
6-
import logging
7-
import argparse
8-
from kmeans import KMeansGPU
9-
import torch
6+
107
import numpy as np
11-
from sklearn.cluster import KMeans,MiniBatchKMeans
8+
import torch
9+
import tqdm
10+
from kmeans import KMeansGPU
11+
from sklearn.cluster import KMeans, MiniBatchKMeans
1212

1313
logging.basicConfig(level=logging.INFO)
1414
logger = logging.getLogger(__name__)
15-
from time import time as ttime
16-
import pynvml,torch
1715

1816
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
1917
logger.info(f"Loading features from {in_dir}")
@@ -29,22 +27,22 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
2927
features = features.astype(np.float32)
3028
logger.info(f"Clustering features of shape: {features.shape}")
3129
t = time.time()
32-
if(use_gpu==False):
30+
if(use_gpu is False):
3331
if use_minibatch:
3432
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
3533
else:
3634
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
3735
else:
3836
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
3937
features=torch.from_numpy(features)#.to(device)
40-
labels = kmeans.fit_predict(features)#
38+
kmeans.fit_predict(features)#
4139

4240
print(time.time()-t, "s")
4341

4442
x = {
45-
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[1],
46-
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
47-
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
43+
"n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
44+
"_n_threads": kmeans._n_threads if use_gpu is False else 4,
45+
"cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
4846
}
4947
print("end")
5048

data_utils.py

+7-8
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1-
import time
21
import os
32
import random
3+
44
import numpy as np
55
import torch
66
import torch.utils.data
77

8-
import modules.commons as commons
98
import utils
10-
from modules.mel_processing import spectrogram_torch, spec_to_mel_torch, spectrogram_torch
11-
from utils import load_wav_to_torch, load_filepaths_and_text
9+
from modules.mel_processing import spectrogram_torch
10+
from utils import load_filepaths_and_text, load_wav_to_torch
1211

1312
# import h5py
1413

@@ -87,7 +86,7 @@ def get_audio(self, filename):
8786
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
8887
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
8988
audio_norm = audio_norm[:, :lmin * self.hop_length]
90-
if volume!= None:
89+
if volume is not None:
9190
volume = volume[:lmin]
9291
return c, f0, spec, audio_norm, spk, uv, volume
9392

@@ -96,7 +95,7 @@ def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
9695
# print("skip too short audio:", filename)
9796
# return None
9897

99-
if random.choice([True, False]) and self.vol_aug and volume!=None:
98+
if random.choice([True, False]) and self.vol_aug and volume is not None:
10099
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
101100
max_shift = min(1, np.log10(1/max_amp))
102101
log10_vol_shift = random.uniform(-1, max_shift)
@@ -114,7 +113,7 @@ def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
114113
end = start + 790
115114
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
116115
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
117-
if volume !=None:
116+
if volume is not None:
118117
volume = volume[start:end]
119118
return c, f0, spec, audio_norm, spk, uv,volume
120119

@@ -178,7 +177,7 @@ def __call__(self, batch):
178177
uv = row[5]
179178
uv_padded[i, :uv.size(0)] = uv
180179
volume = row[6]
181-
if volume != None:
180+
if volume is not None:
182181
volume_padded[i, :volume.size(0)] = volume
183182
else :
184183
volume_padded = None

diffusion/data_loaders.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import os
22
import random
3-
import re
4-
import numpy as np
3+
54
import librosa
5+
import numpy as np
66
import torch
7-
import random
8-
from utils import repeat_expand_2d
9-
from tqdm import tqdm
107
from torch.utils.data import Dataset
8+
from tqdm import tqdm
9+
10+
from utils import repeat_expand_2d
11+
1112

1213
def traverse_dir(
1314
root_dir,
@@ -130,7 +131,6 @@ def __init__(
130131
with open(filelists,"r") as f:
131132
self.paths = f.read().splitlines()
132133
for name_ext in tqdm(self.paths, total=len(self.paths)):
133-
name = os.path.splitext(name_ext)[0]
134134
path_audio = name_ext
135135
duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
136136

diffusion/diffusion.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from collections import deque
22
from functools import partial
33
from inspect import isfunction
4-
import torch.nn.functional as F
5-
import librosa.sequence
4+
65
import numpy as np
76
import torch
7+
import torch.nn.functional as F
88
from torch import nn
99
from tqdm import tqdm
1010

@@ -26,8 +26,10 @@ def extract(a, t, x_shape):
2626

2727

2828
def noise_like(shape, device, repeat=False):
29-
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
30-
noise = lambda: torch.randn(shape, device=device)
29+
def repeat_noise():
30+
return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
31+
def noise():
32+
return torch.randn(shape, device=device)
3133
return repeat_noise() if repeat else noise()
3234

3335

@@ -253,7 +255,11 @@ def forward(self,
253255

254256
if method is not None and infer_speedup > 1:
255257
if method == 'dpm-solver' or method == 'dpm-solver++':
256-
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
258+
from .dpm_solver_pytorch import (
259+
DPM_Solver,
260+
NoiseScheduleVP,
261+
model_wrapper,
262+
)
257263
# 1. Define the noise schedule.
258264
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
259265

@@ -331,7 +337,7 @@ def wrapped(x, t, **kwargs):
331337
infer_speedup, cond=cond
332338
)
333339
elif method == 'unipc':
334-
from .uni_pc import NoiseScheduleVP, model_wrapper, UniPC
340+
from .uni_pc import NoiseScheduleVP, UniPC, model_wrapper
335341
# 1. Define the noise schedule.
336342
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
337343

diffusion/diffusion_onnx.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1+
import math
12
from collections import deque
23
from functools import partial
34
from inspect import isfunction
4-
import torch.nn.functional as F
5-
import librosa.sequence
5+
66
import numpy as np
7-
from torch.nn import Conv1d
8-
from torch.nn import Mish
97
import torch
8+
import torch.nn.functional as F
109
from torch import nn
10+
from torch.nn import Conv1d, Mish
1111
from tqdm import tqdm
12-
import math
1312

1413

1514
def exists(x):
@@ -27,8 +26,10 @@ def extract(a, t):
2726

2827

2928
def noise_like(shape, device, repeat=False):
30-
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
31-
noise = lambda: torch.randn(shape, device=device)
29+
def repeat_noise():
30+
return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
31+
def noise():
32+
return torch.randn(shape, device=device)
3233
return repeat_noise() if repeat else noise()
3334

3435

@@ -389,7 +390,11 @@ def org_forward(self,
389390

390391
if method is not None and infer_speedup > 1:
391392
if method == 'dpm-solver':
392-
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
393+
from .dpm_solver_pytorch import (
394+
DPM_Solver,
395+
NoiseScheduleVP,
396+
model_wrapper,
397+
)
393398
# 1. Define the noise schedule.
394399
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
395400

@@ -576,9 +581,6 @@ def forward(self, condition=None, init_noise=None, pndms=None, k_step=None):
576581
plms_noise_stage = torch.tensor(0, dtype=torch.long, device=device)
577582
noise_list = torch.zeros((0, 1, 1, self.mel_bins, n_frames), device=device)
578583

579-
ot = step_range[0]
580-
ot_1 = torch.full((1,), ot, device=device, dtype=torch.long)
581-
582584
for t in step_range:
583585
t_1 = torch.full((1,), t, device=device, dtype=torch.long)
584586
noise_pred = self.denoise_fn(x, t_1, cond)

diffusion/dpm_solver_pytorch.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
import torch
2-
import torch.nn.functional as F
3-
import math
42

53

64
class NoiseScheduleVP:
@@ -559,7 +557,6 @@ def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=Fal
559557
x_t: A pytorch tensor. The approximated solution at time `t`.
560558
"""
561559
ns = self.noise_schedule
562-
dims = x.dim()
563560
lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
564561
h = lambda_t - lambda_s
565562
log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
@@ -984,20 +981,25 @@ def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol
984981
nfe = 0
985982
if order == 2:
986983
r1 = 0.5
987-
lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
988-
higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
984+
def lower_update(x, s, t):
985+
return self.dpm_solver_first_update(x, s, t, return_intermediate=True)
986+
def higher_update(x, s, t, **kwargs):
987+
return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
989988
elif order == 3:
990989
r1, r2 = 1. / 3., 2. / 3.
991-
lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
992-
higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
990+
def lower_update(x, s, t):
991+
return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
992+
def higher_update(x, s, t, **kwargs):
993+
return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
993994
else:
994995
raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
995996
while torch.abs((s - t_0)).mean() > t_err:
996997
t = ns.inverse_lambda(lambda_s + h)
997998
x_lower, lower_noise_kwargs = lower_update(x, s, t)
998999
x_higher = higher_update(x, s, t, **lower_noise_kwargs)
9991000
delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
1000-
norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
1001+
def norm_fn(v):
1002+
return torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
10011003
E = norm_fn((x_higher - x_lower) / delta).max()
10021004
if torch.all(E <= 1.):
10031005
x = x_higher

diffusion/infer_gt_mel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import numpy as np
21
import torch
32
import torch.nn.functional as F
3+
44
from diffusion.unit2mel import load_model_vocoder
55

66

diffusion/logger/saver.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22
author: wayn391@mastertones
33
'''
44

5+
import datetime
56
import os
6-
import json
77
import time
8-
import yaml
9-
import datetime
10-
import torch
8+
119
import matplotlib.pyplot as plt
12-
from . import utils
10+
import torch
11+
import yaml
1312
from torch.utils.tensorboard import SummaryWriter
1413

14+
1515
class Saver(object):
1616
def __init__(
1717
self,

0 commit comments

Comments
 (0)