fishaudio
diff --git a/‎README.md
+5 b/‎README.md
+5
diff --git a/‎clap_gen.py
+64 b/‎clap_gen.py
+64
diff --git a/‎clap_wrapper.py
+49 b/‎clap_wrapper.py
+49
diff --git a/‎commons.py
+6-14 b/‎commons.py
+6-14
diff --git a/‎compress_model.py
+1 b/‎compress_model.py
+1
@@ -5,6 +5,11 @@
 # Bert-VITS2
 
 VITS2 Backbone with multilingual bert
+
+For quick guide, please refer to `webui_preprocess.py`.
+
+简易教程请参见 `webui_preprocess.py`。
+
 ## 请注意，本项目核心思路来源于[anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 一个非常好的tts项目
 ## MassTTS的演示demo为[ai版峰哥锐评峰哥本人,并找回了在金三角失落的腰子](https://www.bilibili.com/video/BV1w24y1c7z9)
 
 
@@ -0,0 +1,64 @@
+import argparse
+from multiprocessing import Pool, cpu_count
+
+import torch
+import torch.multiprocessing as mp
+from tqdm import tqdm
+
+import utils
+from config import config
+from clap_wrapper import get_clap_audio_feature
+import librosa
+import os
+
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+
+
+def process_line(line):
+    device = config.emo_gen_config.device
+    if config.emo_gen_config.use_multi_device:
+        rank = mp.current_process()._identity
+        rank = rank[0] if len(rank) > 0 else 0
+        if torch.cuda.is_available():
+            gpu_id = rank % torch.cuda.device_count()
+            device = torch.device(f"cuda:{gpu_id}")
+        else:
+            device = torch.device("cpu")
+    wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
+
+    clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
+    if os.path.isfile(clap_path):
+        return
+
+    audio = librosa.load(wav_path, 48000)[0]
+    # audio = librosa.resample(audio, 44100, 48000)
+
+    clap = get_clap_audio_feature(audio, device)
+    torch.save(clap, clap_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--config", type=str, default=config.emo_gen_config.config_path
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=config.emo_gen_config.num_processes
+    )
+    args, _ = parser.parse_known_args()
+    config_path = args.config
+    hps = utils.get_hparams_from_file(config_path)
+    lines = []
+    with open(hps.data.training_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+
+    with open(hps.data.validation_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    if len(lines) != 0:
+        num_processes = min(args.num_processes, cpu_count())
+        with Pool(processes=num_processes) as pool:
+            for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
+                pass
+
+    print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")
@@ -0,0 +1,49 @@
+import sys
+
+import torch
+from transformers import ClapModel, ClapProcessor
+
+from config import config
+
+models = dict()
+processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
+
+
+def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(
+            audios=audio_data, return_tensors="pt", sampling_rate=48000
+        ).to(device)
+        emb = models[device].get_audio_features(**inputs)
+    return emb.T
+
+
+def get_clap_text_feature(text, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(text=text, return_tensors="pt").to(device)
+        emb = models[device].get_text_features(**inputs)
+    return emb.T
@@ -46,26 +46,18 @@ def rand_gumbel_like(x):
 
 
 def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        if idx_str < 0:
-            i1 = x.size(2) + idx_str
-            r1 = x[i, :, i1:]
-            r2 = x[i, :, :idx_end]
-            ret[i] = torch.cat([r1, r2], dim=1)
-        else:
-            ret[i] = x[i, :, idx_str:idx_end]
-    return ret
+    gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
+        1, x.size(1), 1
+    ) + torch.arange(segment_size, device=x.device)
+    return torch.gather(x, 2, gather_indices)
 
 
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
     b, d, t = x.size()
     if x_lengths is None:
         x_lengths = t
-    ids_str_max = x_lengths - segment_size + 1
-    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
+    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
     ret = slice_segments(x, ids_str, segment_size)
     return ret, ids_str
 
 
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 from text.symbols import symbols
 import torch
+
 from tools.log import logger
 import utils
 from models import SynthesizerTrn