ZDisket · ZDisket · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023
diff --git a/TensorVox.pro b/TensorVox.pro
@@ -25,12 +25,16 @@ SOURCES += \
     VoxCommon.cpp \
     attention.cpp \
     batchdenoisedlg.cpp \
+    bert.cpp \
+    berttokenizer.cpp \
+    devits.cpp \
     espeakphonemizer.cpp \
     ext/ByteArr.cpp \
     ext/Qt-Frameless-Window-DarkStyle-master/DarkStyle.cpp \
     ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/framelesswindow.cpp \
     ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/windowdragger.cpp \
     ext/ZCharScanner.cpp \
+    ext/ZCharScannerWide.cpp \
     ext/ZFile.cpp \
     ext/qcustomplot.cpp \
     istftnettorch.cpp \
@@ -61,6 +65,9 @@ HEADERS += \
     VoxCommon.hpp \
     attention.h \
     batchdenoisedlg.h \
+    bert.h \
+    berttokenizer.h \
+    devits.h \
     espeakphonemizer.h \
     ext/AudioFile.hpp \
     ext/ByteArr.h \
@@ -76,6 +83,7 @@ HEADERS += \
     ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/framelesswindow.h \
     ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/windowdragger.h \
     ext/ZCharScanner.h \
+    ext/ZCharScannerWide.h \
     ext/ZFile.h \
     ext/json.hpp \
     ext/qcustomplot.h \
@@ -115,7 +123,7 @@ DEFINES += _CRT_SECURE_NO_WARNINGS
 INCLUDEPATH += $$PWD/deps/include
 INCLUDEPATH += $$PWD/deps/include/libtorch
 INCLUDEPATH += $$PWD/ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow
-win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib c10.lib torch.lib torch_cpu.lib libespeak-ng.lib
+win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib c10.lib torch.lib torch_cpu.lib libespeak-ng.lib Utf8Proc.lib
 win32: LIBS += Advapi32.lib User32.lib Psapi.lib
 
 

diff --git a/Voice.cpp b/Voice.cpp
@@ -120,14 +120,16 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer
 
     const int32_t Tex2MelArch = VoxInfo.Architecture.Text2Mel;
 
-    const bool IsVITS = Tex2MelArch == EText2MelModel::VITS || Tex2MelArch == EText2MelModel::VITSTM;
+    const bool IsVITS = Tex2MelArch == EText2MelModel::VITS || Tex2MelArch == EText2MelModel::DEVITS;
 
     if (Tex2MelArch == EText2MelModel::Tacotron2)
         MelPredictor = std::make_unique<Tacotron2>();
     else if (Tex2MelArch == EText2MelModel::FastSpeech2)
         MelPredictor = std::make_unique<FastSpeech2>();
-    else if (Tex2MelArch == EText2MelModel::VITS || Tex2MelArch == EText2MelModel::VITSTM)
+    else if (Tex2MelArch == EText2MelModel::VITS)
         MelPredictor = std::make_unique<VITS>();
+    else if (Tex2MelArch == EText2MelModel::DEVITS)
+         MelPredictor = std::make_unique<DEVITS>();
     else
         MelPredictor = std::make_unique<Tacotron2Torch>();
 
@@ -144,8 +146,12 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer
 
 
 
-    if (Tex2MelArch == EText2MelModel::VITSTM)
+    if (Tex2MelArch == EText2MelModel::DEVITS){
         Moji.Initialize(VoxPath + "/moji.pt", VoxPath + "/tm_dict.txt");
+        BertFE.Initialize(VoxPath + "/bert.pt", VoxPath + "/bert_vocab.txt");
+
+    }
+
 
 
     const int32_t VocoderArch = VoxInfo.Architecture.Vocoder;
@@ -286,20 +292,36 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea
 
         Mel = ((FastSpeech2*)MelPredictor.get())->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID, EmotionID);
 
-    }else
+    }else if (Text2MelN == EText2MelModel::VITS)
     {
         FloatArgs = {Speed};
 
-        if (EmotionOvr.size()){
-            std::vector<std::string> MojiInput = Processor.GetTokenizer().Tokenize(EmotionOvr,true,true);
 
-            std::vector<float> MojiStates = Moji.Infer(MojiInput);
+        TFTensor<float> Audio = MelPredictor.get()->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID,EmotionID);
+        Attention = ((VITS*)MelPredictor.get())->Attention;
 
-            FloatArgs.insert(FloatArgs.end(),MojiStates.begin(),MojiStates.end());
+        std::vector<float> AudioData = Audio.Data;
 
-        }
+        Mel.Shape.push_back(-1); // Tell the plotter that we have no mel to plot
 
-        TFTensor<float> Audio = MelPredictor.get()->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID,EmotionID);
+        // As VITS is fully E2E, we return here
+
+        return {AudioData,Attention,Mel};
+
+    }else // DE-VITS
+    {
+        FloatArgs = {Speed};
+        std::vector<std::string> MojiInput = Processor.GetTokenizer().Tokenize(EmotionOvr,true,true);
+        TFTensor<float> MojiStates = Moji.Infer(MojiInput);
+
+        auto BERTOutputs = BertFE.Infer(Prompt);
+
+
+
+
+        TFTensor<float> Audio = ((DEVITS*)MelPredictor.get())->DoInferenceDE(InputIDs, MojiStates,
+                                                                             BERTOutputs.first,FloatArgs,
+                                                                             IntArgs,SpeakerID,EmotionID);
         Attention = ((VITS*)MelPredictor.get())->Attention;
 
         std::vector<float> AudioData = Audio.Data;
@@ -309,7 +331,6 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea
         // As VITS is fully E2E, we return here
 
         return {AudioData,Attention,Mel};
-
     }
 
     // Vocoder inference

diff --git a/Voice.h b/Voice.h
@@ -10,6 +10,9 @@
 #include "phoneticdict.h"
 #include "tacotron2torch.h"
 #include "istftnettorch.h"
+#include "devits.h"
+#include "bert.h"
+
 struct VoxResults{
   std::vector<float> Audio;
   TFTensor<float> Alignment;
@@ -24,6 +27,7 @@ class Voice
 	EnglishPhoneticProcessor Processor;
     VoiceInfo VoxInfo;
     TorchMoji Moji;
+    BERT BertFE;
 
 
 

diff --git a/VoxCommon.cpp b/VoxCommon.cpp
@@ -4,7 +4,7 @@ using namespace nlohmann;
 #include <codecvt>
 #include <locale>         // std::wstring_convert
 
-const std::vector<std::string> Text2MelNames = {"FastSpeech2","Tacotron2 (TF)","VITS","VITS + TorchMoji","Tacotron2 (Torch)"};
+const std::vector<std::string> Text2MelNames = {"FastSpeech2","Tacotron2 (TF)","VITS","DE-VITS","Tacotron2 (Torch)"};
 const std::vector<std::string> VocoderNames = {"Multi-Band MelGAN","MelGAN-STFT","","iSTFTNet"};
 const std::vector<std::string> RepoNames = {"TensorflowTTS","Coqui-TTS","jaywalnut310","keonlee9420"};
 

diff --git a/VoxCommon.hpp b/VoxCommon.hpp
@@ -63,7 +63,7 @@ enum Enum{
     FastSpeech2 = 0,
     Tacotron2,
     VITS,
-    VITSTM,
+    DEVITS,
     Tacotron2Torch
 };
 
@@ -167,7 +167,7 @@ namespace VoxUtil {
     // Copy PyTorch tensor
 
     template<typename D>
-    TFTensor<D> CopyTensor(at::Tensor& InTens){
+    TFTensor<D> CopyTensor(const at::Tensor& InTens){
         D* Data = InTens.data<D>();
         std::vector<int64_t> Shape = InTens.sizes().vec();
 

diff --git a/bert.cpp b/bert.cpp
@@ -0,0 +1,67 @@
+#include "bert.h"
+#include <windows.h>
+
+BERT::BERT()
+{
+
+}
+
+BERT::BERT(const std::string &Path, const std::string &DictPath)
+{
+    Initialize(Path, DictPath);
+}
+
+void BERT::Initialize(const std::string &Path, const std::string &DictPath)
+{
+    Model = torch::jit::load(Path);
+    Tokenizer = std::make_unique<FullTokenizer>(DictPath,true);
+
+
+}
+
+std::pair<TFTensor<float>, TFTensor<float> > BERT::Infer(const std::string &InText)
+{
+    torch::NoGradGuard no_grad;
+
+    auto Tokens = Tokenizer->tokenize(InText + "\n");
+    auto Ids = Tokenizer->convertTokensToIds(Tokens);
+
+    std::vector<int32_t> InTokens(Ids.begin(),Ids.end());
+
+
+
+    auto InIDS = torch::tensor(InTokens).unsqueeze(0); // (1, tokens)
+    std::pair<TFTensor<float>,TFTensor<float>> BERTOutputs;
+
+    try{
+        auto Output = Model({InIDS}).toTuple(); // (hidden states, pooled)
+        BERTOutputs.first = VoxUtil::CopyTensor<float>(Output.get()->elements()[0].toTensor());
+        BERTOutputs.second = VoxUtil::CopyTensor<float>(Output.get()->elements()[1].toTensor());
+
+
+
+    }
+
+    catch (const std::exception& e) {
+        int msgboxID = MessageBox(
+                    NULL,
+                    (LPCWSTR)QString::fromStdString(e.what()).toStdWString().c_str(),
+                    (LPCWSTR)L"Error1!!",
+                    MB_ICONWARNING | MB_CANCELTRYCONTINUE | MB_DEFBUTTON2
+                    );
+
+
+        return BERTOutputs;
+
+    }
+
+
+
+
+
+
+    return BERTOutputs;
+
+
+
+}
diff --git a/bert.h b/bert.h
@@ -0,0 +1,28 @@
+#ifndef BERT_H
+#define BERT_H
+
+#include "VoxCommon.hpp"
+
+#include "berttokenizer.h"
+
+// BERT: Class for inference of TorchScript-exported BERT.
+class BERT
+{
+private:
+    torch::jit::script::Module Model;
+    std::unique_ptr<FullTokenizer> Tokenizer;
+
+public:
+    BERT();
+    BERT(const std::string& Path,const std::string& DictPath);
+    void Initialize(const std::string& Path,const std::string& DictPath);
+
+
+    // Do inference on BERT model.
+    // Returns 2 tensors:
+    // [1, tokens, channels] : Hidden states
+    // [1, channels] : Pooled embeddings
+    std::pair<TFTensor<float>,TFTensor<float>> Infer(const std::string& InText);
+};
+
+#endif // BERT_H