Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion TensorVox.pro
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,16 @@ SOURCES += \
VoxCommon.cpp \
attention.cpp \
batchdenoisedlg.cpp \
bert.cpp \
berttokenizer.cpp \
devits.cpp \
espeakphonemizer.cpp \
ext/ByteArr.cpp \
ext/Qt-Frameless-Window-DarkStyle-master/DarkStyle.cpp \
ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/framelesswindow.cpp \
ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/windowdragger.cpp \
ext/ZCharScanner.cpp \
ext/ZCharScannerWide.cpp \
ext/ZFile.cpp \
ext/qcustomplot.cpp \
istftnettorch.cpp \
Expand Down Expand Up @@ -61,6 +65,9 @@ HEADERS += \
VoxCommon.hpp \
attention.h \
batchdenoisedlg.h \
bert.h \
berttokenizer.h \
devits.h \
espeakphonemizer.h \
ext/AudioFile.hpp \
ext/ByteArr.h \
Expand All @@ -76,6 +83,7 @@ HEADERS += \
ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/framelesswindow.h \
ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow/windowdragger.h \
ext/ZCharScanner.h \
ext/ZCharScannerWide.h \
ext/ZFile.h \
ext/json.hpp \
ext/qcustomplot.h \
Expand Down Expand Up @@ -115,7 +123,7 @@ DEFINES += _CRT_SECURE_NO_WARNINGS
INCLUDEPATH += $$PWD/deps/include
INCLUDEPATH += $$PWD/deps/include/libtorch
INCLUDEPATH += $$PWD/ext/Qt-Frameless-Window-DarkStyle-master/framelesswindow
win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib c10.lib torch.lib torch_cpu.lib libespeak-ng.lib
win32: LIBS += -L$$PWD/deps/lib/ tensorflow.lib r8bsrc64.lib rnnoise64.lib LogitechLEDLib.lib LibNumberText64.lib c10.lib torch.lib torch_cpu.lib libespeak-ng.lib Utf8Proc.lib
win32: LIBS += Advapi32.lib User32.lib Psapi.lib


Expand Down
43 changes: 32 additions & 11 deletions Voice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,16 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer

const int32_t Tex2MelArch = VoxInfo.Architecture.Text2Mel;

const bool IsVITS = Tex2MelArch == EText2MelModel::VITS || Tex2MelArch == EText2MelModel::VITSTM;
const bool IsVITS = Tex2MelArch == EText2MelModel::VITS || Tex2MelArch == EText2MelModel::DEVITS;

if (Tex2MelArch == EText2MelModel::Tacotron2)
MelPredictor = std::make_unique<Tacotron2>();
else if (Tex2MelArch == EText2MelModel::FastSpeech2)
MelPredictor = std::make_unique<FastSpeech2>();
else if (Tex2MelArch == EText2MelModel::VITS || Tex2MelArch == EText2MelModel::VITSTM)
else if (Tex2MelArch == EText2MelModel::VITS)
MelPredictor = std::make_unique<VITS>();
else if (Tex2MelArch == EText2MelModel::DEVITS)
MelPredictor = std::make_unique<DEVITS>();
else
MelPredictor = std::make_unique<Tacotron2Torch>();

Expand All @@ -144,8 +146,12 @@ Voice::Voice(const std::string & VoxPath, const std::string &inName, Phonemizer



if (Tex2MelArch == EText2MelModel::VITSTM)
if (Tex2MelArch == EText2MelModel::DEVITS){
Moji.Initialize(VoxPath + "/moji.pt", VoxPath + "/tm_dict.txt");
BertFE.Initialize(VoxPath + "/bert.pt", VoxPath + "/bert_vocab.txt");

}



const int32_t VocoderArch = VoxInfo.Architecture.Vocoder;
Expand Down Expand Up @@ -286,20 +292,36 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea

Mel = ((FastSpeech2*)MelPredictor.get())->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID, EmotionID);

}else
}else if (Text2MelN == EText2MelModel::VITS)
{
FloatArgs = {Speed};

if (EmotionOvr.size()){
std::vector<std::string> MojiInput = Processor.GetTokenizer().Tokenize(EmotionOvr,true,true);

std::vector<float> MojiStates = Moji.Infer(MojiInput);
TFTensor<float> Audio = MelPredictor.get()->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID,EmotionID);
Attention = ((VITS*)MelPredictor.get())->Attention;

FloatArgs.insert(FloatArgs.end(),MojiStates.begin(),MojiStates.end());
std::vector<float> AudioData = Audio.Data;

}
Mel.Shape.push_back(-1); // Tell the plotter that we have no mel to plot

TFTensor<float> Audio = MelPredictor.get()->DoInference(InputIDs,FloatArgs,IntArgs,SpeakerID,EmotionID);
// As VITS is fully E2E, we return here

return {AudioData,Attention,Mel};

}else // DE-VITS
{
FloatArgs = {Speed};
std::vector<std::string> MojiInput = Processor.GetTokenizer().Tokenize(EmotionOvr,true,true);
TFTensor<float> MojiStates = Moji.Infer(MojiInput);

auto BERTOutputs = BertFE.Infer(Prompt);




TFTensor<float> Audio = ((DEVITS*)MelPredictor.get())->DoInferenceDE(InputIDs, MojiStates,
BERTOutputs.first,FloatArgs,
IntArgs,SpeakerID,EmotionID);
Attention = ((VITS*)MelPredictor.get())->Attention;

std::vector<float> AudioData = Audio.Data;
Expand All @@ -309,7 +331,6 @@ VoxResults Voice::Vocalize(const std::string & Prompt, float Speed, int32_t Spea
// As VITS is fully E2E, we return here

return {AudioData,Attention,Mel};

}

// Vocoder inference
Expand Down
4 changes: 4 additions & 0 deletions Voice.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "phoneticdict.h"
#include "tacotron2torch.h"
#include "istftnettorch.h"
#include "devits.h"
#include "bert.h"

struct VoxResults{
std::vector<float> Audio;
TFTensor<float> Alignment;
Expand All @@ -24,6 +27,7 @@ class Voice
EnglishPhoneticProcessor Processor;
VoiceInfo VoxInfo;
TorchMoji Moji;
BERT BertFE;



Expand Down
2 changes: 1 addition & 1 deletion VoxCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ using namespace nlohmann;
#include <codecvt>
#include <locale> // std::wstring_convert

const std::vector<std::string> Text2MelNames = {"FastSpeech2","Tacotron2 (TF)","VITS","VITS + TorchMoji","Tacotron2 (Torch)"};
const std::vector<std::string> Text2MelNames = {"FastSpeech2","Tacotron2 (TF)","VITS","DE-VITS","Tacotron2 (Torch)"};
const std::vector<std::string> VocoderNames = {"Multi-Band MelGAN","MelGAN-STFT","","iSTFTNet"};
const std::vector<std::string> RepoNames = {"TensorflowTTS","Coqui-TTS","jaywalnut310","keonlee9420"};

Expand Down
4 changes: 2 additions & 2 deletions VoxCommon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ enum Enum{
FastSpeech2 = 0,
Tacotron2,
VITS,
VITSTM,
DEVITS,
Tacotron2Torch
};

Expand Down Expand Up @@ -167,7 +167,7 @@ namespace VoxUtil {
// Copy PyTorch tensor

template<typename D>
TFTensor<D> CopyTensor(at::Tensor& InTens){
TFTensor<D> CopyTensor(const at::Tensor& InTens){
D* Data = InTens.data<D>();
std::vector<int64_t> Shape = InTens.sizes().vec();

Expand Down
67 changes: 67 additions & 0 deletions bert.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#include "bert.h"
#include <windows.h>

BERT::BERT()
{

}

BERT::BERT(const std::string &Path, const std::string &DictPath)
{
Initialize(Path, DictPath);
}

void BERT::Initialize(const std::string &Path, const std::string &DictPath)
{
Model = torch::jit::load(Path);
Tokenizer = std::make_unique<FullTokenizer>(DictPath,true);


}

std::pair<TFTensor<float>, TFTensor<float> > BERT::Infer(const std::string &InText)
{
torch::NoGradGuard no_grad;

auto Tokens = Tokenizer->tokenize(InText + "\n");
auto Ids = Tokenizer->convertTokensToIds(Tokens);

std::vector<int32_t> InTokens(Ids.begin(),Ids.end());



auto InIDS = torch::tensor(InTokens).unsqueeze(0); // (1, tokens)
std::pair<TFTensor<float>,TFTensor<float>> BERTOutputs;

try{
auto Output = Model({InIDS}).toTuple(); // (hidden states, pooled)
BERTOutputs.first = VoxUtil::CopyTensor<float>(Output.get()->elements()[0].toTensor());
BERTOutputs.second = VoxUtil::CopyTensor<float>(Output.get()->elements()[1].toTensor());



}

catch (const std::exception& e) {
int msgboxID = MessageBox(
NULL,
(LPCWSTR)QString::fromStdString(e.what()).toStdWString().c_str(),
(LPCWSTR)L"Error1!!",
MB_ICONWARNING | MB_CANCELTRYCONTINUE | MB_DEFBUTTON2
);


return BERTOutputs;

}






return BERTOutputs;



}
28 changes: 28 additions & 0 deletions bert.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef BERT_H
#define BERT_H

#include "VoxCommon.hpp"

#include "berttokenizer.h"

// BERT: Class for inference of TorchScript-exported BERT.
class BERT
{
private:
torch::jit::script::Module Model;
std::unique_ptr<FullTokenizer> Tokenizer;

public:
BERT();
BERT(const std::string& Path,const std::string& DictPath);
void Initialize(const std::string& Path,const std::string& DictPath);


// Do inference on BERT model.
// Returns 2 tensors:
// [1, tokens, channels] : Hidden states
// [1, channels] : Pooled embeddings
std::pair<TFTensor<float>,TFTensor<float>> Infer(const std::string& InText);
};

#endif // BERT_H
Loading