add unet code (wang-xinyu#380)

YuzhouPeng · web-flow · commit 6161f288542a · 2021-01-28T12:59:02.000+08:00
* add code

* remove txt
diff --git a/unet/CMakeLists.txt b/unet/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 2.6)
+
+project(unet)
+
+add_definitions(-std=c++11)
+
+option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_BUILD_TYPE Debug)
+
+set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
+
+# cuda directory
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(/usr/local/cuda-10.2/targets/x86_64-linux/include)
+link_directories(/usr/local/cuda-10.2/targets/x86_64-linux/lib)
+
+# tensorrt
+include_directories(/home/sycv/workplace/pengyuzhou/TensorRT-7.0.0.11/targets/x86_64-linux-gnu/include)
+link_directories(/home/sycv/workplace/pengyuzhou/TensorRT-7.0.0.11/targets/x86_64-linux-gnu/lib)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
+
+# link library and add exec file
+add_executable(unet ${PROJECT_SOURCE_DIR}/unet.cpp)
+target_link_libraries(unet nvinfer)
+target_link_libraries(unet cudart)
+
+add_definitions(-O2 -pthread)
+
+# opencv library
+find_package(OpenCV)
+include_directories(OpenCV_INCLUDE_DIRS)
+target_link_libraries(unet ${OpenCV_LIBS})
diff --git a/unet/README.md b/unet/README.md
@@ -0,0 +1,64 @@
+# tensorrt-unet
+This is a TensorRT version Unet, inspired by [tensorrtx](https://github.com/wang-xinyu/tensorrtx) and [pytorch-unet](https://github.com/milesial/Pytorch-UNet).<br>
+You can generate TensorRT engine file using this script and customize some params and network structure based on network you trained (FP32/16 precision, input size, different conv, activation function...)<br>
+
+# requirements
+
+TensorRT 7.0 (you need to install tensorrt first)<br>
+Cuda 10.2<br>
+Python3.7<br>
+opencv 4.4<br>
+cmake 3.18<br>
+# train .pth file and convert .wts
+
+## create env
+
+```
+pip install -r requirements.txt
+```
+
+## train .pth file
+
+train your dataset by following [pytorch-unet](https://github.com/milesial/Pytorch-UNet) and generate .pth file.<br>
+
+## convert .wts
+
+run gen_wts from utils folder, and move it to project folder<br>
+
+# generate engine file and infer
+
+## create build folder in project folder
+```
+mkdir build
+```
+
+## make file, generate exec file
+```
+cd build
+cmake ..
+make
+```
+
+## generate TensorRT engine file and infer image
+```
+unet -s
+```
+then a unet exec file will generated, you can use unet -d to infer files in a folder<br>
+```
+unet -d ../samples
+```
+
+# efficiency
+the speed of tensorRT engine is much faster
+
+ pytorch | TensorRT FP32 | TensorRT FP16
+ ---- | ----- | ------  
+ 816x672  | 816x672 | 816x672 
+ 58ms  | 43ms (batchsize 8) | 14ms (batchsize 8) 
+
+
+# Further development
+
+1. add INT8 calibrator<br>
+2. add custom plugin<br>
+etc
diff --git a/unet/common.hpp b/unet/common.hpp
@@ -0,0 +1,148 @@
+#ifndef UNET_COMMON_H_
+#define UNET_COMMON_H_
+
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <opencv2/opencv.hpp>
+#include <dirent.h>
+#include "NvInfer.h"
+
+
+#define CHECK(status) \
+    do\
+    {\
+        auto ret = (status);\
+        if (ret != 0)\
+        {\
+            std::cerr << "Cuda failure: " << ret << std::endl;\
+            abort();\
+        }\
+    } while (0)
+
+using namespace nvinfer1;
+
+
+
+
+
+// TensorRT weight files have a simple space delimited format:
+// [type] [size] <data x size in hex>
+std::map<std::string, Weights> loadWeights(const std::string file) {
+    std::cout << "Loading weights: " << file << std::endl;
+    std::map<std::string, Weights> weightMap;
+
+    // Open weights file
+    std::ifstream input(file);
+    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
+
+    // Read number of weight blobs
+    int32_t count;
+    input >> count;
+    assert(count > 0 && "Invalid weight map file.");
+
+    while (count--)
+    {
+        Weights wt{DataType::kFLOAT, nullptr, 0};
+        uint32_t size;
+
+        // Read name and type of blob
+        std::string name;
+        input >> name >> std::dec >> size;
+        wt.type = DataType::kFLOAT;
+
+        // Load blob
+        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+        for (uint32_t x = 0, y = size; x < y; ++x)
+        {
+            input >> std::hex >> val[x];
+        }
+        wt.values = val;
+        
+        wt.count = size;
+        weightMap[name] = wt;
+    }
+
+    return weightMap;
+}
+
+IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
+    float *gamma = (float*)weightMap[lname + ".weight"].values;
+    float *beta = (float*)weightMap[lname + ".bias"].values;
+    float *mean = (float*)weightMap[lname + ".running_mean"].values;
+    float *var = (float*)weightMap[lname + ".running_var"].values;
+    int len = weightMap[lname + ".running_var"].count;
+
+    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        scval[i] = gamma[i] / sqrt(var[i] + eps);
+    }
+    Weights scale{DataType::kFLOAT, scval, len};
+    
+    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+    }
+    Weights shift{DataType::kFLOAT, shval, len};
+
+    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        pval[i] = 1.0;
+    }
+    Weights power{DataType::kFLOAT, pval, len};
+
+    weightMap[lname + ".scale"] = scale;
+    weightMap[lname + ".shift"] = shift;
+    weightMap[lname + ".power"] = power;
+    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
+    assert(scale_1);
+    return scale_1;
+}
+
+
+ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
+    Weights emptywts{DataType::kFLOAT, nullptr, 0};
+    int p = ksize / 2;
+    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts);
+    assert(conv1);
+    conv1->setStrideNd(DimsHW{s, s});
+    conv1->setPaddingNd(DimsHW{p, p});
+    conv1->setNbGroups(g);
+    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
+
+    // hard_swish = x * hard_sigmoid
+    auto hsig = network->addActivation(*bn1->getOutput(0), ActivationType::kHARD_SIGMOID);
+    assert(hsig);
+    hsig->setAlpha(1.0 / 6.0);
+    hsig->setBeta(0.5);
+    auto ew = network->addElementWise(*bn1->getOutput(0), *hsig->getOutput(0), ElementWiseOperation::kPROD);
+    assert(ew);
+    return ew;
+}
+
+
+
+int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+                strcmp(p_file->d_name, "..") != 0) {
+            //std::string cur_file_name(p_dir_name);
+            //cur_file_name += "/";
+            //cur_file_name += p_file->d_name;
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+#endif
diff --git a/unet/gen_wts.py b/unet/gen_wts.py
@@ -0,0 +1,36 @@
+import torch
+from torch import nn
+import torchvision
+import os
+import struct
+from torchsummary import summary
+
+def main():
+    print('cuda device count: ', torch.cuda.device_count())
+    net = torch.load('ori_unet.pth')
+    net = net.to('cuda:0')
+    net = net.eval()
+    print('model: ', net)
+    #print('state dict: ', net.state_dict().keys())
+    tmp = torch.ones(1, 3, 224, 224).to('cuda:0')
+    print('input: ', tmp)
+    out = net(tmp)
+
+    print('output:', out)
+
+    summary(net, (3, 224, 224))
+    #return
+    f = open("unet.wts", 'w')
+    f.write("{}\n".format(len(net.state_dict().keys())))
+    for k,v in net.state_dict().items():
+        print('key: ', k)
+        print('value: ', v.shape)
+        vr = v.reshape(-1).cpu().numpy()
+        f.write("{} {}".format(k, len(vr)))
+        for vv in vr:
+            f.write(" ")
+            f.write(struct.pack(">f", float(vv)).hex())
+        f.write("\n")
+
+if __name__ == '__main__':
+    main()
diff --git a/unet/logging.h b/unet/logging.h
diff --git a/unet/unet.cpp b/unet/unet.cpp