-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcpg_sw_op.cu
126 lines (101 loc) · 3.27 KB
/
cpg_sw_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include <functional>
#include "caffe2/core/context_gpu.h"
#include "cpg_sw_op.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void SigmoidCUDAKernel(const int N, const T* X, T* Y);
template <>
__global__ void SigmoidCUDAKernel<float>(const int N, const float* X,
float* Y) {
CUDA_1D_KERNEL_LOOP(i, N) {
#if __CUDA_ARCH__ >= 350
Y[i] = 1.0f / (1.0f + expf(-__ldg(X + i)));
#else
Y[i] = 1.0f / (1.0f + expf(-X[i]));
#endif
}
}
} // namespace
template <>
bool CPGSWOp<float, CUDAContext>::RunOnDevice() {
const auto& C = Input(0);
const auto& M = Input(1);
const auto& LO = Input(2);
const auto& LA = Input(3);
const auto& P = Input(4);
CAFFE_ENFORCE_EQ(C.dim(), 4);
CAFFE_ENFORCE_EQ(M.dim(), 4);
CAFFE_ENFORCE_EQ(LO.dim(), 2);
CAFFE_ENFORCE_EQ(LA.dim(), 2);
CAFFE_ENFORCE_EQ(P.dim(), 2);
CAFFE_ENFORCE_EQ(C.dim32(0), M.dim32(0));
CAFFE_ENFORCE_EQ(C.dim32(0), LO.dim32(0));
CAFFE_ENFORCE_EQ(C.dim32(0), LA.dim32(0));
CAFFE_ENFORCE_EQ(C.dim32(0), P.dim32(0));
CAFFE_ENFORCE_EQ(C.dim32(1), M.dim32(1));
CAFFE_ENFORCE_EQ(C.dim32(1), LO.dim32(1));
CAFFE_ENFORCE_EQ(C.dim32(1), LA.dim32(1));
CAFFE_ENFORCE_EQ(C.dim32(1), P.dim32(1));
CAFFE_ENFORCE_EQ(C.dim32(2), M.dim32(2));
CAFFE_ENFORCE_EQ(C.dim32(3), M.dim32(3));
const int batch_size = C.dim32(0);
const int num_classes = C.dim32(1);
const int height = C.dim32(2);
const int width = C.dim32(3);
auto* CO = Output(0);
CO->CopyFrom(C, false);
context_.FinishDeviceComputation();
if (cur_iter_ >= max_iter_) {
return true;
}
Tensor LOcpu = Tensor(LO, caffe2::CPU);
context_.FinishDeviceComputation();
const float* LOcpudata = LOcpu.data<float>();
Tensor LAcpu = Tensor(LA, caffe2::CPU);
context_.FinishDeviceComputation();
const float* LAcpudata = LAcpu.data<float>();
Tensor Pcpu = Tensor(P, caffe2::CPU);
context_.FinishDeviceComputation();
const float* Pcpudata = Pcpu.data<float>();
for (int b = 0; b < batch_size; b++) {
for (int c = 0; c < num_classes; c++) {
int label_idx = b * num_classes + c;
float loss_value = LOcpudata[label_idx];
float label_value = LAcpudata[label_idx];
float pred_value = Pcpudata[label_idx];
if (label_value < 0.5) {
continue;
}
if (pred_value < tau_) {
continue;
}
acm_loss_ += loss_value;
acm_cnt_ += 1;
if (loss_value > min_loss_) {
continue;
}
acm_sw_loss_ += loss_value;
acm_sw_cnt_ += 1;
SigmoidCUDAKernel<float>
<<<CAFFE_GET_BLOCKS(height * width), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
height * width, M.data<float>() + label_idx * height * width,
CO->mutable_data<float>() + label_idx * height * width);
}
}
cur_iter_++;
if (cur_iter_ % 1280 == 0) {
printf("CPG_SW acm_loss: %f acm_cnt: %d mean loss: %f\n", acm_loss_,
acm_cnt_, acm_loss_ / acm_cnt_);
printf("CPG_SW acm_sw_loss: %f acm_sw_cnt: %d mean loss: %f\n",
acm_sw_loss_, acm_sw_cnt_, acm_sw_loss_ / acm_sw_cnt_);
acm_loss_ = 0;
acm_cnt_ = 0;
acm_sw_loss_ = 0;
acm_sw_cnt_ = 0;
}
return true;
}
REGISTER_CUDA_OPERATOR(CPGSW, CPGSWOp<float, CUDAContext>);
} // namespace caffe2