-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathroi_feature_boost_op.cu
97 lines (72 loc) · 2.7 KB
/
roi_feature_boost_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <functional>
#include "caffe2/core/context_gpu.h"
#include "roi_feature_boost_op.h"
namespace caffe2 {
namespace {
template <typename T>
__global__ void kernel_forward(const int nthreads, const T* Xdata,
const T* Sdata, const int batch_size,
const int feature_size, T* Ydata) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int f = index % feature_size;
int b = index / feature_size;
int index_S = b;
int index_XY = b * feature_size + f;
Ydata[index_XY] = Xdata[index_XY] * Sdata[index_S];
}
}
template <typename T>
__global__ void kernel_backward(const int nthreads, const T* dYdata,
const T* Sdata, const int batch_size,
const int feature_size, T* dXdata) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int f = index % feature_size;
int b = index / feature_size;
int index_S = b;
int index_dXY = b * feature_size + f;
dXdata[index_dXY] = dYdata[index_dXY] * Sdata[index_S];
}
}
} // namespace
template <>
bool RoIFeatureBoostOp<float, CUDAContext>::RunOnDevice() {
const auto& X = Input(0);
const auto& S = Input(1);
CAFFE_ENFORCE_EQ(S.dim32(0), S.numel());
CAFFE_ENFORCE_EQ(X.dim32(0), S.dim32(0));
const int batch_size = X.dim32(0);
const int feature_size = X.size_from_dim(1);
const float* Xdata = X.data<float>();
const float* Sdata = S.data<float>();
auto* Y = Output(0);
Y->ResizeLike(X);
float* Ydata = Y->mutable_data<float>();
const int nthreads = X.numel();
kernel_forward<float><<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
nthreads, Xdata, Sdata, batch_size, feature_size, Ydata);
return true;
}
template <>
bool RoIFeatureBoostGradientOp<float, CUDAContext>::RunOnDevice() {
const auto& dY = Input(0);
const auto& S = Input(1);
CAFFE_ENFORCE_EQ(S.dim32(0), S.numel());
CAFFE_ENFORCE_EQ(dY.dim32(0), S.dim32(0));
const int batch_size = dY.dim32(0);
const int feature_size = dY.size_from_dim(1);
const float* dYdata = dY.data<float>();
const float* Sdata = S.data<float>();
auto* dX = Output(0);
dX->ResizeLike(dY);
float* dXdata = dX->mutable_data<float>();
const int nthreads = dY.numel();
kernel_backward<float><<<CAFFE_GET_BLOCKS(nthreads), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
nthreads, dYdata, Sdata, batch_size, feature_size, dXdata);
return true;
}
REGISTER_CUDA_OPERATOR(RoIFeatureBoost, RoIFeatureBoostOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(RoIFeatureBoostGradient,
RoIFeatureBoostGradientOp<float, CUDAContext>);
} // namespace caffe2