Skip to content

Commit 140779d

Browse files
committed
delete HOSTDEVICE
1 parent 9cc8ac3 commit 140779d

File tree

2 files changed

+28
-28
lines changed

2 files changed

+28
-28
lines changed

paddle/fluid/operators/reduce_ops/reduce_functor_op.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ template <typename Tx, typename Ty = Tx>
2828
struct CustomMin {
2929
using Transformer = detail::IdentityFunctor<Tx>;
3030

31-
HOSTDEVICE __forceinline__ Ty initial() {
31+
inline Ty initial() {
3232
return static_cast<Ty>(std::numeric_limits<Ty>::max());
3333
}
3434

@@ -41,7 +41,7 @@ template <typename Tx, typename Ty = Tx>
4141
struct CustomMax {
4242
using Transformer = detail::IdentityFunctor<Tx>;
4343

44-
HOSTDEVICE __forceinline__ Ty initial() {
44+
inline Ty initial() {
4545
return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
4646
}
4747

@@ -55,7 +55,7 @@ template <typename Tx, typename Ty = Tx>
5555
struct CustomSum {
5656
using Transformer = detail::IdentityFunctor<Tx, Ty>;
5757

58-
HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
58+
inline Ty initial() { return static_cast<Ty>(0.0f); }
5959

6060
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
6161
return b + a;
@@ -66,7 +66,7 @@ template <typename Tx, typename Ty = Tx>
6666
struct CustomMean {
6767
using Transformer = detail::DivideFunctor<Tx>;
6868

69-
HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
69+
inline Ty initial() { return static_cast<Ty>(0.0f); }
7070

7171
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
7272
return b + a;
@@ -77,7 +77,7 @@ template <typename Tx, typename Ty = Tx>
7777
struct CustomMul {
7878
using Transformer = detail::IdentityFunctor<Tx>;
7979

80-
HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(1.0f); }
80+
inline Ty initial() { return static_cast<Ty>(1.0f); }
8181

8282
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
8383
return b * a;
@@ -88,7 +88,7 @@ template <typename Tx, typename Ty = Tx>
8888
struct CustomLogicalOr {
8989
using Transformer = detail::IdentityFunctor<Tx>;
9090

91-
HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(false); }
91+
inline Ty initial() { return static_cast<Ty>(false); }
9292

9393
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
9494
return b || a;
@@ -99,7 +99,7 @@ template <typename Tx, typename Ty = Tx>
9999
struct CustomLogicalAnd {
100100
using Transformer = detail::IdentityFunctor<Tx>;
101101

102-
HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(true); }
102+
inline Ty initial() { return static_cast<Ty>(true); }
103103

104104
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
105105
return b && a;

paddle/fluid/operators/reduce_ops/reduce_op.cu.h

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -419,12 +419,12 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
419419
int BlockDim>
420420
__device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
421421
ReduceOp reducer,
422-
TransformOp transformer,
422+
TransformOp transformer, Ty init,
423423
int reduce_num) {
424424
__shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
425425
int idx_x = blockIdx.x * reduce_num;
426426
int idx_y = threadIdx.x;
427-
Ty reduce_var = reducer.initial();
427+
Ty reduce_var = init;
428428
for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) {
429429
reduce_var =
430430
reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
@@ -448,12 +448,12 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
448448
__device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
449449
ReduceOp reducer,
450450
TransformOp transformer,
451-
int reduce_num, int left_num,
452-
int block_size) {
451+
Ty init, int reduce_num,
452+
int left_num, int block_size) {
453453
int idx = blockIdx.x * blockDim.x + threadIdx.x;
454454
int idy = blockIdx.y * block_size;
455455

456-
Ty reduce_var = reducer.initial();
456+
Ty reduce_var = init;
457457

458458
if (idx < left_num) {
459459
int loop = reduce_num - idy;
@@ -532,7 +532,7 @@ __device__ __forceinline__ void ReduceAny(
532532
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
533533
int BlockDim, int Rank, int ReduceRank, int ReduceType>
534534
__device__ __forceinline__ void ReduceModule(
535-
const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
535+
const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
536536
int reduce_num, int left_num, int blocking_size,
537537
paddle::framework::Array<int, Rank> x_strides,
538538
paddle::framework::Array<int, ReduceRank> reduce_dim,
@@ -542,12 +542,12 @@ __device__ __forceinline__ void ReduceModule(
542542
// reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1
543543
if (ReduceType == ReduceType::kReduceLastDim) {
544544
ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
545-
x, y, reducer, transformer, reduce_num);
545+
x, y, reducer, transformer, init, reduce_num);
546546

547547
// reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
548548
} else if (ReduceType == ReduceType::kReduceHigherDim) {
549549
ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
550-
x, y, reducer, transformer, reduce_num, left_num, blocking_size);
550+
x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
551551

552552
// reduce_rank >= 2
553553
} else {
@@ -560,32 +560,32 @@ __device__ __forceinline__ void ReduceModule(
560560
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
561561
int BlockDim, int Rank, int ReduceRank, int ReduceType>
562562
__global__ void ReduceKernelFunction(
563-
const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
563+
const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
564564
int reduce_num, int left_num, int block_size,
565565
paddle::framework::Array<int, Rank> x_strides,
566566
paddle::framework::Array<int, ReduceRank> reduce_dim,
567567
paddle::framework::Array<int, ReduceRank> reduce_strides,
568568
paddle::framework::Array<int, Rank - ReduceRank> left_dim,
569569
paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
570570
ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank,
571-
ReduceType>(x, y, reducer, transformer, reduce_num, left_num,
572-
block_size, x_strides, reduce_dim, reduce_strides,
573-
left_dim, left_strides);
571+
ReduceType>(x, y, reducer, transformer, init, reduce_num,
572+
left_num, block_size, x_strides, reduce_dim,
573+
reduce_strides, left_dim, left_strides);
574574
}
575575

576576
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
577577
typename TransformOp, int kRank, int kReduceRank>
578578
static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
579-
const TransformOp& transformer, gpuStream_t stream,
580-
ReduceConfig<Ty> config) {
579+
const TransformOp& transformer, Ty init,
580+
gpuStream_t stream, ReduceConfig<Ty> config) {
581581
#define CUB_REDUCE_TYPE_CASE(type) \
582582
case type: { \
583583
constexpr auto kReduceType = type; \
584584
ReduceKernelFunction< \
585585
Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \
586586
kReduceType><<<config.grid, config.block, 0, stream>>>( \
587-
x_data, config.output_data, reducer, transformer, config.reduce_num, \
588-
config.left_num, config.blocking_size, \
587+
x_data, config.output_data, reducer, transformer, init, \
588+
config.reduce_num, config.left_num, config.blocking_size, \
589589
detail::VectorToArray<int, kRank>(config.x_strides), \
590590
detail::VectorToArray<int, kReduceRank>(config.reduce_dim), \
591591
detail::VectorToArray<int, kReduceRank>(config.reduce_strides), \
@@ -607,7 +607,7 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
607607
Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
608608
ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
609609
config.output_data, y_data, reducer,
610-
detail::IdentityFunctor<Ty>(config.grid.y), config.grid.y,
610+
detail::IdentityFunctor<Ty>(config.grid.y), init, config.grid.y,
611611
config.left_num, config.grid.y,
612612
detail::VectorToArray<int, kRank>(config.x_strides),
613613
detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
@@ -621,7 +621,7 @@ template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
621621
typename TransformOp>
622622
static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
623623
const ReduceOp& reducer,
624-
const TransformOp& transformer,
624+
const TransformOp& transformer, Ty init,
625625
gpuStream_t stream, ReduceConfig<Ty> config) {
626626
int reduce_rank = config.reduce_strides.size();
627627
int rank = config.x_strides.size();
@@ -636,7 +636,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
636636
case i: { \
637637
constexpr auto kReduceRank = i; \
638638
LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
639-
x_data, y_data, reducer, transformer, stream, config); \
639+
x_data, y_data, reducer, transformer, init, stream, config); \
640640
} break
641641

642642
detail::CheckReduceRank(reduce_rank, rank);
@@ -711,8 +711,8 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
711711
case block_dim: { \
712712
constexpr auto kBlockDim = block_dim; \
713713
LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp<Tx, Ty>, TransformOp>( \
714-
x_data, y_data, reducer, TransformOp(config.reduce_num), stream, \
715-
config); \
714+
x_data, y_data, reducer, TransformOp(config.reduce_num), \
715+
reducer.initial(), stream, config); \
716716
} break
717717

718718
switch (detail::GetBlockDim(config.reduce_num)) {

0 commit comments

Comments
 (0)