update TensorReduceFunc

AnnaTrainingG · AnnaTrainingG · commit 87008944b55d · 2021-06-02T11:58:25.000Z
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -45,7 +45,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     auto stream = context.cuda_device_context().stream();
     TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, DataBound<T>::min(), CustomMax<T>(),
-        detail::IdentityFunctor<T>(), stream);
+        detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -45,7 +45,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     auto stream = context.cuda_device_context().stream();
     TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, DataBound<T>::max(), CustomMin<T>(),
-        detail::IdentityFunctor<T>(), stream);
+        detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -38,11 +38,13 @@ namespace operators {
 namespace detail {
 
 // Post processing function for sum, max, min, prod, any
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct IdentityFunctor {
   HOSTDEVICE explicit inline IdentityFunctor() {}
 
-  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
 };
 
 // Post processing function for mean
@@ -81,7 +83,7 @@ static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
 #ifdef __HIPCC__
 constexpr int kMaxBlock = 256;
 #else
-constexpr int kMaxBlock = 512;
+constexpr int kMaxBlock = 128;
 #endif
 
 // get blockDim for reduceLastDim and reduceAny
@@ -544,8 +546,7 @@ __global__ void ReduceKernelFunction(
 
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp, int kRank, int kReduceRank>
-static void LaunchKernel(const Tx* x_data, Ty* y_data,
-                         const platform::Place& place, const ReduceOp& reducer,
+static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
                          const TransformOp& transformer, const Ty& init,
                          gpuStream_t stream, ReduceConfig<Ty> config) {
 #define CUB_REDUCE_TYPE_CASE(type)                                             \
@@ -589,7 +590,6 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data,
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp>
 static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
-                               const platform::Place& place,
                                const ReduceOp& reducer,
                                const TransformOp& transformer, const Ty& init,
                                gpuStream_t stream, ReduceConfig<Ty> config) {
@@ -606,26 +606,9 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
   case i: {                                                                    \
     constexpr auto kReduceRank = i;                                            \
     LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
-        x_data, y_data, place, reducer, transformer, init, stream, config);    \
+        x_data, y_data, reducer, transformer, init, stream, config);           \
   } break
 
-  // launch CUB::Reduce
-  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
-    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
-        x_data, transformer);
-    size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                              config.reduce_num, reducer, init, stream);
-    framework::Tensor tmp;
-    auto* temp_storage = tmp.mutable_data<uint8_t>(
-        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-        place);
-    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
-                              config.reduce_num, reducer, init, stream);
-
-    return;
-  }
-
   detail::CheckReduceRank(reduce_rank, rank);
   switch (rank) {
     CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
@@ -649,10 +632,12 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
 #undef CUB_RANK_CASE
 }
 
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          typename CubTransformOp = TransformOp>
 void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
                       std::vector<int> origin_reduce_dims, const Ty& init,
                       const ReduceOp& reducer, const TransformOp& transformer,
+                      const CubTransformOp& cub_transformer,
                       gpuStream_t stream) {
   auto x_dim = framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
@@ -673,13 +658,28 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
     y->Resize(out_dims);
     return;
   }
+  // launch CUB::Reduce
+  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
+    cub::TransformInputIterator<Ty, CubTransformOp, const Tx*> trans_x(
+        x_data, cub_transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        x.place());
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+
+    return;
+  }
 
-#define CUB_BLOCK_DIM_CASE(block_dim)                                  \
-  case block_dim: {                                                    \
-    constexpr auto kBlockDim = block_dim;                              \
-    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(      \
-        x_data, y_data, x.place(), reducer, transformer, init, stream, \
-        config);                                                       \
+#define CUB_BLOCK_DIM_CASE(block_dim)                                \
+  case block_dim: {                                                  \
+    constexpr auto kBlockDim = block_dim;                            \
+    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(    \
+        x_data, y_data, reducer, transformer, init, stream, config); \
   } break
 
   switch (detail::GetBlockDim(config.reduce_num)) {
@@ -696,5 +696,36 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
 #undef CUB_BLOCK_DIM_CASE
 }
 
+template <typename Tx, typename ReduceOp,
+          template <typename, typename> class TransformOp>
+struct TensorReduceFunctorImpl {
+  const framework::Tensor& x;
+  framework::Tensor* y;
+  std::vector<int> origin_reduce_dims;
+  const double& init;
+  const ReduceOp& reducer;
+  gpuStream_t stream;
+  TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+                          std::vector<int> origin_reduce_dims,
+                          const double& init, const ReduceOp& reducer,
+                          gpuStream_t stream)
+      : x(x),
+        y(y),
+        origin_reduce_dims(origin_reduce_dims),
+        init(init),
+        reducer(reducer),
+        stream(stream) {}
+
+  template <typename Ty>
+
+  void apply() const {
+    const Ty& init_cast = static_cast<Ty>(init);
+    TensorReduceFunc<Tx, Ty, ReduceOp, TransformOp<Ty, Ty>,
+                     TransformOp<Tx, Ty>>(x, y, origin_reduce_dims, init_cast,
+                                          reducer, TransformOp<Ty, Ty>(),
+                                          TransformOp<Tx, Ty>(), stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -51,22 +51,15 @@ class ReduceProdKernel : public framework::OpKernel<T> {
 
     auto stream = context.cuda_device_context().stream();
     if (out_dtype >= 0) {
-#define VisitDataTypeSmall_t(cpp_type, proto_type)                             \
-  do {                                                                         \
-    if (static_cast<framework::proto::VarType::Type>(out_dtype) ==             \
-        proto_type) {                                                          \
-      TensorReduceFunc<T, cpp_type, CustomMul<cpp_type>,                       \
-                       detail::IdentityFunctor<cpp_type>>(                     \
-          *input, output, reduce_dims, static_cast<cpp_type>(1.0f),            \
-          CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
-    }                                                                          \
-  } while (0)
-      _ForEachDataTypeSmall_(VisitDataTypeSmall_t);
-#undef VisitDataTypeSmall_t
+      framework::VisitDataTypeSmall(
+          static_cast<framework::proto::VarType::Type>(out_dtype),
+          TensorReduceFunctorImpl<T, cub::Sum, detail::IdentityFunctor>(
+              *input, output, reduce_dims, static_cast<double>(1.0f),
+              cub::Sum(), stream));
     } else {
       TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
           *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
-          detail::IdentityFunctor<T>(), stream);
+          detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
     }
   }
 };

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {`
`45`	`45`	`auto stream = context.cuda_device_context().stream();`
`46`	`46`	`TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(`
`47`	`47`	`*input, output, reduce_dims, DataBound<T>::min(), CustomMax<T>(),`
`48`		`- detail::IdentityFunctor<T>(), stream);`
	`48`	`+ detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);`
`49`	`49`	`}`
`50`	`50`	`};`
`51`	`51`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {`
`45`	`45`	`auto stream = context.cuda_device_context().stream();`
`46`	`46`	`TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(`
`47`	`47`	`*input, output, reduce_dims, DataBound<T>::max(), CustomMin<T>(),`
`48`		`- detail::IdentityFunctor<T>(), stream);`
	`48`	`+ detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);`
`49`	`49`	`}`
`50`	`50`	`};`
`51`	`51`