Simplify implementation of map_impl

stijnh · stijnh · commit 06c10e3079c5 · 2024-07-12T11:49:58.000+02:00
diff --git a/include/kernel_float/apply.h b/include/kernel_float/apply.h
@@ -118,43 +118,63 @@ broadcast_like(const V& input, const R& other) {
 
 namespace detail {
 
-template<size_t N>
-struct apply_recur_impl;
-
 template<typename F, size_t N, typename Output, typename... Args>
 struct apply_impl {
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        apply_recur_impl<N>::call(fun, result, inputs...);
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+#pragma unroll
+        for (size_t i = 0; i < N; i++) {
+            output[i] = fun(args[i]...);
+        }
     }
 };
 
-template<size_t N>
-struct apply_recur_impl {
-    static constexpr size_t K = round_up_to_power_of_two(N) / 2;
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
+
+template<typename F, size_t N, typename Output, typename... Args>
+struct map_impl {
+    static constexpr size_t packet_size = preferred_vector_size<Output>::value;
+
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+        if constexpr (N / packet_size > 0) {
+#pragma unroll
+            for (size_t i = 0; i < N - N % packet_size; i += packet_size) {
+                apply_impl<F, packet_size, Output, Args...>::call(fun, output + i, (args + i)...);
+            }
+        }
 
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        apply_impl<F, K, Output, Args...>::call(fun, result, inputs...);
-        apply_impl<F, N - K, Output, Args...>::call(fun, result + K, (inputs + K)...);
+        if constexpr (N % packet_size > 0) {
+#pragma unroll
+            for (size_t i = N - N % packet_size; i < N; i++) {
+                apply_impl<F, 1, Output, Args...>::call(fun, output + i, (args + i)...);
+            }
+        }
     }
 };
 
-template<>
-struct apply_recur_impl<0> {
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {}
-};
+template<typename F, size_t N, typename Output, typename... Args>
+struct fast_map_impl {
+    static constexpr size_t packet_size = preferred_vector_size<Output>::value;
+
+    KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
+        if constexpr (N / packet_size > 0) {
+#pragma unroll
+            for (size_t i = 0; i < N - N % packet_size; i += packet_size) {
+                apply_fastmath_impl<F, packet_size, Output, Args...>::call(
+                    fun,
+                    output + i,
+                    (args + i)...);
+            }
+        }
 
-template<>
-struct apply_recur_impl<1> {
-    template<typename F, typename Output, typename... Args>
-    KERNEL_FLOAT_INLINE static void call(F fun, Output* result, const Args*... inputs) {
-        result[0] = fun(inputs[0]...);
+        if constexpr (N % packet_size > 0) {
+#pragma unroll
+            for (size_t i = N - N % packet_size; i < N; i++) {
+                apply_fastmath_impl<F, 1, Output, Args...>::call(fun, output + i, (args + i)...);
+            }
+        }
     }
 };
-
-template<typename F, size_t N, typename Output, typename... Args>
-struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
 }  // namespace detail
 
 template<typename F, typename... Args>
@@ -180,12 +200,12 @@ KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
     // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
 #if KERNEL_FLOAT_FAST_MATH
     using apply_impl =
-        detail::apply_fastmath_impl<F, extent_size<E>, Output, vector_value_type<Args>...>;
+        detail::fast_math_impl<F, extent_size<E>, Output, vector_value_type<Args>...>;
 #else
-    using apply_impl = detail::apply_impl<F, extent_size<E>, Output, vector_value_type<Args>...>;
+    using map_impl = detail::map_impl<F, extent_size<E>, Output, vector_value_type<Args>...>;
 #endif
 
-    apply_impl::call(
+    map_impl::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
@@ -205,7 +225,7 @@ KERNEL_FLOAT_INLINE map_type<F, Args...> fast_map(F fun, const Args&... args) {
     using E = broadcast_vector_extent_type<Args...>;
     vector_storage<Output, extent_size<E>> result;
 
-    detail::apply_fastmath_impl<F, extent_size<E>, Output, vector_value_type<Args>...>::call(
+    detail::fast_map_impl<F, extent_size<E>, Output, vector_value_type<Args>...>::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
diff --git a/include/kernel_float/base.h b/include/kernel_float/base.h
@@ -231,6 +231,11 @@ struct into_vector_impl<vector<T, E, S>> {
     }
 };
 
+template<typename T>
+struct preferred_vector_size {
+    static constexpr size_t value = 1;
+};
+
 template<typename V>
 struct vector_traits;
 
diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h
@@ -11,6 +11,12 @@
 #include "vector.h"
 
 namespace kernel_float {
+
+template<>
+struct preferred_vector_size<__nv_bfloat16> {
+    static constexpr size_t value = 2;
+};
+
 KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__nv_bfloat16)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __nv_bfloat16)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __nv_bfloat16)
diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h
@@ -54,12 +54,12 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
 
 // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
 #if KERNEL_FLOAT_FAST_MATH
-    using apply_impl = detail::apply_fastmath_impl<F, extent_size<E>, O, T, T>;
+    using map_impl = detail::fast_map_impl<F, extent_size<E>, O, T, T>;
 #else
-    using apply_impl = detail::apply_impl<F, extent_size<E>, O, T, T>;
+    using map_impl = detail::map_impl<F, extent_size<E>, O, T, T>;
 #endif
 
-    apply_impl::call(
+    map_impl::call(
         fun,
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -310,14 +310,11 @@ struct apply_fastmath_impl<ops::divide<T>, N, T, T, T> {
 };
 
 #if KERNEL_FLOAT_IS_DEVICE
-template<size_t N>
-struct apply_fastmath_impl<ops::divide<float>, N, float, float, float> {
+template<>
+struct apply_fastmath_impl<ops::divide<float>, 1, float, float, float> {
     KERNEL_FLOAT_INLINE static void
     call(ops::divide<float> fun, float* result, const float* lhs, const float* rhs) {
-#pragma unroll
-        for (size_t i = 0; i < N; i++) {
-            result[i] = __fdividef(lhs[i], rhs[i]);
-        }
+        *result = __fdividef(*lhs, *rhs);
     }
 };
 #endif
@@ -329,7 +326,7 @@ fast_divide(const L& left, const R& right) {
     using E = broadcast_vector_extent_type<L, R>;
     vector_storage<T, extent_size<E>> result;
 
-    detail::apply_fastmath_impl<ops::divide<T>, extent_size<E>, T, T, T>::call(
+    detail::fast_map_impl<ops::divide<T>, extent_size<E>, T, T, T>::call(
         ops::divide<T> {},
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h
@@ -17,7 +17,7 @@ struct convert_impl {
     static vector_storage<T2, extent_size<E2>> call(vector_storage<T, extent_size<E>> input) {
         using F = ops::cast<T, T2, M>;
         vector_storage<T2, extent_size<E>> intermediate;
-        detail::apply_impl<F, extent_size<E>, T2, T>::call(F {}, intermediate.data(), input.data());
+        detail::map_impl<F, extent_size<E>, T2, T>::call(F {}, intermediate.data(), input.data());
         return detail::broadcast_impl<T2, E, E2>::call(intermediate);
     }
 };
@@ -48,7 +48,7 @@ struct convert_impl<T, E, T2, E, M> {
         using F = ops::cast<T, T2, M>;
 
         vector_storage<T2, extent_size<E>> result;
-        detail::apply_impl<F, extent_size<E>, T2, T>::call(F {}, result.data(), input.data());
+        detail::map_impl<F, extent_size<E>, T2, T>::call(F {}, result.data(), input.data());
         return result;
     }
 };
diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h
@@ -9,6 +9,12 @@
 #include "vector.h"
 
 namespace kernel_float {
+
+template<>
+struct preferred_vector_size<__half> {
+    static constexpr size_t value = 2;
+};
+
 KERNEL_FLOAT_DEFINE_PROMOTED_FLOAT(__half)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(float, __half)
 KERNEL_FLOAT_DEFINE_PROMOTED_TYPE(double, __half)
diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h
@@ -23,7 +23,7 @@ struct reduce_recur_impl {
     template<typename F, typename T>
     KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
         vector_storage<T, K> temp;
-        apply_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
+        map_impl<F, N - K, T, T, T>::call(fun, temp.data(), input, input + K);
 
         if constexpr (N < 2 * K) {
 #pragma unroll
@@ -178,7 +178,7 @@ struct dot_impl {
     KERNEL_FLOAT_INLINE
     static T call(const T* left, const T* right) {
         vector_storage<T, N> intermediate;
-        detail::apply_impl<ops::multiply<T>, N, T, T, T>::call(
+        detail::map_impl<ops::multiply<T>, N, T, T, T>::call(
             ops::multiply<T>(),
             intermediate.data(),
             left,
diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h
@@ -41,7 +41,7 @@ KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values, cons
     using F = ops::conditional<T>;
     vector_storage<T, extent_size<E>> result;
 
-    detail::apply_impl<F, extent_size<E>, T, bool, T, T>::call(
+    detail::map_impl<F, extent_size<E>, T, bool, T, T>::call(
         F {},
         result.data(),
         detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, bool, E>::call(
@@ -126,7 +126,7 @@ KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
     using F = ops::fma<T>;
     vector_storage<T, extent_size<E>> result;
 
-    detail::apply_impl<F, extent_size<E>, T, T, T, T>::call(
+    detail::map_impl<F, extent_size<E>, T, T, T, T>::call(
         F {},
         result.data(),
         detail::convert_impl<vector_value_type<A>, vector_extent_type<A>, T, E>::call(
diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h
@@ -214,12 +214,10 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
 
 #define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(T, F, FAST_FUN)                       \
     namespace detail {                                                                \
-    template<size_t N>                                                                \
-    struct apply_fastmath_impl<ops::F<T>, N, T, T> {                                  \
+    template<>                                                                        \
+    struct apply_fastmath_impl<ops::F<T>, 1, T, T> {                                  \
         KERNEL_FLOAT_INLINE static void call(ops::F<T>, T* result, const T* inputs) { \
-            for (size_t i = 0; i < N; i++) {                                          \
-                result[i] = FAST_FUN(inputs[i]);                                      \
-            }                                                                         \
+            *result = FAST_FUN(*inputs);                                              \
         }                                                                             \
     };                                                                                \
     }
@@ -229,12 +227,10 @@ KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf)
 
 #define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG)                         \
     namespace detail {                                                                    \
-    template<size_t N>                                                                    \
-    struct apply_fastmath_impl<ops::F<T>, N, T, T> {                                      \
+    template<>                                                                            \
+    struct apply_fastmath_impl<ops::F<T>, 1, T, T> {                                      \
         KERNEL_FLOAT_INLINE static void call(ops::F<T> fun, T* result, const T* inputs) { \
-            for (size_t i = 0; i < N; i++) {                                              \
-                asm(INSTR : "=" REG(result[i]) : REG(inputs[i]));                         \
-            }                                                                             \
+            asm(INSTR : "=" REG(*result) : REG(*inputs));                                 \
         }                                                                                 \
     };                                                                                    \
     }
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h