KernelTuner
diff --git a/‎docs/api.rst
+2 b/‎docs/api.rst
+2
diff --git a/‎docs/build_api.py
+23-6 b/‎docs/build_api.py
+23-6
diff --git a/‎docs/guides.rst
+1 b/‎docs/guides.rst
+1
diff --git a/‎docs/guides/constant.md
+37 b/‎docs/guides/constant.md
+37
diff --git a/‎examples/CMakeLists.txt
+1 b/‎examples/CMakeLists.txt
+1
diff --git a/‎examples/vector_add/main.cu
+11-5 b/‎examples/vector_add/main.cu
+11-5
diff --git a/‎examples/vector_add_tiling/CMakeLists.txt
+12 b/‎examples/vector_add_tiling/CMakeLists.txt
+12
diff --git a/‎examples/vector_add_tiling/main.cu
+97 b/‎examples/vector_add_tiling/main.cu
+97
@@ -8,6 +8,8 @@ API Reference
    api/binary_operators.rst
    api/reductions.rst
    api/mathematical.rst
+   api/fast_math.rst
    api/conditional.rst
    api/memory_read_write.rst
+   api/utilities.rst
 
@@ -27,7 +27,10 @@ def build_doxygen_page(name, items):
         content += "-" * len(title) + "\n"
 
         for symbol in symbols:
-            content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
+            if directive == "define":
+                content += f".. doxygendefine:: {symbol}\n\n"
+            else:
+                content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
 
     stripped_name = name.lower().replace(" ", "_").replace("/", "_")
     filename = f"api/{stripped_name}.rst"
@@ -90,7 +93,8 @@ def build_index_page(groups):
             "for_each",
         ],
         "Generation": [
-            "range",
+            ("range", "range()"),
+            ("range", "range(F fun)"),
             "range_like",
             "each_index",
             "fill",
@@ -193,6 +197,14 @@ def build_index_page(groups):
             "isinf",
             "isnan",
         ],
+        "Fast math": [
+                "fast_exp",
+                "fast_log",
+                "fast_cos",
+                "fast_sin",
+                "fast_tan",
+                "fast_div",
+        ],
         "Conditional": [
             ("where", "where(const C&, const L&, const R&)"),
             ("where", "where(const C&, const L&)"),
@@ -202,13 +214,18 @@ def build_index_page(groups):
             "cast_to",
             ("load", "load(const T*, const I&)"),
             ("load", "load(const T*, const I&, const M&)"),
-            ("loadn", "loadn(const T*, ptrdiff_t)"),
-            ("loadn", "loadn(const T*, ptrdiff_t, ptrdiff_t)"),
+            ("loadn", "loadn(const T*, size_t)"),
+            ("loadn", "loadn(const T*, size_t, size_t)"),
             ("store", "store(const V&, T *ptr, const I&)"),
             ("store", "store(const V&, T *ptr, const I&, const M&)"),
-            ("storen", "storen(const V&, T*, ptrdiff_t)"),
-            ("storen", "storen(const V&, T*, ptrdiff_t, ptrdiff_t)"),
+            ("storen", "storen(const V&, T*, size_t)"),
+            ("storen", "storen(const V&, T*, size_t, size_t)"),
             ("aligned_ptr", "aligned_ptr", "struct"),
+        ],
+        "Utilities": [
+            ("constant", "constant", "struct"),
+            ("tiling", "tiling", "struct"),
+            ("KERNEL_FLOAT_TILING_FOR", "KERNEL_FLOAT_TILING_FOR", "define"),
         ]
 }
 
 
@@ -6,3 +6,4 @@ Guides
    guides/introduction.rst
    guides/promotion.rst
    guides/prelude.rst
+   guides/constant.rst
@@ -0,0 +1,37 @@
+Using `kernel_float::constant`
+===
+
+When working with mixed precision types, you will find that working with constants presents a bit of a challenge.
+
+For example, a simple expression such as `3.14 * x` where `x` is of type `vec<float, 2>` will NOT be performed
+in `float` precision as you might expect, but instead in `double` precision.
+This happens since the left-hand side of this expression
+(a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`.
+
+To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represent
+constants. Any binary operations between a value of type `U` and a `constant<T>` will result in both
+operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes
+`constant<T>` useful for representing constants in your code.
+
+For example, consider the following code:
+
+```
+#include "kernel_float.h"
+namespace kf = kernel_float;
+
+int main() {
+  using Type = float;
+  const int N = 8;
+  static constexpr auto PI = kf::make_constant(3.14);
+
+  kf::vec<int, N> i = kf::range<int, N>();
+  kf::vec<Type, N> x = kf::cast<Type>(i) * PI;
+  kf::vec<Type, N> y = x * kf::sin(x);
+  Type result = kf::sum(y);
+  printf("result=%f", double(result));
+
+  return EXIT_SUCCESS;
+}
+```
+
+This code example uses the ``make_constant`` utility function to create `constant<T>`.
@@ -1 +1,2 @@
 add_subdirectory(vector_add)
+add_subdirectory(vector_add_tiling)
@@ -13,11 +13,13 @@ void cuda_check(cudaError_t code) {
 }
 
 template<int N>
-__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
+__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        output[i] = kf::cast<float>((input[i] * input[i]) * constant);
+        auto a = kf::read_aligned<N>(input + i * N);
+        auto b = kf::fma(a, a, kf::cast<__half>(constant));
+        kf::write_aligned<N>(output + i * N, b);
     }
 }
 
@@ -35,8 +37,8 @@ void run_kernel(int n) {
     }
 
     // Allocate device memory
-    khalf<items_per_thread>* input_dev;
-    kfloat<items_per_thread>* output_dev;
+    __half* input_dev;
+    float* output_dev;
     cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
     cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
 
@@ -47,7 +49,11 @@ void run_kernel(int n) {
     int block_size = 256;
     int items_per_block = block_size * items_per_thread;
     int grid_size = (n + items_per_block - 1) / items_per_block;
-    my_kernel<items_per_thread><<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
+    my_kernel<items_per_thread><<<grid_size, block_size>>>(
+        n,
+        kf::aligned_ptr(input_dev),
+        constant,
+        kf::aligned_ptr(output_dev));
 
     // Copy results back
     cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
 
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.17)
+
+set (PROJECT_NAME kernel_float_vecadd_tiling)
+project(${PROJECT_NAME} LANGUAGES CXX CUDA)
+set (CMAKE_CXX_STANDARD 17)
+
+add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
+target_link_libraries(${PROJECT_NAME} kernel_float)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+
+find_package(CUDA REQUIRED)
+target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
@@ -0,0 +1,97 @@
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "kernel_float.h"
+#include "kernel_float/tiling.h"
+using namespace kernel_float::prelude;
+
+void cuda_check(cudaError_t code) {
+    if (code != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code));
+    }
+}
+
+template<int N, int B>
+__global__ void my_kernel(
+    int length,
+    kf::aligned_ptr<const __half> input,
+    double constant,
+    kf::aligned_ptr<float> output) {
+    auto tiling = kf::tiling<
+        kf::tile_factor<N>,
+        kf::block_size<B>,
+        kf::distributions<kf::dist::block_cyclic<2>>>();
+
+    auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
+    auto mask = tiling.local_mask();
+
+    auto a = input.read(points, mask);
+    auto b = (a * a) * constant;
+    output.write(points, b, mask);
+}
+
+template<int items_per_thread, int block_size = 256>
+void run_kernel(int n) {
+    double constant = 1.0;
+    std::vector<half> input(n);
+    std::vector<float> output_expected;
+    std::vector<float> output_result;
+
+    // Generate input data
+    for (int i = 0; i < n; i++) {
+        input[i] = half(i);
+        output_expected[i] = float(i + constant);
+    }
+
+    // Allocate device memory
+    __half* input_dev;
+    float* output_dev;
+    cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
+    cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
+
+    // Copy device memory
+    cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));
+
+    // Launch kernel!
+    int items_per_block = block_size * items_per_thread;
+    int grid_size = (n + items_per_block - 1) / items_per_block;
+    my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
+        n,
+        kf::aligned_ptr(input_dev),
+        constant,
+        kf::aligned_ptr(output_dev));
+
+    // Copy results back
+    cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
+
+    // Check results
+    for (int i = 0; i < n; i++) {
+        float result = output_result[i];
+        float answer = output_expected[i];
+
+        if (result != answer) {
+            std::stringstream msg;
+            msg << "error: index " << i << " is incorrect: " << result << " != " << answer;
+            throw std::runtime_error(msg.str());
+        }
+    }
+
+    cuda_check(cudaFree(input_dev));
+    cuda_check(cudaFree(output_dev));
+}
+
+int main() {
+    int n = 84000;  // divisible by 1, 2, 3, 4, 5, 6, 7, 8
+    cuda_check(cudaSetDevice(0));
+
+    run_kernel<1>(n);
+    run_kernel<2>(n);
+    run_kernel<3>(n);
+    run_kernel<4>(n);
+    run_kernel<8>(n);
+
+    std::cout << "result correct\n";
+    return EXIT_SUCCESS;
+}
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`add_subdirectory(vector_add)`
	`2`	`+add_subdirectory(vector_add_tiling)`