Skip to content

Commit 9760b9b

Browse files
committed
Merge branch 'dev'
2 parents f40ff58 + d6b7199 commit 9760b9b

25 files changed

+3200
-1369
lines changed

docs/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ API Reference
88
api/binary_operators.rst
99
api/reductions.rst
1010
api/mathematical.rst
11+
api/fast_math.rst
1112
api/conditional.rst
1213
api/memory_read_write.rst
14+
api/utilities.rst
1315

docs/build_api.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,10 @@ def build_doxygen_page(name, items):
2727
content += "-" * len(title) + "\n"
2828

2929
for symbol in symbols:
30-
content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
30+
if directive == "define":
31+
content += f".. doxygendefine:: {symbol}\n\n"
32+
else:
33+
content += f".. doxygen{directive}:: kernel_float::{symbol}\n\n"
3134

3235
stripped_name = name.lower().replace(" ", "_").replace("/", "_")
3336
filename = f"api/{stripped_name}.rst"
@@ -90,7 +93,8 @@ def build_index_page(groups):
9093
"for_each",
9194
],
9295
"Generation": [
93-
"range",
96+
("range", "range()"),
97+
("range", "range(F fun)"),
9498
"range_like",
9599
"each_index",
96100
"fill",
@@ -193,6 +197,14 @@ def build_index_page(groups):
193197
"isinf",
194198
"isnan",
195199
],
200+
"Fast math": [
201+
"fast_exp",
202+
"fast_log",
203+
"fast_cos",
204+
"fast_sin",
205+
"fast_tan",
206+
"fast_div",
207+
],
196208
"Conditional": [
197209
("where", "where(const C&, const L&, const R&)"),
198210
("where", "where(const C&, const L&)"),
@@ -202,13 +214,18 @@ def build_index_page(groups):
202214
"cast_to",
203215
("load", "load(const T*, const I&)"),
204216
("load", "load(const T*, const I&, const M&)"),
205-
("loadn", "loadn(const T*, ptrdiff_t)"),
206-
("loadn", "loadn(const T*, ptrdiff_t, ptrdiff_t)"),
217+
("loadn", "loadn(const T*, size_t)"),
218+
("loadn", "loadn(const T*, size_t, size_t)"),
207219
("store", "store(const V&, T *ptr, const I&)"),
208220
("store", "store(const V&, T *ptr, const I&, const M&)"),
209-
("storen", "storen(const V&, T*, ptrdiff_t)"),
210-
("storen", "storen(const V&, T*, ptrdiff_t, ptrdiff_t)"),
221+
("storen", "storen(const V&, T*, size_t)"),
222+
("storen", "storen(const V&, T*, size_t, size_t)"),
211223
("aligned_ptr", "aligned_ptr", "struct"),
224+
],
225+
"Utilities": [
226+
("constant", "constant", "struct"),
227+
("tiling", "tiling", "struct"),
228+
("KERNEL_FLOAT_TILING_FOR", "KERNEL_FLOAT_TILING_FOR", "define"),
212229
]
213230
}
214231

docs/guides.rst

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ Guides
66
guides/introduction.rst
77
guides/promotion.rst
88
guides/prelude.rst
9+
guides/constant.rst

docs/guides/constant.md

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
Using `kernel_float::constant`
2+
===
3+
4+
When working with mixed precision types, you will find that working with constants presents a bit of a challenge.
5+
6+
For example, a simple expression such as `3.14 * x` where `x` is of type `vec<float, 2>` will NOT be performed
7+
in `float` precision as you might expect, but instead in `double` precision.
8+
This happens since the left-hand side of this expression
9+
(a constant) is a `double` and thus `kernel_float` will also cast the right-hand side to `double`.
10+
11+
To solve this problem, `kernel_float` offers a type called `constant<T>` that can be used to represent
12+
constants. Any binary operations between a value of type `U` and a `constant<T>` will result in both
13+
operands being cast to type `U` and the operation is performed in the precision of type `U`. This makes
14+
`constant<T>` useful for representing constants in your code.
15+
16+
For example, consider the following code:
17+
18+
```
19+
#include "kernel_float.h"
20+
namespace kf = kernel_float;
21+
22+
int main() {
23+
using Type = float;
24+
const int N = 8;
25+
static constexpr auto PI = kf::make_constant(3.14);
26+
27+
kf::vec<int, N> i = kf::range<int, N>();
28+
kf::vec<Type, N> x = kf::cast<Type>(i) * PI;
29+
kf::vec<Type, N> y = x * kf::sin(x);
30+
Type result = kf::sum(y);
31+
printf("result=%f", double(result));
32+
33+
return EXIT_SUCCESS;
34+
}
35+
```
36+
37+
This code example uses the ``make_constant`` utility function to create `constant<T>`.

examples/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
add_subdirectory(vector_add)
2+
add_subdirectory(vector_add_tiling)

examples/vector_add/main.cu

+11-5
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ void cuda_check(cudaError_t code) {
1313
}
1414

1515
template<int N>
16-
__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
16+
__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
1717
int i = blockIdx.x * blockDim.x + threadIdx.x;
1818

1919
if (i * N < length) {
20-
output[i] = kf::cast<float>((input[i] * input[i]) * constant);
20+
auto a = kf::read_aligned<N>(input + i * N);
21+
auto b = kf::fma(a, a, kf::cast<__half>(constant));
22+
kf::write_aligned<N>(output + i * N, b);
2123
}
2224
}
2325

@@ -35,8 +37,8 @@ void run_kernel(int n) {
3537
}
3638

3739
// Allocate device memory
38-
khalf<items_per_thread>* input_dev;
39-
kfloat<items_per_thread>* output_dev;
40+
__half* input_dev;
41+
float* output_dev;
4042
cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
4143
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
4244

@@ -47,7 +49,11 @@ void run_kernel(int n) {
4749
int block_size = 256;
4850
int items_per_block = block_size * items_per_thread;
4951
int grid_size = (n + items_per_block - 1) / items_per_block;
50-
my_kernel<items_per_thread><<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
52+
my_kernel<items_per_thread><<<grid_size, block_size>>>(
53+
n,
54+
kf::aligned_ptr(input_dev),
55+
constant,
56+
kf::aligned_ptr(output_dev));
5157

5258
// Copy results back
5359
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
cmake_minimum_required(VERSION 3.17)
2+
3+
set (PROJECT_NAME kernel_float_vecadd_tiling)
4+
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5+
set (CMAKE_CXX_STANDARD 17)
6+
7+
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
8+
target_link_libraries(${PROJECT_NAME} kernel_float)
9+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
10+
11+
find_package(CUDA REQUIRED)
12+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})

examples/vector_add_tiling/main.cu

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#include <iostream>
2+
#include <sstream>
3+
#include <stdexcept>
4+
#include <vector>
5+
6+
#include "kernel_float.h"
7+
#include "kernel_float/tiling.h"
8+
using namespace kernel_float::prelude;
9+
10+
void cuda_check(cudaError_t code) {
11+
if (code != cudaSuccess) {
12+
throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code));
13+
}
14+
}
15+
16+
template<int N, int B>
17+
__global__ void my_kernel(
18+
int length,
19+
kf::aligned_ptr<const __half> input,
20+
double constant,
21+
kf::aligned_ptr<float> output) {
22+
auto tiling = kf::tiling<
23+
kf::tile_factor<N>,
24+
kf::block_size<B>,
25+
kf::distributions<kf::dist::block_cyclic<2>>>();
26+
27+
auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
28+
auto mask = tiling.local_mask();
29+
30+
auto a = input.read(points, mask);
31+
auto b = (a * a) * constant;
32+
output.write(points, b, mask);
33+
}
34+
35+
template<int items_per_thread, int block_size = 256>
36+
void run_kernel(int n) {
37+
double constant = 1.0;
38+
std::vector<half> input(n);
39+
std::vector<float> output_expected;
40+
std::vector<float> output_result;
41+
42+
// Generate input data
43+
for (int i = 0; i < n; i++) {
44+
input[i] = half(i);
45+
output_expected[i] = float(i + constant);
46+
}
47+
48+
// Allocate device memory
49+
__half* input_dev;
50+
float* output_dev;
51+
cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
52+
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
53+
54+
// Copy device memory
55+
cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));
56+
57+
// Launch kernel!
58+
int items_per_block = block_size * items_per_thread;
59+
int grid_size = (n + items_per_block - 1) / items_per_block;
60+
my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
61+
n,
62+
kf::aligned_ptr(input_dev),
63+
constant,
64+
kf::aligned_ptr(output_dev));
65+
66+
// Copy results back
67+
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
68+
69+
// Check results
70+
for (int i = 0; i < n; i++) {
71+
float result = output_result[i];
72+
float answer = output_expected[i];
73+
74+
if (result != answer) {
75+
std::stringstream msg;
76+
msg << "error: index " << i << " is incorrect: " << result << " != " << answer;
77+
throw std::runtime_error(msg.str());
78+
}
79+
}
80+
81+
cuda_check(cudaFree(input_dev));
82+
cuda_check(cudaFree(output_dev));
83+
}
84+
85+
int main() {
86+
int n = 84000; // divisible by 1, 2, 3, 4, 5, 6, 7, 8
87+
cuda_check(cudaSetDevice(0));
88+
89+
run_kernel<1>(n);
90+
run_kernel<2>(n);
91+
run_kernel<3>(n);
92+
run_kernel<4>(n);
93+
run_kernel<8>(n);
94+
95+
std::cout << "result correct\n";
96+
return EXIT_SUCCESS;
97+
}

0 commit comments

Comments
 (0)