Skip to content

Commit c2fc842

Browse files
working on pingpong matmul
1 parent 849dc2e commit c2fc842

14 files changed

+1865
-571
lines changed

examples/atom/fast-exp-fp16.cu

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include "common.h"
2+
3+
DEVICE unsigned exp2(unsigned x) {
4+
unsigned ret;
5+
asm volatile("ex2.approx.f16x2 %0, %1;" : "=r"(ret) : "r"(x));
6+
return ret;
7+
}
8+
9+
// thi is wrong, don't know how to fix
10+
DEVICE unsigned exp2(half x) {
11+
half ret;
12+
asm volatile("ex2.approx.f16 %0, %1;" : "=r"(ret) : "r"(x));
13+
return ret;
14+
}
15+
16+
__global__ void fast_exp(half* A, half* B, int N) {
17+
int x = blockIdx.x * blockDim.x + threadIdx.x;
18+
auto out = reinterpret_cast<unsigned*>(B);
19+
for (int i = x * 2; i * 2 < N; i += blockDim.x * gridDim.x * 2) {
20+
out[i / 2] = exp2((reinterpret_cast<unsigned*>(A))[i / 2]);
21+
}
22+
// for (int i = x; i < N; i += blockDim.x * gridDim.x) {
23+
// B[i] = exp2(A[i]);
24+
// }
25+
}
26+
27+
int main() {
28+
auto len = 100;
29+
auto A = alloc_cpu_tensor<half>({len});
30+
random_fill(A, {len});
31+
auto B = alloc_cpu_tensor<half>({len});
32+
auto dA = alloc_gpu_tensor<half>({len});
33+
auto dB = alloc_gpu_tensor<half>({len});
34+
copy_to_gpu(A, dA, {len});
35+
dim3 block(256);
36+
dim3 grid(ceil_div(len, block.x));
37+
fast_exp<<<grid, block>>>(dA, dB, len);
38+
copy_to_cpu(B, dB, {len});
39+
return 0;
40+
}

0 commit comments

Comments
 (0)