|
1 |
| -#include <sys/time.h> |
2 |
| -#include <cassert> |
| 1 | +#include <assert.h> |
3 | 2 | #include <iostream>
|
4 |
| -#include <cuda_fp16.h> |
5 | 3 | #include <cuda.h>
|
6 | 4 | #include <mma.h>
|
7 |
| -#include <cuda_runtime_api.h> |
| 5 | +#include <cuda_fp16.h> |
8 | 6 |
|
9 |
| -using namespace nvcuda; |
| 7 | +#define N 32 |
| 8 | +#define M 32 |
| 9 | +#define K 32 |
10 | 10 |
|
| 11 | +using namespace nvcuda; |
11 | 12 |
|
12 |
| -struct timeval tv0, tv1; |
| 13 | +__global__ void foo(half *a, half *b, float *c) { |
| 14 | + int block_x = blockIdx.x / 2; |
| 15 | + int block_y = blockIdx.x % 2; |
13 | 16 |
|
14 |
| -void begin_roi() { |
15 |
| - gettimeofday(&tv0, nullptr); |
16 |
| -} |
| 17 | + wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag; |
| 18 | + wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag; |
| 19 | + wmma::fragment<wmma::accumulator, 16, 16, 16, float, void> c_frag; |
| 20 | + wmma::fill_fragment(c_frag, 0.0f); |
17 | 21 |
|
18 |
| -#define TV_TO_SEC(tv) (tv.tv_sec * 1000000 + tv.tv_usec) |
| 22 | + for (int k = 0; k < M; k += 16) { |
| 23 | + wmma::load_matrix_sync(a_frag, a + M * block_x + k, M); |
| 24 | + wmma::load_matrix_sync(b_frag, b + K * k + block_y * 16, K); |
| 25 | + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); |
| 26 | + } |
19 | 27 |
|
20 |
| -void end_roi() { |
21 |
| - gettimeofday(&tv1, nullptr); |
22 |
| - std::cout << TV_TO_SEC(tv1) - TV_TO_SEC(tv0) << std::endl; |
| 28 | + wmma::store_matrix_sync(c + K * block_x * 16 + block_y * 16, c_frag, K, wmma::mem_row_major); |
23 | 29 | }
|
24 | 30 |
|
25 |
| -extern "C" __global__ void default_function_kernel0( half* __restrict__ a, half* __restrict__ b, float* __restrict__ c) { |
26 |
| - |
27 |
| - for (int x_outer_inner = 0; x_outer_inner < 4; ++x_outer_inner) { |
28 |
| - for (int y_outer_inner = 0; y_outer_inner < 4; ++y_outer_inner) { |
29 |
| - |
30 |
| - wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag; |
31 |
| - |
32 |
| - wmma::fill_fragment(c_frag, 0.0f); |
33 |
| - |
34 |
| - wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag; |
35 |
| - wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag; |
36 |
| - |
37 |
| - |
38 |
| - for (int rv_outer = 0; rv_outer < 256; ++rv_outer) { |
39 |
| - |
40 |
| - half *ptr_a = &a[((((((int)blockIdx.x) * 262144) + (x_outer_inner * 65536)) + (rv_outer * 16)))]; |
41 |
| - wmma::load_matrix_sync(a_frag, ptr_a, 4096); |
42 |
| - half *ptr_b = &b[((((((int)threadIdx.x) * 262144) + (y_outer_inner * 65536)) + (rv_outer * 16)))]; |
43 |
| - wmma::load_matrix_sync(b_frag, ptr_b, 4096); |
44 |
| - wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); |
45 |
| - |
46 |
| - } |
47 |
| - __syncthreads(); |
48 |
| - |
49 |
| - float *ptr_c = &c[((((((((int)blockIdx.x) * 262144) + (x_outer_inner * 65536))) + (((int)threadIdx.x) * 64)) + (y_outer_inner * 16)))]; |
50 |
| - wmma::store_matrix_sync(ptr_c, c_frag, 4096, wmma::mem_row_major); |
| 31 | +half a[N * M], b[M * K]; |
| 32 | +float c[N * K], ref[N * K]; |
51 | 33 |
|
| 34 | +template<typename T> |
| 35 | +void print(int n, int m, const T* a) { |
| 36 | + for (int i = 0; i < n; ++i) { |
| 37 | + for (int j = 0; j < m; ++j) { |
| 38 | + if (j) std::cout << " "; |
| 39 | + std::cout << a[i * m + j]; |
52 | 40 | }
|
| 41 | + std::cout << std::endl; |
53 | 42 | }
|
54 | 43 | }
|
55 | 44 |
|
56 |
| -int main() { |
57 |
| - |
58 |
| - half *a, *b; |
59 |
| - float *c; |
60 |
| - |
61 |
| - cudaMalloc(&a, 4096 * 4096 * (sizeof (half))); |
62 |
| - cudaMalloc(&b, 4096 * 4096 * (sizeof (half))); |
63 |
| - cudaMalloc(&c, 4096 * 4096 * (sizeof (float))); |
64 |
| - |
65 |
| - begin_roi(); |
66 |
| - for (int i = 0; i < 10; ++i) { |
67 |
| - default_function_kernel0<<<64, 64>>>(a, b, c); |
| 45 | +template<> |
| 46 | +void print(int n, int m, const half* a) { |
| 47 | + for (int i = 0; i < n; ++i) { |
| 48 | + for (int j = 0; j < m; ++j) { |
| 49 | + if (j) std::cout << " "; |
| 50 | + std::cout << __half2float(a[i * m + j]); |
| 51 | + } |
| 52 | + std::cout << std::endl; |
68 | 53 | }
|
69 |
| - assert(cudaDeviceSynchronize() == cudaSuccess); |
70 |
| - end_roi(); |
| 54 | +} |
71 | 55 |
|
| 56 | +int main() { |
| 57 | + cudaDeviceProp prop; |
| 58 | + assert(cudaSuccess == cudaGetDeviceProperties(&prop, 0)); |
| 59 | + std::cout << "Warp size is: " << prop.warpSize << std::endl; |
| 60 | + |
| 61 | + for (int i = 0; i < N * M; ++i) |
| 62 | + a[i] = __float2half((float )rand() / RAND_MAX * 0.5); |
| 63 | + for (int i = 0; i < M * K; ++i) |
| 64 | + b[i] = __float2half((float) rand() / RAND_MAX * 0.5); |
| 65 | + for (int i = 0; i < N * K; ++i) |
| 66 | + c[i] = 0; |
| 67 | + for (int i = 0; i < N; ++i) |
| 68 | + for (int j = 0; j < K; ++j) { |
| 69 | + ref[i * K + j] = 0.0; |
| 70 | + for (int k = 0; k < M; ++k) |
| 71 | + ref[i * K + j] += __half2float(a[i * M + k]) * __half2float(b[k * K + j]); |
| 72 | + } |
| 73 | + half *dev_a, *dev_b; |
| 74 | + float *dev_c; |
| 75 | + cudaMalloc(&dev_a, N * M * sizeof(half)); |
| 76 | + cudaMalloc(&dev_b, M * K * sizeof(half)); |
| 77 | + cudaMalloc(&dev_c, N * K * sizeof(float)); |
| 78 | + cudaMemcpy(dev_a, a, sizeof a, cudaMemcpyHostToDevice); |
| 79 | + cudaMemcpy(dev_b, b, sizeof b, cudaMemcpyHostToDevice); |
| 80 | + cudaMemcpy(dev_c, c, sizeof c, cudaMemcpyHostToDevice); |
| 81 | + foo<<<4, 32>>>(dev_a, dev_b, dev_c); |
| 82 | + cudaDeviceSynchronize(); |
| 83 | + cudaMemcpy(c, dev_c, sizeof c, cudaMemcpyDeviceToHost); |
| 84 | + std::cout.precision(1); |
| 85 | + std::cout << std::fixed; |
| 86 | + //print(N, M, a); |
| 87 | + print(N, K, c); |
72 | 88 | return 0;
|
73 | 89 | }
|
0 commit comments