Skip to content

Commit 18366a5

Browse files
update
1 parent a7528fa commit 18366a5

13 files changed

+896
-57
lines changed

examples/atom/.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
test_*
1+
test_*
2+
*nsys-rep

examples/atom/ldgmem_ldsmem_v0.cu

+14-12
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,17 @@ __global__ void naive_matrix_ldsm(DType* source, int M, int N, DType* dummy_out)
3333
ld_smem[sm_idx / VEC_LEN] = ld_source[idx / VEC_LEN];
3434
}
3535
__syncthreads();
36-
for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
37-
int m = mo + row_repeat * THREAD_M + mi;
38-
int n = no + ni * VEC_LEN;
39-
int idx = m * N + n;
40-
int sm = row_repeat * THREAD_M + mi;
41-
int sn = ni * VEC_LEN;
42-
int sm_idx = sm * BLOCKN + sn;
43-
for (int i = 0; i < VEC_LEN; ++i) {
44-
dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
36+
for (int x = 0; x < 256; ++x) {
37+
for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
38+
int m = mo + row_repeat * THREAD_M + mi;
39+
int n = no + ni * VEC_LEN;
40+
int idx = m * N + n;
41+
int sm = row_repeat * THREAD_M + mi;
42+
int sn = ni * VEC_LEN;
43+
int sm_idx = sm * BLOCKN + sn;
44+
for (int i = 0; i < VEC_LEN; ++i) {
45+
dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
46+
}
4547
}
4648
}
4749
}
@@ -75,15 +77,15 @@ int main(int argc, char** argv) {
7577

7678
auto dA = alloc_gpu_tensor<DType>(shape);
7779
auto dB = alloc_gpu_tensor<DType>(shape);
80+
dim3 block(NUM_THREADS);
81+
dim3 grid(ceil_div(M, BLOCKM));
7882
gpu_timer.sync_all();
7983
gpu_timer.tick();
8084
copy_to_gpu_async(A, dA, shape);
81-
dim3 block(NUM_THREADS);
82-
dim3 grid(ceil_div(M, BLOCKM));
8385
naive_matrix_ldsm<DType, BLOCKM, BLOCKN, NUM_THREADS><<<grid, block>>>(dA, M, N, dB);
84-
copy_to_cpu_async(B, dB, shape);
8586
gpu_timer.tick();
8687
gpu_timer.sync_all();
88+
copy_to_cpu_async(B, dB, shape);
8789
std::cout << "GPU naive done! Use " << gpu_timer.report_last_ms() << " ms.\n";
8890

8991
std::cout << "Calculating golden...\n";

examples/atom/ldgmem_ldsmem_v1.cu

+11-9
Original file line numberDiff line numberDiff line change
@@ -35,15 +35,17 @@ __global__ void split_matrix_ldsm(DType* source, int M, int N, DType* dummy_out,
3535
ld_smem[sm_idx / VEC_LEN] = ld_source[idx / VEC_LEN];
3636
}
3737
__syncthreads();
38-
for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
39-
int m = mo + row_repeat * THREAD_M + mi;
40-
int n = no + ni * VEC_LEN;
41-
int idx = m * N + n;
42-
int sm = row_repeat * THREAD_M + mi;
43-
int sn = ni * VEC_LEN;
44-
int sm_idx = sm * BLOCKN + sn;
45-
for (int i = 0; i < VEC_LEN; ++i) {
46-
dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
38+
for (int x = 0; x < 256; ++x) {
39+
for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
40+
int m = mo + row_repeat * THREAD_M + mi;
41+
int n = no + ni * VEC_LEN;
42+
int idx = m * N + n;
43+
int sm = row_repeat * THREAD_M + mi;
44+
int sn = ni * VEC_LEN;
45+
int sm_idx = sm * BLOCKN + sn;
46+
for (int i = 0; i < VEC_LEN; ++i) {
47+
dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
48+
}
4749
}
4850
}
4951
}

examples/atom/ldgmem_ldsmem_v2.cu

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
#include "common.h"
2+
// stream + split copy async
3+
4+
// nvcc -arch=sm_90a -std=c++17 -I ../../include/ -lcuda ldgmem_ldsmem_v0.cu -o test
5+
6+
const int SM_LODA_BYTES = 128/8;
7+
8+
template <typename DType, int BLOCKM, int BLOCKN, int NUM_THREADS>
9+
__global__ void split_matrix_ldsm(DType* source, int M, int N, DType* dummy_out, int split, int curr_split) {
10+
__shared__ DType smem[BLOCKM*BLOCKN];
11+
const int VEC_LEN = SM_LODA_BYTES / sizeof(DType);
12+
const int VEC_REPEAT = BLOCKN / VEC_LEN;
13+
const int THREAD_N = VEC_REPEAT;
14+
const int THREAD_M = NUM_THREADS / THREAD_N;
15+
const int ROW_REPEAT = BLOCKM / THREAD_M;
16+
static_assert(BLOCKN % VEC_LEN == 0);
17+
static_assert(NUM_THREADS % THREAD_N == 0);
18+
static_assert(ROW_REPEAT * THREAD_M == BLOCKM);
19+
20+
dummy_out += M / split * curr_split * N;
21+
22+
int mo = blockIdx.x * BLOCKM;
23+
int mi = threadIdx.x / THREAD_N;
24+
int ni = threadIdx.x % THREAD_N;
25+
int4* ld_source = reinterpret_cast<int4*>(source);
26+
int4* ld_smem = reinterpret_cast<int4*>(smem);
27+
for (int no = 0; no < N; no += BLOCKN) {
28+
for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
29+
int m = mo + row_repeat * THREAD_M + mi;
30+
int n = no + ni * VEC_LEN;
31+
int idx = m * N + n;
32+
int sm = row_repeat * THREAD_M + mi;
33+
int sn = ni * VEC_LEN;
34+
int sm_idx = sm * BLOCKN + sn;
35+
ld_smem[sm_idx / VEC_LEN] = ld_source[idx / VEC_LEN];
36+
}
37+
__syncthreads();
38+
for (int x = 0; x < 256; ++x) {
39+
for (int row_repeat = 0; row_repeat < ROW_REPEAT; ++row_repeat) {
40+
int m = mo + row_repeat * THREAD_M + mi;
41+
int n = no + ni * VEC_LEN;
42+
int idx = m * N + n;
43+
int sm = row_repeat * THREAD_M + mi;
44+
int sn = ni * VEC_LEN;
45+
int sm_idx = sm * BLOCKN + sn;
46+
for (int i = 0; i < VEC_LEN; ++i) {
47+
dummy_out[idx + i] = smem[sm_idx + i] + DType(1);
48+
}
49+
}
50+
}
51+
}
52+
}
53+
54+
55+
template<typename DType>
56+
void cpu_dummy(DType* source, DType* dummy_out, int M, int N) {
57+
for (int m = 0; m < M; ++m) {
58+
for (int n = 0; n < N; ++n) {
59+
dummy_out[m * N + n] = (DType)((float)source[m * N + n] + (float)DType(1));
60+
}
61+
}
62+
}
63+
64+
65+
int main(int argc, char** argv) {
66+
const int M = 1024;
67+
const int N = 1024;
68+
int split = 4;
69+
using DType = half;
70+
const int BLOCKM = 128;
71+
const int BLOCKN = 128;
72+
const int NUM_THREADS = 128;
73+
std::vector<int> shape{M, N};
74+
std::vector<int> epoch_shape{M/split, N};
75+
auto A = alloc_cpu_tensor<DType>(shape);
76+
random_fill(A, shape);
77+
// constant_fill(A, shape, DType(1));
78+
auto B = alloc_cpu_tensor<DType>(shape);
79+
auto golden = alloc_cpu_tensor<DType>(shape);
80+
81+
GPUTimer gpu_timer;
82+
cudaStream_t s1, s2;
83+
cudaStreamCreate(&s1);
84+
cudaStreamCreate(&s2);
85+
std::vector<cudaStream_t*> streams{&s1, &s2};
86+
87+
std::vector<DType*> dAs;
88+
for (int i = 0; i < split; ++i) {
89+
dAs.push_back(alloc_gpu_tensor<DType>(epoch_shape));
90+
}
91+
auto dB = alloc_gpu_tensor<DType>(shape);
92+
93+
dim3 block(NUM_THREADS);
94+
dim3 grid(ceil_div(M/split, BLOCKM));
95+
gpu_timer.sync_all();
96+
gpu_timer.tick();
97+
for (int i = 0; i < split; ++i) {
98+
copy_to_gpu_async(A + M/split * i * N, dAs[i], epoch_shape, *streams[i%2]);
99+
split_matrix_ldsm<DType, BLOCKM, BLOCKN, NUM_THREADS><<<grid, block, 0, *streams[i%2]>>>(dAs[i], M, N, dB, split, i);
100+
}
101+
gpu_timer.tick();
102+
gpu_timer.sync_all();
103+
std::cout << "GPU split done! Use " << gpu_timer.report_last_ms() << " ms.\n";
104+
copy_to_cpu_async(B, dB, shape);
105+
106+
107+
std::cout << "Calculating golden...\n";
108+
cpu_dummy(A, golden, M, N);
109+
assert_allclose(B, golden, shape, 1e-5, /*dump=*/false);
110+
std::cout << "Correct!\n";
111+
112+
113+
return 0;
114+
}

0 commit comments

Comments
 (0)