1
+ #include " common.h"
2
+ // stream + split copy async
3
+
4
+ // nvcc -arch=sm_90a -std=c++17 -I ../../include/ -lcuda ldgmem_ldsmem_v0.cu -o test
5
+
6
+ const int SM_LODA_BYTES = 128 /8 ;
7
+
8
+ template <typename DType, int BLOCKM, int BLOCKN, int NUM_THREADS>
9
+ __global__ void split_matrix_ldsm (DType* source, int M, int N, DType* dummy_out, int split, int curr_split) {
10
+ __shared__ DType smem[BLOCKM*BLOCKN];
11
+ const int VEC_LEN = SM_LODA_BYTES / sizeof (DType);
12
+ const int VEC_REPEAT = BLOCKN / VEC_LEN;
13
+ const int THREAD_N = VEC_REPEAT;
14
+ const int THREAD_M = NUM_THREADS / THREAD_N;
15
+ const int ROW_REPEAT = BLOCKM / THREAD_M;
16
+ static_assert (BLOCKN % VEC_LEN == 0 );
17
+ static_assert (NUM_THREADS % THREAD_N == 0 );
18
+ static_assert (ROW_REPEAT * THREAD_M == BLOCKM);
19
+
20
+ dummy_out += M / split * curr_split * N;
21
+
22
+ int mo = blockIdx .x * BLOCKM;
23
+ int mi = threadIdx .x / THREAD_N;
24
+ int ni = threadIdx .x % THREAD_N;
25
+ int4 * ld_source = reinterpret_cast <int4 *>(source);
26
+ int4 * ld_smem = reinterpret_cast <int4 *>(smem);
27
+ for (int no = 0 ; no < N; no += BLOCKN) {
28
+ for (int row_repeat = 0 ; row_repeat < ROW_REPEAT; ++row_repeat) {
29
+ int m = mo + row_repeat * THREAD_M + mi;
30
+ int n = no + ni * VEC_LEN;
31
+ int idx = m * N + n;
32
+ int sm = row_repeat * THREAD_M + mi;
33
+ int sn = ni * VEC_LEN;
34
+ int sm_idx = sm * BLOCKN + sn;
35
+ ld_smem[sm_idx / VEC_LEN] = ld_source[idx / VEC_LEN];
36
+ }
37
+ __syncthreads ();
38
+ for (int x = 0 ; x < 256 ; ++x) {
39
+ for (int row_repeat = 0 ; row_repeat < ROW_REPEAT; ++row_repeat) {
40
+ int m = mo + row_repeat * THREAD_M + mi;
41
+ int n = no + ni * VEC_LEN;
42
+ int idx = m * N + n;
43
+ int sm = row_repeat * THREAD_M + mi;
44
+ int sn = ni * VEC_LEN;
45
+ int sm_idx = sm * BLOCKN + sn;
46
+ for (int i = 0 ; i < VEC_LEN; ++i) {
47
+ dummy_out[idx + i] = smem[sm_idx + i] + DType (1 );
48
+ }
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+
55
+ template <typename DType>
56
+ void cpu_dummy (DType* source, DType* dummy_out, int M, int N) {
57
+ for (int m = 0 ; m < M; ++m) {
58
+ for (int n = 0 ; n < N; ++n) {
59
+ dummy_out[m * N + n] = (DType)((float )source[m * N + n] + (float )DType (1 ));
60
+ }
61
+ }
62
+ }
63
+
64
+
65
+ int main (int argc, char ** argv) {
66
+ const int M = 1024 ;
67
+ const int N = 1024 ;
68
+ int split = 4 ;
69
+ using DType = half;
70
+ const int BLOCKM = 128 ;
71
+ const int BLOCKN = 128 ;
72
+ const int NUM_THREADS = 128 ;
73
+ std::vector<int > shape{M, N};
74
+ std::vector<int > epoch_shape{M/split, N};
75
+ auto A = alloc_cpu_tensor<DType>(shape);
76
+ random_fill (A, shape);
77
+ // constant_fill(A, shape, DType(1));
78
+ auto B = alloc_cpu_tensor<DType>(shape);
79
+ auto golden = alloc_cpu_tensor<DType>(shape);
80
+
81
+ GPUTimer gpu_timer;
82
+ cudaStream_t s1, s2;
83
+ cudaStreamCreate (&s1);
84
+ cudaStreamCreate (&s2);
85
+ std::vector<cudaStream_t*> streams{&s1, &s2};
86
+
87
+ std::vector<DType*> dAs;
88
+ for (int i = 0 ; i < split; ++i) {
89
+ dAs.push_back (alloc_gpu_tensor<DType>(epoch_shape));
90
+ }
91
+ auto dB = alloc_gpu_tensor<DType>(shape);
92
+
93
+ dim3 block (NUM_THREADS);
94
+ dim3 grid (ceil_div (M/split, BLOCKM));
95
+ gpu_timer.sync_all ();
96
+ gpu_timer.tick ();
97
+ for (int i = 0 ; i < split; ++i) {
98
+ copy_to_gpu_async (A + M/split * i * N, dAs[i], epoch_shape, *streams[i%2 ]);
99
+ split_matrix_ldsm<DType, BLOCKM, BLOCKN, NUM_THREADS><<<grid, block, 0 , *streams[i%2 ]>>> (dAs[i], M, N, dB, split, i);
100
+ }
101
+ gpu_timer.tick ();
102
+ gpu_timer.sync_all ();
103
+ std::cout << " GPU split done! Use " << gpu_timer.report_last_ms () << " ms.\n " ;
104
+ copy_to_cpu_async (B, dB, shape);
105
+
106
+
107
+ std::cout << " Calculating golden...\n " ;
108
+ cpu_dummy (A, golden, M, N);
109
+ assert_allclose (B, golden, shape, 1e-5 , /* dump=*/ false );
110
+ std::cout << " Correct!\n " ;
111
+
112
+
113
+ return 0 ;
114
+ }
0 commit comments