Skip to content

Commit ee9a6d0

Browse files
update
1 parent b15ad4b commit ee9a6d0

14 files changed

+1802
-50
lines changed

call_cublas.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// A100 PCIE 80GB
22
// Test performance using shape M=5376, N=5376, K=2048
3-
// Running cost of CuBLAS is 0.58983ms
4-
// TFLOPS: 200.702
3+
// Running cost of CuBLAS is 0.784682ms
4+
// TFLOPS: 150.864
55

66
// 3090
77
// Test performance using shape M=5376, N=5376, K=2048

cmd.sh

+16-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v02.cu main.cu -o test && .
1111
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v03.cu main.cu -o test && ./test stages 4
1212
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v04.cu main.cu -o test && ./test stages 4
1313
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v05.cu main.cu -o test && ./test stages 4
14+
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v06.cu main.cu -o test && ./test stages 4
15+
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v07.cu main.cu -o test && ./test stages 4
16+
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v08.cu main.cu -o test && ./test stages 4 multi_threading 2
17+
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v09.cu main.cu -o test && ./test stages 4 multi_threading 2
18+
nvcc -arch=sm_80 -DDEBUG -Xcompiler -fopenmp matmul-v10.cu main.cu -o test && ./test stages 4 multi_threading 2
1419

1520
# test performance
1621
nvcc -arch=sm_80 matmul-v00.cu main.cu -o test && ./test
@@ -21,9 +26,19 @@ nvcc -arch=sm_80 matmul-v02.cu main.cu -o test && ./test stages 4
2126
nvcc -arch=sm_80 matmul-v03.cu main.cu -o test && ./test stages 4
2227
nvcc -arch=sm_80 matmul-v04.cu main.cu -o test && ./test stages 4
2328
nvcc -arch=sm_80 matmul-v05.cu main.cu -o test && ./test stages 4
29+
nvcc -arch=sm_80 matmul-v06.cu main.cu -o test && ./test stages 4
30+
nvcc -arch=sm_80 matmul-v07.cu main.cu -o test && ./test stages 4
31+
nvcc -arch=sm_80 matmul-v08.cu main.cu -o test && ./test stages 4 multi_threading 2
32+
nvcc -arch=sm_80 matmul-v09.cu main.cu -o test && ./test stages 4 multi_threading 2
33+
nvcc -arch=sm_80 matmul-v10.cu main.cu -o test && ./test stages 4 multi_threading 2
2434

2535
nvcc -arch=sm_86 matmul-v01.cu main.cu -o test && ./test stages 4
2636
nvcc -arch=sm_86 matmul-v02.cu main.cu -o test && ./test stages 4
2737
nvcc -arch=sm_86 matmul-v03.cu main.cu -o test && ./test stages 4
2838
nvcc -arch=sm_86 matmul-v04.cu main.cu -o test && ./test stages 4
29-
nvcc -arch=sm_86 matmul-v05.cu main.cu -o test && ./test stages 4
39+
nvcc -arch=sm_86 matmul-v05.cu main.cu -o test && ./test stages 4
40+
nvcc -arch=sm_86 matmul-v06.cu main.cu -o test && ./test stages 4
41+
nvcc -arch=sm_86 matmul-v07.cu main.cu -o test && ./test stages 4
42+
nvcc -arch=sm_86 matmul-v08.cu main.cu -o test && ./test stages 4 multi_threading 2
43+
nvcc -arch=sm_86 matmul-v09.cu main.cu -o test && ./test stages 4 multi_threading 2
44+
nvcc -arch=sm_86 matmul-v10.cu main.cu -o test && ./test stages 4 multi_threading 2

main.cu

+7-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <cassert>
99

1010
int STAGES = 1;
11+
int MULTI_THREADING = 1;
1112

1213
extern __global__ void matmul(half *A, half *B, half *C, int M, int N, int K);
1314

@@ -55,6 +56,11 @@ int main(int argc, char *argv[])
5556
STAGES = std::atoi(value);
5657
std::cout << "Setting to " << STAGES << " stages.\n";
5758
}
59+
else if (keys == "multi_threading")
60+
{
61+
MULTI_THREADING = std::atoi(value);
62+
std::cout << "Setting to " << MULTI_THREADING << "x threading.\n";
63+
}
5864
}
5965
}
6066
#ifdef DEBUG
@@ -139,7 +145,7 @@ int main(int argc, char *argv[])
139145
CUDA_CHECK(cudaMemcpy(dB, hB, K * N * 2, cudaMemcpyHostToDevice));
140146
CUDA_CHECK(cudaMemcpy(dC, hC, M * N * 2, cudaMemcpyHostToDevice));
141147

142-
dim3 dimBlock(32, 2, 2);
148+
dim3 dimBlock(32, 2 * MULTI_THREADING, 2);
143149
dim3 dimGrid(N / 128, M / 128);
144150

145151
#ifndef DEBUG

matmul-v01.cu

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
// wmma
2+
13
// A100 PCIE 80GB
24
// Test performance using shape M=5376, N=5376, K=2048
3-
// Running cost of CUDA kernel is 2.91723ms
4-
// TFLOPS: 40.5795
5+
// Running cost of CUDA kernel is 3.58903ms
6+
// TFLOPS: 32.9838
57

68
// 3090
79
// Test performance using shape M=5376, N=5376, K=2048

matmul-v02.cu

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
// wmma + pipeline
2+
13
// A100 PCIE 80GB
24
// Test performance using shape M=5376, N=5376, K=2048
3-
// Running cost of CUDA kernel is 0.89474ms
4-
// TFLOPS: 132.307
5+
// Running cost of CUDA kernel is 1.15745ms
6+
// TFLOPS: 102.277
57

68
// 3090
79
// Test performance using shape M=5376, N=5376, K=2048

matmul-v03.cu

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
// 4 mma + pipeline
2+
13
// A100 PCIE 80GB
24
// Test performance using shape M=5376, N=5376, K=2048
3-
// Running cost of CUDA kernel is 0.893115ms
4-
// TFLOPS: 132.547
5+
// Running cost of CUDA kernel is 1.33858ms
6+
// TFLOPS: 88.4372
57

68
// 3090
79
// Test performance using shape M=5376, N=5376, K=2048

matmul-v04.cu

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
// 4 mma + pipeline + ldmatrix
2+
13
// A100 PCIE 80GB
24
// Test performance using shape M=5376, N=5376, K=2048
3-
// Running cost of CUDA kernel is 0.898625ms
4-
// TFLOPS: 131.735
5+
// Running cost of CUDA kernel is 1.48764ms
6+
// TFLOPS: 79.5756
57

68
// 3090
79
// Setting to 4 stages.

matmul-v05.cu

+5
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1+
// 2 mma + pipeline + ldmatrix
2+
13
// A100 PCIE 80GB
4+
// Test performance using shape M=5376, N=5376, K=2048
5+
// Running cost of CUDA kernel is 1.47957ms
6+
// TFLOPS: 80.0096
27

38
// 3090
49
// Test performance using shape M=5376, N=5376, K=2048

matmul-v06.cu

+44-38
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1+
// 2 mma + pipeline + simplify
2+
13
// A100 PCIE 80GB
4+
// Test performance using shape M=5376, N=5376, K=2048
5+
// Running cost of CUDA kernel is 1.21901ms
6+
// TFLOPS: 97.1117
27

38
// 3090
49

@@ -114,19 +119,17 @@ __device__ void loadFragA(unsigned int *frag, half *smem, int ki)
114119
// load 64x16
115120
int tx = threadIdx.x;
116121
int tz = threadIdx.z;
122+
int row = tz * 64 + tx / 4;
123+
int col = ki * KII + tx % 4 * 2;
124+
half* ptr = smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
117125
for (int i = 0; i < 4; ++i)
118126
{
119-
int row = tz * 64 + i * 16 + tx / 16 * 8 + tx % 8;
120-
int col = ki * KII + tx / 8 % 2 * 8;
121-
void *ptr = (void *)(smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16);
122-
uint32_t smem_ptr;
123-
asm(
124-
"{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
125-
: "=r"(smem_ptr)
126-
: "l"(ptr));
127-
asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
128-
: "=r"(frag[i * 4 + 0]), "=r"(frag[i * 4 + 1]), "=r"(frag[i * 4 + 2]), "=r"(frag[i * 4 + 3])
129-
: "r"(smem_ptr));
127+
frag[i * 4 + 0] = *(reinterpret_cast<unsigned int *>(ptr));
128+
frag[i * 4 + 1] = *(reinterpret_cast<unsigned int *>(ptr + 8));
129+
130+
frag[i * 4 + 2] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16));
131+
frag[i * 4 + 3] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16 + 8));
132+
ptr += 16 * 16 * 2;
130133
}
131134
}
132135

@@ -136,19 +139,17 @@ __device__ void loadFragB(unsigned int *frag, half *smem, int ki)
136139
// load 64x16
137140
int tx = threadIdx.x;
138141
int ty = threadIdx.y;
142+
int row = ty * 64 + tx / 4;
143+
int col = ki * KII + tx % 4 * 2;
144+
half* ptr = smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
139145
for (int i = 0; i < 4; ++i)
140146
{
141-
int row = ty * 64 + i * 16 + tx / 16 * 8 + tx % 8;
142-
int col = ki * KII + tx / 8 % 2 * 8;
143-
void *ptr = (void *)(smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16);
144-
uint32_t smem_ptr;
145-
asm(
146-
"{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
147-
: "=r"(smem_ptr)
148-
: "l"(ptr));
149-
asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
150-
: "=r"(frag[i * 4 + 0]), "=r"(frag[i * 4 + 1]), "=r"(frag[i * 4 + 2]), "=r"(frag[i * 4 + 3])
151-
: "r"(smem_ptr));
147+
frag[i * 4 + 0] = *(reinterpret_cast<unsigned int *>(ptr));
148+
frag[i * 4 + 1] = *(reinterpret_cast<unsigned int *>(ptr + 8));
149+
150+
frag[i * 4 + 2] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16));
151+
frag[i * 4 + 3] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16 + 8));
152+
ptr += 16 * 16 * 2;
152153
}
153154
}
154155

@@ -159,22 +160,27 @@ __device__ void storeAccum(float *ptr, float *frag)
159160
int tx = threadIdx.x;
160161
int ty = threadIdx.y;
161162
int tz = threadIdx.z;
163+
int row = tz * 64 + tx / 4;
164+
int col = ty * 64 + tx % 4 * 2;
165+
float *dst = ptr + row / 16 * (8 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
162166
for (int i = 0; i < 4; ++i)
163167
{
164-
for (int j = 0; j < 4; ++j)
165-
{
166-
for (int r = 0; r < 2; ++r)
167-
{
168-
for (int c = 0; c < 2; ++c)
169-
{
170-
int row = tz * 64 + i * 16 + r * 8 + tx / 4;
171-
int col = ty * 64 + j * 16 + c * 8 + tx % 4 * 2;
172-
float *dst = ptr + row / 16 * (8 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
173-
dst[0] = frag[i * 32 + j * 8 + r * 4 + c * 2];
174-
dst[1] = frag[i * 32 + j * 8 + r * 4 + c * 2 + 1];
175-
}
176-
}
168+
for (int j = 0; j < 4; ++j) {
169+
dst[0] = frag[i * 32 + j * 8 + 0 * 4 + 0 * 2];
170+
dst[1] = frag[i * 32 + j * 8 + 0 * 4 + 0 * 2 + 1];
171+
172+
dst[0 + 8] = frag[i * 32 + j * 8 + 0 * 4 + 1 * 2];
173+
dst[1 + 8] = frag[i * 32 + j * 8 + 0 * 4 + 1 * 2 + 1];
174+
175+
dst[0 + 8 * 16] = frag[i * 32 + j * 8 + 1 * 4 + 0 * 2];
176+
dst[1 + 8 * 16] = frag[i * 32 + j * 8 + 1 * 4 + 0 * 2 + 1];
177+
178+
dst[0 + 8 * 16 + 8] = frag[i * 32 + j * 8 + 1 * 4 + 1 * 2];
179+
dst[1 + 8 * 16 + 8] = frag[i * 32 + j * 8 + 1 * 4 + 1 * 2 + 1];
180+
181+
dst += 16 * 16;
177182
}
183+
dst += 4 * 16 * 16;
178184
}
179185
}
180186

@@ -221,9 +227,9 @@ __global__ void matmul(half *A, half *B, half *C, int M, int N, int K)
221227
half *SB4 = SB3 + NI * KI;
222228
float *SC = reinterpret_cast<float *>(shared_storage);
223229

224-
unsigned int FragA[16];
225-
unsigned int FragB[16];
226-
float Accum[128] = {0.0};
230+
unsigned int FragA[4 * 4]; // [4, 4]
231+
unsigned int FragB[4 * 4]; // [4, 4]
232+
float Accum[4 * 4 * 8] = {0.0}; // [4, 4, 8]
227233

228234
// prologue
229235
loadSmemA(SA1, A, M, K, 0);

0 commit comments

Comments
 (0)