KnowingNothing
diff --git a/‎call_cublas.cu
Lines changed: 2 additions & 2 deletions b/‎call_cublas.cu
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmd.sh
Lines changed: 16 additions & 1 deletion b/‎cmd.sh
Lines changed: 16 additions & 1 deletion
diff --git a/‎main.cu
Lines changed: 7 additions & 1 deletion b/‎main.cu
Lines changed: 7 additions & 1 deletion
diff --git a/‎matmul-v01.cu
Lines changed: 4 additions & 2 deletions b/‎matmul-v01.cu
Lines changed: 4 additions & 2 deletions
diff --git a/‎matmul-v02.cu
Lines changed: 4 additions & 2 deletions b/‎matmul-v02.cu
Lines changed: 4 additions & 2 deletions
diff --git a/‎matmul-v03.cu
Lines changed: 4 additions & 2 deletions b/‎matmul-v03.cu
Lines changed: 4 additions & 2 deletions
diff --git a/‎matmul-v04.cu
Lines changed: 4 additions & 2 deletions b/‎matmul-v04.cu
Lines changed: 4 additions & 2 deletions
diff --git a/‎matmul-v05.cu
Lines changed: 5 additions & 0 deletions b/‎matmul-v05.cu
Lines changed: 5 additions & 0 deletions
diff --git a/‎matmul-v06.cu
Lines changed: 44 additions & 38 deletions b/‎matmul-v06.cu
Lines changed: 44 additions & 38 deletions
@@ -1,7 +1,7 @@
 // A100 PCIE 80GB
 // Test performance using shape M=5376, N=5376, K=2048
-// Running cost of CuBLAS is 0.58983ms
-// TFLOPS: 200.702
+// Running cost of CuBLAS is 0.784682ms
+// TFLOPS: 150.864
 
 // 3090
 // Test performance using shape M=5376, N=5376, K=2048
 
@@ -11,6 +11,11 @@ nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v02.cu main.cu -o test && .
 nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v03.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v04.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v05.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v06.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v07.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v08.cu main.cu -o test && ./test stages 4 multi_threading 2
+nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v09.cu main.cu -o test && ./test stages 4 multi_threading 2
+nvcc -arch=sm_80  -DDEBUG -Xcompiler -fopenmp matmul-v10.cu main.cu -o test && ./test stages 4 multi_threading 2
 
 # test performance
 nvcc -arch=sm_80  matmul-v00.cu main.cu -o test && ./test
@@ -21,9 +26,19 @@ nvcc -arch=sm_80  matmul-v02.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_80  matmul-v03.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_80  matmul-v04.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_80  matmul-v05.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_80  matmul-v06.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_80  matmul-v07.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_80  matmul-v08.cu main.cu -o test && ./test stages 4 multi_threading 2
+nvcc -arch=sm_80  matmul-v09.cu main.cu -o test && ./test stages 4 multi_threading 2
+nvcc -arch=sm_80  matmul-v10.cu main.cu -o test && ./test stages 4 multi_threading 2
 
 nvcc -arch=sm_86  matmul-v01.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_86  matmul-v02.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_86  matmul-v03.cu main.cu -o test && ./test stages 4
 nvcc -arch=sm_86  matmul-v04.cu main.cu -o test && ./test stages 4
-nvcc -arch=sm_86  matmul-v05.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_86  matmul-v05.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_86  matmul-v06.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_86  matmul-v07.cu main.cu -o test && ./test stages 4
+nvcc -arch=sm_86  matmul-v08.cu main.cu -o test && ./test stages 4 multi_threading 2
+nvcc -arch=sm_86  matmul-v09.cu main.cu -o test && ./test stages 4 multi_threading 2
+nvcc -arch=sm_86  matmul-v10.cu main.cu -o test && ./test stages 4 multi_threading 2
@@ -8,6 +8,7 @@
 #include <cassert>
 
 int STAGES = 1;
+int MULTI_THREADING = 1;
 
 extern __global__ void matmul(half *A, half *B, half *C, int M, int N, int K);
 
@@ -55,6 +56,11 @@ int main(int argc, char *argv[])
                 STAGES = std::atoi(value);
                 std::cout << "Setting to " << STAGES << " stages.\n";
             }
+            else if (keys == "multi_threading")
+            {
+                MULTI_THREADING = std::atoi(value);
+                std::cout << "Setting to " << MULTI_THREADING << "x threading.\n";
+            }
         }
     }
 #ifdef DEBUG
@@ -139,7 +145,7 @@ int main(int argc, char *argv[])
     CUDA_CHECK(cudaMemcpy(dB, hB, K * N * 2, cudaMemcpyHostToDevice));
     CUDA_CHECK(cudaMemcpy(dC, hC, M * N * 2, cudaMemcpyHostToDevice));
 
-    dim3 dimBlock(32, 2, 2);
+    dim3 dimBlock(32, 2 * MULTI_THREADING, 2);
     dim3 dimGrid(N / 128, M / 128);
 
 #ifndef DEBUG
 
@@ -1,7 +1,9 @@
+// wmma
+
 // A100 PCIE 80GB
 // Test performance using shape M=5376, N=5376, K=2048
-// Running cost of CUDA kernel is 2.91723ms
-// TFLOPS: 40.5795
+// Running cost of CUDA kernel is 3.58903ms
+// TFLOPS: 32.9838
 
 // 3090
 // Test performance using shape M=5376, N=5376, K=2048
 
@@ -1,7 +1,9 @@
+// wmma + pipeline
+
 // A100 PCIE 80GB
 // Test performance using shape M=5376, N=5376, K=2048
-// Running cost of CUDA kernel is 0.89474ms
-// TFLOPS: 132.307
+// Running cost of CUDA kernel is 1.15745ms
+// TFLOPS: 102.277
 
 // 3090
 // Test performance using shape M=5376, N=5376, K=2048
 
@@ -1,7 +1,9 @@
+// 4 mma + pipeline
+
 // A100 PCIE 80GB
 // Test performance using shape M=5376, N=5376, K=2048
-// Running cost of CUDA kernel is 0.893115ms
-// TFLOPS: 132.547
+// Running cost of CUDA kernel is 1.33858ms
+// TFLOPS: 88.4372
 
 // 3090
 // Test performance using shape M=5376, N=5376, K=2048
 
@@ -1,7 +1,9 @@
+// 4 mma + pipeline + ldmatrix
+
 // A100 PCIE 80GB
 // Test performance using shape M=5376, N=5376, K=2048
-// Running cost of CUDA kernel is 0.898625ms
-// TFLOPS: 131.735
+// Running cost of CUDA kernel is 1.48764ms
+// TFLOPS: 79.5756
 
 // 3090
 // Setting to 4 stages.
 
@@ -1,4 +1,9 @@
+// 2 mma + pipeline + ldmatrix
+
 // A100 PCIE 80GB
+// Test performance using shape M=5376, N=5376, K=2048
+// Running cost of CUDA kernel is 1.47957ms
+// TFLOPS: 80.0096
 
 // 3090
 // Test performance using shape M=5376, N=5376, K=2048
 
@@ -1,4 +1,9 @@
+// 2 mma + pipeline + simplify
+
 // A100 PCIE 80GB
+// Test performance using shape M=5376, N=5376, K=2048
+// Running cost of CUDA kernel is 1.21901ms
+// TFLOPS: 97.1117
 
 // 3090
 
@@ -114,19 +119,17 @@ __device__ void loadFragA(unsigned int *frag, half *smem, int ki)
     // load 64x16
     int tx = threadIdx.x;
     int tz = threadIdx.z;
+    int row = tz * 64 + tx / 4;
+    int col = ki * KII + tx % 4 * 2;
+    half* ptr = smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
     for (int i = 0; i < 4; ++i)
     {
-        int row = tz * 64 + i * 16 + tx / 16 * 8 + tx % 8;
-        int col = ki * KII + tx / 8 % 2 * 8;
-        void *ptr = (void *)(smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16);
-        uint32_t smem_ptr;
-        asm(
-            "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
-            : "=r"(smem_ptr)
-            : "l"(ptr));
-        asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
-                     : "=r"(frag[i * 4 + 0]), "=r"(frag[i * 4 + 1]), "=r"(frag[i * 4 + 2]), "=r"(frag[i * 4 + 3])
-                     : "r"(smem_ptr));
+        frag[i * 4 + 0] = *(reinterpret_cast<unsigned int *>(ptr));
+        frag[i * 4 + 1] = *(reinterpret_cast<unsigned int *>(ptr + 8));
+
+        frag[i * 4 + 2] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16));
+        frag[i * 4 + 3] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16 + 8));
+        ptr += 16 * 16 * 2;
     }
 }
 
@@ -136,19 +139,17 @@ __device__ void loadFragB(unsigned int *frag, half *smem, int ki)
     // load 64x16
     int tx = threadIdx.x;
     int ty = threadIdx.y;
+    int row = ty * 64 + tx / 4;
+    int col = ki * KII + tx % 4 * 2;
+    half* ptr = smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
     for (int i = 0; i < 4; ++i)
     {
-        int row = ty * 64 + i * 16 + tx / 16 * 8 + tx % 8;
-        int col = ki * KII + tx / 8 % 2 * 8;
-        void *ptr = (void *)(smem + row / 16 * (2 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16);
-        uint32_t smem_ptr;
-        asm(
-            "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
-            : "=r"(smem_ptr)
-            : "l"(ptr));
-        asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
-                     : "=r"(frag[i * 4 + 0]), "=r"(frag[i * 4 + 1]), "=r"(frag[i * 4 + 2]), "=r"(frag[i * 4 + 3])
-                     : "r"(smem_ptr));
+        frag[i * 4 + 0] = *(reinterpret_cast<unsigned int *>(ptr));
+        frag[i * 4 + 1] = *(reinterpret_cast<unsigned int *>(ptr + 8));
+
+        frag[i * 4 + 2] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16));
+        frag[i * 4 + 3] = *(reinterpret_cast<unsigned int *>(ptr + 8 * 16 + 8));
+        ptr += 16 * 16 * 2;
     }
 }
 
@@ -159,22 +160,27 @@ __device__ void storeAccum(float *ptr, float *frag)
     int tx = threadIdx.x;
     int ty = threadIdx.y;
     int tz = threadIdx.z;
+    int row = tz * 64 + tx / 4;
+    int col = ty * 64 + tx % 4 * 2;
+    float *dst = ptr + row / 16 * (8 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
     for (int i = 0; i < 4; ++i)
     {
-        for (int j = 0; j < 4; ++j)
-        {
-            for (int r = 0; r < 2; ++r)
-            {
-                for (int c = 0; c < 2; ++c)
-                {
-                    int row = tz * 64 + i * 16 + r * 8 + tx / 4;
-                    int col = ty * 64 + j * 16 + c * 8 + tx % 4 * 2;
-                    float *dst = ptr + row / 16 * (8 * 16 * 16) + col / 16 * (16 * 16) + row % 16 * 16 + col % 16;
-                    dst[0] = frag[i * 32 + j * 8 + r * 4 + c * 2];
-                    dst[1] = frag[i * 32 + j * 8 + r * 4 + c * 2 + 1];
-                }
-            }
+        for (int j = 0; j < 4; ++j) {
+            dst[0] = frag[i * 32 + j * 8 + 0 * 4 + 0 * 2];
+            dst[1] = frag[i * 32 + j * 8 + 0 * 4 + 0 * 2 + 1];
+
+            dst[0 + 8] = frag[i * 32 + j * 8 + 0 * 4 + 1 * 2];
+            dst[1 + 8] = frag[i * 32 + j * 8 + 0 * 4 + 1 * 2 + 1];
+
+            dst[0 + 8 * 16] = frag[i * 32 + j * 8 + 1 * 4 + 0 * 2];
+            dst[1 + 8 * 16] = frag[i * 32 + j * 8 + 1 * 4 + 0 * 2 + 1];
+
+            dst[0 + 8 * 16 + 8] = frag[i * 32 + j * 8 + 1 * 4 + 1 * 2];
+            dst[1 + 8 * 16 + 8] = frag[i * 32 + j * 8 + 1 * 4 + 1 * 2 + 1];
+
+            dst += 16 * 16;
         }
+        dst += 4 * 16 * 16;
     }
 }
 
@@ -221,9 +227,9 @@ __global__ void matmul(half *A, half *B, half *C, int M, int N, int K)
     half *SB4 = SB3 + NI * KI;
     float *SC = reinterpret_cast<float *>(shared_storage);
 
-    unsigned int FragA[16];
-    unsigned int FragB[16];
-    float Accum[128] = {0.0};
+    unsigned int FragA[4 * 4];      // [4, 4]
+    unsigned int FragB[4 * 4];      // [4, 4]
+    float Accum[4 * 4 * 8] = {0.0}; // [4, 4, 8]
 
     // prologue
     loadSmemA(SA1, A, M, K, 0);