merge conflict

Jian Weng · Jian Weng · commit d5e30a256fca · 2020-01-24T00:08:20.000Z
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
@@ -41,6 +41,10 @@
 # For a C project, you would set this to 'c' instead of 'c++'.
 '-x',
 'c++',
+'-x',
+'cuda'
+'-I',
+'/usr/local/cuda/include'
 ]
 
 
diff --git a/tensorcore/poc b/tensorcore/poc
diff --git a/tensorcore/poc.cu b/tensorcore/poc.cu
@@ -1,73 +1,89 @@
-#include <sys/time.h>
-#include <cassert>
+#include <assert.h>
 #include <iostream>
-#include <cuda_fp16.h>
 #include <cuda.h>
 #include <mma.h>
-#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
 
-using namespace nvcuda;
+#define N 32
+#define M 32
+#define K 32
 
+using namespace nvcuda;
 
-struct timeval tv0, tv1;
+__global__ void foo(half *a, half *b, float *c) {
+  int block_x = blockIdx.x / 2;
+  int block_y = blockIdx.x % 2;
 
-void begin_roi() {
-  gettimeofday(&tv0, nullptr);
-}
+  wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
+  wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+  wmma::fragment<wmma::accumulator, 16, 16, 16, float, void> c_frag;
+  wmma::fill_fragment(c_frag, 0.0f);
 
-#define TV_TO_SEC(tv) (tv.tv_sec * 1000000 + tv.tv_usec)
+  for (int k = 0; k < M; k += 16) {
+    wmma::load_matrix_sync(a_frag, a + M * block_x + k, M);
+    wmma::load_matrix_sync(b_frag, b + K * k + block_y * 16, K);
+    wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+  }
 
-void end_roi() {
-  gettimeofday(&tv1, nullptr);
-  std::cout << TV_TO_SEC(tv1) - TV_TO_SEC(tv0) << std::endl;
+  wmma::store_matrix_sync(c + K * block_x * 16 + block_y * 16, c_frag, K, wmma::mem_row_major);
 }
 
-extern "C" __global__ void default_function_kernel0( half* __restrict__ a,  half* __restrict__ b,  float* __restrict__ c) {
-
-  for (int x_outer_inner = 0; x_outer_inner < 4; ++x_outer_inner) {
-    for (int y_outer_inner = 0; y_outer_inner < 4; ++y_outer_inner) {
-
-      wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
-
-      wmma::fill_fragment(c_frag, 0.0f);
-
-      wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> a_frag;
-      wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
-
-
-      for (int rv_outer = 0; rv_outer < 256; ++rv_outer) {
-
-        half *ptr_a = &a[((((((int)blockIdx.x) * 262144) + (x_outer_inner * 65536)) + (rv_outer * 16)))];
-        wmma::load_matrix_sync(a_frag, ptr_a, 4096);
-        half *ptr_b = &b[((((((int)threadIdx.x) * 262144) + (y_outer_inner * 65536)) + (rv_outer * 16)))];
-        wmma::load_matrix_sync(b_frag, ptr_b, 4096);
-        wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
-
-      }
-      __syncthreads();
-
-      float *ptr_c = &c[((((((((int)blockIdx.x) * 262144) + (x_outer_inner * 65536))) + (((int)threadIdx.x) * 64)) + (y_outer_inner * 16)))];
-      wmma::store_matrix_sync(ptr_c, c_frag, 4096, wmma::mem_row_major);
+half a[N * M], b[M * K];
+float c[N * K], ref[N * K];
 
+template<typename T>
+void print(int n, int m, const T* a) {
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < m; ++j) {
+      if (j) std::cout << " ";
+      std::cout << a[i * m + j];
     }
+    std::cout << std::endl;
   }
 }
 
-int main() {
-
-  half *a, *b;
-  float *c;
-
-  cudaMalloc(&a, 4096 * 4096 * (sizeof (half)));
-  cudaMalloc(&b, 4096 * 4096 * (sizeof (half)));
-  cudaMalloc(&c, 4096 * 4096 * (sizeof (float)));
-
-  begin_roi();
-  for (int i = 0; i < 10; ++i) {
-    default_function_kernel0<<<64, 64>>>(a, b, c);
+template<>
+void print(int n, int m, const half* a) {
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < m; ++j) {
+      if (j) std::cout << " ";
+      std::cout << __half2float(a[i * m + j]);
+    }
+    std::cout << std::endl;
   }
-  assert(cudaDeviceSynchronize() == cudaSuccess);
-  end_roi();
+}
 
+int main() {
+  cudaDeviceProp prop;
+  assert(cudaSuccess == cudaGetDeviceProperties(&prop, 0));
+  std::cout << "Warp size is: " <<  prop.warpSize << std::endl;
+
+  for (int i = 0; i < N * M; ++i)
+    a[i] = __float2half((float )rand() / RAND_MAX * 0.5);
+  for (int i = 0; i < M * K; ++i)
+    b[i] = __float2half((float) rand() / RAND_MAX * 0.5);
+  for (int i = 0; i < N * K; ++i)
+    c[i] = 0;
+  for (int i = 0; i < N; ++i)
+    for (int j = 0; j < K; ++j) {
+      ref[i * K + j] = 0.0;
+      for (int k = 0; k < M; ++k)
+        ref[i * K + j] += __half2float(a[i * M + k]) * __half2float(b[k * K + j]);
+    }
+  half *dev_a, *dev_b;
+  float *dev_c;
+  cudaMalloc(&dev_a, N * M * sizeof(half));
+  cudaMalloc(&dev_b, M * K * sizeof(half));
+  cudaMalloc(&dev_c, N * K * sizeof(float));
+  cudaMemcpy(dev_a, a, sizeof a, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_b, b, sizeof b, cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_c, c, sizeof c, cudaMemcpyHostToDevice);
+  foo<<<4, 32>>>(dev_a, dev_b, dev_c);
+  cudaDeviceSynchronize();
+  cudaMemcpy(c, dev_c, sizeof c, cudaMemcpyDeviceToHost);
+  std::cout.precision(1);
+  std::cout << std::fixed;
+  //print(N, M, a);
+  print(N, K, c);
   return 0;
 }
diff --git a/tensorcore/tensorcore.py b/tensorcore/tensorcore.py
@@ -1,8 +1,8 @@
 import tvm
 
-n = 4096
-k = 4096
-m = 4096
+n = 16
+k = 16
+m = 16
 
 a = tvm.placeholder((n, k), dtype='float16', name='a')
 b = tvm.placeholder((m, k), dtype='float16', name='b')
@@ -13,63 +13,16 @@
 
 sch = tvm.create_schedule(c.op)
 
-c_write = sch.cache_write(c, 'local')
-
 x, y = c.op.axis
 
 xo, xi = sch[c].split(x, 16)
-xoo, xoi = sch[c].split(xo, 4)
 yo, yi = sch[c].split(y, 16)
-yoo, yoi = sch[c].split(yo, 4)
 
 blcx = tvm.thread_axis('blockIdx.x')
 thrx = tvm.thread_axis('threadIdx.x')
 
-sch[c].bind(xoo, blcx)
-sch[c].bind(yoo, thrx)
-
-red_axis = c_write.op.reduce_axis[0]
-ro, ri = sch[c_write].split(red, 16)
-sch[c_write].reorder(ro, c_write.op.axis[0], c_write.op.axis[1], ri)
-
-#ax0, ax1 = a_shared.op.axis
-#ax1o, ax1i = sch[a_shared].split(ax1, 4)
-#fused = sch[a_shared].fuse(ax0, ax1o)
-#sch[a_shared].bind(fused, thrx)
-
-sch[c].reorder(xoo, yoo, xoi, yoi, xi, yi)
-
-#a_shared = sch.cache_read(a, 'shared', [c_write])
-#sch[a_shared].compute_at(sch[c_write], ro)
-
-sch[c_write].compute_at(sch[c], yoi)
-
-b_shared = sch.cache_read(b, 'shared', [c_write])
-sch[b_shared].compute_at(sch[c_write], ri)
-
-def toucher(op):
-    if isinstance(op, tvm.stmt.For):
-        print(op.loop_var)
-        print('a: ', tvm.arith.DomainTouched(op, a, True, True))
-        print('b: ', tvm.arith.DomainTouched(op, b, True, True))
-        print('c: ', tvm.arith.DomainTouched(op, c, True, True))
-        print('c.local: ', tvm.arith.DomainTouched(op, c_write, True, True))
-
-tvm.ir_pass.PostOrderVisit(tvm.build_module.form_body(sch), toucher)
-
-ir = tvm.lower(sch, [a, b, c], simple_mode=True)
-print(ir)
-
-module = tvm.build(sch, [a, b, c], target='cuda')
-print(module.imported_modules[0].get_source())
-
-module.imported_modules[0].save('gemm.cu')
-
-#import numpy as np
-#
-#nda = tvm.ndarray.array(np.random.randn(n, k).astype('float16'), tvm.gpu(0))
-#ndb = tvm.ndarray.array(np.random.randn(m, k).astype('float16'), tvm.gpu(0))
-#ndc = tvm.ndarray.array(np.zeros((n, m), dtype='float32'), tvm.gpu(0))
+yio, yii = sch[c].split(yi, 8)
+sch[c].reorder(xo, yo, xi, yio, yii, red)
+xy = sch[c].fuse(xi, yio)
 
-#timer = module.time_evaluator(module.entry_name, tvm.gpu(0), number=10)
-#print(timer(nda, ndb, ndc).mean)
+print(tvm.lower(sch, [a, b, c], simple_mode=True))
diff --git a/vnni/Makefile b/vnni/Makefile
@@ -6,8 +6,8 @@ poc.exe gemm.exe: %.exe: %.cu
 
 mkldnn_conv.out mkldnn_gemm.out: %.out: %.cc
 	clang++ $^ -std=c++11 -march=cascadelake -o $@ -O3 \
-	  -I../../mkl-dnn/include -I../../mkl-dnn/build/include \
-	  -L../../mkl-dnn/build/src -lmkldnn -lm -lpthread -lz
+	  -I$(MKLDNN)/include -I../../mkl-dnn/build/include \
+	  -L$(MKLDNN)/build/src -lmkldnn -lm -lpthread -lz
 
 clean:
 	rm -f *.out *.ll main poc *.exe *.s *.o
diff --git a/vnni/mkldnn_gemm.cc b/vnni/mkldnn_gemm.cc
@@ -22,17 +22,21 @@ int main() {
 
   {
     begin_roi();
-    for (int i = 0; i < 100; ++i) {
+    for (int i = 0; i < 10; ++i) {
       mkldnn_status_t status = mkldnn_gemm_s8s8s32('N', 'T', 'F', n, m, k, 1.0,
                                       a, k, 0, b, k, 0, 0.0,
                                       c, m, &co);
       assert(status == mkldnn_success);
     }
     float res = end_roi();
-    float gvnnis = (float) n * m * k / 64.f / res / 10.0;
+    float gvnnis = ((float) n * m * k / 64.f * 10.0 / res) / 1000.;
     printf("Execution time: %.5f\n", res / 100. / 1000000.);
-    printf("%.2f GVNNI/s\n", gvnnis / 8 * 7);
+    printf("%.2f GVNNI/us\n", gvnnis);
   }
 
+  delete[] a;
+  delete[] b;
+  delete[] c;
+
   return 0;
 }

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,10 @@`
`41`	`41`	`# For a C project, you would set this to 'c' instead of 'c++'.`
`42`	`42`	`'-x',`
`43`	`43`	`'c++',`
	`44`	`+'-x',`
	`45`	`+'cuda'`
	`46`	`+'-I',`
	`47`	`+'/usr/local/cuda/include'`
`44`	`48`	`]`
`45`	`49`
`46`	`50`
Original file line number	Diff line number	Diff line change
`@@ -22,17 +22,21 @@ int main() {`
`22`	`22`
`23`	`23`	`{`
`24`	`24`	`begin_roi();`
`25`		`- for (int i = 0; i < 100; ++i) {`
	`25`	`+ for (int i = 0; i < 10; ++i) {`
`26`	`26`	`mkldnn_status_t status = mkldnn_gemm_s8s8s32('N', 'T', 'F', n, m, k, 1.0,`
`27`	`27`	`a, k, 0, b, k, 0, 0.0,`
`28`	`28`	`c, m, &co);`
`29`	`29`	`assert(status == mkldnn_success);`
`30`	`30`	`}`
`31`	`31`	`float res = end_roi();`
`32`		`- float gvnnis = (float) n * m * k / 64.f / res / 10.0;`
	`32`	`+ float gvnnis = ((float) n * m * k / 64.f * 10.0 / res) / 1000.;`
`33`	`33`	`printf("Execution time: %.5f\n", res / 100. / 1000000.);`
`34`		`- printf("%.2f GVNNI/s\n", gvnnis / 8 * 7);`
	`34`	`+ printf("%.2f GVNNI/us\n", gvnnis);`
`35`	`35`	`}`
`36`	`36`
	`37`	`+ delete[] a;`
	`38`	`+ delete[] b;`
	`39`	`+ delete[] c;`
	`40`	`+`
`37`	`41`	`return 0;`
`38`	`42`	`}`