were · Nov 16, 2020
diff --git a/‎apps/gpu/alone.py
+114 b/‎apps/gpu/alone.py
+114
diff --git a/‎apps/gpu/input
+1 b/‎apps/gpu/input
+1
diff --git a/‎apps/gpu/intersect
+37 b/‎apps/gpu/intersect
+37
diff --git a/‎apps/gpu/run.py
+13 b/‎apps/gpu/run.py
+13
diff --git a/‎apps/gpu/tune.py
+21 b/‎apps/gpu/tune.py
+21
diff --git a/‎gpu-tune.log
+288 b/‎gpu-tune.log
+288
diff --git a/‎python/tensorizer/intrinsics/cpu.py
+10-3 b/‎python/tensorizer/intrinsics/cpu.py
+10-3
diff --git a/‎python/tensorizer/intrinsics/gpu.py
+2-1 b/‎python/tensorizer/intrinsics/gpu.py
+2-1
diff --git a/‎python/tensorizer/ops/gpu.py
+36-10 b/‎python/tensorizer/ops/gpu.py
+36-10
diff --git a/‎python/tensorizer/tune.py
+30-2 b/‎python/tensorizer/tune.py
+30-2
@@ -0,0 +1,114 @@
+import tvm
+import tensorizer
+import logging
+import sys
+import numpy as np
+from tvm import relay
+from tvm import autotvm
+
+import topi
+from tvm.relay import op
+
+
+#t0, t1 = eval(input())
+#n, c, h, w = map(int, t0)
+#oc, ic, kh, kw = map(int, t1)
+n, c, h, w, oc, ic, kh, kw, sh, sw = map(int, input().split())
+
+oh = (h - kh) // sh + 1
+ow = (w - kw) // sw + 1
+
+import time
+timing = -1
+
+def tracer(module, info, is_before):
+    global timing
+    if bool(is_before):
+        timing = time.time()
+    else:
+        print('Executes: ', info.name, (time.time() - timing) * 1000)
+
+from tensorizer import tune
+tune.enable = False
+
+result = info = 1e9
+for i in [None, 'fuse', 'pad'] if ow < 32 else [None]:
+    j = 16
+    while True:
+        diffc = diffoc = diffh = diffw = 0
+        #if c % 64:
+        #    diffc = 64 - c % 64
+
+        #if oc % 32:
+        #    diffoc = 32 - oc % 32
+
+        #can_fuse = can_pad = True
+        #if i == 'pad':
+        #    can_fuse = False
+        #if i == 'fuse':
+        #    can_pad = False
+        #if not ((oh * ow % 32 == 0 and 32 % ow == 0) or ow % 32 == 0):
+        #    first_h = sh - (h - kh) % sh
+        #    first_w = sw - (w - kw) % sw
+        #    max_diff_h = 32 - oh % 32
+        #    max_diff_w = 32 - ow % 32
+        #    diffh = diffw = 1e9
+        #    for i in range(max_diff_h + 1):
+        #        for j in range(max_diff_w + 1):
+        #            if (((oh + i) * (ow + j) % 32 == 0 and 32 % (ow + j) == 0 and can_fuse) or ((ow + j) % 32 == 0 and can_pad)) and i + j < diffh + diffw:
+        #                def to_pad(padding, first, stride):
+        #                    if padding == 0:
+        #                        return 0
+        #                    assert padding >= 1
+        #                    return (padding - 1) * stride + first
+        #                diffh, diffw = to_pad(i, first_h, sh), to_pad(j, first_w, sw)
+        #    #assert (height + diffh - kh + 1) * (width + diffw - kw + 1) % 32 == 0
+
+
+        #var_x = relay.var('x', shape=(n, (c + diffc) // 16, (h + diffh), (w + diffw), 16), dtype='float16')
+        #var_w = relay.const(tvm.nd.array((np.random.randn((oc + diffoc) // 16, (c + diffc) // 16, kh, kw, 16, 16) * 128).astype('float16')))
+        #conv2d = relay.nn.conv2d(var_x, var_w, out_dtype='float32', kernel_size=(kh, kw), channels=oc + diffoc, strides=(sh, sw), data_layout='NCHW16c', kernel_layout='OIHW16i16o')
+        #if diffc or diffoc or diffh or diffw:
+        #    y = relay.strided_slice(conv2d,
+        #                            begin=relay.const(tvm.nd.array([0, 0, 0, 0])),
+        #                            end=relay.const(tvm.nd.array([n, oc, oh, ow])))
+        #else:
+        #    y = conv2d
+        var_x = relay.var('x', shape=(n, c, h, w), dtype='float32')
+        var_w = relay.const(tvm.nd.array((np.random.randn(oc, ic, kh, kw) * 128).astype('float32')))
+        var_b = relay.const(tvm.nd.array((np.random.randn(1, oc, 1, 1) * 128).astype('float32')))
+        conv2d = relay.nn.conv2d(var_x, var_w, out_dtype='float32', kernel_size=(kh, kw), channels=oc, strides=(sh, sw), out_layout='NCHW16c')
+        y = conv2d
+
+        func = relay.Function([var_x], y)
+        module = tvm.IRModule()
+        module['main'] = func
+
+        tune.padding = i
+        tune.splitk = j
+        passes = [(1, tensorizer.rewrite)]
+        with tvm.transform.PassContext(opt_level=0, trace=tracer, config={'tir.add_lower_pass': passes}):
+        #with tvm.transform.PassContext(opt_level=4, trace=tracer):
+            #graph, lib, params = tvm.relay.build(module, target='cuda -libs=cublas,cudnn')
+            graph, lib, params = tvm.relay.build(module, target='nvptx -libs=cublas,cudnn')
+            from tvm.contrib import graph_runtime as runtime
+            from tvm.contrib.debugger import debug_runtime as runtime
+            func = runtime.create(graph, lib, tvm.gpu())
+
+            x_ =(np.random.randn(n, c, h, w) * 128).astype('float32')
+            func.set_input('x', x_)
+            timer = func.module.time_evaluator('run', ctx=tvm.gpu(), number=2, repeat=10)
+
+            timed = timer()
+            while np.var(timed.results) > 1e-5:
+                timed = timer()
+
+            if timed.mean < result:
+                result = timed.mean
+                info = (i, j)
+
+
+        relay.backend.compile_engine.get().clear()
+        j <<= 1
+        if j > tune.total_idx:
+            break
@@ -0,0 +1 @@
+1 608 14 14 192 608 1 1 1 1
@@ -0,0 +1,37 @@
+(1, 576, 14, 14, 192, 576, 1, 1, 1, 1, 8.4071, 15.396700000000001)
+(1, 160, 9, 9, 224, 160, 3, 3, 1, 1, 9.589333333333332, 13.485766666666668)
+(1, 2048, 8, 8, 384, 2048, 1, 1, 1, 1, 13.051733333333335, 17.1916)
+(1, 64, 58, 58, 128, 64, 3, 3, 2, 2, 15.326533333333337, 18.845066666666664)
+(1, 64, 56, 56, 128, 64, 1, 1, 2, 2, 5.128066666666667, 6.184633333333333)
+(1, 1056, 7, 7, 192, 1056, 1, 1, 1, 1, 6.848499999999999, 8.254633333333333)
+(1, 64, 29, 29, 96, 64, 3, 3, 1, 1, 19.699066666666667, 22.661566666666666)
+(1, 576, 14, 14, 128, 576, 1, 1, 1, 1, 7.0441, 8.0889)
+(1, 1024, 14, 14, 512, 1024, 1, 1, 1, 1, 28.08936666666667, 32.23866666666667)
+(1, 160, 17, 23, 192, 160, 1, 7, 1, 1, 21.459366666666668, 24.52113333333334)
+(1, 288, 35, 35, 384, 288, 3, 3, 2, 2, 86.23503333333333, 98.33573333333334)
+(1, 128, 16, 16, 128, 128, 3, 3, 1, 1, 10.165566666666667, 11.5552)
+(1, 192, 23, 17, 192, 192, 7, 1, 1, 1, 26.066133333333333, 29.171766666666667)
+(1, 768, 17, 17, 128, 768, 1, 1, 1, 1, 12.296766666666663, 13.711066666666666)
+(1, 768, 17, 17, 160, 768, 1, 1, 1, 1, 14.967966666666669, 16.653766666666666)
+(1, 256, 56, 56, 128, 256, 1, 1, 2, 2, 9.581933333333334, 10.642633333333334)
+(1, 192, 17, 17, 320, 192, 3, 3, 2, 2, 11.066666666666668, 12.182533333333334)
+(1, 96, 16, 16, 128, 96, 3, 3, 1, 1, 8.426166666666667, 9.266100000000003)
+(1, 128, 16, 16, 160, 128, 3, 3, 1, 1, 11.457933333333335, 12.278133333333335)
+(1, 1280, 8, 8, 384, 1280, 1, 1, 1, 1, 11.535666666666666, 12.33676666666667)
+(1, 64, 16, 16, 96, 64, 3, 3, 1, 1, 6.273666666666665, 6.7090666666666685)
+(1, 192, 16, 16, 192, 192, 3, 3, 1, 1, 17.286700000000003, 18.317766666666667)
+(1, 160, 23, 17, 192, 160, 7, 1, 1, 1, 25.39276666666667, 26.673099999999998)
+(1, 768, 17, 17, 192, 768, 1, 1, 1, 1, 17.378100000000003, 18.132166666666663)
+(1, 1024, 14, 14, 256, 1024, 1, 1, 1, 1, 17.09733333333333, 17.7697)
+(1, 256, 16, 16, 256, 256, 3, 3, 1, 1, 29.66086666666666, 30.810766666666673)
+(1, 576, 14, 14, 64, 576, 1, 1, 1, 1, 5.460666666666666, 5.6329666666666665)
+(1, 608, 14, 14, 192, 608, 1, 1, 1, 1, 8.621333333333334, 8.864866666666666)
+(1, 192, 27, 27, 64, 192, 1, 1, 1, 1, 6.770499999999999, 6.957433333333333)
+(1, 192, 16, 16, 256, 192, 3, 3, 1, 1, 23.1803, 23.818533333333335)
+(1, 160, 17, 23, 160, 160, 1, 7, 1, 1, 19.4532, 19.969299999999997)
+(1, 256, 27, 27, 64, 256, 1, 1, 1, 1, 8.121599999999999, 8.312833333333332)
+(1, 448, 10, 10, 384, 448, 3, 3, 1, 1, 21.378566666666664, 21.486066666666662)
+(1, 2048, 8, 8, 448, 2048, 1, 1, 1, 1, 20.103566666666666, 20.196166666666663)
+(1, 128, 28, 28, 512, 128, 1, 1, 1, 1, 14.981066666666665, 15.042333333333334)
+(1, 32, 149, 149, 32, 32, 3, 3, 1, 1, 42.16560000000001, 42.2847)
+(1, 80, 73, 73, 192, 80, 3, 3, 1, 1, 362.71533333333326, 363.31000000000006)
@@ -0,0 +1,13 @@
+import subprocess
+
+with open('./intersect', 'r') as f:
+    shapes = []
+    for i in f.readlines():
+        a = eval(i)
+        shapes.append(' '.join(map(str, a[:-2])))
+    shapes = set(shapes)
+    for i in shapes:
+        with open('input', 'w') as f:
+            f.write(i)
+        print('tuning:', i)
+        subprocess.check_output('python relay.py < input', shell=True)
@@ -0,0 +1,21 @@
+import subprocess
+
+#with open('/home/ubuntu/shapes.raw', 'r') as f:
+#    shapes = []
+#    for i in f.readlines():
+#        shapes.append(i)
+#    shapes = set(shapes)
+#    for i in shapes:
+#        with open('input', 'w') as f:
+#            f.write(i)
+#        print('tuning:', i)
+#        subprocess.check_output('python relay.py < input', shell=True)
+#
+
+shapes = [(1, 288, 35, 35, 384, 288, 3, 3, 2, 2), (1, 160, 9, 9, 224, 160, 3, 3, 1, 1), (1, 1056, 7, 7, 192, 1056, 1, 1, 1, 1), (1, 80, 73, 73, 192, 80, 3, 3, 1, 1), (1, 128, 16, 16, 128, 128, 3, 3, 1, 1), (1, 192, 16, 16, 192, 192, 3, 3, 1, 1), (1, 256, 16, 16, 256, 256, 3, 3, 1, 1), (1, 1024, 14, 14, 512, 1024, 1, 1, 1, 1), (1, 128, 16, 16, 160, 128, 3, 3, 1, 1), (1, 576, 14, 14, 192, 576, 1, 1, 1, 1), (1, 96, 16, 16, 128, 96, 3, 3, 1, 1), (1, 1024, 14, 14, 256, 1024, 1, 1, 1, 1), (1, 576, 14, 14, 128, 576, 1, 1, 1, 1), (1, 64, 29, 29, 96, 64, 3, 3, 1, 1), (1, 64, 56, 56, 128, 64, 1, 1, 2, 2), (1, 608, 14, 14, 192, 608, 1, 1, 1, 1)]
+
+for i in shapes:
+    with open('input', 'w') as f:
+        f.write(' '.join(map(str, i)))
+    print('tuning:', i)
+    subprocess.check_output('python relay.py < input', shell=True)
@@ -75,8 +75,15 @@ def callback(op):
 
             if tune.cpu_idx is None:
                 to_apply = points[0][-1]
-                #with open('/home/ubuntu/Tensorization-PoC/cpu-shapes.log', 'a') as f:
-                #    f.write(f'{tune.ashape} {tune.bshape} {tune.strides}\n')
+                import os
+                HOME = os.getenv("HOME")
+                try:
+                    f = open(HOME + '/Tensorization-PoC/cpu-shapes.log', 'a')
+                except:
+                    f = open(HOME + '/UNIT/cpu-shapes.log', 'a')
+                except:
+                    assert False
+                f.write(f'{tune.ashape} {tune.bshape} {tune.strides}\n')
                 if (tune.ashape, tune.bshape, tune.strides) in tune.x86.keys():
                     to_apply = points[tune.x86[(tune.ashape, tune.bshape, tune.strides)]][-1]
             else:
@@ -182,4 +189,4 @@ def callback(op):
 arm_operand = functools.partial(loader, cast_type='int8x16')
 arm_writeback = functools.partial(writer, llvm_intrin='llvm.aarch64.neon.sdot.v4i32.v16i8', dtype='int32x4')
 from .pattern import arm_sdot128_i8i16
-arm_schedule = functools.partial(schedule, pattern=arm_sdot128_i8i16, pragma='vdot', max_threads=10000)
+arm_schedule = functools.partial(schedule, pattern=arm_sdot128_i8i16, pragma='vdot', max_threads=10000)
@@ -213,4 +213,5 @@ def cleanup(store, axis, operands):
     res = [res, tvm.tir.Evaluate(tvm.tir.call_llvm_intrin('handle', 'llvm.nvvm.barrier0', tvm.tir.const(0, 'int32')))]
 
     res = tvm.tir.SeqStmt(res)
-    return res
+    return res
+
@@ -3,6 +3,7 @@
 from tvm import te
 from tvm import autotvm
 import tvm
+from tensorizer import tune
 
 @autotvm.register_topi_compute('conv2d_NCHW16c_OHWI16o.nvptx')
 def _conv2d_NCHW16c_OHWI16o_impl(cfg, a, b, stride_h, stride_w, out_type):
@@ -70,7 +71,17 @@ def schedule_fetcher(sch, buffer, y, x):
         sch[buffer].vectorize(xi)
 
     rc = sch[conv].op.reduce_axis[0]
-    rco, rci = sch[conv].split(rc, 64)
+    for i in [16, 32, 64]:
+        if rc.dom.extent.value % i == 0:
+            split_k = i
+    print('!!!!!!!!!!!!!!!')
+    print(tune.splitk)
+    if tune.splitk is not None:
+        tune.total_idx = split_k
+        split_k = tune.splitk
+
+    rc = sch[conv].op.reduce_axis[0]
+    rco, rci = sch[conv].split(rc, split_k)
     rcio, rcii = sch[conv].split(rci, 16)
     rf = sch.rfactor(conv, rcio)
     cc = sch.cache_write(rf, 'wmma.accumulator')
@@ -83,7 +94,9 @@ def schedule_fetcher(sch, buffer, y, x):
     xyio, xyii = sch[conv].split(xyi, 16)
     obo, obi = sch[conv].split(ob, 8)
     sch[conv].reorder(batch, oco, xyo, oci, xyio, xyii, obo, obi)
-    sch[conv].bind(sch[conv].fuse(oci, xyio), te.thread_axis('threadIdx.y'))
+    fused = sch[conv].fuse(oci, xyio)
+    fo, fi = sch[conv].split(fused, split_k // 16)
+    sch[conv].bind(fi, te.thread_axis('threadIdx.y'))
     sch[conv].bind(sch[conv].fuse(xyii, obo), te.thread_axis('threadIdx.x'))
     sch[conv].vectorize(obi)
     sch[rf].compute_at(sch[conv], xyo)
@@ -99,7 +112,9 @@ def schedule_fetcher(sch, buffer, y, x):
         xyio, xyii = sch[output].split(xyi, 16)
         obo, obi = sch[output].split(ob, 8)
         sch[output].reorder(batch, oco, xyo, oci, xyio, xyii, obo, obi)
-        sch[output].bind(sch[output].fuse(oci, xyio), te.thread_axis('threadIdx.y'))
+        fused = sch[output].fuse(oci, xyio)
+        fo, fi = sch[output].split(fused, split_k // 16)
+        sch[output].bind(fi, te.thread_axis('threadIdx.y'))
         sch[output].bind(sch[output].fuse(xyii, obo), te.thread_axis('threadIdx.x'))
         sch[output].vectorize(obi)
         sch[output].bind(oco, te.thread_axis('blockIdx.y'))
@@ -134,7 +149,7 @@ def schedule_fetcher(sch, buffer, y, x):
         sch[aaii].compute_at(sch[cc], crw)
         sch[a_icol].compute_inline()
         fused = sch[aaii].fuse(sch[aaii].op.axis[1], sch[aaii].op.axis[2], sch[aaii].op.axis[3])
-        fo, fi = sch[aaii].split(fused, nparts=4)
+        fo, fi = sch[aaii].split(fused, nparts=split_k // 16)
         fio, fii = sch[aaii].split(fi, nparts=32)
         sch[aaii].bind(fo, te.thread_axis('threadIdx.y'))
         sch[aaii].bind(fio, te.thread_axis('threadIdx.x'))
@@ -143,10 +158,10 @@ def schedule_fetcher(sch, buffer, y, x):
     else:
         a_reuse = sch.cache_read(a, 'shared', [cc])
         sch[a_reuse].compute_at(sch[cc], crcio)
-        schedule_fetcher(sch, a_reuse, 4, 32)
+        schedule_fetcher(sch, a_reuse, split_k // 16, 32)
         a_shared = sch.cache_read(a_reuse, 'shared', [cc])
         sch[a_shared].compute_at(sch[cc], crw)
-        schedule_fetcher(sch, a_shared, 4, 32)
+        schedule_fetcher(sch, a_shared, split_k // 16, 32)
 
     aa = sch.cache_read(a_shared, 'wmma.matrix_a', [cc])
     #aa = sch.cache_read(a, 'wmma.matrix_a', [cc])
@@ -168,9 +183,11 @@ def _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w):
         if rc.dom.extent.value % i == 0:
             split_k = i
 
-    if stride_h != 1 or stride_w != 1:
-        split_k = 16
-
+    print('!!!!!!!!!!!!!!!!!')
+    print(tune.splitk)
+    if tune.splitk is not None:
+        tune.total_idx = split_k
+        split_k = tune.splitk
 
     rc = sch[conv].op.reduce_axis[0]
     rco, rci = sch[conv].split(rc, split_k)
@@ -186,7 +203,8 @@ def _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w):
     sch[conv].reorder(batch, x, yo, oco, oo, oci, yio, oio, yii, oii)
     sch[rf].compute_at(sch[conv], oo)
     fused = sch[conv].fuse(oci, yio, oio)
-    sch[conv].bind(fused, te.thread_axis('threadIdx.y'))
+    fo, fi = sch[conv].split(fused, split_k // 16)
+    sch[conv].bind(fi, te.thread_axis('threadIdx.y'))
     vo, vi = sch[conv].split(oii, 8)
     sch[conv].vectorize(vi)
     fused = sch[conv].fuse(yii, vo)
@@ -269,10 +287,16 @@ def callback(op):
         nonlocal sch
         if len(list(op.reduce_axis)):
             a, b = op.input_tensors
+            tune.ashape = get_const_tuple(a.shape)
+            tune.bshape = get_const_tuple(b.shape)
 
             conv = op.output(0)
             n, c, h, w, _ = get_const_tuple(conv.shape)
             stride_h, stride_w = attrs.get_int_tuple('strides')
+            tune.strides = (stride_h, stride_w)
+            ky = tune.ashape, tune.bshape, (stride_h, stride_w)
+            if tune.enable and ky in tune.cuda_kernel.keys():
+                tune.splitk = int(tune.cuda_kernel[ky])
             if w % 32 == 0:
                 _conv2d_schedule_wdim(sch, conv, output, stride_h, stride_w)
             else:
@@ -281,4 +305,6 @@ def callback(op):
 
     traverse_inline(sch, output, callback)
 
+    tune.splitk = None
+
     return sch
@@ -1,3 +1,7 @@
+import os
+
+enable = False
+
 cpu_idx = None
 total_idx = None
 ashape = None
@@ -8,13 +12,37 @@
 splitk = None
 x86 = {}
 
+HOME = os.getenv("HOME")
+
 def load_x86():
-    for i in open('/home/ubuntu/Tensorization-PoC/cpu-tune.log').readlines():
+    try:
+        f = open(HOME + '/Tensorization-PoC/cpu-tune.log')
+    except:
+        f = open(HOME + '/UNIT/cpu-tune.log')
+    for i in f.readlines():
         i = i.replace(') ', '), ')
         try:
             a, b, s, v = eval(i)
         except:
             a, b, s, v, _, _ = eval(i)
         x86[(a, b, s)] = v
 
-load_x86()
+cuda_kernel = {}
+cuda_relay = {}
+def load_cuda():
+    try:
+        f = open(HOME + '/Tensorization-PoC/gpu-tune.log')
+    except:
+        f = open(HOME + '/UNIT/gpu-tune.log')
+    raw = f.readlines()
+    for i in raw[::2]:
+        i = i.replace(') ', '), ')
+        a, b, s, v, _ = eval(i)
+        cuda_kernel[(a, b, s)] = v
+    for i in raw[1::2]:
+        N, C, H, W, O, I, KH, KW, SH, SW, v, _ = i.split()
+        v = v.strip(',')
+        cuda_relay[tuple(map(int, (N, C, H, W, O, I, KH, KW, SH, SW)))] = v
+
+load_x86()
+load_cuda()