Skip to content

Commit 31f86dc

Browse files
author
Jian Weng
committed
hmmmmm
1 parent d4ccf23 commit 31f86dc

File tree

5 files changed

+26
-33
lines changed

5 files changed

+26
-33
lines changed

models/arm/compile.py

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ def tracer(module, info, is_before):
9595

9696
print('Model Load!')
9797
import tensorizer
98+
from tensorizer import tune
99+
tune.enable = True
98100
with tvm.transform.PassContext(config={'tir.add_lower_pass': [(1, tensorizer.rewrite)]},
99101
trace=tracer, opt_level=3):
100102
graph, lib, params = relay.build_module.build(

models/cuda/compile.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ def load_model(symbol_file, param_file, logger=None):
6262

6363

6464
def compile_via_tvm(sym, arg_params, aux_params, symbol_file, data_shape):
65-
tune = False
6665

6766
input_shape = [1] + list(data_shape)
6867
input_dict = {'data': input_shape}
@@ -91,6 +90,8 @@ def tracer(module, info, is_before):
9190
print('Executes: ', info.name, (time.time() - timing) * 1000)
9291

9392
import tensorizer
93+
from tensorizer import tune
94+
tune.enable = True
9495
with tvm.transform.PassContext(config={'tir.add_lower_pass': [(1, tensorizer.rewrite)]},
9596
trace=tracer, opt_level=3, disabled_pass=['FoldScaleAxis']):
9697
#with tvm.transform.PassContext(trace=tracer, opt_level=4):

poc/tensorcore/conv-tc.cu

+12-9
Original file line numberDiff line numberDiff line change
@@ -118,17 +118,20 @@ int main() {
118118
yDesc, y) );
119119
cudaDeviceSynchronize();
120120
begin_roi();
121-
checkCudnnErr( cudnnConvolutionForward(handle, (void*)(&alpha),
122-
xDesc, x,
123-
wDesc, w, convDesc, algo,
124-
workSpace, workSpaceSize,
125-
(void*)(&beta),
126-
yDesc, y) );
127-
checkCudaErr(cudaDeviceSynchronize());
128-
auto elps = end_roi();
121+
for (int i = 0; i < 10; ++i) {
122+
checkCudnnErr( cudnnConvolutionForward(handle, (void*)(&alpha),
123+
xDesc, x,
124+
wDesc, w, convDesc, algo,
125+
workSpace, workSpaceSize,
126+
(void*)(&beta),
127+
yDesc, y) );
128+
checkCudaErr(cudaDeviceSynchronize());
129+
}
130+
double elps = end_roi();
131+
elps /= 10.;
129132
std::cout << "Exec: " << elps << "us" << std::endl;
130133
std::cout << ((double) dimY[0] * dimY[1] * dimY[2] * dimY[3] * dimB[1] * dimB[2] * dimB[3]) / elps / 1000.
131134
<< " GFLOP/s" << std::endl;
132135

133136
return 0;
134-
}
137+
}

poc/tensorcore/input

-13
This file was deleted.

poc/tensorcore/run.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,26 @@
1414
(1024,15,15,2048,1024,1,1,2,2),
1515
(128,65,65,256,128,3,3,2,2)]
1616

17+
workloads = [(1, 288, 35, 35, 384, 288, 3, 3, 2, 2), (1, 160, 9, 9, 224, 160, 3, 3, 1, 1), (1, 1056, 7, 7, 192, 1056, 1, 1, 1, 1), (1, 80, 73, 73, 192, 80, 3, 3, 1, 1), (1, 128, 16, 16, 128, 128, 3, 3, 1, 1), (1, 192, 16, 16, 192, 192, 3, 3, 1, 1), (1, 256, 16, 16, 256, 256, 3, 3, 1, 1), (1, 1024, 14, 14, 512, 1024, 1, 1, 1, 1), (1, 128, 16, 16, 160, 128, 3, 3, 1, 1), (1, 576, 14, 14, 192, 576, 1, 1, 1, 1), (1, 96, 16, 16, 128, 96, 3, 3, 1, 1), (1, 1024, 14, 14, 256, 1024, 1, 1, 1, 1), (1, 576, 14, 14, 128, 576, 1, 1, 1, 1), (1, 64, 29, 29, 96, 64, 3, 3, 1, 1), (1, 64, 56, 56, 128, 64, 1, 1, 2, 2), (1, 608, 14, 14, 192, 608, 1, 1, 1, 1)]
18+
1719
for i in workloads:
20+
res = []
1821
for dtype in [0, 2]:
19-
for tc in range(2):
22+
for tc in range(2 if dtype == 2 else 1):
2023
exec_time = []
2124
for algo in range(8):
2225
with open('input', 'w') as f:
23-
f.write('1\n')
26+
#f.write('1\n')
2427
for j in i:
2528
f.write(str(j) + '\n')
2629
f.write(str(dtype) + '\n')
2730
f.write(str(algo) + '\n')
2831
f.write(str(tc) + '\n')
2932
try:
30-
avg = []
31-
for j in range(10):
32-
res = subprocess.check_output('./tensorcore.nvcc < input', shell=True).decode('utf-8')
33-
res = int(res.split('\n')[0].lstrip('Exec: ').rstrip('us'))
34-
avg.append(res)
35-
exec_time.append(sum(avg) / len(avg))
33+
proc = subprocess.check_output('./tensorcore.nvcc < input', shell=True).decode('utf-8')
34+
proc = float(proc.split('\n')[0].lstrip('Exec: ').rstrip('us'))
35+
exec_time.append(proc)
3636
except:
3737
pass
38-
#print(i, algo, tc, 'fails')
39-
print(i, ['fp32', None, 'fp16'][dtype], tc, min(exec_time))
38+
res.append(min(exec_time))
39+
print(i, ['fp32', None, 'fp16'][dtype], tc, res)

0 commit comments

Comments
 (0)