diff --git a/test/microbench/batch_norm_1d.py b/test/microbench/batch_norm_1d.py index 1a9bed77e..1d837ccd2 100644 --- a/test/microbench/batch_norm_1d.py +++ b/test/microbench/batch_norm_1d.py @@ -1,7 +1,15 @@ import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +if torch.cuda.is_available(): + device = "cuda" + activity = ProfilerActivity.CUDA + table_key = "cuda_time_total" +else: + device = "xpu" + activity = ProfilerActivity.XPU + table_key = "xpu_time_total" + shape_list = [((64, 8), (8)), ((4, 128, 15000), (128)), ((4, 256, 512), (256))] @@ -29,7 +37,7 @@ backward, ) with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ProfilerActivity.CPU, activity], record_shapes=True ) as prof: for i in range(20): m = torch.nn.BatchNorm1d(shape[1], device=device) @@ -37,4 +45,4 @@ if backward: gy = torch.empty_like(output) output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) + print(prof.key_averages().table(sort_by=table_key)) diff --git a/test/microbench/batch_norm_2d.py b/test/microbench/batch_norm_2d.py index 1130e6209..aee88507c 100644 --- a/test/microbench/batch_norm_2d.py +++ b/test/microbench/batch_norm_2d.py @@ -1,7 +1,15 @@ import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +if torch.cuda.is_available(): + device = "cuda" + activity = ProfilerActivity.CUDA + table_key = "cuda_time_total" +else: + device = "xpu" + activity = ProfilerActivity.XPU + table_key = "xpu_time_total" + shape_list = [ (256, 256, 56, 56, 256), @@ -20,14 +28,14 @@ def BTN2d(shape, dtype, channels_last, backward): input = ( torch.randn(N, C, H, W) .to(memory_format=torch.channels_last) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, H, W).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, H, W).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) - grad = torch.randn([C, H, W]).to(device="xpu", dtype=dtype) + grad = torch.randn([C, H, W]).to(device=device, dtype=dtype) BTN = torch.nn.BatchNorm2d(shape[4], device=device) @@ -59,9 +67,9 @@ def BTN2d(shape, dtype, channels_last, backward): backward, ) with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], + activities=[ProfilerActivity.CPU, activity], record_shapes=True, ) as prof: for i in range(20): BTN2d(shape, dtype, channels_last, backward=True) - print(prof.key_averages().table(sort_by="xpu_time_total")) + print(prof.key_averages().table(sort_by=table_key)) diff --git a/test/microbench/batch_norm_3d.py b/test/microbench/batch_norm_3d.py index 5bf376574..a7fbb0769 100644 --- a/test/microbench/batch_norm_3d.py +++ b/test/microbench/batch_norm_3d.py @@ -1,7 +1,15 @@ import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +if torch.cuda.is_available(): + device = "cuda" + activity = ProfilerActivity.CUDA + table_key = "cuda_time_total" +else: + device = "xpu" + activity = ProfilerActivity.XPU + table_key = "xpu_time_total" + shape_list = [(2, 5, 6, 3, 5, 5), (2, 8, 64, 64, 64, 8), (16, 16, 128, 128, 256, 16)] @@ -20,14 +28,14 @@ def BTN3d(shape, dtype, channels_last, backward): input = ( torch.randn(N, C, D, H, W) .to(memory_format=torch.channels_last_3d) - .to(device="xpu", dtype=dtype) + .to(device=device, dtype=dtype) ) else: - input = torch.randn(N, C, D, H, W).to(device="xpu", dtype=dtype) + input = torch.randn(N, C, D, H, W).to(device=device, dtype=dtype) if backward: input.requires_grad_(True) - grad = torch.randn([C, D, H, W]).to(device="xpu", dtype=dtype) + grad = torch.randn([C, D, H, W]).to(device=device, dtype=dtype) BTN = torch.nn.BatchNorm3d(shape[5], device=device) @@ -59,9 +67,9 @@ def BTN3d(shape, dtype, channels_last, backward): backward, ) with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], + activities=[ProfilerActivity.CPU, activity], record_shapes=True, ) as prof: for i in range(20): BTN3d(shape, dtype, channels_last, backward=True) - print(prof.key_averages().table(sort_by="xpu_time_total")) + print(prof.key_averages().table(sort_by=table_key)) diff --git a/test/microbench/group_norm.py b/test/microbench/group_norm.py index 4a6b471a6..f61795ac5 100644 --- a/test/microbench/group_norm.py +++ b/test/microbench/group_norm.py @@ -1,9 +1,19 @@ import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +if torch.cuda.is_available(): + device = "cuda" + activity = ProfilerActivity.CUDA + table_key = "cuda_time_total" +else: + device = "xpu" + activity = ProfilerActivity.XPU + table_key = "xpu_time_total" + + backward = True + shape_list = [ (1, 32, 128, 32, 32), # all channel for 1 group (16, 1024, 128, 32, 32), # normal shape, big memory @@ -64,7 +74,7 @@ backward, ) with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], + activities=[ProfilerActivity.CPU, activity], record_shapes=True, ) as prof: for i in range(20): @@ -73,4 +83,4 @@ if backward: grad_out = torch.randn_like(output).to(device) (grad_dpcpp,) = torch.autograd.grad(output, input, grad_out) - print(prof.key_averages().table(sort_by="xpu_time_total")) + print(prof.key_averages().table(sort_by=table_key)) diff --git a/test/microbench/layer_norm.py b/test/microbench/layer_norm.py index 9262a8a8c..c597a7cd1 100644 --- a/test/microbench/layer_norm.py +++ b/test/microbench/layer_norm.py @@ -1,9 +1,19 @@ import torch from torch.profiler import profile, ProfilerActivity -device = "xpu" +if torch.cuda.is_available(): + device = "cuda" + activity = ProfilerActivity.CUDA + table_key = "cuda_time_total" +else: + device = "xpu" + activity = ProfilerActivity.XPU + table_key = "xpu_time_total" + + backward = True + shape_list = [ ((1, 1024), (1024)), ((2, 4096, 320), (4096, 320)), @@ -38,7 +48,7 @@ backward, ) with profile( - activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True + activities=[ProfilerActivity.CPU, activity], record_shapes=True ) as prof: for i in range(20): m = torch.nn.LayerNorm(shape[1], device=device, dtype=dtype) @@ -46,4 +56,4 @@ if backward: gy = torch.empty_like(output) output.backward(gy) - print(prof.key_averages().table(sort_by="xpu_time_total")) + print(prof.key_averages().table(sort_by=table_key)) diff --git a/tools/check_op_perf.py b/tools/check_op_perf.py new file mode 100644 index 000000000..61a03fbc9 --- /dev/null +++ b/tools/check_op_perf.py @@ -0,0 +1,87 @@ +import os +import subprocess +from pathlib import Path + + +def find_pytorch_dir(): + path = Path(__file__).resolve() + while path != path.root: + if path.name == "torch-xpu-ops": + return str(path) + path = path.parent + return '' + + +OP_LIST = { + 'layer_norm.py': ['aten::native_layer_norm', 'aten::native_layer_norm_backward'], + 'group_norm.py': ['aten::native_group_norm', 'aten::native_group_norm_backward'], + 'batch_norm_1d.py': [('aten::native_batch_norm', 'aten::cudnn_batch_norm'), + ('aten::native_batch_norm_backward', 'aten::cudnn_batch_norm_backward')], + 'batch_norm_2d.py': [('aten::native_batch_norm', 'aten::cudnn_batch_norm'), + ('aten::native_batch_norm_backward', 'aten::cudnn_batch_norm_backward')], + # 'batch_norm_3d.py': ['aten::native_batch_norm', 'aten::native_batch_norm_backward'], +} + + +def find_op_time(text, ops): + res = [] + + def transform_to_us(time): + if time.endswith('us'): + return float(time[:-2]) + elif time.endswith('ms'): + return float(time[:-2]) * 1000.0 + elif time.endswith('s'): + return float(time[:-1]) * 1000000.0 + else: + raise Exception("time format not support") + flag = "None" + print(text) + for line in text.split('\n'): + line = line.strip() + if line.startswith('shape:'): + flag = line + for op in ops: + if not isinstance(op, tuple): + op = (op,) + op_base_name = op[0] + for op_alias in op: + if op_alias in line: + items = [] + for item in line.strip().split(' '): + if len(item) > 1: + items.append(item.strip()) + if items[0].strip() == op_alias: + op_time = transform_to_us(items[-2]) + res.append([op_base_name, flag, str(op_time)]) + res_ = ["@@".join(item) for item in res] + res_ = list(set(res_)) + res = [item.split("@@") for item in res_] + res = sorted(res, key=lambda x: x[1]) + res = sorted(res, key=lambda x: x[0]) + return res + + +if __name__ == '__main__': + root_folder = find_pytorch_dir().strip() + perf_suit = os.path.join(root_folder, 'test/microbench/') + import csv + csv_data = [ + ["Operator", "Tag", "Latency(us)"], + ] + for item, ops in OP_LIST.items(): + print(item) + f = os.path.join(perf_suit, item) + result = subprocess.run( + ["python", f], + capture_output=True, + text=True + ) + output = result.stdout + res = find_op_time(output, ops) + csv_data += res + for item in res: + print(item) + with open("check_op_perf.csv", mode="w", newline="", encoding="utf-8") as file: + writer = csv.writer(file) + writer.writerows(csv_data)