Skip to content

Commit b960bdb

Browse files
authored
Add RocHPL tests (#458)
1 parent d8a78e5 commit b960bdb

File tree

3 files changed

+238
-0
lines changed

3 files changed

+238
-0
lines changed
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
# Copyright Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import os
7+
import reframe as rfm
8+
import reframe.utility.sanity as sn
9+
from uenv import uarch
10+
11+
rochpl_references = {
12+
'mi200': {38400: 2.65e+04, 192000: 1.49e+05, 218880: 1.55e+05},
13+
'mi300': {38400: 2.53e+04, 192000: 1.57e+05, 218880: 1.62e+05},
14+
}
15+
16+
slurm_config = {
17+
'mi200': {
18+
'ntasks-per-node': 8,
19+
'cpus-per-task': 16,
20+
},
21+
'mi300': {
22+
'ntasks-per-node': 4,
23+
'cpus-per-task': 48,
24+
}
25+
}
26+
27+
HPLdat = """HPLinpack benchmark input file
28+
Innovative Computing Laboratory, University of Tennessee
29+
HPL.out output file name (if any)
30+
0 device out (6=stdout,7=stderr,file)
31+
{count} # of problems sizes (N)
32+
{sizes} Ns
33+
1 # of NBs
34+
384 NBs
35+
1 PMAP process mapping (0=Row-,1=Column-major)
36+
1 # of process grids (P x Q)
37+
{p} Ps
38+
{q} Qs
39+
16.0 threshold
40+
1 # of panel fact
41+
2 PFACTs (0=left, 1=Crout, 2=Right)
42+
1 # of recursive stopping criterium
43+
8 NBMINs (>= 1)
44+
1 # of panels in recursion
45+
2 NDIVs
46+
1 # of recursive panel fact.
47+
2 RFACTs (0=left, 1=Crout, 2=Right)
48+
1 # of broadcast
49+
6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
50+
1 # of lookahead depth
51+
1 DEPTHs (>=0)
52+
1 SWAP (0=bin-exch,1=long,2=mix)
53+
64 swapping threshold
54+
1 L1 in (0=transposed,1=no-transposed) form
55+
0 U in (0=transposed,1=no-transposed) form
56+
0 Equilibration (0=no,1=yes)
57+
8 memory alignment in double (> 0)
58+
"""
59+
60+
61+
class RocHPL(rfm.RegressionTest):
62+
descr = 'AMD HPL (rocHPL) test'
63+
valid_systems = ['+amdgpu +uenv']
64+
valid_prog_environs = ['+uenv +prgenv +rocm']
65+
maintainers = ['rasolca', 'SSA']
66+
sourcesdir = 'scripts'
67+
build_system = 'CMake'
68+
# This branch contains fixes for cmake.
69+
# https://github.com/ROCm/rocHPL/pull/28
70+
prebuild_cmds = [
71+
'git clone --depth 1 --branch cmake_hip '
72+
'https://github.com/rasolca/rocHPL.git'
73+
]
74+
time_limit = '10m'
75+
build_locally = False
76+
77+
@run_before('compile')
78+
def set_build_options(self):
79+
self.build_system.configuredir = 'rocHPL'
80+
self.build_system.builddir = 'build'
81+
self.build_system.max_concurrency = 10
82+
83+
gpu_arch = self.current_partition.select_devices('gpu')[0].arch
84+
self.build_system.config_opts = [
85+
'-DHPL_VERBOSE_PRINT=ON',
86+
'-DHPL_PROGRESS_REPORT=ON',
87+
'-DHPL_DETAILED_TIMING=ON',
88+
'-DCMAKE_BUILD_TYPE=Release',
89+
f'-DCMAKE_HIP_ARCHITECTURES="{gpu_arch}"'
90+
]
91+
92+
@run_after('setup')
93+
def set_num_gpus(self):
94+
curr_part = self.current_partition
95+
self.num_gpus = curr_part.select_devices('gpu')[0].num_devices
96+
97+
@run_before('run')
98+
def set_executable(self):
99+
self.uarch = uarch(self.current_partition)
100+
101+
pre_script = f'./{self.uarch}-wrapper.sh'
102+
binary = os.path.join(self.build_system.builddir, 'bin', 'rochpl')
103+
self.executable = f'{pre_script} {binary}'
104+
105+
# slurm configuration
106+
config = slurm_config[self.uarch]
107+
self.job.options = [f'--nodes=1']
108+
self.num_tasks_per_node = config['ntasks-per-node']
109+
self.num_tasks = self.num_tasks_per_node
110+
self.num_cpus_per_task = config['cpus-per-task']
111+
self.ntasks_per_core = 2
112+
if self.uarch == 'mi200':
113+
self.job.launcher.options = [(
114+
'--cpu-bind=mask_cpu:'
115+
'ff00000000000000ff000000000000,'
116+
'ff00000000000000ff00000000000000,'
117+
'ff00000000000000ff0000,'
118+
'ff00000000000000ff000000,'
119+
'ff00000000000000ff,'
120+
'ff00000000000000ff00,'
121+
'ff00000000000000ff00000000,'
122+
'ff00000000000000ff0000000000')]
123+
else:
124+
self.job.launcher.options = ['--cpu-bind=cores']
125+
126+
# env variables
127+
self.env_vars['MPICH_GPU_SUPPORT_ENABLED'] = '1'
128+
self.env_vars['OMP_PROC_BIND'] = 'true'
129+
self.env_vars['OMP_NUM_THREADS'] = \
130+
f'{self.num_cpus_per_task / self.ntasks_per_core}'
131+
132+
# executable options
133+
if self.uarch == 'mi200':
134+
prows = 2
135+
pcols = 4
136+
if self.uarch == 'mi300':
137+
prows = 2
138+
pcols = 2
139+
140+
input_file = os.path.join(self.stagedir, 'HPL.dat')
141+
with open(input_file, 'w') as file:
142+
file.write(HPLdat.format(count=len(self.matrix_sizes), sizes=' '.join(str(n) for n in self.matrix_sizes), p=prows, q=pcols)) # noqa: E501
143+
144+
self.executable_opts += [
145+
f'-p {prows}',
146+
f'-q {pcols}',
147+
f'-P {prows}',
148+
f'-Q {pcols}',
149+
f'-i {input_file}'
150+
]
151+
152+
# set performance reference
153+
if self.uarch in rochpl_references:
154+
reference = {}
155+
156+
for n in self.matrix_sizes:
157+
if n in rochpl_references[self.uarch]:
158+
# Note: Permissive threshold for mi300 as sles15sp5 shows
159+
# performance drops with large matrices. Should be removed
160+
# when all the nodes run the sles15sp6 image.
161+
lower_bound = -0.1
162+
if self.uarch == 'mi300':
163+
if n > 200000:
164+
lower_bound = -0.90
165+
elif n > 150000:
166+
lower_bound = -0.33
167+
168+
reference[f'size {n}'] = \
169+
(rochpl_references[self.uarch][n],
170+
lower_bound, 0.05, 'Gflop/s')
171+
172+
self.reference = {self.current_partition.fullname: reference}
173+
174+
@sanity_function
175+
def assert_results(self):
176+
"""
177+
WC15R2R8 218880 384 2 2 102.52 6.819e+04
178+
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= 0.0000524 ...... PASSED
179+
"""
180+
out_file = os.path.join(self.stagedir, 'HPL.out')
181+
182+
regex1 = r'^WC15R2R8\s+([0-9]+)\s+384\s+[0-9]+\s+[0-9]+\s+[0-9\.]+\s+([0-9\.]+e\+[0-9]+)$'
183+
regex2 = r'^\|\|Ax-b\|\|_oo\/\(eps\*\(\|\|A\|\|_oo\*\|\|x\|\|_oo\+\|\|b\|\|_oo\)\*N\)=\s+([\.0-9]+)\s+\.+\s+PASSED$'
184+
self.perf_ = sn.extractall(regex1, out_file, tag=(1, 2), conv=(int, float))
185+
self.accuracy_ = sn.extractall(regex2, out_file, tag=1, conv=float)
186+
187+
sanity_patterns = [
188+
sn.assert_eq(sn.len(self.perf_), sn.len(self.matrix_sizes), 'Number of results do not match with number of runs'),
189+
sn.assert_eq(sn.len(self.accuracy_), sn.len(self.matrix_sizes), 'Number of PASSED accuracy results do not match with number of runs')
190+
]
191+
192+
for (perf, n) in sn.zip(self.perf_, self.matrix_sizes):
193+
sanity_patterns.append(sn.assert_eq(perf[0], n, 'Matrix size does not match'))
194+
195+
self.sanity_patterns = sn.all(sanity_patterns)
196+
197+
return self.sanity_patterns
198+
199+
@run_before('performance')
200+
def set_perf_vars(self):
201+
make_perf = sn.make_performance_function
202+
203+
self.perf_variables = {}
204+
for perf in self.perf_:
205+
self.perf_variables[f'size {perf[0]}'] = make_perf(sn.getitem(perf, 1), 'Gflop/s')
206+
207+
208+
@rfm.simple_test
209+
class RocHPL_small(RocHPL):
210+
matrix_sizes = [38400]
211+
212+
213+
@rfm.simple_test
214+
class RocHPL_medium(RocHPL):
215+
matrix_sizes = [192000]
216+
tags = {'production', 'uenv', 'bencher'}
217+
218+
219+
@rfm.simple_test
220+
class RocHPL_large(RocHPL):
221+
matrix_sizes = [218880]
222+
tags = {'production', 'uenv', 'bencher'}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#! /usr/bin/env bash
2+
3+
export GPUID=$(( SLURM_LOCALID % 8))
4+
5+
export ROCR_VISIBLE_DEVICES=$GPUID
6+
7+
"$@"
8+
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#! /usr/bin/env bash
2+
3+
export GPUID=$(( SLURM_LOCALID % 4 ))
4+
export NUMAID=$GPUID
5+
6+
export ROCR_VISIBLE_DEVICES=$GPUID
7+
8+
numactl --cpunodebind=$NUMAID --membind=$NUMAID "$@"

0 commit comments

Comments
 (0)