|
| 1 | +# Copyright Swiss National Supercomputing Centre (CSCS/ETH Zurich) |
| 2 | +# ReFrame Project Developers. See the top-level LICENSE file for details. |
| 3 | +# |
| 4 | +# SPDX-License-Identifier: BSD-3-Clause |
| 5 | + |
| 6 | +import os |
| 7 | +import reframe as rfm |
| 8 | +import reframe.utility.sanity as sn |
| 9 | +from uenv import uarch |
| 10 | + |
| 11 | +rochpl_references = { |
| 12 | + 'mi200': {38400: 2.65e+04, 192000: 1.49e+05, 218880: 1.55e+05}, |
| 13 | + 'mi300': {38400: 2.53e+04, 192000: 1.57e+05, 218880: 1.62e+05}, |
| 14 | +} |
| 15 | + |
| 16 | +slurm_config = { |
| 17 | + 'mi200': { |
| 18 | + 'ntasks-per-node': 8, |
| 19 | + 'cpus-per-task': 16, |
| 20 | + }, |
| 21 | + 'mi300': { |
| 22 | + 'ntasks-per-node': 4, |
| 23 | + 'cpus-per-task': 48, |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +HPLdat = """HPLinpack benchmark input file |
| 28 | +Innovative Computing Laboratory, University of Tennessee |
| 29 | +HPL.out output file name (if any) |
| 30 | +0 device out (6=stdout,7=stderr,file) |
| 31 | +{count} # of problems sizes (N) |
| 32 | +{sizes} Ns |
| 33 | +1 # of NBs |
| 34 | +384 NBs |
| 35 | +1 PMAP process mapping (0=Row-,1=Column-major) |
| 36 | +1 # of process grids (P x Q) |
| 37 | +{p} Ps |
| 38 | +{q} Qs |
| 39 | +16.0 threshold |
| 40 | +1 # of panel fact |
| 41 | +2 PFACTs (0=left, 1=Crout, 2=Right) |
| 42 | +1 # of recursive stopping criterium |
| 43 | +8 NBMINs (>= 1) |
| 44 | +1 # of panels in recursion |
| 45 | +2 NDIVs |
| 46 | +1 # of recursive panel fact. |
| 47 | +2 RFACTs (0=left, 1=Crout, 2=Right) |
| 48 | +1 # of broadcast |
| 49 | +6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) |
| 50 | +1 # of lookahead depth |
| 51 | +1 DEPTHs (>=0) |
| 52 | +1 SWAP (0=bin-exch,1=long,2=mix) |
| 53 | +64 swapping threshold |
| 54 | +1 L1 in (0=transposed,1=no-transposed) form |
| 55 | +0 U in (0=transposed,1=no-transposed) form |
| 56 | +0 Equilibration (0=no,1=yes) |
| 57 | +8 memory alignment in double (> 0) |
| 58 | +""" |
| 59 | + |
| 60 | + |
| 61 | +class RocHPL(rfm.RegressionTest): |
| 62 | + descr = 'AMD HPL (rocHPL) test' |
| 63 | + valid_systems = ['+amdgpu +uenv'] |
| 64 | + valid_prog_environs = ['+uenv +prgenv +rocm'] |
| 65 | + maintainers = ['rasolca', 'SSA'] |
| 66 | + sourcesdir = 'scripts' |
| 67 | + build_system = 'CMake' |
| 68 | + # This branch contains fixes for cmake. |
| 69 | + # https://github.com/ROCm/rocHPL/pull/28 |
| 70 | + prebuild_cmds = [ |
| 71 | + 'git clone --depth 1 --branch cmake_hip ' |
| 72 | + 'https://github.com/rasolca/rocHPL.git' |
| 73 | + ] |
| 74 | + time_limit = '10m' |
| 75 | + build_locally = False |
| 76 | + |
| 77 | + @run_before('compile') |
| 78 | + def set_build_options(self): |
| 79 | + self.build_system.configuredir = 'rocHPL' |
| 80 | + self.build_system.builddir = 'build' |
| 81 | + self.build_system.max_concurrency = 10 |
| 82 | + |
| 83 | + gpu_arch = self.current_partition.select_devices('gpu')[0].arch |
| 84 | + self.build_system.config_opts = [ |
| 85 | + '-DHPL_VERBOSE_PRINT=ON', |
| 86 | + '-DHPL_PROGRESS_REPORT=ON', |
| 87 | + '-DHPL_DETAILED_TIMING=ON', |
| 88 | + '-DCMAKE_BUILD_TYPE=Release', |
| 89 | + f'-DCMAKE_HIP_ARCHITECTURES="{gpu_arch}"' |
| 90 | + ] |
| 91 | + |
| 92 | + @run_after('setup') |
| 93 | + def set_num_gpus(self): |
| 94 | + curr_part = self.current_partition |
| 95 | + self.num_gpus = curr_part.select_devices('gpu')[0].num_devices |
| 96 | + |
| 97 | + @run_before('run') |
| 98 | + def set_executable(self): |
| 99 | + self.uarch = uarch(self.current_partition) |
| 100 | + |
| 101 | + pre_script = f'./{self.uarch}-wrapper.sh' |
| 102 | + binary = os.path.join(self.build_system.builddir, 'bin', 'rochpl') |
| 103 | + self.executable = f'{pre_script} {binary}' |
| 104 | + |
| 105 | + # slurm configuration |
| 106 | + config = slurm_config[self.uarch] |
| 107 | + self.job.options = [f'--nodes=1'] |
| 108 | + self.num_tasks_per_node = config['ntasks-per-node'] |
| 109 | + self.num_tasks = self.num_tasks_per_node |
| 110 | + self.num_cpus_per_task = config['cpus-per-task'] |
| 111 | + self.ntasks_per_core = 2 |
| 112 | + if self.uarch == 'mi200': |
| 113 | + self.job.launcher.options = [( |
| 114 | + '--cpu-bind=mask_cpu:' |
| 115 | + 'ff00000000000000ff000000000000,' |
| 116 | + 'ff00000000000000ff00000000000000,' |
| 117 | + 'ff00000000000000ff0000,' |
| 118 | + 'ff00000000000000ff000000,' |
| 119 | + 'ff00000000000000ff,' |
| 120 | + 'ff00000000000000ff00,' |
| 121 | + 'ff00000000000000ff00000000,' |
| 122 | + 'ff00000000000000ff0000000000')] |
| 123 | + else: |
| 124 | + self.job.launcher.options = ['--cpu-bind=cores'] |
| 125 | + |
| 126 | + # env variables |
| 127 | + self.env_vars['MPICH_GPU_SUPPORT_ENABLED'] = '1' |
| 128 | + self.env_vars['OMP_PROC_BIND'] = 'true' |
| 129 | + self.env_vars['OMP_NUM_THREADS'] = \ |
| 130 | + f'{self.num_cpus_per_task / self.ntasks_per_core}' |
| 131 | + |
| 132 | + # executable options |
| 133 | + if self.uarch == 'mi200': |
| 134 | + prows = 2 |
| 135 | + pcols = 4 |
| 136 | + if self.uarch == 'mi300': |
| 137 | + prows = 2 |
| 138 | + pcols = 2 |
| 139 | + |
| 140 | + input_file = os.path.join(self.stagedir, 'HPL.dat') |
| 141 | + with open(input_file, 'w') as file: |
| 142 | + file.write(HPLdat.format(count=len(self.matrix_sizes), sizes=' '.join(str(n) for n in self.matrix_sizes), p=prows, q=pcols)) # noqa: E501 |
| 143 | + |
| 144 | + self.executable_opts += [ |
| 145 | + f'-p {prows}', |
| 146 | + f'-q {pcols}', |
| 147 | + f'-P {prows}', |
| 148 | + f'-Q {pcols}', |
| 149 | + f'-i {input_file}' |
| 150 | + ] |
| 151 | + |
| 152 | + # set performance reference |
| 153 | + if self.uarch in rochpl_references: |
| 154 | + reference = {} |
| 155 | + |
| 156 | + for n in self.matrix_sizes: |
| 157 | + if n in rochpl_references[self.uarch]: |
| 158 | + # Note: Permissive threshold for mi300 as sles15sp5 shows |
| 159 | + # performance drops with large matrices. Should be removed |
| 160 | + # when all the nodes run the sles15sp6 image. |
| 161 | + lower_bound = -0.1 |
| 162 | + if self.uarch == 'mi300': |
| 163 | + if n > 200000: |
| 164 | + lower_bound = -0.90 |
| 165 | + elif n > 150000: |
| 166 | + lower_bound = -0.33 |
| 167 | + |
| 168 | + reference[f'size {n}'] = \ |
| 169 | + (rochpl_references[self.uarch][n], |
| 170 | + lower_bound, 0.05, 'Gflop/s') |
| 171 | + |
| 172 | + self.reference = {self.current_partition.fullname: reference} |
| 173 | + |
| 174 | + @sanity_function |
| 175 | + def assert_results(self): |
| 176 | + """ |
| 177 | + WC15R2R8 218880 384 2 2 102.52 6.819e+04 |
| 178 | + ||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= 0.0000524 ...... PASSED |
| 179 | + """ |
| 180 | + out_file = os.path.join(self.stagedir, 'HPL.out') |
| 181 | + |
| 182 | + regex1 = r'^WC15R2R8\s+([0-9]+)\s+384\s+[0-9]+\s+[0-9]+\s+[0-9\.]+\s+([0-9\.]+e\+[0-9]+)$' |
| 183 | + regex2 = r'^\|\|Ax-b\|\|_oo\/\(eps\*\(\|\|A\|\|_oo\*\|\|x\|\|_oo\+\|\|b\|\|_oo\)\*N\)=\s+([\.0-9]+)\s+\.+\s+PASSED$' |
| 184 | + self.perf_ = sn.extractall(regex1, out_file, tag=(1, 2), conv=(int, float)) |
| 185 | + self.accuracy_ = sn.extractall(regex2, out_file, tag=1, conv=float) |
| 186 | + |
| 187 | + sanity_patterns = [ |
| 188 | + sn.assert_eq(sn.len(self.perf_), sn.len(self.matrix_sizes), 'Number of results do not match with number of runs'), |
| 189 | + sn.assert_eq(sn.len(self.accuracy_), sn.len(self.matrix_sizes), 'Number of PASSED accuracy results do not match with number of runs') |
| 190 | + ] |
| 191 | + |
| 192 | + for (perf, n) in sn.zip(self.perf_, self.matrix_sizes): |
| 193 | + sanity_patterns.append(sn.assert_eq(perf[0], n, 'Matrix size does not match')) |
| 194 | + |
| 195 | + self.sanity_patterns = sn.all(sanity_patterns) |
| 196 | + |
| 197 | + return self.sanity_patterns |
| 198 | + |
| 199 | + @run_before('performance') |
| 200 | + def set_perf_vars(self): |
| 201 | + make_perf = sn.make_performance_function |
| 202 | + |
| 203 | + self.perf_variables = {} |
| 204 | + for perf in self.perf_: |
| 205 | + self.perf_variables[f'size {perf[0]}'] = make_perf(sn.getitem(perf, 1), 'Gflop/s') |
| 206 | + |
| 207 | + |
| 208 | +@rfm.simple_test |
| 209 | +class RocHPL_small(RocHPL): |
| 210 | + matrix_sizes = [38400] |
| 211 | + |
| 212 | + |
| 213 | +@rfm.simple_test |
| 214 | +class RocHPL_medium(RocHPL): |
| 215 | + matrix_sizes = [192000] |
| 216 | + tags = {'production', 'uenv', 'bencher'} |
| 217 | + |
| 218 | + |
| 219 | +@rfm.simple_test |
| 220 | +class RocHPL_large(RocHPL): |
| 221 | + matrix_sizes = [218880] |
| 222 | + tags = {'production', 'uenv', 'bencher'} |
0 commit comments