From 39282337361f01bf9a14bdee4c5b6fa7a76da068 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Thu, 25 Apr 2024 23:36:13 +0300 Subject: [PATCH 01/10] remove intel gpu pipeline; add cuda support - setting SHARPY_DEVICE implies we are targeting nvidia gpu - IMEXROOT env variable is not used anymore --- src/jit/mlir.cpp | 69 ++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp index d84186e..1ebec8c 100644 --- a/src/jit/mlir.cpp +++ b/src/jit/mlir.cpp @@ -688,6 +688,8 @@ static const std::string cpu_pipeline = "arith-expand," "memref-expand," "func.func(empty-tensor-to-alloc-tensor)," + "cse," + "canonicalize," "one-shot-bufferize," "canonicalize," "imex-remove-temporaries," @@ -728,53 +730,46 @@ static const std::string gpu_pipeline = "linalg-fuse-elementwise-ops," "arith-expand," "memref-expand," - "arith-bufferize," - "func-bufferize," "func.func(empty-tensor-to-alloc-tensor)," - "func.func(scf-bufferize)," - "func.func(tensor-bufferize)," - "func.func(bufferization-bufferize)," - "func.func(linalg-bufferize)," - "func.func(linalg-detensorize)," - "func.func(tensor-bufferize)," + "func.func(tile-loops{tile-sizes=128 in-regions})," + "func.func(tile-loops{tile-sizes=1 in-regions})," "region-bufferize," "canonicalize," - "func.func(finalizing-bufferize)," + "one-shot-bufferize," + "cse," + "canonicalize," + "scf-forall-to-parallel," + "cse," + "canonicalize," "imex-remove-temporaries," - "func.func(convert-linalg-to-parallel-loops)," - "func.func(scf-parallel-loop-fusion)," - // GPU - "func.func(imex-add-outer-parallel-loop)," + "buffer-deallocation-pipeline," + "func.func(convert-linalg-to-loops)," "func.func(gpu-map-parallel-loops)," - "func.func(convert-parallel-loops-to-gpu)," - // insert-gpu-allocs pass can have client-api = opencl or vulkan args - "func.func(insert-gpu-allocs{in-regions=1})," + "convert-parallel-loops-to-gpu," + "canonicalize," + "cse," + "func.func(insert-gpu-allocs{in-regions=1 host-shared=0})," + "func.func(insert-gpu-copy)," "drop-regions," "canonicalize," - // "normalize-memrefs," - // "gpu-decompose-memrefs," - "func.func(lower-affine)," "gpu-kernel-outlining," + "convert-scf-to-cf," + "convert-cf-to-llvm," "canonicalize," "cse," - // The following set-spirv-* passes can have client-api = opencl or vulkan - // args - "set-spirv-capabilities{client-api=opencl}," - "gpu.module(set-spirv-abi-attrs{client-api=opencl})," - "canonicalize," - "fold-memref-alias-ops," - "imex-convert-gpu-to-spirv{enable-vc-intrinsic=1}," - "spirv.module(spirv-lower-abi-attrs)," - "spirv.module(spirv-update-vce)," - // "func.func(llvm-request-c-wrappers)," - "serialize-spirv," + "gpu.module(strip-debuginfo," + "convert-gpu-to-nvvm)," + "nvvm-attach-target{chip=sm_80 O=3}," + "func.func(gpu-async-region)," "expand-strided-metadata," "lower-affine," - "convert-gpu-to-gpux," + "gpu-to-llvm," "convert-func-to-llvm," "convert-math-to-llvm," - "convert-gpux-to-llvm," "finalize-memref-to-llvm," + "canonicalize," + "cse," + "gpu-module-to-binary{format=fatbin}," "reconcile-unrealized-casts"; const std::string _passes(get_text_env("SHARPY_PASSES")); @@ -831,10 +826,10 @@ JIT::JIT(const std::string &libidtr) _crunnerlib = mlirRoot + "/lib/libmlir_c_runner_utils.so"; _runnerlib = mlirRoot + "/lib/libmlir_runner_utils.so"; if (!std::ifstream(_crunnerlib)) { - throw std::runtime_error("Cannot find libmlir_c_runner_utils.so"); + throw std::runtime_error("Cannot find lib: " + _crunnerlib); } if (!std::ifstream(_runnerlib)) { - throw std::runtime_error("Cannot find libmlir_runner_utils.so"); + throw std::runtime_error("Cannot find lib: " + _runnerlib); } if (useGPU()) { @@ -842,11 +837,9 @@ JIT::JIT(const std::string &libidtr) if (!gpuxlibstr.empty()) { _gpulib = std::string(gpuxlibstr); } else { - auto imexRoot = get_text_env("IMEXROOT"); - imexRoot = !imexRoot.empty() ? imexRoot : std::string(CMAKE_IMEX_ROOT); - _gpulib = imexRoot + "/lib/liblevel-zero-runtime.so"; + _gpulib = mlirRoot + "/lib/libmlir_cuda_runtime.so"; if (!std::ifstream(_gpulib)) { - throw std::runtime_error("Cannot find liblevel-zero-runtime.so"); + throw std::runtime_error("Cannot find lib: " + _gpulib); } } _sharedLibPaths = {_crunnerlib.c_str(), _runnerlib.c_str(), From 763ee574101f67c850f209bfc1d4e7ee201137e3 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Fri, 26 Apr 2024 14:58:29 +0300 Subject: [PATCH 02/10] black-scholes: add partial gpu support, verification is skipped --- examples/black_scholes.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/examples/black_scholes.py b/examples/black_scholes.py index ba5f1f7..dfea6c8 100644 --- a/examples/black_scholes.py +++ b/examples/black_scholes.py @@ -246,15 +246,23 @@ def eval(): info(f"Median rate: {perf_rate:.5f} Mopts/s") # verify - call, put = args[-2], args[-1] - expected_call = 16.976097804669887 - expected_put = 0.34645174725098116 - call_value = float(call[0]) - put_value = float(put[0]) - assert numpy.allclose(call_value, expected_call) - assert numpy.allclose(put_value, expected_put) - - info("SUCCESS") + if device: + # FIXME gpu.memcpy to host requires identity layout + # FIXME reduction on gpu + # call = args[-2].to_device() + # put = args[-1].to_device() + pass + else: + call = args[-2] + put = args[-1] + expected_call = 16.976097804669887 + expected_put = 0.34645174725098116 + call_value = float(call[0]) + put_value = float(put[0]) + assert numpy.allclose(call_value, expected_call) + assert numpy.allclose(put_value, expected_put) + info("SUCCESS") + fini() From a9351324af767838b6473750f5259ec123dfa182 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Thu, 8 Aug 2024 10:59:11 +0300 Subject: [PATCH 03/10] wave equation: add partial gpu support, warm up jit cache --- examples/wave_equation.py | 40 +++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/examples/wave_equation.py b/examples/wave_equation.py index defd2c6..13adaa4 100644 --- a/examples/wave_equation.py +++ b/examples/wave_equation.py @@ -126,8 +126,10 @@ def ind_arr(shape, columns=False): T_shape = (nx, ny) U_shape = (nx + 1, ny) V_shape = (nx, ny + 1) + sync() x_t_2d = xmin + ind_arr(T_shape, True) * dx + dx / 2 y_t_2d = ymin + ind_arr(T_shape) * dy + dy / 2 + sync() dofs_T = int(numpy.prod(numpy.asarray(T_shape))) dofs_U = int(numpy.prod(numpy.asarray(U_shape))) @@ -151,6 +153,8 @@ def ind_arr(shape, columns=False): u2 = create_full(U_shape, 0.0, dtype) v2 = create_full(V_shape, 0.0, dtype) + sync() + def exact_elev(t, x_t_2d, y_t_2d, lx, ly): """ Exact solution for elevation field. @@ -224,7 +228,7 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): sync() # initial solution - e[:, :] = exact_elev(0.0, x_t_2d, y_t_2d, lx, ly) + e[:, :] = exact_elev(0.0, x_t_2d, y_t_2d, lx, ly).to_device(device) u[:, :] = create_full(U_shape, 0.0, dtype) v[:, :] = create_full(V_shape, 0.0, dtype) sync() @@ -240,9 +244,22 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): t = i * dt if t >= next_t_export - 1e-8: - _elev_max = np.max(e, all_axes) - _u_max = np.max(u, all_axes) - _total_v = np.sum(e + h, all_axes) + if device: + # FIXME gpu.memcpy to host requires identity layout + # FIXME reduction on gpu + # e_host = e.to_device() + # u_host = u.to_device() + # h_host = h.to_device() + # _elev_max = np.max(e_host, all_axes) + # _u_max = np.max(u_host, all_axes) + # _total_v = np.sum(e_host + h, all_axes) + _elev_max = 0 + _u_max = 0 + _total_v = 0 + else: + _elev_max = np.max(e, all_axes) + _u_max = np.max(u, all_axes) + _total_v = np.sum(e + h, all_axes) elev_max = float(_elev_max) u_max = float(_u_max) @@ -277,12 +294,19 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): duration = time_mod.perf_counter() - tic info(f"Duration: {duration:.2f} s") - e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly) - err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly - err_L2 = math.sqrt(float(np.sum(err2, all_axes))) + if device: + # FIXME gpu.memcpy to host requires identity layout + # FIXME reduction on gpu + # err2_host = err2.to_device() + # err_L2 = math.sqrt(float(np.sum(err2_host, all_axes))) + err_L2 = 0 + else: + e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly) + err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly + err_L2 = math.sqrt(float(np.sum(err2, all_axes))) info(f"L2 error: {err_L2:7.5e}") - if nx == 128 and ny == 128 and not benchmark_mode: + if nx == 128 and ny == 128 and not benchmark_mode and not device: if datatype == "f32": assert numpy.allclose(err_L2, 7.2235471e-03, rtol=1e-4) else: From 7dfaaa3db0408730059897c0755b4210f162ed70 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Thu, 22 Aug 2024 12:51:49 +0300 Subject: [PATCH 04/10] shallow water: add partial gpu support, warm up jit cache --- examples/shallow_water.py | 136 +++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/examples/shallow_water.py b/examples/shallow_water.py index a5798a4..39b23cf 100644 --- a/examples/shallow_water.py +++ b/examples/shallow_water.py @@ -156,7 +156,7 @@ def ind_arr(shape, columns=False): q = create_full(F_shape, 0.0, dtype) # bathymetry - h = create_full(T_shape, 0.0, dtype) + h = create_full(T_shape, 1.0, dtype) # HACK init with 1 hu = create_full(U_shape, 0.0, dtype) hv = create_full(V_shape, 0.0, dtype) @@ -205,13 +205,15 @@ def bathymetry(x_t_2d, y_t_2d, lx, ly): return bath * create_full(T_shape, 1.0, dtype) # set bathymetry - h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly) + # h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device) # steady state potential energy - pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny + # pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny + pe_offset = 0.5 * g * float(1.0) / nx / ny # compute time step alpha = 0.5 - h_max = float(np.max(h, all_axes)) + # h_max = float(np.max(h, all_axes)) + h_max = float(1.0) c = (g * h_max) ** 0.5 dt = alpha * dx / c dt = t_export / int(math.ceil(t_export / dt)) @@ -328,9 +330,9 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): u0, v0, e0 = exact_solution( 0, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d ) - e[:, :] = e0 - u[:, :] = u0 - v[:, :] = v0 + e[:, :] = e0.to_device(device) + u[:, :] = u0.to_device(device) + v[:, :] = v0.to_device(device) t = 0 i_export = 0 @@ -344,30 +346,41 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): t = i * dt if t >= next_t_export - 1e-8: - _elev_max = np.max(e, all_axes) - _u_max = np.max(u, all_axes) - _q_max = np.max(q, all_axes) - _total_v = np.sum(e + h, all_axes) - - # potential energy - _pe = 0.5 * g * (e + h) * (e - h) + pe_offset - _total_pe = np.sum(_pe, all_axes) - - # kinetic energy - u2 = u * u - v2 = v * v - u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :]) - v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1]) - _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h) - _total_ke = np.sum(_ke, all_axes) - - total_pe = float(_total_pe) * dx * dy - total_ke = float(_total_ke) * dx * dy - total_e = total_ke + total_pe - elev_max = float(_elev_max) - u_max = float(_u_max) - q_max = float(_q_max) - total_v = float(_total_v) * dx * dy + if device: + # FIXME gpu.memcpy to host requires identity layout + # FIXME reduction on gpu + elev_max = 0 + u_max = 0 + q_max = 0 + diff_e = 0 + diff_v = 0 + total_pe = 0 + total_ke = 0 + else: + _elev_max = np.max(e, all_axes) + _u_max = np.max(u, all_axes) + _q_max = np.max(q, all_axes) + _total_v = np.sum(e + h, all_axes) + + # potential energy + _pe = 0.5 * g * (e + h) * (e - h) + pe_offset + _total_pe = np.sum(_pe, all_axes) + + # kinetic energy + u2 = u * u + v2 = v * v + u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :]) + v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1]) + _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h) + _total_ke = np.sum(_ke, all_axes) + + total_pe = float(_total_pe) * dx * dy + total_ke = float(_total_ke) * dx * dy + total_e = total_ke + total_pe + elev_max = float(_elev_max) + u_max = float(_u_max) + q_max = float(_q_max) + total_v = float(_total_v) * dx * dy if i_export == 0: initial_v = total_v @@ -402,35 +415,40 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): duration = time_mod.perf_counter() - tic info(f"Duration: {duration:.2f} s") - e_exact = exact_solution(t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d)[ - 2 - ] - err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly - err_L2 = math.sqrt(float(np.sum(err2, all_axes))) - info(f"L2 error: {err_L2:7.15e}") - - if nx < 128 or ny < 128: - info("Skipping correctness test due to small problem size.") - elif not benchmark_mode: - tolerance_ene = 1e-7 if datatype == "f32" else 1e-9 - assert ( - diff_e < tolerance_ene - ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}" - if nx == 128 and ny == 128: - if datatype == "f32": - assert numpy.allclose( - err_L2, 4.3127859e-05, rtol=1e-5 - ), "L2 error does not match" - else: - assert numpy.allclose( - err_L2, 4.315799035627906e-05 - ), "L2 error does not match" - else: - tolerance_l2 = 1e-4 + if device: + # FIXME gpu.memcpy to host requires identity layout + # FIXME reduction on gpu + pass + else: + e_exact = exact_solution( + t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d + )[2] + err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly + err_L2 = math.sqrt(float(np.sum(err2, all_axes))) + info(f"L2 error: {err_L2:7.15e}") + + if nx < 128 or ny < 128: + info("Skipping correctness test due to small problem size.") + elif not benchmark_mode: + tolerance_ene = 1e-7 if datatype == "f32" else 1e-9 assert ( - err_L2 < tolerance_l2 - ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}" - info("SUCCESS") + diff_e < tolerance_ene + ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}" + if nx == 128 and ny == 128: + if datatype == "f32": + assert numpy.allclose( + err_L2, 4.3127859e-05, rtol=1e-5 + ), "L2 error does not match" + else: + assert numpy.allclose( + err_L2, 4.315799035627906e-05 + ), "L2 error does not match" + else: + tolerance_l2 = 1e-4 + assert ( + err_L2 < tolerance_l2 + ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}" + info("SUCCESS") fini() From 9c9b55b4b132275309a36a70c2d6cbe9a4f63f86 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Wed, 11 Dec 2024 13:53:29 +0200 Subject: [PATCH 05/10] CMakeLists: fix linker --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e58301..9c18ea3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,6 +178,7 @@ target_link_libraries(_sharpy PRIVATE ${imex_dialect_libs} ${imex_conversion_libs} IMEXTransforms + MLIRCopyOpInterface IMEXUtil LLVMX86CodeGen LLVMX86AsmParser From ad5c58d1ba26a45970acbd08929e223c07516b5f Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Fri, 13 Dec 2024 13:00:30 +0200 Subject: [PATCH 06/10] getMRType: fix llvm strides must not be zero error --- src/jit/mlir.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp index 1ebec8c..aae8045 100644 --- a/src/jit/mlir.cpp +++ b/src/jit/mlir.cpp @@ -266,6 +266,12 @@ ::mlir::Value DepManager::getDependent(::mlir::OpBuilder &builder, static ::mlir::MemRefType getMRType(size_t ndims, intptr_t offset, intptr_t *sizes, intptr_t *strides, ::mlir::Type elType) { + // sanitize strides + for (size_t i = 0; i < ndims; i++) { + if (strides[i] == 0) { + strides[i] = 1; + } + } auto layout = ::mlir::StridedLayoutAttr::get(elType.getContext(), offset, {strides, ndims}); return ::mlir::MemRefType::get({sizes, ndims}, elType, layout); From 0d110cfc6a9ce06d80d067affcc41e0a2f4ca54c Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Fri, 13 Dec 2024 20:52:47 +0200 Subject: [PATCH 07/10] wave equation: add gpu reductions --- examples/wave_equation.py | 46 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/examples/wave_equation.py b/examples/wave_equation.py index 13adaa4..b5070d1 100644 --- a/examples/wave_equation.py +++ b/examples/wave_equation.py @@ -244,22 +244,15 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): t = i * dt if t >= next_t_export - 1e-8: - if device: - # FIXME gpu.memcpy to host requires identity layout - # FIXME reduction on gpu - # e_host = e.to_device() - # u_host = u.to_device() - # h_host = h.to_device() - # _elev_max = np.max(e_host, all_axes) - # _u_max = np.max(u_host, all_axes) - # _total_v = np.sum(e_host + h, all_axes) - _elev_max = 0 - _u_max = 0 - _total_v = 0 - else: - _elev_max = np.max(e, all_axes) - _u_max = np.max(u, all_axes) - _total_v = np.sum(e + h, all_axes) + sync() + H_tmp = e + h + sync() + _elev_max = np.max(e, all_axes).to_device() + # NOTE max(u) segfaults, shape (n+1, n) too large for tiling + _u_max = np.max(u[1:, :], all_axes).to_device() + _total_v = np.sum(H_tmp, all_axes).to_device() + # NOTE this segfaults + # _total_v = np.sum(e + h, all_axes).to_device() # segfaults elev_max = float(_elev_max) u_max = float(_u_max) @@ -294,16 +287,17 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): duration = time_mod.perf_counter() - tic info(f"Duration: {duration:.2f} s") - if device: - # FIXME gpu.memcpy to host requires identity layout - # FIXME reduction on gpu - # err2_host = err2.to_device() - # err_L2 = math.sqrt(float(np.sum(err2_host, all_axes))) - err_L2 = 0 - else: - e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly) - err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly - err_L2 = math.sqrt(float(np.sum(err2, all_axes))) + e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly).to_device(device) + err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly + err2sum = np.sum(err2, all_axes).to_device() + sync() + # e_host = e.to_device() + # sync() + # e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly).to_device() + # err2 = (e_exact - e_host) * (e_exact - e_host) * dx * dy / lx / ly + # err2sum = np.sum(err2, all_axes) + sync() + err_L2 = math.sqrt(float(err2sum)) info(f"L2 error: {err_L2:7.5e}") if nx == 128 and ny == 128 and not benchmark_mode and not device: From 75c7c19940124d5e74e6519353d1fc6648ba721c Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 16 Dec 2024 08:24:59 +0200 Subject: [PATCH 08/10] wave equation: initialize coord arrays with int64 --- examples/wave_equation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/wave_equation.py b/examples/wave_equation.py index b5070d1..f94d520 100644 --- a/examples/wave_equation.py +++ b/examples/wave_equation.py @@ -115,10 +115,10 @@ def ind_arr(shape, columns=False): """Construct an (nx, ny) array where each row/col is an arange""" nx, ny = shape if columns: - ind = np.arange(0, nx * ny, 1, dtype=np.int32) % nx + ind = np.arange(0, nx * ny, 1, dtype=np.int64) % nx ind = transpose(np.reshape(ind, (ny, nx))) else: - ind = np.arange(0, nx * ny, 1, dtype=np.int32) % ny + ind = np.arange(0, nx * ny, 1, dtype=np.int64) % ny ind = np.reshape(ind, (nx, ny)) return ind.astype(dtype) From b1a7c3f769de803bfadd6c639b5abff1337b0a80 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 16 Dec 2024 08:25:50 +0200 Subject: [PATCH 09/10] shallow water: initialize coord arrays with int64 --- examples/shallow_water.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/shallow_water.py b/examples/shallow_water.py index 39b23cf..fb44715 100644 --- a/examples/shallow_water.py +++ b/examples/shallow_water.py @@ -115,10 +115,10 @@ def ind_arr(shape, columns=False): """Construct an (nx, ny) array where each row/col is an arange""" nx, ny = shape if columns: - ind = np.arange(0, nx * ny, 1, dtype=np.int32) % nx + ind = np.arange(0, nx * ny, 1, dtype=np.int64) % nx ind = transpose(np.reshape(ind, (ny, nx))) else: - ind = np.arange(0, nx * ny, 1, dtype=np.int32) % ny + ind = np.arange(0, nx * ny, 1, dtype=np.int64) % ny ind = np.reshape(ind, (nx, ny)) return ind.astype(dtype) From 9987123381de20095fd7eb40826eea8f4a611aed Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 3 Feb 2025 20:25:01 +0200 Subject: [PATCH 10/10] shallow water: reductions and L2 error comp runs on GPU --- examples/shallow_water.py | 149 ++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 79 deletions(-) diff --git a/examples/shallow_water.py b/examples/shallow_water.py index fb44715..e7de866 100644 --- a/examples/shallow_water.py +++ b/examples/shallow_water.py @@ -156,7 +156,7 @@ def ind_arr(shape, columns=False): q = create_full(F_shape, 0.0, dtype) # bathymetry - h = create_full(T_shape, 1.0, dtype) # HACK init with 1 + h = create_full(T_shape, 0.0, dtype) hu = create_full(U_shape, 0.0, dtype) hv = create_full(V_shape, 0.0, dtype) @@ -165,7 +165,7 @@ def ind_arr(shape, columns=False): dvdx = create_full(F_shape, 0.0, dtype) # vector invariant form - H_at_f = create_full(F_shape, 0.0, dtype) + H_at_f = create_full(F_shape, 1.0, dtype) # HACK init with 1 # auxiliary variables for RK time integration e1 = create_full(T_shape, 0.0, dtype) @@ -205,15 +205,14 @@ def bathymetry(x_t_2d, y_t_2d, lx, ly): return bath * create_full(T_shape, 1.0, dtype) # set bathymetry - # h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device) + h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device) # steady state potential energy - # pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny - pe_offset = 0.5 * g * float(1.0) / nx / ny + h2sum = np.sum(h**2.0, all_axes).to_device() + pe_offset = 0.5 * g * float(np.sum(h2sum, all_axes)) / nx / ny # compute time step alpha = 0.5 - # h_max = float(np.max(h, all_axes)) - h_max = float(1.0) + h_max = float(np.max(h, all_axes).to_device()) c = (g * h_max) ** 0.5 dt = alpha * dx / c dt = t_export / int(math.ceil(t_export / dt)) @@ -253,10 +252,11 @@ def rhs(u, v, e): H_at_f[-1, 1:-1] = 0.5 * (H[-1, 1:] + H[-1, :-1]) H_at_f[1:-1, 0] = 0.5 * (H[1:, 0] + H[:-1, 0]) H_at_f[1:-1, -1] = 0.5 * (H[1:, -1] + H[:-1, -1]) - H_at_f[0, 0] = H[0, 0] - H_at_f[0, -1] = H[0, -1] - H_at_f[-1, 0] = H[-1, 0] - H_at_f[-1, -1] = H[-1, -1] + # NOTE causes gpu.memcpy error, non-identity layout + # H_at_f[0, 0] = H[0, 0] + # H_at_f[0, -1] = H[0, -1] + # H_at_f[-1, 0] = H[-1, 0] + # H_at_f[-1, -1] = H[-1, -1] # potential vorticity dudy[:, 1:-1] = (u[:, 1:] - u[:, :-1]) / dy @@ -346,41 +346,36 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): t = i * dt if t >= next_t_export - 1e-8: - if device: - # FIXME gpu.memcpy to host requires identity layout - # FIXME reduction on gpu - elev_max = 0 - u_max = 0 - q_max = 0 - diff_e = 0 - diff_v = 0 - total_pe = 0 - total_ke = 0 - else: - _elev_max = np.max(e, all_axes) - _u_max = np.max(u, all_axes) - _q_max = np.max(q, all_axes) - _total_v = np.sum(e + h, all_axes) - - # potential energy - _pe = 0.5 * g * (e + h) * (e - h) + pe_offset - _total_pe = np.sum(_pe, all_axes) - - # kinetic energy - u2 = u * u - v2 = v * v - u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :]) - v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1]) - _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h) - _total_ke = np.sum(_ke, all_axes) - - total_pe = float(_total_pe) * dx * dy - total_ke = float(_total_ke) * dx * dy - total_e = total_ke + total_pe - elev_max = float(_elev_max) - u_max = float(_u_max) - q_max = float(_q_max) - total_v = float(_total_v) * dx * dy + sync() + # NOTE must precompute reduction operands to single field + H_tmp = e + h + # potential energy + _pe = 0.5 * g * (e + h) * (e - h) + pe_offset + # kinetic energy + u2 = u * u + v2 = v * v + u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :]) + v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1]) + _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h) + sync() + _elev_max = np.max(e, all_axes).to_device() + # NOTE max(u) segfaults, shape (n+1, n) too large for tiling + _u_max = np.max(u[1:, :], all_axes).to_device() + _q_max = np.max(q[1:, 1:], all_axes).to_device() + _total_v = np.sum(H_tmp, all_axes).to_device() + _total_pe = np.sum(_pe, all_axes).to_device() + _total_ke = np.sum(_ke, all_axes).to_device() + + total_pe = float(_total_pe) * dx * dy + total_ke = float(_total_ke) * dx * dy + total_e = total_ke + total_pe + elev_max = float(_elev_max) + u_max = float(_u_max) + q_max = float(_q_max) + total_v = float(_total_v) * dx * dy + + diff_e = 0 + diff_v = 0 if i_export == 0: initial_v = total_v @@ -415,40 +410,36 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2): duration = time_mod.perf_counter() - tic info(f"Duration: {duration:.2f} s") - if device: - # FIXME gpu.memcpy to host requires identity layout - # FIXME reduction on gpu - pass - else: - e_exact = exact_solution( - t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d - )[2] - err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly - err_L2 = math.sqrt(float(np.sum(err2, all_axes))) - info(f"L2 error: {err_L2:7.15e}") - - if nx < 128 or ny < 128: - info("Skipping correctness test due to small problem size.") - elif not benchmark_mode: - tolerance_ene = 1e-7 if datatype == "f32" else 1e-9 - assert ( - diff_e < tolerance_ene - ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}" - if nx == 128 and ny == 128: - if datatype == "f32": - assert numpy.allclose( - err_L2, 4.3127859e-05, rtol=1e-5 - ), "L2 error does not match" - else: - assert numpy.allclose( - err_L2, 4.315799035627906e-05 - ), "L2 error does not match" + e_exact = exact_solution(t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d)[ + 2 + ].to_device(device) + err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly + err2sum = np.sum(err2, all_axes).to_device() + err_L2 = math.sqrt(float(err2sum)) + info(f"L2 error: {err_L2:7.15e}") + + if nx < 128 or ny < 128: + info("Skipping correctness test due to small problem size.") + elif not benchmark_mode: + tolerance_ene = 1e-7 if datatype == "f32" else 1e-9 + assert ( + diff_e < tolerance_ene + ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}" + if nx == 128 and ny == 128: + if datatype == "f32": + assert numpy.allclose( + err_L2, 4.3127859e-05, rtol=1e-5 + ), "L2 error does not match" else: - tolerance_l2 = 1e-4 - assert ( - err_L2 < tolerance_l2 - ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}" - info("SUCCESS") + assert numpy.allclose( + err_L2, 4.315799035627906e-05 + ), "L2 error does not match" + else: + tolerance_l2 = 1e-4 + assert ( + err_L2 < tolerance_l2 + ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}" + info("SUCCESS") fini()