From 39282337361f01bf9a14bdee4c5b6fa7a76da068 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Thu, 25 Apr 2024 23:36:13 +0300
Subject: [PATCH 01/10] remove intel gpu pipeline; add cuda support

- setting SHARPY_DEVICE implies we are targeting nvidia gpu
- IMEXROOT env variable is not used anymore
---
 src/jit/mlir.cpp | 69 ++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
index d84186e..1ebec8c 100644
--- a/src/jit/mlir.cpp
+++ b/src/jit/mlir.cpp
@@ -688,6 +688,8 @@ static const std::string cpu_pipeline =
     "arith-expand,"
     "memref-expand,"
     "func.func(empty-tensor-to-alloc-tensor),"
+    "cse,"
+    "canonicalize,"
     "one-shot-bufferize,"
     "canonicalize,"
     "imex-remove-temporaries,"
@@ -728,53 +730,46 @@ static const std::string gpu_pipeline =
     "linalg-fuse-elementwise-ops,"
     "arith-expand,"
     "memref-expand,"
-    "arith-bufferize,"
-    "func-bufferize,"
     "func.func(empty-tensor-to-alloc-tensor),"
-    "func.func(scf-bufferize),"
-    "func.func(tensor-bufferize),"
-    "func.func(bufferization-bufferize),"
-    "func.func(linalg-bufferize),"
-    "func.func(linalg-detensorize),"
-    "func.func(tensor-bufferize),"
+    "func.func(tile-loops{tile-sizes=128 in-regions}),"
+    "func.func(tile-loops{tile-sizes=1 in-regions}),"
     "region-bufferize,"
     "canonicalize,"
-    "func.func(finalizing-bufferize),"
+    "one-shot-bufferize,"
+    "cse,"
+    "canonicalize,"
+    "scf-forall-to-parallel,"
+    "cse,"
+    "canonicalize,"
     "imex-remove-temporaries,"
-    "func.func(convert-linalg-to-parallel-loops),"
-    "func.func(scf-parallel-loop-fusion),"
-    // GPU
-    "func.func(imex-add-outer-parallel-loop),"
+    "buffer-deallocation-pipeline,"
+    "func.func(convert-linalg-to-loops),"
     "func.func(gpu-map-parallel-loops),"
-    "func.func(convert-parallel-loops-to-gpu),"
-    // insert-gpu-allocs pass can have client-api = opencl or vulkan args
-    "func.func(insert-gpu-allocs{in-regions=1}),"
+    "convert-parallel-loops-to-gpu,"
+    "canonicalize,"
+    "cse,"
+    "func.func(insert-gpu-allocs{in-regions=1 host-shared=0}),"
+    "func.func(insert-gpu-copy),"
     "drop-regions,"
     "canonicalize,"
-    // "normalize-memrefs,"
-    // "gpu-decompose-memrefs,"
-    "func.func(lower-affine),"
     "gpu-kernel-outlining,"
+    "convert-scf-to-cf,"
+    "convert-cf-to-llvm,"
     "canonicalize,"
     "cse,"
-    // The following set-spirv-* passes can have client-api = opencl or vulkan
-    // args
-    "set-spirv-capabilities{client-api=opencl},"
-    "gpu.module(set-spirv-abi-attrs{client-api=opencl}),"
-    "canonicalize,"
-    "fold-memref-alias-ops,"
-    "imex-convert-gpu-to-spirv{enable-vc-intrinsic=1},"
-    "spirv.module(spirv-lower-abi-attrs),"
-    "spirv.module(spirv-update-vce),"
-    // "func.func(llvm-request-c-wrappers),"
-    "serialize-spirv,"
+    "gpu.module(strip-debuginfo,"
+    "convert-gpu-to-nvvm),"
+    "nvvm-attach-target{chip=sm_80 O=3},"
+    "func.func(gpu-async-region),"
     "expand-strided-metadata,"
     "lower-affine,"
-    "convert-gpu-to-gpux,"
+    "gpu-to-llvm,"
     "convert-func-to-llvm,"
     "convert-math-to-llvm,"
-    "convert-gpux-to-llvm,"
     "finalize-memref-to-llvm,"
+    "canonicalize,"
+    "cse,"
+    "gpu-module-to-binary{format=fatbin},"
     "reconcile-unrealized-casts";
 
 const std::string _passes(get_text_env("SHARPY_PASSES"));
@@ -831,10 +826,10 @@ JIT::JIT(const std::string &libidtr)
   _crunnerlib = mlirRoot + "/lib/libmlir_c_runner_utils.so";
   _runnerlib = mlirRoot + "/lib/libmlir_runner_utils.so";
   if (!std::ifstream(_crunnerlib)) {
-    throw std::runtime_error("Cannot find libmlir_c_runner_utils.so");
+    throw std::runtime_error("Cannot find lib: " + _crunnerlib);
   }
   if (!std::ifstream(_runnerlib)) {
-    throw std::runtime_error("Cannot find libmlir_runner_utils.so");
+    throw std::runtime_error("Cannot find lib: " + _runnerlib);
   }
 
   if (useGPU()) {
@@ -842,11 +837,9 @@ JIT::JIT(const std::string &libidtr)
     if (!gpuxlibstr.empty()) {
       _gpulib = std::string(gpuxlibstr);
     } else {
-      auto imexRoot = get_text_env("IMEXROOT");
-      imexRoot = !imexRoot.empty() ? imexRoot : std::string(CMAKE_IMEX_ROOT);
-      _gpulib = imexRoot + "/lib/liblevel-zero-runtime.so";
+      _gpulib = mlirRoot + "/lib/libmlir_cuda_runtime.so";
       if (!std::ifstream(_gpulib)) {
-        throw std::runtime_error("Cannot find liblevel-zero-runtime.so");
+        throw std::runtime_error("Cannot find lib: " + _gpulib);
       }
     }
     _sharedLibPaths = {_crunnerlib.c_str(), _runnerlib.c_str(),

From 763ee574101f67c850f209bfc1d4e7ee201137e3 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Fri, 26 Apr 2024 14:58:29 +0300
Subject: [PATCH 02/10] black-scholes: add partial gpu support, verification is
 skipped

---
 examples/black_scholes.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/examples/black_scholes.py b/examples/black_scholes.py
index ba5f1f7..dfea6c8 100644
--- a/examples/black_scholes.py
+++ b/examples/black_scholes.py
@@ -246,15 +246,23 @@ def eval():
     info(f"Median rate: {perf_rate:.5f} Mopts/s")
 
     # verify
-    call, put = args[-2], args[-1]
-    expected_call = 16.976097804669887
-    expected_put = 0.34645174725098116
-    call_value = float(call[0])
-    put_value = float(put[0])
-    assert numpy.allclose(call_value, expected_call)
-    assert numpy.allclose(put_value, expected_put)
-
-    info("SUCCESS")
+    if device:
+        # FIXME gpu.memcpy to host requires identity layout
+        # FIXME reduction on gpu
+        # call = args[-2].to_device()
+        # put = args[-1].to_device()
+        pass
+    else:
+        call = args[-2]
+        put = args[-1]
+        expected_call = 16.976097804669887
+        expected_put = 0.34645174725098116
+        call_value = float(call[0])
+        put_value = float(put[0])
+        assert numpy.allclose(call_value, expected_call)
+        assert numpy.allclose(put_value, expected_put)
+        info("SUCCESS")
+
     fini()
 
 

From a9351324af767838b6473750f5259ec123dfa182 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Thu, 8 Aug 2024 10:59:11 +0300
Subject: [PATCH 03/10] wave equation: add partial gpu support, warm up jit
 cache

---
 examples/wave_equation.py | 40 +++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/examples/wave_equation.py b/examples/wave_equation.py
index defd2c6..13adaa4 100644
--- a/examples/wave_equation.py
+++ b/examples/wave_equation.py
@@ -126,8 +126,10 @@ def ind_arr(shape, columns=False):
     T_shape = (nx, ny)
     U_shape = (nx + 1, ny)
     V_shape = (nx, ny + 1)
+    sync()
     x_t_2d = xmin + ind_arr(T_shape, True) * dx + dx / 2
     y_t_2d = ymin + ind_arr(T_shape) * dy + dy / 2
+    sync()
 
     dofs_T = int(numpy.prod(numpy.asarray(T_shape)))
     dofs_U = int(numpy.prod(numpy.asarray(U_shape)))
@@ -151,6 +153,8 @@ def ind_arr(shape, columns=False):
     u2 = create_full(U_shape, 0.0, dtype)
     v2 = create_full(V_shape, 0.0, dtype)
 
+    sync()
+
     def exact_elev(t, x_t_2d, y_t_2d, lx, ly):
         """
         Exact solution for elevation field.
@@ -224,7 +228,7 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     sync()
 
     # initial solution
-    e[:, :] = exact_elev(0.0, x_t_2d, y_t_2d, lx, ly)
+    e[:, :] = exact_elev(0.0, x_t_2d, y_t_2d, lx, ly).to_device(device)
     u[:, :] = create_full(U_shape, 0.0, dtype)
     v[:, :] = create_full(V_shape, 0.0, dtype)
     sync()
@@ -240,9 +244,22 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
         t = i * dt
 
         if t >= next_t_export - 1e-8:
-            _elev_max = np.max(e, all_axes)
-            _u_max = np.max(u, all_axes)
-            _total_v = np.sum(e + h, all_axes)
+            if device:
+                # FIXME gpu.memcpy to host requires identity layout
+                # FIXME reduction on gpu
+                # e_host = e.to_device()
+                # u_host = u.to_device()
+                # h_host = h.to_device()
+                # _elev_max = np.max(e_host, all_axes)
+                # _u_max = np.max(u_host, all_axes)
+                # _total_v = np.sum(e_host + h, all_axes)
+                _elev_max = 0
+                _u_max = 0
+                _total_v = 0
+            else:
+                _elev_max = np.max(e, all_axes)
+                _u_max = np.max(u, all_axes)
+                _total_v = np.sum(e + h, all_axes)
 
             elev_max = float(_elev_max)
             u_max = float(_u_max)
@@ -277,12 +294,19 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     duration = time_mod.perf_counter() - tic
     info(f"Duration: {duration:.2f} s")
 
-    e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly)
-    err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
-    err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
+    if device:
+        # FIXME gpu.memcpy to host requires identity layout
+        # FIXME reduction on gpu
+        # err2_host = err2.to_device()
+        # err_L2 = math.sqrt(float(np.sum(err2_host, all_axes)))
+        err_L2 = 0
+    else:
+        e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly)
+        err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
+        err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
     info(f"L2 error: {err_L2:7.5e}")
 
-    if nx == 128 and ny == 128 and not benchmark_mode:
+    if nx == 128 and ny == 128 and not benchmark_mode and not device:
         if datatype == "f32":
             assert numpy.allclose(err_L2, 7.2235471e-03, rtol=1e-4)
         else:

From 7dfaaa3db0408730059897c0755b4210f162ed70 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Thu, 22 Aug 2024 12:51:49 +0300
Subject: [PATCH 04/10] shallow water: add partial gpu support, warm up jit
 cache

---
 examples/shallow_water.py | 136 +++++++++++++++++++++-----------------
 1 file changed, 77 insertions(+), 59 deletions(-)

diff --git a/examples/shallow_water.py b/examples/shallow_water.py
index a5798a4..39b23cf 100644
--- a/examples/shallow_water.py
+++ b/examples/shallow_water.py
@@ -156,7 +156,7 @@ def ind_arr(shape, columns=False):
     q = create_full(F_shape, 0.0, dtype)
 
     # bathymetry
-    h = create_full(T_shape, 0.0, dtype)
+    h = create_full(T_shape, 1.0, dtype)  # HACK init with 1
 
     hu = create_full(U_shape, 0.0, dtype)
     hv = create_full(V_shape, 0.0, dtype)
@@ -205,13 +205,15 @@ def bathymetry(x_t_2d, y_t_2d, lx, ly):
         return bath * create_full(T_shape, 1.0, dtype)
 
     # set bathymetry
-    h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly)
+    # h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device)
     # steady state potential energy
-    pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny
+    # pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny
+    pe_offset = 0.5 * g * float(1.0) / nx / ny
 
     # compute time step
     alpha = 0.5
-    h_max = float(np.max(h, all_axes))
+    # h_max = float(np.max(h, all_axes))
+    h_max = float(1.0)
     c = (g * h_max) ** 0.5
     dt = alpha * dx / c
     dt = t_export / int(math.ceil(t_export / dt))
@@ -328,9 +330,9 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     u0, v0, e0 = exact_solution(
         0, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d
     )
-    e[:, :] = e0
-    u[:, :] = u0
-    v[:, :] = v0
+    e[:, :] = e0.to_device(device)
+    u[:, :] = u0.to_device(device)
+    v[:, :] = v0.to_device(device)
 
     t = 0
     i_export = 0
@@ -344,30 +346,41 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
         t = i * dt
 
         if t >= next_t_export - 1e-8:
-            _elev_max = np.max(e, all_axes)
-            _u_max = np.max(u, all_axes)
-            _q_max = np.max(q, all_axes)
-            _total_v = np.sum(e + h, all_axes)
-
-            # potential energy
-            _pe = 0.5 * g * (e + h) * (e - h) + pe_offset
-            _total_pe = np.sum(_pe, all_axes)
-
-            # kinetic energy
-            u2 = u * u
-            v2 = v * v
-            u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
-            v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
-            _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
-            _total_ke = np.sum(_ke, all_axes)
-
-            total_pe = float(_total_pe) * dx * dy
-            total_ke = float(_total_ke) * dx * dy
-            total_e = total_ke + total_pe
-            elev_max = float(_elev_max)
-            u_max = float(_u_max)
-            q_max = float(_q_max)
-            total_v = float(_total_v) * dx * dy
+            if device:
+                # FIXME gpu.memcpy to host requires identity layout
+                # FIXME reduction on gpu
+                elev_max = 0
+                u_max = 0
+                q_max = 0
+                diff_e = 0
+                diff_v = 0
+                total_pe = 0
+                total_ke = 0
+            else:
+                _elev_max = np.max(e, all_axes)
+                _u_max = np.max(u, all_axes)
+                _q_max = np.max(q, all_axes)
+                _total_v = np.sum(e + h, all_axes)
+
+                # potential energy
+                _pe = 0.5 * g * (e + h) * (e - h) + pe_offset
+                _total_pe = np.sum(_pe, all_axes)
+
+                # kinetic energy
+                u2 = u * u
+                v2 = v * v
+                u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
+                v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
+                _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
+                _total_ke = np.sum(_ke, all_axes)
+
+                total_pe = float(_total_pe) * dx * dy
+                total_ke = float(_total_ke) * dx * dy
+                total_e = total_ke + total_pe
+                elev_max = float(_elev_max)
+                u_max = float(_u_max)
+                q_max = float(_q_max)
+                total_v = float(_total_v) * dx * dy
 
             if i_export == 0:
                 initial_v = total_v
@@ -402,35 +415,40 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     duration = time_mod.perf_counter() - tic
     info(f"Duration: {duration:.2f} s")
 
-    e_exact = exact_solution(t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d)[
-        2
-    ]
-    err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
-    err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
-    info(f"L2 error: {err_L2:7.15e}")
-
-    if nx < 128 or ny < 128:
-        info("Skipping correctness test due to small problem size.")
-    elif not benchmark_mode:
-        tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
-        assert (
-            diff_e < tolerance_ene
-        ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
-        if nx == 128 and ny == 128:
-            if datatype == "f32":
-                assert numpy.allclose(
-                    err_L2, 4.3127859e-05, rtol=1e-5
-                ), "L2 error does not match"
-            else:
-                assert numpy.allclose(
-                    err_L2, 4.315799035627906e-05
-                ), "L2 error does not match"
-        else:
-            tolerance_l2 = 1e-4
+    if device:
+        # FIXME gpu.memcpy to host requires identity layout
+        # FIXME reduction on gpu
+        pass
+    else:
+        e_exact = exact_solution(
+            t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d
+        )[2]
+        err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
+        err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
+        info(f"L2 error: {err_L2:7.15e}")
+
+        if nx < 128 or ny < 128:
+            info("Skipping correctness test due to small problem size.")
+        elif not benchmark_mode:
+            tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
             assert (
-                err_L2 < tolerance_l2
-            ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
-        info("SUCCESS")
+                diff_e < tolerance_ene
+            ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
+            if nx == 128 and ny == 128:
+                if datatype == "f32":
+                    assert numpy.allclose(
+                        err_L2, 4.3127859e-05, rtol=1e-5
+                    ), "L2 error does not match"
+                else:
+                    assert numpy.allclose(
+                        err_L2, 4.315799035627906e-05
+                    ), "L2 error does not match"
+            else:
+                tolerance_l2 = 1e-4
+                assert (
+                    err_L2 < tolerance_l2
+                ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
+            info("SUCCESS")
 
     fini()
 

From 9c9b55b4b132275309a36a70c2d6cbe9a4f63f86 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Wed, 11 Dec 2024 13:53:29 +0200
Subject: [PATCH 05/10] CMakeLists: fix linker

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e58301..9c18ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,6 +178,7 @@ target_link_libraries(_sharpy PRIVATE
     ${imex_dialect_libs}
     ${imex_conversion_libs}
     IMEXTransforms
+    MLIRCopyOpInterface
     IMEXUtil
     LLVMX86CodeGen
     LLVMX86AsmParser

From ad5c58d1ba26a45970acbd08929e223c07516b5f Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Fri, 13 Dec 2024 13:00:30 +0200
Subject: [PATCH 06/10] getMRType: fix llvm strides must not be zero error

---
 src/jit/mlir.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/jit/mlir.cpp b/src/jit/mlir.cpp
index 1ebec8c..aae8045 100644
--- a/src/jit/mlir.cpp
+++ b/src/jit/mlir.cpp
@@ -266,6 +266,12 @@ ::mlir::Value DepManager::getDependent(::mlir::OpBuilder &builder,
 static ::mlir::MemRefType getMRType(size_t ndims, intptr_t offset,
                                     intptr_t *sizes, intptr_t *strides,
                                     ::mlir::Type elType) {
+  // sanitize strides
+  for (size_t i = 0; i < ndims; i++) {
+    if (strides[i] == 0) {
+      strides[i] = 1;
+    }
+  }
   auto layout = ::mlir::StridedLayoutAttr::get(elType.getContext(), offset,
                                                {strides, ndims});
   return ::mlir::MemRefType::get({sizes, ndims}, elType, layout);

From 0d110cfc6a9ce06d80d067affcc41e0a2f4ca54c Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Fri, 13 Dec 2024 20:52:47 +0200
Subject: [PATCH 07/10] wave equation: add gpu reductions

---
 examples/wave_equation.py | 46 +++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/examples/wave_equation.py b/examples/wave_equation.py
index 13adaa4..b5070d1 100644
--- a/examples/wave_equation.py
+++ b/examples/wave_equation.py
@@ -244,22 +244,15 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
         t = i * dt
 
         if t >= next_t_export - 1e-8:
-            if device:
-                # FIXME gpu.memcpy to host requires identity layout
-                # FIXME reduction on gpu
-                # e_host = e.to_device()
-                # u_host = u.to_device()
-                # h_host = h.to_device()
-                # _elev_max = np.max(e_host, all_axes)
-                # _u_max = np.max(u_host, all_axes)
-                # _total_v = np.sum(e_host + h, all_axes)
-                _elev_max = 0
-                _u_max = 0
-                _total_v = 0
-            else:
-                _elev_max = np.max(e, all_axes)
-                _u_max = np.max(u, all_axes)
-                _total_v = np.sum(e + h, all_axes)
+            sync()
+            H_tmp = e + h
+            sync()
+            _elev_max = np.max(e, all_axes).to_device()
+            # NOTE max(u) segfaults, shape (n+1, n) too large for tiling
+            _u_max = np.max(u[1:, :], all_axes).to_device()
+            _total_v = np.sum(H_tmp, all_axes).to_device()
+            # NOTE this segfaults
+            # _total_v = np.sum(e + h, all_axes).to_device()  # segfaults
 
             elev_max = float(_elev_max)
             u_max = float(_u_max)
@@ -294,16 +287,17 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     duration = time_mod.perf_counter() - tic
     info(f"Duration: {duration:.2f} s")
 
-    if device:
-        # FIXME gpu.memcpy to host requires identity layout
-        # FIXME reduction on gpu
-        # err2_host = err2.to_device()
-        # err_L2 = math.sqrt(float(np.sum(err2_host, all_axes)))
-        err_L2 = 0
-    else:
-        e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly)
-        err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
-        err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
+    e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly).to_device(device)
+    err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
+    err2sum = np.sum(err2, all_axes).to_device()
+    sync()
+    # e_host = e.to_device()
+    # sync()
+    # e_exact = exact_elev(t, x_t_2d, y_t_2d, lx, ly).to_device()
+    # err2 = (e_exact - e_host) * (e_exact - e_host) * dx * dy / lx / ly
+    # err2sum = np.sum(err2, all_axes)
+    sync()
+    err_L2 = math.sqrt(float(err2sum))
     info(f"L2 error: {err_L2:7.5e}")
 
     if nx == 128 and ny == 128 and not benchmark_mode and not device:

From 75c7c19940124d5e74e6519353d1fc6648ba721c Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 16 Dec 2024 08:24:59 +0200
Subject: [PATCH 08/10] wave equation: initialize coord arrays with int64

---
 examples/wave_equation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/wave_equation.py b/examples/wave_equation.py
index b5070d1..f94d520 100644
--- a/examples/wave_equation.py
+++ b/examples/wave_equation.py
@@ -115,10 +115,10 @@ def ind_arr(shape, columns=False):
         """Construct an (nx, ny) array where each row/col is an arange"""
         nx, ny = shape
         if columns:
-            ind = np.arange(0, nx * ny, 1, dtype=np.int32) % nx
+            ind = np.arange(0, nx * ny, 1, dtype=np.int64) % nx
             ind = transpose(np.reshape(ind, (ny, nx)))
         else:
-            ind = np.arange(0, nx * ny, 1, dtype=np.int32) % ny
+            ind = np.arange(0, nx * ny, 1, dtype=np.int64) % ny
             ind = np.reshape(ind, (nx, ny))
         return ind.astype(dtype)
 

From b1a7c3f769de803bfadd6c639b5abff1337b0a80 Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 16 Dec 2024 08:25:50 +0200
Subject: [PATCH 09/10] shallow water: initialize coord arrays with int64

---
 examples/shallow_water.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/shallow_water.py b/examples/shallow_water.py
index 39b23cf..fb44715 100644
--- a/examples/shallow_water.py
+++ b/examples/shallow_water.py
@@ -115,10 +115,10 @@ def ind_arr(shape, columns=False):
         """Construct an (nx, ny) array where each row/col is an arange"""
         nx, ny = shape
         if columns:
-            ind = np.arange(0, nx * ny, 1, dtype=np.int32) % nx
+            ind = np.arange(0, nx * ny, 1, dtype=np.int64) % nx
             ind = transpose(np.reshape(ind, (ny, nx)))
         else:
-            ind = np.arange(0, nx * ny, 1, dtype=np.int32) % ny
+            ind = np.arange(0, nx * ny, 1, dtype=np.int64) % ny
             ind = np.reshape(ind, (nx, ny))
         return ind.astype(dtype)
 

From 9987123381de20095fd7eb40826eea8f4a611aed Mon Sep 17 00:00:00 2001
From: Tuomas Karna <tuomas.karna@intel.com>
Date: Mon, 3 Feb 2025 20:25:01 +0200
Subject: [PATCH 10/10] shallow water: reductions and L2 error comp runs on GPU

---
 examples/shallow_water.py | 149 ++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 79 deletions(-)

diff --git a/examples/shallow_water.py b/examples/shallow_water.py
index fb44715..e7de866 100644
--- a/examples/shallow_water.py
+++ b/examples/shallow_water.py
@@ -156,7 +156,7 @@ def ind_arr(shape, columns=False):
     q = create_full(F_shape, 0.0, dtype)
 
     # bathymetry
-    h = create_full(T_shape, 1.0, dtype)  # HACK init with 1
+    h = create_full(T_shape, 0.0, dtype)
 
     hu = create_full(U_shape, 0.0, dtype)
     hv = create_full(V_shape, 0.0, dtype)
@@ -165,7 +165,7 @@ def ind_arr(shape, columns=False):
     dvdx = create_full(F_shape, 0.0, dtype)
 
     # vector invariant form
-    H_at_f = create_full(F_shape, 0.0, dtype)
+    H_at_f = create_full(F_shape, 1.0, dtype)  # HACK init with 1
 
     # auxiliary variables for RK time integration
     e1 = create_full(T_shape, 0.0, dtype)
@@ -205,15 +205,14 @@ def bathymetry(x_t_2d, y_t_2d, lx, ly):
         return bath * create_full(T_shape, 1.0, dtype)
 
     # set bathymetry
-    # h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device)
+    h[:, :] = bathymetry(x_t_2d, y_t_2d, lx, ly).to_device(device)
     # steady state potential energy
-    # pe_offset = 0.5 * g * float(np.sum(h**2.0, all_axes)) / nx / ny
-    pe_offset = 0.5 * g * float(1.0) / nx / ny
+    h2sum = np.sum(h**2.0, all_axes).to_device()
+    pe_offset = 0.5 * g * float(np.sum(h2sum, all_axes)) / nx / ny
 
     # compute time step
     alpha = 0.5
-    # h_max = float(np.max(h, all_axes))
-    h_max = float(1.0)
+    h_max = float(np.max(h, all_axes).to_device())
     c = (g * h_max) ** 0.5
     dt = alpha * dx / c
     dt = t_export / int(math.ceil(t_export / dt))
@@ -253,10 +252,11 @@ def rhs(u, v, e):
         H_at_f[-1, 1:-1] = 0.5 * (H[-1, 1:] + H[-1, :-1])
         H_at_f[1:-1, 0] = 0.5 * (H[1:, 0] + H[:-1, 0])
         H_at_f[1:-1, -1] = 0.5 * (H[1:, -1] + H[:-1, -1])
-        H_at_f[0, 0] = H[0, 0]
-        H_at_f[0, -1] = H[0, -1]
-        H_at_f[-1, 0] = H[-1, 0]
-        H_at_f[-1, -1] = H[-1, -1]
+        # NOTE causes gpu.memcpy error, non-identity layout
+        # H_at_f[0, 0] = H[0, 0]
+        # H_at_f[0, -1] = H[0, -1]
+        # H_at_f[-1, 0] = H[-1, 0]
+        # H_at_f[-1, -1] = H[-1, -1]
 
         # potential vorticity
         dudy[:, 1:-1] = (u[:, 1:] - u[:, :-1]) / dy
@@ -346,41 +346,36 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
         t = i * dt
 
         if t >= next_t_export - 1e-8:
-            if device:
-                # FIXME gpu.memcpy to host requires identity layout
-                # FIXME reduction on gpu
-                elev_max = 0
-                u_max = 0
-                q_max = 0
-                diff_e = 0
-                diff_v = 0
-                total_pe = 0
-                total_ke = 0
-            else:
-                _elev_max = np.max(e, all_axes)
-                _u_max = np.max(u, all_axes)
-                _q_max = np.max(q, all_axes)
-                _total_v = np.sum(e + h, all_axes)
-
-                # potential energy
-                _pe = 0.5 * g * (e + h) * (e - h) + pe_offset
-                _total_pe = np.sum(_pe, all_axes)
-
-                # kinetic energy
-                u2 = u * u
-                v2 = v * v
-                u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
-                v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
-                _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
-                _total_ke = np.sum(_ke, all_axes)
-
-                total_pe = float(_total_pe) * dx * dy
-                total_ke = float(_total_ke) * dx * dy
-                total_e = total_ke + total_pe
-                elev_max = float(_elev_max)
-                u_max = float(_u_max)
-                q_max = float(_q_max)
-                total_v = float(_total_v) * dx * dy
+            sync()
+            # NOTE must precompute reduction operands to single field
+            H_tmp = e + h
+            # potential energy
+            _pe = 0.5 * g * (e + h) * (e - h) + pe_offset
+            # kinetic energy
+            u2 = u * u
+            v2 = v * v
+            u2_at_t = 0.5 * (u2[1:, :] + u2[:-1, :])
+            v2_at_t = 0.5 * (v2[:, 1:] + v2[:, :-1])
+            _ke = 0.5 * (u2_at_t + v2_at_t) * (e + h)
+            sync()
+            _elev_max = np.max(e, all_axes).to_device()
+            # NOTE max(u) segfaults, shape (n+1, n) too large for tiling
+            _u_max = np.max(u[1:, :], all_axes).to_device()
+            _q_max = np.max(q[1:, 1:], all_axes).to_device()
+            _total_v = np.sum(H_tmp, all_axes).to_device()
+            _total_pe = np.sum(_pe, all_axes).to_device()
+            _total_ke = np.sum(_ke, all_axes).to_device()
+
+            total_pe = float(_total_pe) * dx * dy
+            total_ke = float(_total_ke) * dx * dy
+            total_e = total_ke + total_pe
+            elev_max = float(_elev_max)
+            u_max = float(_u_max)
+            q_max = float(_q_max)
+            total_v = float(_total_v) * dx * dy
+
+            diff_e = 0
+            diff_v = 0
 
             if i_export == 0:
                 initial_v = total_v
@@ -415,40 +410,36 @@ def step(u, v, e, u1, v1, e1, u2, v2, e2):
     duration = time_mod.perf_counter() - tic
     info(f"Duration: {duration:.2f} s")
 
-    if device:
-        # FIXME gpu.memcpy to host requires identity layout
-        # FIXME reduction on gpu
-        pass
-    else:
-        e_exact = exact_solution(
-            t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d
-        )[2]
-        err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
-        err_L2 = math.sqrt(float(np.sum(err2, all_axes)))
-        info(f"L2 error: {err_L2:7.15e}")
-
-        if nx < 128 or ny < 128:
-            info("Skipping correctness test due to small problem size.")
-        elif not benchmark_mode:
-            tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
-            assert (
-                diff_e < tolerance_ene
-            ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
-            if nx == 128 and ny == 128:
-                if datatype == "f32":
-                    assert numpy.allclose(
-                        err_L2, 4.3127859e-05, rtol=1e-5
-                    ), "L2 error does not match"
-                else:
-                    assert numpy.allclose(
-                        err_L2, 4.315799035627906e-05
-                    ), "L2 error does not match"
+    e_exact = exact_solution(t, x_t_2d, y_t_2d, x_u_2d, y_u_2d, x_v_2d, y_v_2d)[
+        2
+    ].to_device(device)
+    err2 = (e_exact - e) * (e_exact - e) * dx * dy / lx / ly
+    err2sum = np.sum(err2, all_axes).to_device()
+    err_L2 = math.sqrt(float(err2sum))
+    info(f"L2 error: {err_L2:7.15e}")
+
+    if nx < 128 or ny < 128:
+        info("Skipping correctness test due to small problem size.")
+    elif not benchmark_mode:
+        tolerance_ene = 1e-7 if datatype == "f32" else 1e-9
+        assert (
+            diff_e < tolerance_ene
+        ), f"Energy error exceeds tolerance: {diff_e} > {tolerance_ene}"
+        if nx == 128 and ny == 128:
+            if datatype == "f32":
+                assert numpy.allclose(
+                    err_L2, 4.3127859e-05, rtol=1e-5
+                ), "L2 error does not match"
             else:
-                tolerance_l2 = 1e-4
-                assert (
-                    err_L2 < tolerance_l2
-                ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
-            info("SUCCESS")
+                assert numpy.allclose(
+                    err_L2, 4.315799035627906e-05
+                ), "L2 error does not match"
+        else:
+            tolerance_l2 = 1e-4
+            assert (
+                err_L2 < tolerance_l2
+            ), f"L2 error exceeds tolerance: {err_L2} > {tolerance_l2}"
+        info("SUCCESS")
 
     fini()