diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 000000000..e848bfeac
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,58 @@
+steps:
+  - label: "CUDA"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    command: |
+      julia --color=yes --project=test -e 'using Pkg; Pkg.add("CUDA"); Pkg.develop(path="."); Pkg.instantiate()'
+      julia --color=yes --project=test -e 'include("test/runtests.jl")'
+    env:
+      TRIXIPARTICLES_TEST: cuda
+    timeout_in_minutes: 60
+
+  - label: "AMDGPU"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+    command: |
+      julia --color=yes --project=test -e 'using Pkg; Pkg.add("AMDGPU"); Pkg.develop(path="."); Pkg.instantiate()'
+      julia --color=yes --project=test -e 'include("test/runtests.jl")'
+    env:
+      TRIXIPARTICLES_TEST: amdgpu
+    timeout_in_minutes: 60
+
+  - label: "Metal"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    command: |
+      julia --color=yes --project=test -e 'using Pkg; Pkg.add("Metal"); Pkg.develop(path="."); Pkg.instantiate()'
+      julia --color=yes --project=test -e 'include("test/runtests.jl")'
+    env:
+      TRIXIPARTICLES_TEST: metal
+    timeout_in_minutes: 60
+
+  # Doesn't work. Fails with segfault. See https://github.com/trixi-framework/TrixiParticles.jl/issues/484.
+  # - label: "oneAPI"
+  #   plugins:
+  #     - JuliaCI/julia#v1:
+  #         version: "1"
+  #   agents:
+  #     queue: "juliagpu"
+  #     intel: "*"
+  #   command: |
+  #     julia --color=yes --project=test -e 'using Pkg; Pkg.add("oneAPI"); Pkg.develop(path="."); Pkg.instantiate()'
+  #     julia --color=yes --project=test -e 'include("test/runtests.jl")'
+  #   env:
+  #     TRIXIPARTICLES_TEST: oneapi
+  #   timeout_in_minutes: 60
diff --git a/README.md b/README.md
index 5bd88fdb0..ca2a5aad1 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ It offers intuitive configuration, robust pre- and post-processing, and vendor-a
 - Particle sampling of complex geometries from `.stl` and `.asc` files.
 - Output formats:
   - VTK
+- Support for GPUs by Nvidia, AMD and Apple (experimental)
 
 ## Examples
 We provide several example simulation setups in the `examples` folder (which can be accessed from Julia via `examples_dir()`).
diff --git a/examples/fluid/dam_break_2d_gpu.jl b/examples/fluid/dam_break_2d_gpu.jl
index 89050e364..d37bcfcbe 100644
--- a/examples/fluid/dam_break_2d_gpu.jl
+++ b/examples/fluid/dam_break_2d_gpu.jl
@@ -9,7 +9,7 @@ using TrixiParticles
 # Load setup from dam break example
 trixi_include(@__MODULE__,
               joinpath(examples_dir(), "fluid", "dam_break_2d.jl"),
-              sol=nothing)
+              sol=nothing, ode=nothing)
 
 # Define a GPU-compatible neighborhood search
 min_corner = minimum(tank.boundary.coordinates, dims=2)
@@ -23,4 +23,7 @@ trixi_include(@__MODULE__,
               neighborhood_search=neighborhood_search,
               fluid_particle_spacing=fluid_particle_spacing,
               tspan=tspan,
+              density_diffusion=density_diffusion,
+              boundary_layers=boundary_layers, spacing_ratio=spacing_ratio,
+              boundary_model=boundary_model,
               data_type=nothing)
diff --git a/examples/fluid/dam_break_3d.jl b/examples/fluid/dam_break_3d.jl
index ad01dde42..cc5996eed 100644
--- a/examples/fluid/dam_break_3d.jl
+++ b/examples/fluid/dam_break_3d.jl
@@ -57,7 +57,7 @@ boundary_system = BoundarySPHSystem(tank.boundary, boundary_model)
 # ==========================================================================================
 # ==== Simulation
 semi = Semidiscretization(fluid_system, boundary_system)
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=10)
 saving_callback = SolutionSavingCallback(dt=0.02, prefix="")
diff --git a/examples/fluid/hydrostatic_water_column_2d.jl b/examples/fluid/hydrostatic_water_column_2d.jl
index 86d392c5b..1574acb02 100644
--- a/examples/fluid/hydrostatic_water_column_2d.jl
+++ b/examples/fluid/hydrostatic_water_column_2d.jl
@@ -62,7 +62,7 @@ boundary_system = BoundarySPHSystem(tank.boundary, boundary_model, movement=noth
 # ==========================================================================================
 # ==== Simulation
 semi = Semidiscretization(fluid_system, boundary_system)
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=50)
 saving_callback = SolutionSavingCallback(dt=0.02, prefix="")
diff --git a/examples/fluid/periodic_channel_2d.jl b/examples/fluid/periodic_channel_2d.jl
index 3743e843d..e9a3e4421 100644
--- a/examples/fluid/periodic_channel_2d.jl
+++ b/examples/fluid/periodic_channel_2d.jl
@@ -60,7 +60,7 @@ periodic_box = PeriodicBox(min_corner=[0.0, -0.25], max_corner=[1.0, 0.75])
 neighborhood_search = GridNeighborhoodSearch{2}(; periodic_box)
 
 semi = Semidiscretization(fluid_system, boundary_system; neighborhood_search)
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=100)
 saving_callback = SolutionSavingCallback(dt=0.02, prefix="")
diff --git a/examples/fluid/pipe_flow_2d.jl b/examples/fluid/pipe_flow_2d.jl
index 0d1e8d189..4113b90da 100644
--- a/examples/fluid/pipe_flow_2d.jl
+++ b/examples/fluid/pipe_flow_2d.jl
@@ -125,7 +125,7 @@ boundary_system = BoundarySPHSystem(pipe.boundary, boundary_model)
 semi = Semidiscretization(fluid_system, open_boundary_in, open_boundary_out,
                           boundary_system)
 
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=100)
 saving_callback = SolutionSavingCallback(dt=0.02, prefix="")
diff --git a/examples/fsi/dam_break_gate_2d.jl b/examples/fsi/dam_break_gate_2d.jl
index 7efc6cf02..c9e8ba5f3 100644
--- a/examples/fsi/dam_break_gate_2d.jl
+++ b/examples/fsi/dam_break_gate_2d.jl
@@ -4,6 +4,9 @@
 # "Study of a complex fluid-structure dam-breaking benchmark problem using a multi-phase SPH method with APR".
 # In: Engineering Analysis with Boundary Elements 104 (2019), pages 240-258.
 # https://doi.org/10.1016/j.enganabound.2019.03.033
+#
+# Use a higher resolution and see the comments below regarding plate thickness
+# to reproduce the results from the paper.
 
 using TrixiParticles
 using OrdinaryDiffEq
@@ -14,7 +17,7 @@ using OrdinaryDiffEq
 # since "larger" particles don't fit through the slightly opened gate. Lower fluid
 # resolutions thereforce cause a later and more violent fluid impact against the gate.
 fluid_particle_spacing = 0.02
-n_particles_x = 5
+n_particles_x = 4
 
 # Change spacing ratio to 3 and boundary layers to 1 when using Monaghan-Kajtar boundary model
 boundary_layers = 3
@@ -54,13 +57,15 @@ is_moving(t) = t < 0.1
 
 gate_movement = BoundaryMovement(movement_function, is_moving)
 
-# Elastic plate/beam
+# Elastic plate/beam.
+# The paper is using a thickness of 0.004, which only works properly when a similar fluid
+# resolution is used. Increase resolution and change to 0.004 to reproduce the results.
 length_beam = 0.09
-thickness = 0.004
+thickness = 0.004 * 10
 solid_density = 1161.54
 
 # Young's modulus and Poisson ratio
-E = 3.5e6
+E = 3.5e6 / 10
 nu = 0.45
 
 # The structure starts at the position of the first particle and ends
@@ -123,24 +128,11 @@ solid_smoothing_kernel = WendlandC2Kernel{2}()
 hydrodynamic_densites = fluid_density * ones(size(solid.density))
 hydrodynamic_masses = hydrodynamic_densites * solid_particle_spacing^2
 
-k_solid = gravity * initial_fluid_size[2]
-beta_solid = fluid_particle_spacing / solid_particle_spacing
-boundary_model_solid = BoundaryModelMonaghanKajtar(k_solid, beta_solid,
-                                                   solid_particle_spacing,
-                                                   hydrodynamic_masses)
-
-# `BoundaryModelDummyParticles` usually produces better results, since Monaghan-Kajtar BCs
-# tend to introduce a non-physical gap between fluid and boundary.
-# However, `BoundaryModelDummyParticles` can only be used when the plate thickness is
-# at least two fluid particle spacings, so that the compact support is fully sampled,
-# or fluid particles can penetrate the solid.
-# For higher fluid resolutions, uncomment the code below for better results.
-#
-# boundary_model_solid = BoundaryModelDummyParticles(hydrodynamic_densites,
-#                                                    hydrodynamic_masses,
-#                                                    state_equation=state_equation,
-#                                                    AdamiPressureExtrapolation(),
-#                                                    smoothing_kernel, smoothing_length)
+boundary_model_solid = BoundaryModelDummyParticles(hydrodynamic_densites,
+                                                   hydrodynamic_masses,
+                                                   state_equation=state_equation,
+                                                   AdamiPressureExtrapolation(),
+                                                   smoothing_kernel, smoothing_length)
 
 solid_system = TotalLagrangianSPHSystem(solid,
                                         solid_smoothing_kernel, solid_smoothing_length,
@@ -152,7 +144,7 @@ solid_system = TotalLagrangianSPHSystem(solid,
 # ==== Simulation
 semi = Semidiscretization(fluid_system, boundary_system_tank,
                           boundary_system_gate, solid_system)
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=100)
 saving_callback = SolutionSavingCallback(dt=0.02, prefix="")
diff --git a/examples/solid/oscillating_beam_2d.jl b/examples/solid/oscillating_beam_2d.jl
index c23ed8252..b3c515dbf 100644
--- a/examples/solid/oscillating_beam_2d.jl
+++ b/examples/solid/oscillating_beam_2d.jl
@@ -56,7 +56,7 @@ solid_system = TotalLagrangianSPHSystem(solid, smoothing_kernel, smoothing_lengt
 # ==== Simulation
 semi = Semidiscretization(solid_system,
                           neighborhood_search=PrecomputedNeighborhoodSearch{2}())
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=100)
 
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
index bef795099..af253c941 100644
--- a/src/general/gpu.jl
+++ b/src/general/gpu.jl
@@ -10,9 +10,11 @@
 Adapt.@adapt_structure Semidiscretization
 Adapt.@adapt_structure WeaklyCompressibleSPHSystem
 Adapt.@adapt_structure DensityDiffusionAntuono
+Adapt.@adapt_structure EntropicallyDampedSPHSystem
 Adapt.@adapt_structure BoundarySPHSystem
 Adapt.@adapt_structure BoundaryModelDummyParticles
 Adapt.@adapt_structure BoundaryModelMonaghanKajtar
+Adapt.@adapt_structure BoundaryMovement
 Adapt.@adapt_structure TotalLagrangianSPHSystem
 
 # The initial conditions are only used for initialization, which happens before `adapt`ing
diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl
index fc58a1d55..82dc8774f 100644
--- a/src/general/neighborhood_search.jl
+++ b/src/general/neighborhood_search.jl
@@ -15,6 +15,8 @@ function PointNeighbors.foreach_point_neighbor(f, system::GPUSystem, neighbor_sy
                                                neighborhood_search;
                                                points=eachparticle(system),
                                                parallel=true)
+    @assert parallel != false
+
     # For `GPUSystem`s, explicitly pass the backend, so a `GPUSystem` with a CPU
     # backend will actually launch the KernelAbstractions.jl kernels on the CPU.
     foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search;
diff --git a/src/general/smoothing_kernels.jl b/src/general/smoothing_kernels.jl
index 862e58ba1..8f3346203 100644
--- a/src/general/smoothing_kernels.jl
+++ b/src/general/smoothing_kernels.jl
@@ -257,7 +257,7 @@ end
     return result
 end
 
-@inline compact_support(::SchoenbergQuarticSplineKernel, h) = 2.5 * h
+@inline compact_support(::SchoenbergQuarticSplineKernel, h) = 5 // 2 * h
 
 @inline normalization_factor(::SchoenbergQuarticSplineKernel{1}, h) = 1 / 24h
 # `1199 * pi` is always `Float64`. `pi * h^2 * 1199` preserves the type of `h`.
diff --git a/src/general/system.jl b/src/general/system.jl
index 48d556eba..011b398e3 100644
--- a/src/general/system.jl
+++ b/src/general/system.jl
@@ -3,7 +3,7 @@
 abstract type System{NDIMS, IC} end
 
 # When using KernelAbstractions.jl, the initial condition has been replaced by `nothing`
-GPUSystem = System{NDIMS, Nothing} where {NDIMS}
+const GPUSystem = System{<:Any, Nothing}
 
 abstract type FluidSystem{NDIMS, IC} <: System{NDIMS, IC} end
 timer_name(::FluidSystem) = "fluid"
@@ -26,7 +26,7 @@ end
 initialize!(system, neighborhood_search) = system
 
 @inline Base.ndims(::System{NDIMS}) where {NDIMS} = NDIMS
-@inline Base.eltype(system::System) = eltype(system.initial_condition)
+@inline Base.eltype(system::System) = error("eltype not implemented for system $system")
 
 # Number of integrated variables in the first component of the ODE system (coordinates)
 @inline u_nvariables(system) = ndims(system)
diff --git a/src/preprocessing/particle_packing/system.jl b/src/preprocessing/particle_packing/system.jl
index efdd4cdcc..a5b9db600 100644
--- a/src/preprocessing/particle_packing/system.jl
+++ b/src/preprocessing/particle_packing/system.jl
@@ -126,6 +126,10 @@ function Base.show(io::IO, ::MIME"text/plain", system::ParticlePackingSystem)
     end
 end
 
+@inline function Base.eltype(::ParticlePackingSystem{<:Any, ELTYPE}) where {ELTYPE}
+    return ELTYPE
+end
+
 @inline function v_nvariables(system::ParticlePackingSystem)
     return ndims(system) * 2
 end
diff --git a/src/schemes/boundary/dummy_particles/dummy_particles.jl b/src/schemes/boundary/dummy_particles/dummy_particles.jl
index 4cc784206..7fc0b6cd3 100644
--- a/src/schemes/boundary/dummy_particles/dummy_particles.jl
+++ b/src/schemes/boundary/dummy_particles/dummy_particles.jl
@@ -369,27 +369,38 @@ function compute_pressure!(boundary_model,
         neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system)
 
         # This is an optimization for simulations with large and complex boundaries.
-        # Especially, in 3D simulations with large and/or complex structures outside
+        # Especially in 3D simulations with large and/or complex structures outside
         # of areas with permanent flow.
-        # Note: The version iterating neighbors first is not thread parallelizable.
+        # Note: The version iterating neighbors first is not thread-parallelizable
+        #       and thus not GPU-compatible.
         # The factor is based on the achievable speed-up of the thread parallelizable version.
-        if nparticles(system) >
-           ceil(Int, Threads.nthreads() / 2) * nparticles(neighbor_system)
-            nhs = get_neighborhood_search(neighbor_system, system, semi)
-
-            # Loop over fluid particles and then the neighboring boundary particles to extrapolate fluid pressure to the boundaries
-            boundary_pressure_extrapolation_neighbor!(boundary_model, system,
-                                                      neighbor_system,
-                                                      system_coords, neighbor_coords, v,
-                                                      v_neighbor_system, nhs)
-        else
+        # Use the parallel version if the number of boundary particles is not much larger
+        # than the number of fluid particles.
+        n_boundary_particles = nparticles(system)
+        n_fluid_particles = nparticles(neighbor_system)
+        speedup = ceil(Int, Threads.nthreads() / 2)
+        parallelize = system isa GPUSystem ||
+                      n_boundary_particles < speedup * n_fluid_particles
+        if parallelize
             nhs = get_neighborhood_search(system, neighbor_system, semi)
 
-            # Loop over boundary particles and then the neighboring fluid particles to extrapolate fluid pressure to the boundaries
+            # Loop over boundary particles and then the neighboring fluid particles
+            # to extrapolate fluid pressure to the boundaries.
             boundary_pressure_extrapolation!(boundary_model, system,
                                              neighbor_system,
                                              system_coords, neighbor_coords, v,
                                              v_neighbor_system, nhs)
+        else
+            nhs = get_neighborhood_search(neighbor_system, system, semi)
+
+            # Loop over fluid particles and then the neighboring boundary particles
+            # to extrapolate fluid pressure to the boundaries.
+            # Note that this needs to be serial, as we are writing into the same
+            # pressure entry from different loop iterations.
+            boundary_pressure_extrapolation_neighbor!(boundary_model, system,
+                                                      neighbor_system,
+                                                      system_coords, neighbor_coords, v,
+                                                      v_neighbor_system, nhs)
         end
 
         @threaded system for particle in eachparticle(system)
@@ -472,7 +483,7 @@ end
     (; pressure, cache, viscosity, density_calculator) = boundary_model
     (; pressure_offset) = density_calculator
 
-    # Loop over all pairs of particles and neighbors within the kernel cutoff.
+    # Loop over all pairs of particles and neighbors within the kernel cutoff
     foreach_point_neighbor(system, neighbor_system, system_coords, neighbor_coords,
                            neighborhood_search;
                            points=eachparticle(system)) do particle, neighbor,
diff --git a/src/schemes/boundary/monaghan_kajtar/monaghan_kajtar.jl b/src/schemes/boundary/monaghan_kajtar/monaghan_kajtar.jl
index 2674cdb30..45a19a2aa 100644
--- a/src/schemes/boundary/monaghan_kajtar/monaghan_kajtar.jl
+++ b/src/schemes/boundary/monaghan_kajtar/monaghan_kajtar.jl
@@ -59,7 +59,7 @@ end
     # In order to avoid this, we clip the force at a "large" value, large enough to prevent
     # penetration when a reasonable `K` is used, but small enough to not cause instabilites
     # or super small time steps.
-    distance_from_singularity = max(0.01 * boundary_particle_spacing,
+    distance_from_singularity = max(boundary_particle_spacing / 100,
                                     distance - boundary_particle_spacing)
 
     return K / beta^(ndims(particle_system) - 1) * pos_diff /
@@ -72,11 +72,11 @@ end
 
     # TODO The neighborhood search fluid->boundary should use this search distance
     if q >= 2
-        return 0.0
+        return zero(eltype(r))
     end
 
     # (Monaghan, Kajtar, 2009, Section 4): The kernel should be normalized to 1.77 for q=0
-    return 1.77 / 32 * (1 + 5 / 2 * q + 2 * q^2) * (2 - q)^5
+    return (177 // 100) // 32 * (1 + 5 // 2 * q + 2 * q^2) * (2 - q)^5
 end
 
 @inline function particle_density(v, model::BoundaryModelMonaghanKajtar, system, particle)
diff --git a/src/schemes/boundary/open_boundary/system.jl b/src/schemes/boundary/open_boundary/system.jl
index 5e7dad2a9..8317b8bf3 100644
--- a/src/schemes/boundary/open_boundary/system.jl
+++ b/src/schemes/boundary/open_boundary/system.jl
@@ -170,6 +170,10 @@ function Base.show(io::IO, ::MIME"text/plain", system::OpenBoundarySPHSystem)
     end
 end
 
+@inline function Base.eltype(::OpenBoundarySPHSystem{<:Any, <:Any, <:Any, ELTYPE}) where {ELTYPE}
+    return ELTYPE
+end
+
 function reset_callback_flag!(system::OpenBoundarySPHSystem)
     system.update_callback_used[] = false
 
diff --git a/src/schemes/boundary/system.jl b/src/schemes/boundary/system.jl
index 2c9f364a2..adc42c1a0 100644
--- a/src/schemes/boundary/system.jl
+++ b/src/schemes/boundary/system.jl
@@ -130,27 +130,28 @@ is_moving(t) = t < 1.5
 movement = BoundaryMovement(movement_function, is_moving)
 
 # output
-BoundaryMovement{typeof(movement_function), typeof(is_moving)}(movement_function, is_moving, Int64[])
+BoundaryMovement{typeof(movement_function), typeof(is_moving), Vector{Int64}}(movement_function, is_moving, Int64[])
 ```
 """
-struct BoundaryMovement{MF, IM}
+struct BoundaryMovement{MF, IM, MP}
     movement_function :: MF
     is_moving         :: IM
-    moving_particles  :: Vector{Int}
+    moving_particles  :: MP # Vector{Int}
+end
 
-    function BoundaryMovement(movement_function, is_moving; moving_particles=nothing)
-        if !(movement_function(0.0) isa SVector)
-            @warn "Return value of `movement_function` is not of type `SVector`. " *
-                  "Returning regular `Vector`s causes allocations and significant performance overhead."
-        end
+# The default constructor needs to be accessible for Adapt.jl to work with this struct.
+# See the comments in general/gpu.jl for more details.
+function BoundaryMovement(movement_function, is_moving; moving_particles=nothing)
+    if !(movement_function(0.0) isa SVector)
+        @warn "Return value of `movement_function` is not of type `SVector`. " *
+              "Returning regular `Vector`s causes allocations and significant performance overhead."
+    end
 
-        # Default value is an empty vector, which will be resized in the `BoundarySPHSystem`
-        # constructor to move all particles.
-        moving_particles = isnothing(moving_particles) ? [] : vec(moving_particles)
+    # Default value is an empty vector, which will be resized in the `BoundarySPHSystem`
+    # constructor to move all particles.
+    moving_particles = isnothing(moving_particles) ? Int[] : vec(moving_particles)
 
-        return new{typeof(movement_function),
-                   typeof(is_moving)}(movement_function, is_moving, moving_particles)
-    end
+    return BoundaryMovement(movement_function, is_moving, moving_particles)
 end
 
 create_cache_boundary(::Nothing, initial_condition) = (;)
diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl
index 5668ce950..5b7a306dc 100644
--- a/src/schemes/fluid/entropically_damped_sph/rhs.jl
+++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl
@@ -104,7 +104,7 @@ end
     eta_tilde = 2 * eta_a * eta_b / (eta_a + eta_b)
 
     # TODO For variable smoothing length use average smoothing length
-    tmp = eta_tilde / (distance^2 + 0.01 * smoothing_length^2)
+    tmp = eta_tilde / (distance^2 + smoothing_length^2 / 100)
 
     # This formulation was introduced by Hu and Adams (2006). https://doi.org/10.1016/j.jcp.2005.09.001
     # They argued that the formulation is more flexible because of the possibility to formulate
diff --git a/src/schemes/fluid/entropically_damped_sph/system.jl b/src/schemes/fluid/entropically_damped_sph/system.jl
index ca59b745f..b8c2ae876 100644
--- a/src/schemes/fluid/entropically_damped_sph/system.jl
+++ b/src/schemes/fluid/entropically_damped_sph/system.jl
@@ -69,74 +69,78 @@ struct EntropicallyDampedSPHSystem{NDIMS, ELTYPE <: Real, IC, M, DC, K, V, TV,
     surface_normal_method             :: SRFN
     buffer                            :: B
     cache                             :: C
+end
 
-    function EntropicallyDampedSPHSystem(initial_condition, smoothing_kernel,
-                                         smoothing_length, sound_speed;
-                                         pressure_acceleration=inter_particle_averaged_pressure,
-                                         density_calculator=SummationDensity(),
-                                         transport_velocity=nothing,
-                                         alpha=0.5, viscosity=nothing,
-                                         acceleration=ntuple(_ -> 0.0,
-                                                             ndims(smoothing_kernel)),
-                                         source_terms=nothing, surface_tension=nothing,
-                                         surface_normal_method=nothing, buffer_size=nothing,
-                                         reference_particle_spacing=0.0)
-        buffer = isnothing(buffer_size) ? nothing :
-                 SystemBuffer(nparticles(initial_condition), buffer_size)
-
-        initial_condition = allocate_buffer(initial_condition, buffer)
-
-        NDIMS = ndims(initial_condition)
-        ELTYPE = eltype(initial_condition)
-
-        mass = copy(initial_condition.mass)
-        n_particles = length(initial_condition.mass)
-
-        if ndims(smoothing_kernel) != NDIMS
-            throw(ArgumentError("smoothing kernel dimensionality must be $NDIMS for a $(NDIMS)D problem"))
-        end
-
-        acceleration_ = SVector(acceleration...)
-        if length(acceleration_) != NDIMS
-            throw(ArgumentError("`acceleration` must be of length $NDIMS for a $(NDIMS)D problem"))
-        end
+# The default constructor needs to be accessible for Adapt.jl to work with this struct.
+# See the comments in general/gpu.jl for more details.
+function EntropicallyDampedSPHSystem(initial_condition, smoothing_kernel,
+                                     smoothing_length, sound_speed;
+                                     pressure_acceleration=inter_particle_averaged_pressure,
+                                     density_calculator=SummationDensity(),
+                                     transport_velocity=nothing,
+                                     alpha=0.5, viscosity=nothing,
+                                     acceleration=ntuple(_ -> 0.0,
+                                                         ndims(smoothing_kernel)),
+                                     source_terms=nothing, surface_tension=nothing,
+                                     surface_normal_method=nothing, buffer_size=nothing,
+                                     reference_particle_spacing=0.0)
+    buffer = isnothing(buffer_size) ? nothing :
+             SystemBuffer(nparticles(initial_condition), buffer_size)
+
+    initial_condition = allocate_buffer(initial_condition, buffer)
+
+    NDIMS = ndims(initial_condition)
+    ELTYPE = eltype(initial_condition)
+
+    mass = copy(initial_condition.mass)
+    n_particles = length(initial_condition.mass)
+
+    if ndims(smoothing_kernel) != NDIMS
+        throw(ArgumentError("smoothing kernel dimensionality must be $NDIMS for a $(NDIMS)D problem"))
+    end
 
-        if surface_tension !== nothing && surface_normal_method === nothing
-            surface_normal_method = ColorfieldSurfaceNormal()
-        end
+    acceleration_ = SVector(acceleration...)
+    if length(acceleration_) != NDIMS
+        throw(ArgumentError("`acceleration` must be of length $NDIMS for a $(NDIMS)D problem"))
+    end
 
-        if surface_normal_method !== nothing && reference_particle_spacing < eps()
-            throw(ArgumentError("`reference_particle_spacing` must be set to a positive value when using `ColorfieldSurfaceNormal` or a surface tension model"))
-        end
+    if surface_tension !== nothing && surface_normal_method === nothing
+        surface_normal_method = ColorfieldSurfaceNormal()
+    end
 
-        pressure_acceleration = choose_pressure_acceleration_formulation(pressure_acceleration,
-                                                                         density_calculator,
-                                                                         NDIMS, ELTYPE,
-                                                                         nothing)
-
-        nu_edac = (alpha * smoothing_length * sound_speed) / 8
-
-        cache = create_cache_density(initial_condition, density_calculator)
-        cache = (;
-                 create_cache_edac(initial_condition, transport_velocity)...,
-                 create_cache_surface_normal(surface_normal_method, ELTYPE, NDIMS,
-                                             n_particles)...,
-                 create_cache_surface_tension(surface_tension, ELTYPE, NDIMS,
-                                              n_particles)...,
-                 cache...)
-
-        new{NDIMS, ELTYPE, typeof(initial_condition), typeof(mass),
-            typeof(density_calculator), typeof(smoothing_kernel), typeof(viscosity),
-            typeof(transport_velocity), typeof(pressure_acceleration), typeof(source_terms),
-            typeof(surface_tension), typeof(surface_normal_method),
-            typeof(buffer), typeof(cache)}(initial_condition, mass, density_calculator,
-                                           smoothing_kernel, smoothing_length,
-                                           sound_speed, viscosity, nu_edac,
-                                           acceleration_, nothing, pressure_acceleration,
-                                           transport_velocity, source_terms,
-                                           surface_tension, surface_normal_method, buffer,
-                                           cache)
+    if surface_normal_method !== nothing && reference_particle_spacing < eps()
+        throw(ArgumentError("`reference_particle_spacing` must be set to a positive value when using `ColorfieldSurfaceNormal` or a surface tension model"))
     end
+
+    pressure_acceleration = choose_pressure_acceleration_formulation(pressure_acceleration,
+                                                                     density_calculator,
+                                                                     NDIMS, ELTYPE,
+                                                                     nothing)
+
+    nu_edac = (alpha * smoothing_length * sound_speed) / 8
+
+    cache = create_cache_density(initial_condition, density_calculator)
+    cache = (;
+             create_cache_edac(initial_condition, transport_velocity)...,
+             create_cache_surface_normal(surface_normal_method, ELTYPE, NDIMS,
+                                         n_particles)...,
+             create_cache_surface_tension(surface_tension, ELTYPE, NDIMS,
+                                          n_particles)...,
+             cache...)
+
+    EntropicallyDampedSPHSystem{NDIMS, ELTYPE, typeof(initial_condition), typeof(mass),
+                                typeof(density_calculator), typeof(smoothing_kernel),
+                                typeof(viscosity), typeof(transport_velocity),
+                                typeof(pressure_acceleration), typeof(source_terms),
+                                typeof(surface_tension), typeof(surface_normal_method),
+                                typeof(buffer),
+                                typeof(cache)}(initial_condition, mass, density_calculator,
+                                               smoothing_kernel, smoothing_length,
+                                               sound_speed, viscosity, nu_edac,
+                                               acceleration_, nothing,
+                                               pressure_acceleration, transport_velocity,
+                                               source_terms, surface_tension,
+                                               surface_normal_method, buffer, cache)
 end
 
 function Base.show(io::IO, system::EntropicallyDampedSPHSystem)
@@ -189,6 +193,10 @@ function create_cache_edac(initial_condition, ::TransportVelocityAdami)
     return (; pressure_average, neighbor_counter, update_callback_used)
 end
 
+@inline function Base.eltype(::EntropicallyDampedSPHSystem{<:Any, ELTYPE}) where {ELTYPE}
+    return ELTYPE
+end
+
 @inline function v_nvariables(system::EntropicallyDampedSPHSystem)
     return v_nvariables(system, system.density_calculator)
 end
@@ -289,18 +297,16 @@ function update_average_pressure!(system, ::TransportVelocityAdami, v_ode, u_ode
 end
 
 function write_v0!(v0, system::EntropicallyDampedSPHSystem, ::SummationDensity)
-    for particle in eachparticle(system)
-        v0[end, particle] = system.initial_condition.pressure[particle]
-    end
+    # Note that `.=` is very slightly faster, but not GPU-compatible
+    v0[end, :] = system.initial_condition.pressure
 
     return v0
 end
 
 function write_v0!(v0, system::EntropicallyDampedSPHSystem, ::ContinuityDensity)
-    for particle in eachparticle(system)
-        v0[end - 1, particle] = system.initial_condition.density[particle]
-        v0[end, particle] = system.initial_condition.pressure[particle]
-    end
+    # Note that `.=` is very slightly faster, but not GPU-compatible
+    v0[end - 1, :] = system.initial_condition.density
+    v0[end, :] = system.initial_condition.pressure
 
     return v0
 end
diff --git a/src/schemes/fluid/transport_velocity.jl b/src/schemes/fluid/transport_velocity.jl
index 70007b328..4e709fd17 100644
--- a/src/schemes/fluid/transport_velocity.jl
+++ b/src/schemes/fluid/transport_velocity.jl
@@ -79,7 +79,7 @@ end
 @inline function momentum_convection(system, neighbor_system, ::Nothing,
                                      v_particle_system, v_neighbor_system, rho_a, rho_b,
                                      m_a, m_b, particle, neighbor, grad_kernel)
-    return SVector(ntuple(_ -> 0.0, Val(ndims(system))))
+    return zero(grad_kernel)
 end
 
 @inline function momentum_convection(system, neighbor_system, ::TransportVelocityAdami,
@@ -98,7 +98,7 @@ end
     A_a = rho_a * momentum_velocity_a * (advection_velocity_a - momentum_velocity_a)'
     A_b = rho_b * momentum_velocity_b * (advection_velocity_b - momentum_velocity_b)'
 
-    return volume_term * (0.5 * (A_a + A_b)) * grad_kernel
+    return volume_term * ((A_a + A_b) / 2) * grad_kernel
 end
 
 @inline transport_velocity!(dv, system, rho_a, rho_b, m_a, m_b, grad_kernel, particle) = dv
diff --git a/src/schemes/fluid/weakly_compressible_sph/state_equations.jl b/src/schemes/fluid/weakly_compressible_sph/state_equations.jl
index a0f9ffdb9..2afe44a3f 100644
--- a/src/schemes/fluid/weakly_compressible_sph/state_equations.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/state_equations.jl
@@ -36,7 +36,7 @@ function (state_equation::StateEquationCole)(density)
 
     # This is determined statically and has therefore no overhead
     if clip_negative_pressure(state_equation)
-        return max(0.0, pressure)
+        return max(0, pressure)
     end
 
     return pressure
@@ -86,7 +86,7 @@ function (state_equation::StateEquationIdealGas)(density)
 
     # This is determined statically and has therefore no overhead
     if clip_negative_pressure(state_equation)
-        return max(0.0, pressure)
+        return max(0, pressure)
     end
 
     return pressure
diff --git a/src/schemes/fluid/weakly_compressible_sph/system.jl b/src/schemes/fluid/weakly_compressible_sph/system.jl
index 8f30aedc9..6cec79457 100644
--- a/src/schemes/fluid/weakly_compressible_sph/system.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/system.jl
@@ -80,11 +80,11 @@ function WeaklyCompressibleSPHSystem(initial_condition,
                                      pressure_acceleration=nothing,
                                      buffer_size=nothing,
                                      viscosity=nothing, density_diffusion=nothing,
-                                     acceleration=ntuple(_ -> 0.0,
+                                     acceleration=ntuple(_ -> zero(eltype(initial_condition)),
                                                          ndims(smoothing_kernel)),
                                      correction=nothing, source_terms=nothing,
                                      surface_tension=nothing, surface_normal_method=nothing,
-                                     reference_particle_spacing=0.0)
+                                     reference_particle_spacing=0)
     buffer = isnothing(buffer_size) ? nothing :
              SystemBuffer(nparticles(initial_condition), buffer_size)
 
@@ -217,6 +217,10 @@ function Base.show(io::IO, ::MIME"text/plain", system::WeaklyCompressibleSPHSyst
     end
 end
 
+@inline function Base.eltype(::WeaklyCompressibleSPHSystem{<:Any, ELTYPE}) where {ELTYPE}
+    return ELTYPE
+end
+
 @inline function v_nvariables(system::WeaklyCompressibleSPHSystem)
     return v_nvariables(system, system.density_calculator)
 end
diff --git a/src/schemes/solid/discrete_element_method/system.jl b/src/schemes/solid/discrete_element_method/system.jl
index 937460cd0..11281221f 100644
--- a/src/schemes/solid/discrete_element_method/system.jl
+++ b/src/schemes/solid/discrete_element_method/system.jl
@@ -93,6 +93,10 @@ function Base.show(io::IO, ::MIME"text/plain", system::DEMSystem)
     end
 end
 
+@inline function Base.eltype(::DEMSystem{<:Any, ELTYPE}) where {ELTYPE}
+    return ELTYPE
+end
+
 timer_name(::DEMSystem) = "solid"
 
 function TrixiParticles.write_u0!(u0, system::DEMSystem)
diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl
index 3edbe300b..e756f6ab5 100644
--- a/src/schemes/solid/total_lagrangian_sph/system.jl
+++ b/src/schemes/solid/total_lagrangian_sph/system.jl
@@ -156,6 +156,10 @@ function Base.show(io::IO, ::MIME"text/plain", system::TotalLagrangianSPHSystem)
     end
 end
 
+@inline function Base.eltype(::TotalLagrangianSPHSystem{<:Any, <:Any, ELTYPE}) where {ELTYPE}
+    return ELTYPE
+end
+
 @inline function v_nvariables(system::TotalLagrangianSPHSystem)
     return ndims(system)
 end
diff --git a/src/setups/extrude_geometry.jl b/src/setups/extrude_geometry.jl
index c7fb3fa44..498266ecd 100644
--- a/src/setups/extrude_geometry.jl
+++ b/src/setups/extrude_geometry.jl
@@ -124,7 +124,7 @@ function sample_plane(geometry::AbstractMatrix, particle_spacing; tlsph)
         # Extruding a 2D shape results in a 3D shape
 
         # When `tlsph=true`, particles will be placed on the x-y plane
-        coords = vcat(geometry, fill(tlsph ? 0.0 : 0.5particle_spacing, size(geometry, 2))')
+        coords = vcat(geometry, fill(tlsph ? 0 : particle_spacing / 2, size(geometry, 2))')
 
         # TODO: 2D shapes not only in x-y plane but in any user-defined plane
         return coords, particle_spacing
@@ -139,7 +139,7 @@ function sample_plane(shape::InitialCondition, particle_spacing; tlsph)
 
         # When `tlsph=true`, particles will be placed on the x-y plane
         coords = vcat(shape.coordinates,
-                      fill(tlsph ? 0.0 : 0.5particle_spacing, size(shape.coordinates, 2))')
+                      fill(tlsph ? 0 : particle_spacing / 2, size(shape.coordinates, 2))')
 
         # TODO: 2D shapes not only in x-y plane but in any user-defined plane
         return coords, particle_spacing
@@ -183,7 +183,7 @@ function sample_plane(plane_points::NTuple{3}, particle_spacing; tlsph=nothing)
     edge2 = point3_ - point1_
 
     # Check if the points are collinear
-    if isapprox(norm(cross(edge1, edge2)), 0.0; atol=eps())
+    if isapprox(norm(cross(edge1, edge2)), 0; atol=eps())
         throw(ArgumentError("the vectors `AB` and `AC` must not be collinear"))
     end
 
@@ -229,8 +229,8 @@ function shift_plane_corners(plane_points::NTuple{2}, direction, particle_spacin
     plane_point2 = copy(plane_points[2])
 
     # Vectors shifting the points in the corresponding direction
-    dir1 = 0.5 * particle_spacing * direction
-    dir2 = 0.5 * particle_spacing * normalize(plane_point2 - plane_point1)
+    dir1 = particle_spacing * direction / 2
+    dir2 = particle_spacing * normalize(plane_point2 - plane_point1) / 2
 
     plane_point1 .+= dir1 + dir2
     plane_point2 .+= dir1 - dir2
@@ -251,9 +251,9 @@ function shift_plane_corners(plane_points::NTuple{3}, direction, particle_spacin
     edge2 = normalize(plane_point3 - plane_point1)
 
     # Vectors shifting the points in the corresponding direction
-    dir1 = 0.5 * particle_spacing * direction
-    dir2 = 0.5 * particle_spacing * edge1
-    dir3 = 0.5 * particle_spacing * edge2
+    dir1 = particle_spacing * direction / 2
+    dir2 = particle_spacing * edge1 / 2
+    dir3 = particle_spacing * edge2 / 2
 
     plane_point1 .+= dir1 + dir2 + dir3
     plane_point2 .+= dir1 - dir2 + dir3
diff --git a/src/setups/rectangular_tank.jl b/src/setups/rectangular_tank.jl
index 70b179e3b..340272fb5 100644
--- a/src/setups/rectangular_tank.jl
+++ b/src/setups/rectangular_tank.jl
@@ -95,10 +95,10 @@ struct RectangularTank{NDIMS, NDIMSt2, ELTYPE <: Real}
 
     function RectangularTank(particle_spacing, fluid_size, tank_size, fluid_density;
                              velocity=zeros(length(fluid_size)), fluid_mass=nothing,
-                             pressure=0.0,
+                             pressure=0,
                              acceleration=nothing, state_equation=nothing,
                              boundary_density=fluid_density,
-                             n_layers=1, spacing_ratio=1.0,
+                             n_layers=1, spacing_ratio=1,
                              min_coordinates=zeros(length(fluid_size)),
                              faces=Tuple(trues(2 * length(fluid_size))))
         NDIMS = length(fluid_size)
diff --git a/test/examples/examples.jl b/test/examples/examples.jl
index 4d48a50e9..73d52d017 100644
--- a/test/examples/examples.jl
+++ b/test/examples/examples.jl
@@ -7,13 +7,11 @@
             trixi_include(@__MODULE__,
                           joinpath(examples_dir(), "fluid",
                                    "hydrostatic_water_column_2d.jl"),
-                          fluid_system=nothing, sol=nothing, semi=nothing, ode=nothing)
+                          sol=nothing, ode=nothing)
 
             # Neighborhood search for `FullGridCellList` test below
-            search_radius = TrixiParticles.compact_support(smoothing_kernel,
-                                                           smoothing_length)
-            min_corner = minimum(tank.boundary.coordinates, dims=2) .- search_radius
-            max_corner = maximum(tank.boundary.coordinates, dims=2) .+ search_radius
+            min_corner = minimum(tank.boundary.coordinates, dims=2)
+            max_corner = maximum(tank.boundary.coordinates, dims=2)
             cell_list = FullGridCellList(; min_corner, max_corner)
             semi_fullgrid = Semidiscretization(fluid_system, boundary_system,
                                                neighborhood_search=GridNeighborhoodSearch{2}(;
@@ -286,7 +284,8 @@
                                            extra_callback=steady_state_reached,
                                            tspan=(0.0, 1.5))
 
-            @test sol.t[end] < 1.0
+            # Make sure that the simulation is terminated after a reasonable amount of time
+            @test 0.1 < sol.t[end] < 1.0
             @test sol.retcode == ReturnCode.Terminated
         end
 
@@ -300,7 +299,8 @@
                                            extra_callback=steady_state_reached,
                                            tspan=(0.0, 1.5))
 
-            @test sol.t[end] < 1.0
+            # Make sure that the simulation is terminated after a reasonable amount of time
+            @test 0.1 < sol.t[end] < 1.0
             @test sol.retcode == ReturnCode.Terminated
         end
 
diff --git a/test/examples/gpu.jl b/test/examples/gpu.jl
new file mode 100644
index 000000000..b305d1592
--- /dev/null
+++ b/test/examples/gpu.jl
@@ -0,0 +1,369 @@
+const TRIXIPARTICLES_TEST_ = lowercase(get(ENV, "TRIXIPARTICLES_TEST", "all"))
+
+if TRIXIPARTICLES_TEST_ == "cuda"
+    using CUDA
+    CUDA.versioninfo()
+    data_type = CuArray
+    supports_double_precision = true
+elseif TRIXIPARTICLES_TEST_ == "amdgpu"
+    using AMDGPU
+    AMDGPU.versioninfo()
+    data_type = ROCArray
+    supports_double_precision = true
+elseif TRIXIPARTICLES_TEST_ == "metal"
+    using Metal
+    Metal.versioninfo()
+    data_type = MtlArray
+    supports_double_precision = false
+elseif TRIXIPARTICLES_TEST_ == "oneapi"
+    using oneAPI
+    oneAPI.versioninfo()
+    data_type = oneArray
+    # The runners are using an iGPU, which does not support double precision
+    supports_double_precision = false
+else
+    error("Unknown GPU backend: $TRIXIPARTICLES_TEST_")
+end
+
+@testset verbose=true "Examples $TRIXIPARTICLES_TEST_" begin
+    @testset verbose=true "Fluid" begin
+        @trixi_testset "fluid/dam_break_2d_gpu.jl Float64" begin
+            if Main.supports_double_precision
+                @test_nowarn_mod trixi_include(@__MODULE__,
+                                               joinpath(examples_dir(), "fluid",
+                                                        "dam_break_2d_gpu.jl"),
+                                               tspan=(0.0, 0.1),
+                                               data_type=Main.data_type) [
+                    r"┌ Info: The desired tank length in y-direction .*\n",
+                    r"└ New tank length in y-direction.*\n"
+                ]
+                @test semi.neighborhood_searches[1][1].cell_list isa FullGridCellList
+                @test sol.retcode == ReturnCode.Success
+                @test sol.u[end].x[1] isa Main.data_type
+            else
+                error = "Metal does not support Float64 values, try using Float32 instead"
+                @test_throws error trixi_include(@__MODULE__,
+                                                 joinpath(examples_dir(), "fluid",
+                                                          "dam_break_2d_gpu.jl"),
+                                                 tspan=(0.0, 0.1),
+                                                 data_type=Main.data_type)
+            end
+        end
+
+        @trixi_testset "fluid/dam_break_2d_gpu.jl Float32" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(),
+                                                   "fluid", "dam_break_2d.jl");
+                                          sol=nothing, ode=nothing)
+
+            dam_break_tests = Dict(
+                "default" => (),
+                "DensityDiffusionMolteniColagrossi" => (density_diffusion=DensityDiffusionMolteniColagrossi(delta=0.1f0),),
+                "DensityDiffusionFerrari" => (density_diffusion=DensityDiffusionFerrari(),)
+            )
+
+            for (test_description, kwargs) in dam_break_tests
+                @testset "$test_description" begin
+                    println("═"^100)
+                    println("$test_description")
+
+                    @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+                                                                   joinpath(examples_dir(),
+                                                                            "fluid",
+                                                                            "dam_break_2d_gpu.jl");
+                                                                   tspan=(0.0f0, 0.1f0),
+                                                                   data_type=Main.data_type,
+                                                                   kwargs...) [
+                        r"┌ Info: The desired tank length in y-direction .*\n",
+                        r"└ New tank length in y-direction.*\n"
+                    ]
+                    @test semi.neighborhood_searches[1][1].cell_list isa FullGridCellList
+                    @test sol.retcode == ReturnCode.Success
+                    @test sol.u[end].x[1] isa Main.data_type
+                end
+            end
+        end
+
+        @trixi_testset "fluid/dam_break_2d_gpu.jl Float32 BoundaryModelMonaghanKajtar" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(),
+                                                   "fluid", "dam_break_2d.jl");
+                                          boundary_layers=1, spacing_ratio=3,
+                                          sol=nothing, ode=nothing)
+
+            boundary_model = BoundaryModelMonaghanKajtar(0.5f0,
+                                                         spacing_ratio,
+                                                         boundary_particle_spacing,
+                                                         tank.boundary.mass)
+
+            @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+                                                           joinpath(examples_dir(),
+                                                                    "fluid",
+                                                                    "dam_break_2d_gpu.jl");
+                                                           tspan=(0.0f0, 0.1f0),
+                                                           boundary_layers=1,
+                                                           spacing_ratio=3,
+                                                           boundary_model=boundary_model,
+                                                           data_type=Main.data_type) [
+                r"┌ Info: The desired tank length in y-direction .*\n",
+                r"└ New tank length in y-direction.*\n"
+            ]
+            @test semi.neighborhood_searches[1][1].cell_list isa FullGridCellList
+            @test sol.retcode == ReturnCode.Success
+            @test sol.u[end].x[1] isa Main.data_type
+        end
+
+        @trixi_testset "fluid/dam_break_3d.jl" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(), "fluid",
+                                                   "dam_break_3d.jl"),
+                                          fluid_particle_spacing=0.1,
+                                          sol=nothing, ode=nothing)
+
+            # Neighborhood search with `FullGridCellList` for GPU compatibility
+            min_corner = minimum(tank.boundary.coordinates, dims=2)
+            max_corner = maximum(tank.boundary.coordinates, dims=2)
+            cell_list = FullGridCellList(; min_corner, max_corner)
+            semi_fullgrid = Semidiscretization(fluid_system, boundary_system,
+                                               neighborhood_search=GridNeighborhoodSearch{3}(;
+                                                                                             cell_list))
+
+            # Note that this simulation only takes 36 time steps on the CPU.
+            # Due to https://github.com/JuliaGPU/Metal.jl/issues/549, it doesn't work on Metal.
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(), "fluid",
+                                                   "dam_break_3d.jl"),
+                                          tspan=(0.0f0, 0.1f0),
+                                          fluid_particle_spacing=0.1,
+                                          semi=semi_fullgrid,
+                                          data_type=Main.data_type,
+                                          maxiters=36)
+            @test sol.retcode == ReturnCode.Success
+            @test sol.u[end].x[1] isa Main.data_type
+        end
+
+        # Short tests to make sure that different models and kernels work on GPUs
+        @trixi_testset "fluid/hydrostatic_water_column_2d.jl" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(), "fluid",
+                                                   "hydrostatic_water_column_2d.jl"),
+                                          sol=nothing, ode=nothing)
+
+            hydrostatic_water_column_tests = Dict(
+                "WCSPH default" => (),
+                "WCSPH with source term damping" => (source_terms=SourceTermDamping(damping_coefficient=1.0f-4),),
+                "WCSPH with SummationDensity" => (fluid_density_calculator=SummationDensity(),
+                                                  clip_negative_pressure=true),
+                "WCSPH with ViscosityAdami" => (
+                                                # from 0.02*10.0*1.2*0.05/8
+                                                viscosity=ViscosityAdami(nu=0.0015f0),),
+                "WCSPH with ViscosityMorris" => (
+                                                 # from 0.02*10.0*1.2*0.05/8
+                                                 viscosity=ViscosityMorris(nu=0.0015f0),),
+                "WCSPH with ViscosityAdami and SummationDensity" => (
+                                                                     # from 0.02*10.0*1.2*0.05/8
+                                                                     viscosity=ViscosityAdami(nu=0.0015f0),
+                                                                     fluid_density_calculator=SummationDensity(),
+                                                                     maxiters=38, # 38 time steps on CPU
+                                                                     clip_negative_pressure=true),
+                # Broken due to https://github.com/JuliaGPU/CUDA.jl/issues/2681
+                # and https://github.com/JuliaGPU/Metal.jl/issues/550.
+                # "WCSPH with SchoenbergQuarticSplineKernel" => (smoothing_length=1.1,
+                #                                                smoothing_kernel=SchoenbergQuarticSplineKernel{2}()),
+                "WCSPH with SchoenbergQuinticSplineKernel" => (smoothing_length=1.1,
+                                                               smoothing_kernel=SchoenbergQuinticSplineKernel{2}()),
+                "WCSPH with WendlandC2Kernel" => (smoothing_length=3.0,
+                                                  smoothing_kernel=WendlandC2Kernel{2}()),
+                "WCSPH with WendlandC4Kernel" => (smoothing_length=3.5,
+                                                  smoothing_kernel=WendlandC4Kernel{2}()),
+                "WCSPH with WendlandC6Kernel" => (smoothing_length=4.0,
+                                                  smoothing_kernel=WendlandC6Kernel{2}()),
+                "EDAC with source term damping" => (source_terms=SourceTermDamping(damping_coefficient=1.0f-4),
+                                                    fluid_system=EntropicallyDampedSPHSystem(tank.fluid,
+                                                                                             smoothing_kernel,
+                                                                                             smoothing_length,
+                                                                                             sound_speed,
+                                                                                             viscosity=viscosity,
+                                                                                             density_calculator=ContinuityDensity(),
+                                                                                             acceleration=(0.0,
+                                                                                                           -gravity))),
+                "EDAC with SummationDensity" => (fluid_system=EntropicallyDampedSPHSystem(tank.fluid,
+                                                                                          smoothing_kernel,
+                                                                                          smoothing_length,
+                                                                                          sound_speed,
+                                                                                          viscosity=viscosity,
+                                                                                          density_calculator=SummationDensity(),
+                                                                                          acceleration=(0.0,
+                                                                                                        -gravity)),)
+            )
+
+            for (test_description, kwargs) in hydrostatic_water_column_tests
+                @testset "$test_description" begin
+                    println("═"^100)
+                    println("$test_description")
+
+                    # Create systems with the given keyword arguments
+                    trixi_include_changeprecision(Float32, @__MODULE__,
+                                                  joinpath(examples_dir(), "fluid",
+                                                           "hydrostatic_water_column_2d.jl");
+                                                  sol=nothing, ode=nothing,
+                                                  kwargs...)
+
+                    # Neighborhood search with `FullGridCellList` for GPU compatibility
+                    min_corner = minimum(tank.boundary.coordinates, dims=2)
+                    max_corner = maximum(tank.boundary.coordinates, dims=2)
+                    cell_list = FullGridCellList(; min_corner, max_corner,
+                                                 max_points_per_cell=500)
+                    semi_fullgrid = Semidiscretization(fluid_system, boundary_system,
+                                                       neighborhood_search=GridNeighborhoodSearch{2}(;
+                                                                                                     cell_list))
+
+                    # Run the simulation
+                    @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+                                                                   joinpath(examples_dir(),
+                                                                            "fluid",
+                                                                            "hydrostatic_water_column_2d.jl");
+                                                                   semi=semi_fullgrid,
+                                                                   data_type=Main.data_type,
+                                                                   tspan=(0.0f0, 0.1f0),
+                                                                   kwargs...)
+
+                    @test sol.retcode == ReturnCode.Success
+                    @test sol.u[end].x[1] isa Main.data_type
+                end
+            end
+        end
+
+        # Test periodic neighborhood search
+        @trixi_testset "fluid/periodic_channel_2d.jl" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(), "fluid",
+                                                   "periodic_channel_2d.jl"),
+                                          sol=nothing, ode=nothing)
+
+            # Neighborhood search with `FullGridCellList` for GPU compatibility
+            search_radius = TrixiParticles.compact_support(smoothing_kernel,
+                                                           smoothing_length)
+            min_corner = minimum(tank.boundary.coordinates, dims=2)
+            max_corner = maximum(tank.boundary.coordinates, dims=2) .+ 2 * search_radius
+            cell_list = FullGridCellList(; min_corner, max_corner)
+            semi_fullgrid = Semidiscretization(fluid_system, boundary_system,
+                                               neighborhood_search=GridNeighborhoodSearch{2}(;
+                                                                                             cell_list))
+
+            @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+                                                           joinpath(examples_dir(), "fluid",
+                                                                    "periodic_channel_2d.jl"),
+                                                           tspan=(0.0f0, 0.1f0),
+                                                           semi=semi_fullgrid,
+                                                           data_type=Main.data_type)
+            @test sol.retcode == ReturnCode.Success
+            @test sol.u[end].x[1] isa Main.data_type
+        end
+
+        # Test open boundaries and steady-state callback
+        @testset "fluid/pipe_flow_2d.jl - steady state reached (`dt`)" begin
+            # TODO This currently doesn't work on GPUs due to
+            # https://github.com/trixi-framework/PointNeighbors.jl/issues/20.
+
+            # # Import variables into scope
+            # trixi_include_changeprecision(Float32, @__MODULE__,
+            #                               joinpath(examples_dir(), "fluid",
+            #                                        "pipe_flow_2d.jl"),
+            #                               sol=nothing, ode=nothing)
+
+            # # Neighborhood search with `FullGridCellList` for GPU compatibility
+            # min_corner = minimum(pipe.boundary.coordinates, dims=2) .- 8 * particle_spacing
+            # max_corner = maximum(pipe.boundary.coordinates, dims=2) .+ 8 * particle_spacing
+            # cell_list = FullGridCellList(; min_corner, max_corner)
+            # semi_fullgrid = Semidiscretization(fluid_system, boundary_system,
+            #                                    neighborhood_search=GridNeighborhoodSearch{2}(;
+            #                                                                                  cell_list))
+
+            # steady_state_reached = SteadyStateReachedCallback(; dt=0.002, interval_size=10)
+
+            # @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+            #                                                joinpath(examples_dir(), "fluid",
+            #                                                         "pipe_flow_2d.jl"),
+            #                                                extra_callback=steady_state_reached,
+            #                                                tspan=(0.0f0, 1.5f0),
+            #                                                semi=semi_fullgrid,
+            #                                                data_type=Main.data_type)
+
+            # TODO This currently doesn't work on GPUs due to
+            # https://github.com/trixi-framework/PointNeighbors.jl/issues/20.
+
+            # Make sure that the simulation is terminated after a reasonable amount of time
+            @test_skip 0.1 < sol.t[end] < 1.0
+            @test_skip sol.retcode == ReturnCode.Terminated
+            @test_skip sol.u[end].x[1] isa Main.data_type
+        end
+    end
+
+    @testset verbose=true "Solid" begin
+        # TODO after https://github.com/trixi-framework/PointNeighbors.jl/pull/10
+        # is merged, there should be no need to use the `FullGridCellList`.
+        @trixi_testset "solid/oscillating_beam_2d.jl" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(), "solid",
+                                                   "oscillating_beam_2d.jl"),
+                                          sol=nothing, ode=nothing)
+
+            # Neighborhood search with `FullGridCellList` for GPU compatibility
+            min_corner = minimum(solid.coordinates, dims=2)
+            max_corner = maximum(solid.coordinates, dims=2)
+            cell_list = FullGridCellList(; min_corner, max_corner)
+            semi_fullgrid = Semidiscretization(solid_system,
+                                               neighborhood_search=GridNeighborhoodSearch{2}(;
+                                                                                             cell_list))
+
+            @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+                                                           joinpath(examples_dir(), "solid",
+                                                                    "oscillating_beam_2d.jl"),
+                                                           tspan=(0.0f0, 0.1f0),
+                                                           semi=semi_fullgrid,
+                                                           data_type=Main.data_type)
+            @test sol.retcode == ReturnCode.Success
+            @test sol.u[end].x[1] isa Main.data_type
+        end
+    end
+
+    @testset verbose=true "FSI" begin
+        @trixi_testset "fsi/dam_break_gate_2d.jl" begin
+            # Import variables into scope
+            trixi_include_changeprecision(Float32, @__MODULE__,
+                                          joinpath(examples_dir(), "fsi",
+                                                   "dam_break_gate_2d.jl"),
+                                          sol=nothing, ode=nothing)
+
+            # Neighborhood search with `FullGridCellList` for GPU compatibility
+            min_corner = minimum(tank.boundary.coordinates, dims=2)
+            max_corner = maximum(tank.boundary.coordinates, dims=2)
+            max_corner[2] = gate_height + movement_function(0.1)[2]
+            # We need a very high `max_points_per_cell` because the plate resolution
+            # is much finer than the fluid resolution.
+            cell_list = FullGridCellList(; min_corner, max_corner)
+            semi_fullgrid = Semidiscretization(fluid_system, boundary_system_tank,
+                                               boundary_system_gate, solid_system,
+                                               neighborhood_search=GridNeighborhoodSearch{2}(;
+                                                                                             cell_list))
+
+            @test_nowarn_mod trixi_include_changeprecision(Float32, @__MODULE__,
+                                                           joinpath(examples_dir(), "fsi",
+                                                                    "dam_break_gate_2d.jl"),
+                                                           tspan=(0.0f0, 0.4f0),
+                                                           semi=semi_fullgrid,
+                                                           # Needs <1500 steps on the CPU
+                                                           maxiters=1500,
+                                                           data_type=Main.data_type)
+            @test sol.retcode == ReturnCode.Success
+            @test sol.u[end].x[1] isa Main.data_type
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 9c732c77b..457e6b821 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,4 +11,8 @@ const TRIXIPARTICLES_TEST = lowercase(get(ENV, "TRIXIPARTICLES_TEST", "all"))
         include("examples/examples.jl")
         include("validation/validation.jl")
     end
+
+    if TRIXIPARTICLES_TEST in ("cuda", "amdgpu", "metal", "oneapi")
+        include("examples/gpu.jl")
+    end
 end;