From 61ec568bab2d7eedf033d490a0c9533c725bedfb Mon Sep 17 00:00:00 2001
From: Arham Khan <arhakhan@gpu9453.jax.cs.cpe.ice.amd.com>
Date: Sun, 22 Dec 2024 22:55:12 +0000
Subject: [PATCH 01/23] init minimal zoom backend

---
 aten/src/ATen/AccumulateType.cpp              |   28 +-
 aten/src/ATen/AccumulateType.h                |   31 +-
 aten/src/ATen/CMakeLists.txt                  |   49 +-
 aten/src/ATen/Context.cpp                     |   16 +
 aten/src/ATen/Context.h                       |   14 +-
 aten/src/ATen/EmptyTensor.cpp                 |    1 +
 aten/src/ATen/TensorIndexing.cpp              |    3 +-
 aten/src/ATen/autocast_mode.cpp               |   38 +
 aten/src/ATen/autocast_mode.h                 |   19 +
 aten/src/ATen/detail/ZoomHooksInterface.cpp   |   48 +
 aten/src/ATen/detail/ZoomHooksInterface.h     |  143 +
 aten/src/ATen/native/Copy.cpp                 |    6 +-
 aten/src/ATen/native/TensorCompare.cpp        |    5 +-
 aten/src/ATen/native/zoom/AmpKernels.cu       |  252 ++
 aten/src/ATen/native/zoom/CompareEQKernel.cu  |   50 +
 aten/src/ATen/native/zoom/CompareKernels.cu   |  103 +
 aten/src/ATen/native/zoom/Copy.cu             |  393 +++
 aten/src/ATen/native/zoom/Copy.h              |   11 +
 aten/src/ATen/native/zoom/Equal.cpp           |   49 +
 aten/src/ATen/native/zoom/FillKernel.cu       |   30 +
 aten/src/ATen/native/zoom/MiscUtils.h         |   32 +
 aten/src/ATen/native/zoom/Nonzero.cu          |  130 +
 aten/src/ATen/native/zoom/Resize.cpp          |   69 +
 aten/src/ATen/native/zoom/Resize.h            |   61 +
 aten/src/ATen/native/zoom/TensorCompare.cpp   |   23 +
 aten/src/ATen/native/zoom/TensorCompare.cu    |  133 +
 aten/src/ATen/native/zoom/TensorFactories.cu  |  396 +++
 aten/src/ATen/native/zoom/TensorShape.cu      |  833 +++++
 aten/src/ATen/native/zoom/TensorShapeZoom.cpp |   37 +
 .../ATen/native/zoom/TensorTransformations.cu |  154 +
 aten/src/ATen/zoom/ATenZoomGeneral.h          |    8 +
 aten/src/ATen/zoom/ApplyGridUtils.cuh         |   47 +
 aten/src/ATen/zoom/AsmUtils.cuh               |   85 +
 aten/src/ATen/zoom/Atomic.cuh                 |  457 +++
 aten/src/ATen/zoom/CachingHostAllocator.cpp   |  266 ++
 aten/src/ATen/zoom/CachingHostAllocator.h     |   39 +
 aten/src/ATen/zoom/DeviceUtils.cuh            |   75 +
 aten/src/ATen/zoom/EmptyTensor.cpp            |   71 +
 aten/src/ATen/zoom/EmptyTensor.h              |   14 +
 aten/src/ATen/zoom/HIPConfig.h                |    9 +
 aten/src/ATen/zoom/HIPGraph.cpp               |  317 ++
 aten/src/ATen/zoom/HIPGraph.h                 |   96 +
 aten/src/ATen/zoom/HIPGraphsUtils.hpp         |   41 +
 aten/src/ATen/zoom/HIPUtils.h                 |   20 +
 aten/src/ATen/zoom/NumericLimits.cuh          |  121 +
 aten/src/ATen/zoom/PeerToPeerAccess.cpp       |   59 +
 aten/src/ATen/zoom/PeerToPeerAccess.h         |   12 +
 aten/src/ATen/zoom/PhiloxHIPState.h           |    5 +
 aten/src/ATen/zoom/PhiloxUtils.hpp            |    4 +
 aten/src/ATen/zoom/PinnedMemoryAllocator.cpp  |   32 +
 aten/src/ATen/zoom/PinnedMemoryAllocator.h    |   11 +
 aten/src/ATen/zoom/ScanUtils.cuh              |   72 +
 aten/src/ATen/zoom/ThrustAllocator.h          |   23 +
 aten/src/ATen/zoom/ZoomApplyUtils.cuh         |  537 +++
 aten/src/ATen/zoom/ZoomContext.cpp            |   69 +
 aten/src/ATen/zoom/ZoomContext.h              |    9 +
 aten/src/ATen/zoom/ZoomContextLight.h         |   85 +
 aten/src/ATen/zoom/ZoomDataType.h             |   97 +
 aten/src/ATen/zoom/ZoomDevice.h               |   17 +
 aten/src/ATen/zoom/ZoomEvent.h                |  213 ++
 aten/src/ATen/zoom/ZoomGeneratorImpl.cpp      |  512 +++
 aten/src/ATen/zoom/ZoomGeneratorImpl.h        |  181 +
 aten/src/ATen/zoom/cub-RadixSortKeys.cu       |   59 +
 aten/src/ATen/zoom/cub-RadixSortPairs.cu      |   86 +
 aten/src/ATen/zoom/cub.cu                     |   51 +
 aten/src/ATen/zoom/cub.cuh                    |  284 ++
 aten/src/ATen/zoom/cub.h                      |   88 +
 aten/src/ATen/zoom/cub_definitions.cuh        |   27 +
 .../ATen/zoom/detail/DeviceThreadHandles.h    |  151 +
 aten/src/ATen/zoom/detail/IndexUtils.cu       |   75 +
 aten/src/ATen/zoom/detail/IndexUtils.cuh      |   36 +
 aten/src/ATen/zoom/detail/KernelUtils.h       |   37 +
 .../ATen/zoom/detail/PhiloxHIPStateRaw.hpp    |   43 +
 aten/src/ATen/zoom/detail/TensorInfo.cuh      |  116 +
 aten/src/ATen/zoom/detail/UnpackRaw.hpp       |   28 +
 aten/src/ATen/zoom/detail/ZoomHooks.cpp       |  273 ++
 aten/src/ATen/zoom/detail/ZoomHooks.h         |   36 +
 aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.cpp |   13 +
 aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.h   |   85 +
 aten/src/ATen/zoom/jit/HIPJitLoops.cuh        |  292 ++
 aten/src/ATen/zoom/jit/HIPLoops.cuh           |  333 ++
 aten/src/ATen/zoom/jit/IntegerDivider.cuh     |  126 +
 aten/src/ATen/zoom/jit/JitLoops.cuh           |  182 +
 aten/src/ATen/zoom/jit/Loops.cuh              |  325 ++
 aten/src/ATen/zoom/jit/MemoryAccess.cuh       |  395 +++
 aten/src/ATen/zoom/jit/OffsetCalculator.cuh   |  115 +
 aten/src/ATen/zoom/jit/jit_utils.cpp          | 1752 ++++++++++
 aten/src/ATen/zoom/jit/jit_utils.h            |  230 ++
 aten/src/ATen/zoom/jit/llvm_jit_strings.cpp   | 1444 ++++++++
 aten/src/ATen/zoom/jit/llvm_jit_strings.h     |   14 +
 aten/src/ATen/zoom/jit/macros.h               |    4 +
 aten/src/ATen/zoom/jit/thread_constants.h     |   16 +
 build_variables.bzl                           |   15 +
 c10/CMakeLists.txt                            |    4 +
 c10/core/Allocator.cpp                        |   15 +
 c10/core/Allocator.h                          |   14 +
 c10/macros/Export.h                           |    2 +
 c10/macros/Macros.h                           |    2 +-
 c10/util/generic_math.h                       |    6 +-
 c10/zoom/CMakeLists.txt                       |   60 +
 c10/zoom/HIPGraphsC10Utils.h                  |   77 +
 c10/zoom/HIPMathCompat.h                      |  152 +
 c10/zoom/ZoomAllocatorConfig.cpp              |  350 ++
 c10/zoom/ZoomAllocatorConfig.h                |  128 +
 c10/zoom/ZoomCachingAllocator.cpp             | 3104 +++++++++++++++++
 c10/zoom/ZoomCachingAllocator.h               |  480 +++
 c10/zoom/ZoomDeviceAssertionHost.cpp          |  344 ++
 c10/zoom/ZoomDeviceAssertionHost.h            |  164 +
 c10/zoom/ZoomException.cpp                    |   88 +
 c10/zoom/ZoomException.h                      |  185 +
 c10/zoom/ZoomFunctions.cpp                    |  294 ++
 c10/zoom/ZoomFunctions.h                      |  112 +
 c10/zoom/ZoomGuard.h                          |  301 ++
 c10/zoom/ZoomMacros.h                         |   41 +
 c10/zoom/ZoomMallocAsyncAllocator.cpp         |  899 +++++
 c10/zoom/ZoomMiscFunctions.cpp                |   23 +
 c10/zoom/ZoomMiscFunctions.h                  |    8 +
 c10/zoom/ZoomStream.cpp                       |  375 ++
 c10/zoom/ZoomStream.h                         |  221 ++
 c10/zoom/impl/ZoomGuardImpl.cpp               |    7 +
 c10/zoom/impl/ZoomGuardImpl.h                 |  249 ++
 caffe2/CMakeLists.txt                         |  126 +
 cmake/Caffe2Config.cmake.in                   |    4 +
 cmake/Codegen.cmake                           |    3 +
 cmake/Dependencies.cmake                      |   14 +-
 cmake/External/aotriton.cmake                 |    5 +-
 cmake/Summary.cmake                           |    2 +
 cmake/public/LoadHIP.cmake                    |    4 +
 torch/CMakeLists.txt                          |   16 +
 torch/__init__.py                             |    1 +
 torch/_decomp/decompositions.py               |    8 +-
 torch/csrc/Module.cpp                         |   36 +-
 .../autograd/python_variable_indexing.cpp     |    3 +-
 torch/csrc/tensor/python_tensor.cpp           |   14 +
 torch/csrc/zoom/Event.cpp                     |  250 ++
 torch/csrc/zoom/Event.h                       |   18 +
 torch/csrc/zoom/Graph.cpp                     |   91 +
 torch/csrc/zoom/Module.cpp                    | 1533 ++++++++
 torch/csrc/zoom/Module.h                      |   11 +
 torch/csrc/zoom/Stream.cpp                    |  216 ++
 torch/csrc/zoom/Stream.h                      |   20 +
 torch/csrc/zoom/THCP.h                        |   10 +
 torch/csrc/zoom/Tensor.cpp                    |   15 +
 torch/csrc/zoom/ZoomPluggableAllocator.cpp    |  373 ++
 torch/csrc/zoom/ZoomPluggableAllocator.h      |  147 +
 torch/csrc/zoom/comm.cpp                      |  508 +++
 torch/csrc/zoom/comm.h                        |   52 +
 torch/csrc/zoom/device_set.h                  |   11 +
 torch/csrc/zoom/memory_snapshot.cpp           |  376 ++
 torch/csrc/zoom/memory_snapshot.h             |   27 +
 torch/csrc/zoom/python_comm.cpp               |  109 +
 torch/csrc/zoom/python_comm.h                 |    7 +
 torch/csrc/zoom/shared/hiprt.cpp              |   76 +
 torch/csrc/zoom/utils.cpp                     |   41 +
 torch/csrc/zoom/utils.h                       |    4 +
 torch/nn/functional.py                        |    4 +-
 torch/testing/_internal/common_device_type.py |   29 +
 torch/testing/_internal/common_utils.py       |   28 +
 torch/testing/_internal/opinfo/core.py        |    4 +-
 torch/utils/cpp_extension.py                  |   24 +-
 torch/zoom/__init__.py                        |  577 +++
 torch/zoom/_memory_viz.py                     |  627 ++++
 torch/zoom/_utils.py                          |   38 +
 torch/zoom/graphs.py                          |  479 +++
 torch/zoom/memory.py                          |  910 +++++
 torch/zoom/random.py                          |  179 +
 torch/zoom/streams.py                         |  241 ++
 167 files changed, 28424 insertions(+), 44 deletions(-)
 create mode 100644 aten/src/ATen/detail/ZoomHooksInterface.cpp
 create mode 100644 aten/src/ATen/detail/ZoomHooksInterface.h
 create mode 100644 aten/src/ATen/native/zoom/AmpKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/CompareEQKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/CompareKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/Copy.cu
 create mode 100644 aten/src/ATen/native/zoom/Copy.h
 create mode 100644 aten/src/ATen/native/zoom/Equal.cpp
 create mode 100644 aten/src/ATen/native/zoom/FillKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/MiscUtils.h
 create mode 100644 aten/src/ATen/native/zoom/Nonzero.cu
 create mode 100644 aten/src/ATen/native/zoom/Resize.cpp
 create mode 100644 aten/src/ATen/native/zoom/Resize.h
 create mode 100644 aten/src/ATen/native/zoom/TensorCompare.cpp
 create mode 100644 aten/src/ATen/native/zoom/TensorCompare.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorFactories.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorShape.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorShapeZoom.cpp
 create mode 100644 aten/src/ATen/native/zoom/TensorTransformations.cu
 create mode 100644 aten/src/ATen/zoom/ATenZoomGeneral.h
 create mode 100644 aten/src/ATen/zoom/ApplyGridUtils.cuh
 create mode 100644 aten/src/ATen/zoom/AsmUtils.cuh
 create mode 100644 aten/src/ATen/zoom/Atomic.cuh
 create mode 100644 aten/src/ATen/zoom/CachingHostAllocator.cpp
 create mode 100644 aten/src/ATen/zoom/CachingHostAllocator.h
 create mode 100644 aten/src/ATen/zoom/DeviceUtils.cuh
 create mode 100644 aten/src/ATen/zoom/EmptyTensor.cpp
 create mode 100644 aten/src/ATen/zoom/EmptyTensor.h
 create mode 100644 aten/src/ATen/zoom/HIPConfig.h
 create mode 100644 aten/src/ATen/zoom/HIPGraph.cpp
 create mode 100644 aten/src/ATen/zoom/HIPGraph.h
 create mode 100644 aten/src/ATen/zoom/HIPGraphsUtils.hpp
 create mode 100644 aten/src/ATen/zoom/HIPUtils.h
 create mode 100644 aten/src/ATen/zoom/NumericLimits.cuh
 create mode 100644 aten/src/ATen/zoom/PeerToPeerAccess.cpp
 create mode 100644 aten/src/ATen/zoom/PeerToPeerAccess.h
 create mode 100644 aten/src/ATen/zoom/PhiloxHIPState.h
 create mode 100644 aten/src/ATen/zoom/PhiloxUtils.hpp
 create mode 100644 aten/src/ATen/zoom/PinnedMemoryAllocator.cpp
 create mode 100644 aten/src/ATen/zoom/PinnedMemoryAllocator.h
 create mode 100644 aten/src/ATen/zoom/ScanUtils.cuh
 create mode 100644 aten/src/ATen/zoom/ThrustAllocator.h
 create mode 100644 aten/src/ATen/zoom/ZoomApplyUtils.cuh
 create mode 100644 aten/src/ATen/zoom/ZoomContext.cpp
 create mode 100644 aten/src/ATen/zoom/ZoomContext.h
 create mode 100644 aten/src/ATen/zoom/ZoomContextLight.h
 create mode 100644 aten/src/ATen/zoom/ZoomDataType.h
 create mode 100644 aten/src/ATen/zoom/ZoomDevice.h
 create mode 100644 aten/src/ATen/zoom/ZoomEvent.h
 create mode 100644 aten/src/ATen/zoom/ZoomGeneratorImpl.cpp
 create mode 100644 aten/src/ATen/zoom/ZoomGeneratorImpl.h
 create mode 100644 aten/src/ATen/zoom/cub-RadixSortKeys.cu
 create mode 100644 aten/src/ATen/zoom/cub-RadixSortPairs.cu
 create mode 100644 aten/src/ATen/zoom/cub.cu
 create mode 100644 aten/src/ATen/zoom/cub.cuh
 create mode 100644 aten/src/ATen/zoom/cub.h
 create mode 100644 aten/src/ATen/zoom/cub_definitions.cuh
 create mode 100644 aten/src/ATen/zoom/detail/DeviceThreadHandles.h
 create mode 100644 aten/src/ATen/zoom/detail/IndexUtils.cu
 create mode 100644 aten/src/ATen/zoom/detail/IndexUtils.cuh
 create mode 100644 aten/src/ATen/zoom/detail/KernelUtils.h
 create mode 100644 aten/src/ATen/zoom/detail/PhiloxHIPStateRaw.hpp
 create mode 100644 aten/src/ATen/zoom/detail/TensorInfo.cuh
 create mode 100644 aten/src/ATen/zoom/detail/UnpackRaw.hpp
 create mode 100644 aten/src/ATen/zoom/detail/ZoomHooks.cpp
 create mode 100644 aten/src/ATen/zoom/detail/ZoomHooks.h
 create mode 100644 aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.cpp
 create mode 100644 aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.h
 create mode 100644 aten/src/ATen/zoom/jit/HIPJitLoops.cuh
 create mode 100644 aten/src/ATen/zoom/jit/HIPLoops.cuh
 create mode 100644 aten/src/ATen/zoom/jit/IntegerDivider.cuh
 create mode 100644 aten/src/ATen/zoom/jit/JitLoops.cuh
 create mode 100644 aten/src/ATen/zoom/jit/Loops.cuh
 create mode 100644 aten/src/ATen/zoom/jit/MemoryAccess.cuh
 create mode 100644 aten/src/ATen/zoom/jit/OffsetCalculator.cuh
 create mode 100644 aten/src/ATen/zoom/jit/jit_utils.cpp
 create mode 100644 aten/src/ATen/zoom/jit/jit_utils.h
 create mode 100644 aten/src/ATen/zoom/jit/llvm_jit_strings.cpp
 create mode 100644 aten/src/ATen/zoom/jit/llvm_jit_strings.h
 create mode 100644 aten/src/ATen/zoom/jit/macros.h
 create mode 100644 aten/src/ATen/zoom/jit/thread_constants.h
 create mode 100644 c10/zoom/CMakeLists.txt
 create mode 100644 c10/zoom/HIPGraphsC10Utils.h
 create mode 100644 c10/zoom/HIPMathCompat.h
 create mode 100644 c10/zoom/ZoomAllocatorConfig.cpp
 create mode 100644 c10/zoom/ZoomAllocatorConfig.h
 create mode 100644 c10/zoom/ZoomCachingAllocator.cpp
 create mode 100644 c10/zoom/ZoomCachingAllocator.h
 create mode 100644 c10/zoom/ZoomDeviceAssertionHost.cpp
 create mode 100644 c10/zoom/ZoomDeviceAssertionHost.h
 create mode 100644 c10/zoom/ZoomException.cpp
 create mode 100644 c10/zoom/ZoomException.h
 create mode 100644 c10/zoom/ZoomFunctions.cpp
 create mode 100644 c10/zoom/ZoomFunctions.h
 create mode 100644 c10/zoom/ZoomGuard.h
 create mode 100644 c10/zoom/ZoomMacros.h
 create mode 100644 c10/zoom/ZoomMallocAsyncAllocator.cpp
 create mode 100644 c10/zoom/ZoomMiscFunctions.cpp
 create mode 100644 c10/zoom/ZoomMiscFunctions.h
 create mode 100644 c10/zoom/ZoomStream.cpp
 create mode 100644 c10/zoom/ZoomStream.h
 create mode 100644 c10/zoom/impl/ZoomGuardImpl.cpp
 create mode 100644 c10/zoom/impl/ZoomGuardImpl.h
 create mode 100644 torch/csrc/zoom/Event.cpp
 create mode 100644 torch/csrc/zoom/Event.h
 create mode 100644 torch/csrc/zoom/Graph.cpp
 create mode 100644 torch/csrc/zoom/Module.cpp
 create mode 100644 torch/csrc/zoom/Module.h
 create mode 100644 torch/csrc/zoom/Stream.cpp
 create mode 100644 torch/csrc/zoom/Stream.h
 create mode 100644 torch/csrc/zoom/THCP.h
 create mode 100644 torch/csrc/zoom/Tensor.cpp
 create mode 100644 torch/csrc/zoom/ZoomPluggableAllocator.cpp
 create mode 100644 torch/csrc/zoom/ZoomPluggableAllocator.h
 create mode 100644 torch/csrc/zoom/comm.cpp
 create mode 100644 torch/csrc/zoom/comm.h
 create mode 100644 torch/csrc/zoom/device_set.h
 create mode 100644 torch/csrc/zoom/memory_snapshot.cpp
 create mode 100644 torch/csrc/zoom/memory_snapshot.h
 create mode 100644 torch/csrc/zoom/python_comm.cpp
 create mode 100644 torch/csrc/zoom/python_comm.h
 create mode 100644 torch/csrc/zoom/shared/hiprt.cpp
 create mode 100644 torch/csrc/zoom/utils.cpp
 create mode 100644 torch/csrc/zoom/utils.h
 create mode 100644 torch/zoom/__init__.py
 create mode 100644 torch/zoom/_memory_viz.py
 create mode 100644 torch/zoom/_utils.py
 create mode 100644 torch/zoom/graphs.py
 create mode 100644 torch/zoom/memory.py
 create mode 100644 torch/zoom/random.py
 create mode 100644 torch/zoom/streams.py

diff --git a/aten/src/ATen/AccumulateType.cpp b/aten/src/ATen/AccumulateType.cpp
index c4623cc08629c7..55952a6c8ff919 100644
--- a/aten/src/ATen/AccumulateType.cpp
+++ b/aten/src/ATen/AccumulateType.cpp
@@ -2,17 +2,20 @@
 
 namespace at {
 
+// TODO(Arham): exchange keys
 c10::ScalarType toAccumulateType(c10::ScalarType type, c10::DeviceType device) {
   switch (type) {
-#define DEFINE_CASE(scalar_t, TypeNum)                                                             \
-    case ScalarType::TypeNum:                                                                      \
-      switch (device) {                                                                            \
-        case DeviceType::CUDA:                                                                     \
-          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CUDA>>::value; \
-        case DeviceType::MPS:                                                                      \
-          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::MPS>>::value;  \
-        default:                                                                                   \
-          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CPU>>::value;  \
+#define DEFINE_CASE(scalar_t, TypeNum)                                                                    \
+    case ScalarType::TypeNum:                                                                             \
+      switch (device) {                                                                                   \
+        case DeviceType::CUDA:                                                                            \
+          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CUDA>>::value;        \
+        case DeviceType::PrivateUse1:                                                                     \
+          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::PrivateUse1>>::value; \
+        case DeviceType::MPS:                                                                             \
+          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::MPS>>::value;         \
+        default:                                                                                          \
+          return CppTypeToScalarType<at::acc_type_device<scalar_t, c10::DeviceType::CPU>>::value;         \
       }
 
     AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(DEFINE_CASE)
@@ -23,7 +26,12 @@ c10::ScalarType toAccumulateType(c10::ScalarType type, c10::DeviceType device) {
 }
 
 c10::ScalarType toAccumulateType(c10::ScalarType type, bool is_cuda) {
-  return is_cuda ? toAccumulateType(type, c10::DeviceType::CUDA) : toAccumulateType(type, c10::DeviceType::CPU);
+  #ifndef USE_ZOOM
+    return is_cuda ? toAccumulateType(type, c10::DeviceType::CUDA) : toAccumulateType(type, c10::DeviceType::CPU);
+  #else
+    // TODO(Arham): exchange keys
+    return is_cuda ? toAccumulateType(type, c10::DeviceType::PrivateUse1) : toAccumulateType(type, c10::DeviceType::CPU);
+  #endif
 }
 
 }
diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h
index 0275ef099b03d7..1cdd2423c050a0 100644
--- a/aten/src/ATen/AccumulateType.h
+++ b/aten/src/ATen/AccumulateType.h
@@ -67,7 +67,12 @@ struct AccumulateType<T, false> {
 
 template <typename T>
 struct AccumulateType<T, true> {
-  using type = typename AccumulateTypeDevice<T, c10::DeviceType::CUDA>::type;
+  #ifndef USE_ZOOM
+    using type = typename AccumulateTypeDevice<T, c10::DeviceType::CUDA>::type;
+  #else
+    // TODO(Arham): exchange keys
+    using type = typename AccumulateTypeDevice<T, c10::DeviceType::PrivateUse1>::type;
+  #endif
 };
 
 template <typename T, c10::DeviceType device>
@@ -83,6 +88,8 @@ using acc_type = typename AccumulateType<T, is_cuda>::type;
   };
 #define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS)
 #define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA)
+// TODO(Arham): exchange keys
+#define ZOOM_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::PrivateUse1)
 #define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU)
 
 MPS_ACC_TYPE(BFloat16, float);
@@ -126,6 +133,28 @@ CUDA_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
 CUDA_ACC_TYPE(c10::complex<float>, c10::complex<float>);
 CUDA_ACC_TYPE(c10::complex<double>, c10::complex<double>);
 
+#if defined(__HIPCC__)
+ZOOM_ACC_TYPE(half, float);
+#endif
+ZOOM_ACC_TYPE(BFloat16, float);
+ZOOM_ACC_TYPE(Half, float);
+ZOOM_ACC_TYPE(Float8_e5m2, float);
+ZOOM_ACC_TYPE(Float8_e4m3fn, float);
+ZOOM_ACC_TYPE(Float8_e5m2fnuz, float);
+ZOOM_ACC_TYPE(Float8_e4m3fnuz, float);
+ZOOM_ACC_TYPE(float, float);
+ZOOM_ACC_TYPE(double, double);
+ZOOM_ACC_TYPE(int8_t, int64_t);
+ZOOM_ACC_TYPE(uint8_t, int64_t);
+ZOOM_ACC_TYPE(char, int64_t);
+ZOOM_ACC_TYPE(int16_t, int64_t);
+ZOOM_ACC_TYPE(int32_t, int64_t);
+ZOOM_ACC_TYPE(int64_t, int64_t);
+ZOOM_ACC_TYPE(bool, bool);
+ZOOM_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+ZOOM_ACC_TYPE(c10::complex<float>, c10::complex<float>);
+ZOOM_ACC_TYPE(c10::complex<double>, c10::complex<double>);
+
 CPU_ACC_TYPE(BFloat16, float);
 CPU_ACC_TYPE(Half, float);
 CPU_ACC_TYPE(Float8_e5m2, float);
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 9ec458fda45e4c..1cd471cee47bc0 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -82,6 +82,12 @@ file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
 file(GLOB miopen_h "miopen/*.h")
 file(GLOB miopen_cpp "miopen/*.cpp")
 
+file(GLOB zoom_h "zoom/*.h" "zoom/detail/*.h" "zoom/*.cuh" "zoom/detail/*.cuh" "zoom/tunable/*.cuh" "zoom/tunable/*.h" "zoom/jit/*.cuh" "zoom/jit/*.h")
+file(GLOB zoom_cpp "zoom/*.cpp" "zoom/detail/*.cpp" "zoom/tunable/*.cpp" "zoom/jit/*.cpp")
+file(GLOB zoom_hip "zoom/*.cu" "zoom/detail/*.cu" "zoom/impl/*.cu" "zoom/tunable/*.cu")
+file(GLOB zoom_hiprtc_stub_h "zoom/hiprtc_stub/*.h")
+file(GLOB zoom_hiprtc_stub_cpp "zoom/hiprtc_stub/*.cpp")
+
 file(GLOB mkl_cpp "mkl/*.cpp")
 file(GLOB mkldnn_cpp "mkldnn/*.cpp")
 
@@ -166,6 +172,13 @@ file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp")
 file(GLOB native_quantized_cudnn_hip_cpp "native/quantized/cudnn/hip/*.cpp")
 file(GLOB native_utils_cpp "native/utils/*.cpp")
 
+file(GLOB native_zoom_hip "native/zoom/*.cu")
+file(GLOB native_zoom_hip_h "native/zoom/*.cuh")
+file(GLOB native_zoom_cpp "native/zoom/*.cpp")
+file(GLOB native_zoom_linalg_cpp "native/zoom/linalg/*.cpp")
+file(GLOB native_sparse_zoom_hip "native/sparse/zoom/*.cu")
+file(GLOB native_sparse_zoom_cpp "native/sparse/zoom/*.cpp")
+
 # flash_attention sources
 file(GLOB flash_attention_cuda_cu "native/transformers/cuda/flash_attn/*.cu")
 file(GLOB flash_attention_cuda_kernels_cu "native/transformers/cuda/flash_attn/kernels/*.cu")
@@ -342,6 +355,26 @@ if(USE_ROCM)
   )
 endif()
 
+if(USE_ZOOM)
+  list(APPEND ATen_ZOOM_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/zoom)
+  list(APPEND ATen_ZOOM_SRCS
+    ${ATen_ZOOM_SRCS}
+    ${zoom_hip}
+    ${native_zoom_hip}
+    ${native_zoom_hip_h}
+    ${native_sparse_zoom_hip}
+  )
+  list(APPEND all_zoom_cpp
+    ${native_sparse_zoom_cpp}
+    ${zoom_cpp}
+    ${native_zoom_cpp}
+    ${native_zoom_linalg_cpp}
+    ${zoom_generated_sources}
+    ${ATen_ZOOM_SRCS}
+    ${all_zoom_cpp}
+  )
+endif()
+
 if(USE_XPU)
   list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/xpu)
   list(APPEND ATen_XPU_SRCS ${xpu_cpp})
@@ -546,6 +579,7 @@ endif()
 # Include CPU paths for CUDA/HIP as well
 list(APPEND ATen_CUDA_INCLUDE ${ATen_CPU_INCLUDE})
 list(APPEND ATen_HIP_INCLUDE ${ATen_CPU_INCLUDE})
+list(APPEND ATen_ZOOM_INCLUDE ${ATen_CPU_INCLUDE})
 list(APPEND ATen_VULKAN_INCLUDE ${ATen_CPU_INCLUDE})
 
 # We have two libraries: libATen_cpu.so and libATen_cuda.so,
@@ -576,6 +610,12 @@ if(USE_ROCM)
   # list(APPEND ATen_HIP_DEPENDENCY_LIBS ATEN_CUDA_FILES_GEN_LIB)
 endif()
 
+if(USE_ZOOM)
+  set(ATen_ZOOM_SRCS ${all_zoom_cpp})
+  set(ATen_HIPRTC_STUB_SRCS ${zoom_hiprtc_stub_cpp})
+  # list(APPEND ATen_ZOOM_DEPENDENCY_LIBS ATEN_ZOOM_FILES_GEN_LIB)
+endif()
+
 set(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
 configure_file(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake")
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
@@ -583,7 +623,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${zoom_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
   # Metal
   if(USE_PYTORCH_METAL_EXPORT)
     # Add files needed from exporting metal models(optimized_for_mobile)
@@ -611,7 +651,7 @@ foreach(HEADER  ${INSTALL_HEADERS})
 endforeach()
 
 # TODO: Install hip_generated_headers when we have it
-foreach(HEADER ${generated_headers} ${cuda_generated_headers})
+foreach(HEADER ${generated_headers} ${cuda_generated_headers} ${zoom_generated_headers})
   # NB: Assumed to be flat
   install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen)
 endforeach()
@@ -652,7 +692,10 @@ set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
+set(ATen_HIPRTC_STUB_SRCS ${ATen_HIPRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
+set(ATen_ZOOM_SRCS ${ATen_ZOOM_SRCS} PARENT_SCOPE)
+set(ATen_HIPRTC_STUB_SRCS ${ATen_HIPRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
 set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
@@ -671,12 +714,14 @@ set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
+set(ATen_ZOOM_INCLUDE ${ATen_ZOOM_INCLUDE} PARENT_SCOPE)
 set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_ZOOM_DEPENDENCY_LIBS ${ATen_ZOOM_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
 set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
 set(ATen_ATTENTION_KERNEL_SRCS ${ATen_ATTENTION_KERNEL_SRCS} PARENT_SCOPE)
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 7fd191ef3f38c3..1136b05b265491 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -153,6 +153,7 @@ static const char* const cublas_deterministic_configs[] = { ":4096:8", ":16:8" }
 
 bool Context::checkCuBLASConfigDeterministic() {
   bool cublas_config_deterministic = true;
+  #ifndef USE_ZOOM
   // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
   // is set to deterministic setting
   if (hasCUDART() && (versionCUDART() >= 10020)) {
@@ -163,6 +164,10 @@ bool Context::checkCuBLASConfigDeterministic() {
     );
   }
   return cublas_config_deterministic;
+  #else
+  // Zoom uses hipBLAS with the rocBLAS backend - this is only deterministic if atomics are disabled
+  return checkHIPBlasDeterministic();
+  #endif
 }
 
 void Context::alertCuBLASConfigNotDeterministic() const {
@@ -171,6 +176,7 @@ void Context::alertCuBLASConfigNotDeterministic() const {
     return;
   }
 
+  #ifndef USE_ZOOM
   auto msg = c10::str(
     "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
     "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
@@ -180,6 +186,16 @@ void Context::alertCuBLASConfigNotDeterministic() const {
     cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
     "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
   );
+  #else
+  auto msg = c10::str(
+    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
+    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
+    "it uses hipBLAS and you have atomic operations enabled. To enable deterministic behavior in this ",
+    "case, you must set an environment variable before running your PyTorch application: ",
+    "ROCBLAS_DEFAULT_ATOMICS_MODE = 0. For more information, go to ",
+    "https://github.com/ROCm/rocBLAS/blob/develop/docs/how-to/what-is-rocblas.rst#bitwise-reproducibility"
+  );
+  #endif
 
   if (deterministicAlgorithmsWarnOnly()) {
     TORCH_WARN(msg);
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index a922bcd5922fc8..f241e91be6f731 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -11,6 +11,7 @@
 #include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
+#include <ATen/detail/ZoomHooksInterface.h>
 #include <ATen/detail/IPUHooksInterface.h>
 #include <ATen/detail/MAIAHooksInterface.h>
 #include <ATen/detail/MPSHooksInterface.h>
@@ -126,6 +127,9 @@ class TORCH_API Context {
   static bool hasCuBLASLt() {
     return detail::getCUDAHooks().hasCuBLASLt();
   }
+  static bool checkHIPBlasDeterministic() {
+    return detail::getZoomHooks().checkHIPBlasDeterministic();
+  }
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
@@ -163,14 +167,18 @@ class TORCH_API Context {
   }
   void lazyInitPrivateUse1() {
     c10::call_once(thp_init, [&] {
-      if (isPrivateUse1HooksRegistered()) {
-        at::GetPrivateUse1HooksInterface()->initPrivateUse1();
-      }
+      // if (isPrivateUse1HooksRegistered()) {
+      //   at::GetPrivateUse1HooksInterface()->initPrivateUse1();
+      // }
+      detail::getZoomHooks().initPrivateUse1();
     });
   }
   static const at::cuda::NVRTC& getNVRTC() {
     return detail::getCUDAHooks().nvrtc();
   }
+  static const at::zoom::HIPRTC& getHIPRTC() {
+    return detail::getZoomHooks().hiprtc();
+  }
 
   static bool setFlushDenormal(bool on);
 
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 1eb5c070b547c9..8b5cd8e8123920 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -21,6 +21,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
     } else if (at::globalContext().hasXPU()) {
       return at::detail::getXPUHooks().getPinnedMemoryAllocator();
     } else if(at::isPrivateUse1HooksRegistered()) {
+      // TODO(Arham): exchange keys
       return at::GetPrivateUse1HooksInterface()->getPinnedMemoryAllocator();
     } else {
       TORCH_CHECK(false, "Need to provide pin_memory allocator to use pin memory.")
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index bd50282b46ec6a..128298522d48f2 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -50,9 +50,10 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
     at::Device self_device = self.device();
 
     // TODO: This qint special case looks very suspicious...
+    // TODO(Arham): exchange keys
     if (isQIntType(self.scalar_type())) {
       value = at::indexing::scalarToTensor(v, device(kCPU).dtype(kFloat), at::Device(kCPU));
-    } else if (self_device.is_cuda()) {
+    } else if (self_device.is_cuda() || self_device.is_privateuseone()) {
       value = at::indexing::scalarToTensor(v, self.options(), at::Device(kCPU));
     } else {
       value = at::indexing::scalarToTensor(v, self.options(), self_device);
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 2d01bdeca500b0..8219fafb037b98 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -202,6 +202,44 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
          TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
 }
 
+// TODO(Arham): exchange keys
+TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+
+TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
+  // lower_precision_fp
+#define _KERNEL_ZOOM_LOW_PRECISION_FP(...) \
+  KERNEL_ZOOM(__VA_ARGS__, lower_precision_fp)
+
+  AT_FORALL_LOWER_PRECISION_FP(_KERNEL_ZOOM_LOW_PRECISION_FP)
+
+  // fp32
+#define _KERNEL_ZOOM_FP32(...) KERNEL_ZOOM(__VA_ARGS__, fp32)
+
+  AT_FORALL_FP32(_KERNEL_ZOOM_FP32)
+
+  // fp32_set_opt_dtype
+#define _KERNEL_ZOOM_FP32_SET_OPT_DTYPE(...) \
+  KERNEL_ZOOM(__VA_ARGS__, fp32_set_opt_dtype)
+
+  AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_ZOOM_FP32_SET_OPT_DTYPE)
+
+  // fp32_append_dtype
+  // The fp32_append_dtype wrapper overrides implicit promotion behavior.
+  // norm does not implicitly promote, but be aware when adding new ops to this policy.
+  AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
+      KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_ZOOM)
+
+  // promote
+#define _KERNEL_ZOOM_PROMOTE(...) KERNEL_ZOOM(__VA_ARGS__, promote)
+
+  AT_FORALL_PROMOTE(_KERNEL_ZOOM_PROMOTE)
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
+         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
+}
+
 TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
   m.fallback(torch::CppFunction::makeFallthrough());
 }
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index c36030db5b0489..2f897715d03b60 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -708,6 +708,25 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
       REDISPATCH_SIGNATURE,                         \
       POLICY)
 
+// KERNEL_ZOOM/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_ZOOM
+// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastZOOM
+// TODO(Arham): exchange keys
+#define KERNEL_ZOOM(...) KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__)
+
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_ZOOM( \
+    REDISPATCH_FUNC,                                \
+    REGISTER_NAME,                                  \
+    REGISTER_SIGNATURE,                             \
+    REDISPATCH_SIGNATURE,                           \
+    POLICY)                                         \
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(            \
+      c10::DeviceType::PrivateUse1,                        \
+      REDISPATCH_FUNC,                              \
+      REGISTER_NAME,                                \
+      REGISTER_SIGNATURE,                           \
+      REDISPATCH_SIGNATURE,                         \
+      POLICY)
+
 // KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
 // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
 #define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)
diff --git a/aten/src/ATen/detail/ZoomHooksInterface.cpp b/aten/src/ATen/detail/ZoomHooksInterface.cpp
new file mode 100644
index 00000000000000..f23de3c899c165
--- /dev/null
+++ b/aten/src/ATen/detail/ZoomHooksInterface.cpp
@@ -0,0 +1,48 @@
+#include <ATen/detail/ZoomHooksInterface.h>
+
+#include <c10/util/CallOnce.h>
+
+#include <memory>
+
+namespace at {
+namespace detail {
+
+// NB: We purposely leak the CUDA hooks object.  This is because under some
+// situations, we may need to reference the CUDA hooks while running destructors
+// of objects which were constructed *prior* to the first invocation of
+// getZoomHooks.  The example which precipitated this change was the fused
+// kernel cache in the JIT.  The kernel cache is a global variable which caches
+// both CPU and CUDA kernels; CUDA kernels must interact with CUDA hooks on
+// destruction.  Because the kernel cache handles CPU kernels too, it can be
+// constructed before we initialize CUDA; if it contains CUDA kernels at program
+// destruction time, you will destruct the CUDA kernels after CUDA hooks has
+// been unloaded.  In principle, we could have also fixed the kernel cache store
+// CUDA kernels in a separate global variable, but this solution is much
+// simpler.
+//
+// CUDAHooks doesn't actually contain any data, so leaking it is very benign;
+// you're probably losing only a word (the vptr in the allocated object.)
+static ZoomHooksInterface* zoom_hooks = nullptr;
+
+// init and register extension hooks
+void initZoomHooks() {
+  static c10::once_flag once;
+  c10::call_once(once, [] {
+    zoom_hooks = PrivateUse1HooksRegistry()->Create("ZoomHooks", ZoomHooksArgs{}).release();
+    if (!zoom_hooks) {
+      zoom_hooks = new ZoomHooksInterface();
+    }
+    RegisterPrivateUse1HooksInterface(zoom_hooks);
+  });
+}
+
+const ZoomHooksInterface& getZoomHooks() {
+  initZoomHooks();
+  return *zoom_hooks;
+}
+
+} // namespace detail
+
+C10_DEFINE_REGISTRY(PrivateUse1HooksRegistry, ZoomHooksInterface, ZoomHooksArgs)
+
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/detail/ZoomHooksInterface.h b/aten/src/ATen/detail/ZoomHooksInterface.h
new file mode 100644
index 00000000000000..0e971a17e5a9c9
--- /dev/null
+++ b/aten/src/ATen/detail/ZoomHooksInterface.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+
+// Forward-declares at::Generator and at::zoom::NVRTC
+namespace at {
+struct Generator;
+namespace zoom {
+struct HIPRTC;
+} // namespace zoom
+} // namespace at
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+// #ifdef _MSC_VER
+// constexpr const char* ZOOM_HELP =
+//   "PyTorch splits its backend into two shared libraries: a CPU library "
+//   "and a CUDA library; this error has occurred because you are trying "
+//   "to use some CUDA functionality, but the CUDA library has not been "
+//   "loaded by the dynamic linker for some reason.  The CUDA library MUST "
+//   "be loaded, EVEN IF you don't directly use any symbols from the CUDA library! "
+//   "One common culprit is a lack of -INCLUDE:?warp_size@cuda@at@@YAHXZ "
+//   "in your link arguments; many dynamic linkers will delete dynamic library "
+//   "dependencies if you don't depend on any of their symbols.  You can check "
+//   "if this has occurred by using link on your binary to see if there is a "
+//   "dependency on *_cuda.dll library.";
+// #else
+constexpr const char* ZOOM_HELP =
+  "PyTorch splits its backend into two shared libraries: a CPU library "
+  "and a ZOOM library; this error has occurred because you are trying "
+  "to use some ZOOM functionality, but the ZOOM library has not been "
+  "loaded by the dynamic linker for some reason.  The ZOOM library MUST "
+  "be loaded, EVEN IF you don't directly use any symbols from the ZOOM library! "
+  "One common culprit is a lack of -Wl,--no-as-needed in your link arguments; many "
+  "dynamic linkers will delete dynamic library dependencies if you don't "
+  "depend on any of their symbols.  You can check if this has occurred by "
+  "using ldd on your binary to see if there is a dependency on *_cuda.so "
+  "library.";
+// #endif
+
+// The ZoomHooksInterface is an omnibus interface for any ZOOM functionality
+// which we may want to call into from CPU code (and thus must be dynamically
+// dispatched, to allow for separate compilation of ZOOM code).  How do I
+// decide if a function should live in this class?  There are two tests:
+//
+//  1. Does the *implementation* of this function require linking against
+//     ZOOM libraries?
+//
+//  2. Is this function *called* from non-ZOOM ATen code?
+//
+// (2) should filter out many ostensible use-cases, since many times a ZOOM
+// function provided by ATen is only really ever used by actual ZOOM code.
+//
+// TODO: Consider putting the stub definitions in another class, so that one
+// never forgets to implement each virtual function in the real implementation
+// in ZOOMHooks.  This probably doesn't buy us much though.
+struct TORCH_API ZoomHooksInterface : PrivateUse1HooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~ZoomHooksInterface() override = default;
+
+  // Initialize THCState and, transitively, the ZOOM state
+  virtual void initZoom() const {
+    TORCH_CHECK(false, "Cannot initialize ZOOM without torch_zoom library. ", ZOOM_HELP);
+  }
+
+  virtual void initPrivateUse1() const override {
+    initZoom();
+  }
+
+  virtual const Generator& getDefaultZoomGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
+    TORCH_CHECK(false, "Cannot get default ZOOM generator without torch_zoom library. ", ZOOM_HELP);
+  }
+
+  virtual const Generator& getDefaultGenerator(DeviceIndex device_index) override { return getDefaultZoomGenerator(device_index); };
+
+  virtual Device getDeviceFromPtr(void* /*data*/) const override {
+    TORCH_CHECK(false, "Cannot get device of pointer on ZOOM without torch_zoom library. ", ZOOM_HELP);
+  }
+
+  virtual bool isPinnedPtr(const void* /*data*/) const {
+    return false;
+  }
+
+  virtual bool hasROCM() const {
+    return false;
+  }
+
+  virtual bool checkHIPBlasDeterministic() const {
+    TORCH_CHECK(false, "Cannot call checkHIPBlasDeterministic without torch_zoom library", ZOOM_HELP);
+  }
+
+  virtual const at::zoom::HIPRTC& hiprtc() const {
+    TORCH_CHECK(false, "HIPRTC requires Zoom. ", ZOOM_HELP);
+  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    TORCH_CHECK(false, "Cannot call hasPrimaryContext(", device_index, ") without torch_zoom library. ", ZOOM_HELP);
+  }
+
+  virtual DeviceIndex current_device() const {
+    return -1;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const override {
+    TORCH_CHECK(false, "Pinned memory requires ZOOM. ", ZOOM_HELP);
+  }
+
+  virtual Allocator* getZoomDeviceAllocator() const {
+    TORCH_CHECK(false, "ZoomDeviceAllocator requires ZOOM. ", ZOOM_HELP);
+  }
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(false, "Cannot query detailed ZOOM version without torch_zoom library. ", ZOOM_HELP);
+  }
+
+  virtual int getNumGPUs() const {
+    return 0;
+  }
+
+  virtual void deviceSynchronize(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot synchronize ZOOM device without torch_zoom library. ", ZOOM_HELP);
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct TORCH_API ZoomHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(PrivateUse1HooksRegistry, ZoomHooksInterface, ZoomHooksArgs);
+#define REGISTER_PRIVATEUSE1_HOOKS(clsname) \
+  C10_REGISTER_CLASS(PrivateUse1HooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API void initZoomHooks();
+TORCH_API const ZoomHooksInterface& getZoomHooks();
+} // namespace detail
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index c5f81e98906dd4..416a607d5c2622 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -130,7 +130,8 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
 // (e.g. XLA) may be supported by overriding copy_ and _copy_from.
 bool is_supported_device(Device device) {
   DeviceType device_type = device.type();
-  return device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal || device_type == kMPS;
+  // TODO(Arham): exchange keys
+  return device_type == kPrivateUse1 || device_type == kCPU || device_type == kCUDA || device_type == kHIP || device_type == kVulkan || device_type == kMetal || device_type == kMPS;
 }
 
 } // namespace
@@ -288,6 +289,9 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   } else if (iter.device_type(1) == kMPS) {
     device_type = kMPS;
   }
+  else if (iter.device_type(1) == kPrivateUse1) {
+    device_type = kPrivateUse1;
+  }
 
   // TODO: if we need to, we can also enable this path for quantized tensor
   if (device_type == kCPU && copy_transpose_valid(self, src) && !self.is_quantized()) {
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 974ad302ca0c86..72336656842368 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -585,8 +585,9 @@ std::tuple<Tensor, Tensor> mode(const Tensor& self, int64_t dim, bool keepdim) {
 
 std::tuple<Tensor &,Tensor &> mode_out(const Tensor& self, int64_t dim, bool keepdim,
                                        Tensor& values, Tensor& indices) {
-  TORCH_CHECK(self.device().is_cpu() || self.is_cuda(),
-              "mode only supports CPU AND CUDA device type, got: ", self.device().type());
+  // TODO(Arham): exchange keys
+  TORCH_CHECK(self.device().is_cpu() || self.is_cuda() || self.is_privateuseone(),
+              "mode only supports CPU, CUDA, and Zoom device type, got: ", self.device().type());
   TORCH_CHECK(self.layout() == Layout::Strided,
               "mode only supports strided layout, got: ", self.layout());
   TORCH_CHECK(self.device() == values.device(),
diff --git a/aten/src/ATen/native/zoom/AmpKernels.cu b/aten/src/ATen/native/zoom/AmpKernels.cu
new file mode 100644
index 00000000000000..14fa799fd6d283
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AmpKernels.cu
@@ -0,0 +1,252 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <math.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/TensorIterator.h>
+
+
+namespace {
+// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
+// to ensure the Cuda math library's isfinite is actually what gets called in
+// _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda.
+//
+// isfinite_ensure_cuda_math is defined outside at::native because:
+// - A bare call to "isfinite(val)" inside at::native causes nvcc to prefer the unrelated
+//   Tensor at::native::isfinite(const Tensor&), resulting in an error:
+//   "no suitable constructor exists to convert from "float" to "at::Tensor""
+// - Unfortunately, the Cuda math library documentation doesn't say how (or if) you can provide a full namespace path
+//   to ensure that its version of a particular function is invoked.  It only shows bare (not-namespaced)
+//   calls to its routines inside kernel or device functions.
+// - "std::isfinite(val)" in the gpu_kernel lambda causes an "unspecified launch failure" at runtime with cuda 9 on Windows.
+//
+// isfinite_ensure_cuda_math, declared at file scope outside the at::native region, uses isfinite as math library docs
+// suggest and allows disambiguated usage in the lambda within the at::native region.
+// GPU_LAMBDA is defined as __host__ __device__ (see Loops.cuh), so I need the __host__ keyword or else nvcc complains that
+// "calling a __device__ function("isfinite_ensure_cuda_math") from a __host__ __device__ function("operator()") is not allowed."
+static __host__ __device__ __forceinline__ int isfinite_ensure_zoom_math(float val) {
+  return isfinite(val);
+}
+}
+
+namespace at::native {
+
+namespace {
+// Single-tensor fallback for _amp_foreach_non_finite_check_and_unscale_zoom_.
+// Handles individual tensors that are acceptable to unscale but not MTA-safe.
+void _amp_non_finite_check_and_unscale_zoom_(Tensor& scaled_grad,
+                                             Tensor& found_inf,
+                                             const Tensor& inv_scale)
+{
+  // The only way we reach this function is through _amp_foreach_non_finite_check_and_unscale_zoom_, so no input checks.
+
+  // It's not obvious gpu_kernel always guards onto its argument.  Guarding here just in case.
+  const OptionalDeviceGuard device_guard(device_of(scaled_grad));
+
+  // Acts on scaled_grad in place.
+  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    iter.dtype(),
+    "_amp_non_finite_check_and_unscale_zoom",
+    [&iter, &found_inf, &inv_scale] {
+      auto* found_inf_ptr = found_inf.mutable_data_ptr<float>();
+      auto* inv_scale_ptr = inv_scale.const_data_ptr<float>();
+
+      using opmath_t = at::opmath_type<scalar_t>;
+
+      gpu_kernel(iter,
+                 [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t {
+                   auto val = static_cast<opmath_t>(val_in);
+                   if (!isfinite_ensure_zoom_math(val)) {
+                     *found_inf_ptr = 1.f;
+                   }
+                   // Every thread accesses inv_scale, but it will hit in cache.
+                   const auto inv_scale_val = *inv_scale_ptr;
+                   return static_cast<scalar_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                 });
+    });
+}
+} // anonymous namespace
+
+
+// Multiplies each tensor in scaled_grads by inv_scale in-place.
+// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf to 1.0.
+// Uses multi tensor apply (MTA) to process all MTA-safe tensors.
+//
+// Args:
+// scaled_grads:  A TensorList of scaled gradient tensors.  May contain infs or NaNs.
+// found_inf:  A single-element float tensor to which 1.0 will be written if any gradient contain infs/nans.
+//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
+// inv_scale:  The inverse of the scale factor by which scaled_grads are currently multiplied.
+void _amp_foreach_non_finite_check_and_unscale_zoom_(TensorList scaled_grads,
+                                                     Tensor& found_inf,
+                                                     const Tensor& inv_scale)
+{
+  if (scaled_grads.size() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(inv_scale.is_privateuseone(), "inv_scale must be a Zoom tensor.");
+  TORCH_CHECK(found_inf.is_privateuseone(), "found_inf must be a Zoom tensor.");
+  TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor.");
+  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
+
+  // Ensures client code (GradScaler) filtered scaled_grads by dtype.
+  check_foreach_api_restrictions(scaled_grads);
+
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+
+  // is_non_overlapping_and_dense() is not available in Python.
+  // GradScaler can't filter for it. We need to filter here.
+  if (can_use_fast_route(scaled_grads)) {
+    // Hopefully common case.
+    // can_use_fast_route is true, which confirms:
+    //  - all scaled_grads are strided
+    //  - all scaled_grads are non overlapping and dense
+    //  - all scaled_grads are on the same device
+    //  - all scaled_grads are of the same dtype
+    TORCH_CHECK(scaled_grads[0].is_privateuseone(), "scaled_grads must be Zoom tensors.");
+    // Sets up MTA launch to use scaled_grads as-is.
+    tensor_lists.emplace_back(scaled_grads.vec());
+  } else {
+    // Hopefully uncommon case.
+    // can_use_fast_route is an all-or-nothing check.  In this path it was false,
+    // so any of the above confirmations could have gone wrong.
+    // We filter MTA-safe tensors into an MTA-able list.
+    // If a tensor is acceptable but not MTA-safe, we fall back to the TensorIterator kernel.
+    // If a tensor is unacceptable, we throw an error to blame GradScaler.
+    tensor_lists.resize(1);
+    tensor_lists[0].reserve(scaled_grads.size());
+    auto expected_device = scaled_grads[0].device();
+    const auto expected_dtype = scaled_grads[0].scalar_type();
+    for (const Tensor& t : scaled_grads) {
+      // Ensures GradScaler filtered scaled_grads by device.
+      TORCH_CHECK(t.is_privateuseone(), "one of scaled_grads was not a Zoom tensor.");
+      TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
+      TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
+      if (!t.is_non_overlapping_and_dense() || t.scalar_type() != expected_dtype) {
+        // t is acceptable but not MTA-safe.  Falls back to single-tensor TensorIterator kernel.
+        _amp_non_finite_check_and_unscale_zoom_(const_cast<Tensor&>(t),
+                                                found_inf,
+                                                inv_scale);
+      } else {
+        tensor_lists[0].push_back(t);
+      }
+    }
+    if (tensor_lists[0].size() == 0) {
+      return;
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    tensor_lists[0][0].scalar_type(),
+    "_amp_foreach_non_finite_check_and_unscale_zoom",
+    [&tensor_lists, &found_inf, &inv_scale] {
+      auto* found_inf_ptr = found_inf.mutable_data_ptr<float>();
+      auto* inv_scale_ptr = inv_scale.const_data_ptr<float>();
+
+      using opmath_t = at::opmath_type<scalar_t>;
+
+      // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
+      multi_tensor_apply<1>(tensor_lists,
+                            UnaryOpFunctor<scalar_t,
+                                           /* depth */ 1,
+                                           /* r_args_depth */ 1,
+                                           /* res_arg_index */ 0>(),
+                            [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
+                              // There is a slight asymmetry here with the TensorIterator kernel above.
+                              // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
+                              if (!isfinite_ensure_zoom_math(val)) {
+                                *found_inf_ptr = 1.f;
+                              }
+                              // Every thread accesses inv_scale, but it will hit in cache.
+                              const auto inv_scale_val = *inv_scale_ptr;
+                              return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                            });
+    });
+}
+
+
+// amp_update_scale_zoom_kernel is launched with a single thread to compute the new scale.
+// The scale factor is maintained and updated on the GPU to avoid synchronization.
+__global__ void amp_update_scale_zoom_kernel(float* current_scale,
+                                             int* growth_tracker,
+                                             const float* found_inf,
+                                             double growth_factor,
+                                             double backoff_factor,
+                                             int growth_interval)
+{
+  if (*found_inf) {
+    *current_scale = (*current_scale)*backoff_factor;
+    *growth_tracker = 0;
+  } else {
+    // Entering this branch means we just carried out a successful step,
+    // so growth_tracker is incremented before comparing to growth_interval.
+    auto successful = (*growth_tracker) + 1;
+    if (successful == growth_interval) {
+      auto new_scale = static_cast<float>((*current_scale)*growth_factor);
+      // Do not grow the scale past fp32 bounds to inf.
+      if (isfinite_ensure_zoom_math(new_scale)) {
+          *current_scale = new_scale;
+      }
+      *growth_tracker = 0;
+    } else {
+      *growth_tracker = successful;
+    }
+  }
+}
+
+
+// _amp_update_scale_zoom asynchronously updates the scale tensor in place.
+//
+// Args:
+// current_scale:  A one-element zoom float tensor containing the scale value.
+// growth_tracker:  A one-element torch.zoom.IntTensor containing the number of recent consecutive unskipped steps.
+// found_inf:  A one-element zoom float tensor. If > 0, indicates that infs/nans were found by the relevant
+//             prior _amp_non_finite_check_and_unscale_zoom call, and 0 if no infs/nans were found.
+// growth_factor:  Multiplier if no infs/NaNs were found (typically slightly > 1).
+// backoff_factor:  Multiplier if infs/NaNs were found (typically 0.5).
+// growth_interval:  Number of consecutive unskipped steps that must occur for current_scale to be multiplied by
+//                   growth_factor.
+//
+// Returns:
+// current_scale
+Tensor& _amp_update_scale_zoom_(Tensor& current_scale,
+                                Tensor& growth_tracker,
+                                const Tensor& found_inf,
+                                double growth_factor,
+                                double backoff_factor,
+                                int64_t growth_interval)
+{
+  TORCH_CHECK(growth_tracker.is_privateuseone(), "growth_tracker must be a Zoom tensor.");
+  TORCH_CHECK(current_scale.is_privateuseone(), "current_scale must be a Zoom tensor.");
+  TORCH_CHECK(found_inf.is_privateuseone(), "found_inf must be a Zoom tensor.");
+  TORCH_CHECK(growth_tracker.numel() == 1, "growth_tracker must be a 1-element tensor.");
+  TORCH_CHECK(current_scale.numel() == 1, "current_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(growth_tracker.scalar_type() == at::ScalarType::Int, "growth_tracker must be an int tensor.");
+  TORCH_CHECK(current_scale.scalar_type() == at::ScalarType::Float, "current_scale must be a float tensor.");
+  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
+
+  amp_update_scale_zoom_kernel<<<1, 1, 0, c10::zoom::getCurrentZoomStream()>>>(
+    current_scale.mutable_data_ptr<float>(),
+    growth_tracker.mutable_data_ptr<int>(),
+    found_inf.const_data_ptr<float>(),
+    growth_factor,
+    backoff_factor,
+    growth_interval);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return current_scale;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CompareEQKernel.cu b/aten/src/ATen/native/zoom/CompareEQKernel.cu
new file mode 100644
index 00000000000000..b8869c0dc86b31
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CompareEQKernel.cu
@@ -0,0 +1,50 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native { namespace {
+
+enum class EqOpType {EQ, NE};
+
+template<typename scalar_t>
+struct CompareEqFunctor{
+  CompareEqFunctor(EqOpType op): op_(op) {}
+  const EqOpType op_;
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    if (op_ == EqOpType::EQ) {
+      return a == b;
+    } else { //NE
+      return a != b;
+    }
+
+  }
+ };
+}
+
+C10_NOINLINE void compare_eq_ne_kernel(TensorIteratorBase &iter, EqOpType op) {
+  AT_DISPATCH_V2(iter.common_dtype(), "compare_eq_ne_zoom", AT_WRAP([&]() {
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
+        iter, CompareEqFunctor<scalar_t>(op));
+  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+void eq_kernel_zoom(TensorIteratorBase& iter) {
+  compare_eq_ne_kernel(iter, EqOpType::EQ);
+}
+
+void ne_kernel_zoom(TensorIteratorBase& iter) {
+  compare_eq_ne_kernel(iter, EqOpType::NE);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(eq_stub, &eq_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(ne_stub, &ne_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/CompareKernels.cu b/aten/src/ATen/native/zoom/CompareKernels.cu
new file mode 100644
index 00000000000000..21da608a35fc94
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CompareKernels.cu
@@ -0,0 +1,103 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native { namespace {
+
+enum class OpType {GE, GT, LE, LT};
+
+template<typename scalar_t>
+struct CompareFunctor{
+  constexpr CompareFunctor(OpType op): op_(op) {};
+  OpType op_;
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    if (op_ == OpType::GE) {
+      return a >= b;
+    } else if (op_ == OpType::GT) {
+      return a > b;
+    } else if (op_ == OpType::LE) {
+      return a <= b;
+    } else { //LT
+      return a < b;
+    }
+  }
+};
+
+// Reflects the comparison operator, so reflect(op)(a, b) == op(b, a)
+OpType reflect(OpType x) {
+  switch (x) {
+    case OpType::GE: return OpType::LE;
+    case OpType::GT: return OpType::LT;
+    case OpType::LE: return OpType::GE;
+    case OpType::LT: return OpType::GT;
+  }
+  TORCH_INTERNAL_ASSERT(false, "Invalid OpType");
+}
+
+}  // namespace (anonymous)
+
+template <typename scalar_t>
+void compare_scalar_kernel(TensorIteratorBase &iter, OpType op, scalar_t rhs) {
+  CompareFunctor<scalar_t> f(op);
+  gpu_kernel(iter, [=] GPU_LAMBDA (scalar_t lhs) -> bool {
+    return f(lhs, rhs);
+  });
+}
+
+template <typename scalar_t>
+void compare_kernel_impl(TensorIteratorBase &iter, OpType op) {
+  // If either input is a cpu scalar, perform the equivalent comparison
+  // where the scalar is on the right hand side. This saves us from
+  // generating two otherwise identical kernels with mirrored
+  // arguments.
+  if (iter.is_cpu_scalar(1)) {
+    const scalar_t lhs = iter.scalar_value<scalar_t>(1);
+    iter.remove_operand(1);
+    const DeviceGuard device_guard(iter.device(1));
+    compare_scalar_kernel(iter, reflect(op), lhs);
+  } else if (iter.is_cpu_scalar(2)) {
+    const scalar_t rhs = iter.scalar_value<scalar_t>(2);
+    iter.remove_operand(2);
+    compare_scalar_kernel(iter, op, rhs);
+  } else {
+    CompareFunctor<scalar_t> f(op);
+    gpu_kernel(iter, f);
+  }
+}
+
+C10_NOINLINE void compare_kernel_with_scalars(TensorIteratorBase &iter, OpType op) {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "compare_zoom", [&]() {
+    compare_kernel_impl<scalar_t>(iter, op);
+  });
+}
+
+
+void ge_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::GE);
+}
+
+void gt_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::GT);
+}
+
+void le_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::LE);
+}
+
+void lt_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::LT);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(ge_stub, &ge_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(gt_stub, &gt_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(le_stub, &le_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(lt_stub, &lt_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Copy.cu b/aten/src/ATen/native/zoom/Copy.cu
new file mode 100644
index 00000000000000..3415806851f9fd
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Copy.cu
@@ -0,0 +1,393 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/zoom/CachingHostAllocator.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ZoomEvent.h>
+#include <ATen/zoom/PeerToPeerAccess.h>
+#include <ATen/native/Copy.h>
+#include <ATen/native/quantized/Copy.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomStream.h>
+
+namespace at::native {
+
+void neg_kernel_zoom(TensorIteratorBase &iter);
+void conj_kernel_zoom(TensorIteratorBase &iter);
+
+void float8_copy_kernel_zoom(TensorIteratorBase &iter) {
+  ScalarType dtype = iter.dtype(0);
+  ScalarType other_dtype = iter.dtype(1);
+  if (dtype == kFloat8_e4m3fn) {
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e4m3fn(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e4m3fn(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e4m3fn(value);
+         });
+         break;
+      default:
+        gpu_kernel(iter, [] GPU_LAMBDA(Float8_e4m3fn x) { return x; });
+        break;
+    }
+  } else if (dtype == kFloat8_e5m2) {
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+#ifdef AT_USE_NV_CVT_INTRINSICS
+             const auto x =  __nv_cvt_float_to_fp8(value, __NV_NOSAT, __NV_E5M2);
+             return Float8_e5m2(x, Float8_e5m2::from_bits());
+#else
+             return Float8_e5m2(value);
+#endif
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+#ifdef AT_USE_NV_CVT_INTRINSICS
+             const auto x =  __nv_cvt_halfraw_to_fp8(static_cast<__half>(value), __NV_NOSAT, __NV_E5M2);
+             return Float8_e5m2(x, Float8_e5m2::from_bits());
+#else
+             return Float8_e5m2(value);
+#endif
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+#ifdef AT_USE_NV_CVT_INTRINSICS
+             const auto x =  __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16>(value), __NV_NOSAT, __NV_E5M2);
+             return Float8_e5m2(x, Float8_e5m2::from_bits());
+#else
+             return Float8_e5m2(value);
+#endif
+         });
+         break;
+      default:
+         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2 x) { return x; });
+         break;
+    }
+  } else if (dtype == kFloat8_e4m3fnuz) {
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e4m3fnuz(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e4m3fnuz(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e4m3fnuz(value);
+         });
+         break;
+      default:
+        gpu_kernel(iter, [] GPU_LAMBDA(Float8_e4m3fnuz x) { return x; });
+        break;
+    }
+  } else if (dtype == kFloat8_e5m2fnuz) {
+    switch (other_dtype) {
+      case kFloat:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) {
+             return Float8_e5m2fnuz(value);
+         });
+         break;
+      case kHalf:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) {
+             return Float8_e5m2fnuz(value);
+         });
+         break;
+      case kBFloat16:
+         gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) {
+             return Float8_e5m2fnuz(value);
+         });
+         break;
+      default:
+         gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; });
+         break;
+    }
+  } else {
+    TORCH_CHECK(false, "This supposed ot be called only for Float8 types");
+  }
+}
+
+// TODO: We probably can use the opaque type trick to avoid creating duplicate
+// kernels for equivalent bit lengths
+void direct_copy_kernel_zoom(TensorIteratorBase &iter) {
+  ScalarType dtype = iter.dtype(0);
+  if (isQIntType(dtype)) {
+    AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] {
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
+    });
+  } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) {
+     float8_copy_kernel_zoom(iter);
+  } else if (isBitsType(dtype)) {
+    TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
+      "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
+    AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
+      gpu_kernel_nocast(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
+    });
+  } else {
+    AT_DISPATCH_V2(
+        dtype, "copy_", AT_WRAP([&] {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; });
+    }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kHalf, kBool, kBFloat16, kComplexHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  }
+}
+
+void neg_conj_kernel_zoom(TensorIteratorBase &iter) {
+  AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "neg_conj_zoom", [&] {
+    gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return -std::conj(x); });
+  });
+}
+
+using namespace at::zoom;
+
+// device-to-device copy, does type conversion
+void copy_device_to_device(TensorIterator& iter,
+                           bool non_blocking,
+                           bool p2p_enabled) {
+  int64_t numel = iter.numel();
+
+  // We can memcpy the memory if both tensors have the same type AND both
+  // tensors are contiguous after dimension coalescing and reordering.
+  bool same_type = iter.dtype(0) == iter.dtype(1);
+  bool same_conj = iter.tensor(0).is_conj() == iter.tensor(1).is_conj();
+  bool same_neg = iter.tensor(0).is_neg() == iter.tensor(1).is_neg();
+  bool memcpy_eligible = same_type && same_conj && same_neg && iter.is_contiguous();
+
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
+
+  c10::zoom::ZoomGuard device_guard(src_device);
+
+  // We always perform the copy on the source device, using the current stream
+  // on the source device, and we fully synchronize on both src and dst's
+  // current streams for completion of the copy. We have to explicitly do this
+  // for non-contig copies. This mimics the behavior of cross-device
+  // hipMemcpyAsync on the default stream.
+  c10::zoom::ZoomStream copy_stream = c10::zoom::getCurrentZoomStream(src_device.index());
+  if (src_device != dst_device) {
+    // This is a cross-device copy on the src current stream and dst current
+    // stream. We perform a two-way barrier between both devices' streams
+    // before the copy. This ensures that any write-after-write and
+    // write-after-read dependencies on the destination side are handled, so
+    // that no one is operating on the dst memory when we perform the copy.
+    // src waits on dst barrier (src already waits on src)
+    ZoomEvent dst_ready;
+    device_guard.set_device(dst_device);
+    dst_ready.record(c10::zoom::getCurrentZoomStream(dst_device.index()));
+
+    device_guard.set_device(src_device);
+    dst_ready.block(copy_stream);
+  }
+
+  if (memcpy_eligible) {
+    void *dst = iter.data_ptr(0);
+    void *src = iter.data_ptr(1);
+    size_t size = numel * iter.element_size(0);
+    if (src != dst || src_device != dst_device) {
+      // Due to bizarre cuda driver intricacies, copies of
+      // hipMallocAsynced memory between devices that aren't
+      // peer-to-peer-capable need "hipMemcpyPeerAsync".
+      // So we let the allocator implement the correct call
+      // (either hipMemcpyAsync or hipMemcpyPeerAsync)
+      C10_ZOOM_CHECK(c10::zoom::ZoomCachingAllocator::memcpyAsync(
+        dst, dst_device.index(),
+        src, src_device.index(),
+        size, copy_stream, p2p_enabled));
+    }
+  } else {
+    if (same_neg) {
+      if (!same_conj) {
+        conj_kernel_zoom(iter);
+      } else {
+        direct_copy_kernel_zoom(iter);
+      }
+    } else {
+      if (!same_conj) {
+        neg_conj_kernel_zoom(iter);
+      } else {
+        neg_kernel_zoom(iter);
+      }
+    }
+  }
+
+  if (src_device != dst_device) {
+    // dst waits on src barrier (dst already waits on dst). We cannot
+    // operate on dst's copy until the copy is complete.
+
+    // Still on src_device, record stream event
+    ZoomEvent src_ready;
+    src_ready.record(copy_stream);
+
+    device_guard.set_device(dst_device);
+    src_ready.block(c10::zoom::getCurrentZoomStream(dst_device.index()));
+  }
+
+  C10_ZOOM_CHECK(hipGetLastError());
+}
+
+static bool copy_requires_temporaries(TensorIterator& iter, bool p2p_enabled) {
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
+
+  if (dst_device == src_device) {
+    // We never require temporaries for copies on the same GPU.
+    TORCH_INTERNAL_ASSERT(dst_device.is_privateuseone() && src_device.is_privateuseone());
+    return false;
+  }
+
+  bool same_dtype = iter.dtype(0) == iter.dtype(1);
+  if (same_dtype && iter.is_contiguous()) {
+    // Contiguous same-dtype copies can always use hipMemcpyAsync
+    return false;
+  } else if (dst_device.is_privateuseone() && src_device.is_privateuseone()) {
+    // Copies between GPUs can use the copy kernel if P2P is supported
+    return !p2p_enabled;
+  } else {
+    // The remaining cases require temporaries. For example, this includes
+    // non-contiguous copies between CPU and GPU.
+    return true;
+  }
+}
+
+static bool maybe_enable_p2p_access(Device dst_device, Device src_device) {
+  if (dst_device.is_cpu() || src_device.is_cpu()) {
+    return false;
+  }
+  return at::zoom::get_p2p_access(src_device.index(), dst_device.index());
+}
+
+static void copy_kernel_zoom(TensorIterator& iter, bool non_blocking) {
+  TORCH_CHECK(iter.ntensors() == 2);
+
+  Device dst_device = iter.device(0);
+  Device src_device = iter.device(1);
+
+  // Enable p2p access between devices. (No-op if it involves the CPU)
+  bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);
+
+  if (copy_requires_temporaries(iter, p2p_enabled)) {
+    // NB: this involves recursive calls to copy. Be careful that those copies
+    // don't require temporaries or you will cause an infinite recursion!
+    auto& dst = iter.tensor(0);
+    Tensor dst_contig;
+    Tensor src_contig;
+
+    // If non_blocking is true - type conversions are performed on the GPU
+    // For blocking transfers conversions are performed on CPU to avoid allocating
+    // extra GPU memory
+    // for GPU-GPU transfers conversions are performed on the source device
+    auto conversion_device = non_blocking ? DeviceType::PrivateUse1 : kCPU;
+    if (iter.device_type(1) == conversion_device) {
+      dst_contig = dst.is_contiguous() ? dst : at::empty_like(dst, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      src_contig = iter.tensor(1).to(iter.dtype(0)).expand_as(dst).contiguous();
+    } else {
+      bool same_type = iter.dtype(0) == iter.dtype(1);
+      dst_contig = (dst.is_contiguous() && same_type) ? dst : at::empty_like(dst, iter.dtype(1), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      src_contig = iter.tensor(1).expand_as(dst).contiguous();
+    }
+
+    // propagate the correct conjugate bit
+    dst_contig._set_conj(dst.is_conj());
+    src_contig._set_conj(iter.tensor(1).is_conj());
+
+    dst_contig._set_neg(dst.is_neg());
+    src_contig._set_neg(iter.tensor(1).is_neg());
+
+    // perform a same-dtype copy on contiguous tensors
+    TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes()));
+    TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type());
+    dst_contig.copy_(src_contig, non_blocking);
+
+    // if necessary, copy back into dst
+    if (!dst_contig.is_same(dst)) {
+      TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device());
+      dst.copy_(dst_contig, non_blocking);
+    }
+    return;
+  }
+
+  // Copy on GPU (or between GPUs)
+  if (dst_device.is_privateuseone() && src_device.is_privateuseone()) {
+    copy_device_to_device(iter, non_blocking, p2p_enabled);
+    return;
+  }
+
+  // Copy between CPU and GPU
+  c10::zoom::OptionalZoomGuard device_guard;
+  hipMemcpyKind kind;
+  if (dst_device.is_privateuseone() && src_device.is_cpu()) {
+    device_guard.set_device(dst_device);
+    kind = hipMemcpyHostToDevice;
+  } else if (dst_device.is_cpu() && src_device.is_privateuseone()) {
+    device_guard.set_device(src_device);
+    kind = hipMemcpyDeviceToHost;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unsupported devices in GPU copy_()");
+  }
+
+  void* dst = iter.data_ptr(0);
+  void* src = iter.data_ptr(1);
+  int64_t nbytes = iter.numel() * iter.element_size(0);
+  c10::zoom::ZoomStream stream = c10::zoom::getCurrentZoomStream();
+
+  if (non_blocking) {
+    C10_ZOOM_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    // we use both the storage context and the tensor data pointer as the key
+    // for the caching host allocator. This allows us to better attribute the
+    // events to the original tensor allocation correctly. The cases we seek to
+    // handle are:
+
+    // 1: a user can pass a pinned memory tensor with an alternative
+    // context, for example if allocating memory directly from the pinned memory
+    // allocator and constructing a tensor with torch::from_blob.
+
+    // 2: a user can pass a tensor with a different base pointer to the original
+    // allocation (via slicing).
+    const auto& dst_tensor = iter.tensor(0);
+    const auto& src_tensor = iter.tensor(1);
+    const auto& host_tensor = (dst_device == kCPU ? dst_tensor : src_tensor);
+    auto* ptr = (dst_device == kCPU ? dst : src);
+    auto* ctx = host_tensor.storage().data_ptr().get_context();
+    // TODO: warn on the return value.
+    CachingHostAllocator_recordEvent(ptr, ctx, stream);
+
+  } else {
+    c10::zoom::memcpy_and_sync(dst, src, nbytes, kind, stream);
+  }
+
+  if (iter.tensor(0).is_conj() != iter.tensor(1).is_conj()) {
+     iter.tensor(0).conj_physical_();
+  }
+  if (iter.tensor(0).is_neg() != iter.tensor(1).is_neg()) {
+     iter.tensor(0).neg_();
+  }
+}
+
+    REGISTER_PRIVATEUSE1_DISPATCH(copy_stub, &copy_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Copy.h b/aten/src/ATen/native/zoom/Copy.h
new file mode 100644
index 00000000000000..d7a7243b36dfdf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Copy.h
@@ -0,0 +1,11 @@
+#pragma once
+
+namespace at {
+struct TensorIteratorBase;
+
+    namespace native {
+
+        void direct_copy_kernel_zoom(TensorIteratorBase &iter);
+    
+    }
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Equal.cpp b/aten/src/ATen/native/zoom/Equal.cpp
new file mode 100644
index 00000000000000..00f6acf51d0b66
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Equal.cpp
@@ -0,0 +1,49 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/NamedTensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <ATen/ops/eq.h>
+#else
+#include <ATen/ops/eq.h>
+#include <ATen/ops/equal_native.h>
+#endif
+
+namespace at::native {
+
+bool zoom_equal(const Tensor& self, const Tensor &src) {
+  if (!at::namedinference::are_names_equal(
+          self.unsafeGetTensorImpl(), src.unsafeGetTensorImpl())) {
+    return false;
+  }
+  at::NoNamesGuard guard;
+  TORCH_CHECK(self.device() == src.device(), "Cannot compare two tensors on "
+              "different devices. Got: ", self.device(), " and ", src.device());
+  if (self.sizes() != src.sizes()) {
+    return false;
+  }
+  if (self.numel() == 0) {
+    return true;
+  }
+
+  // This is the same optimization done in the cpu_equal. Since the flags like neg/conj should be already handled outside the
+  // cuda_equal, it should be safe to have the following fast path by
+  // ensuring the storage and strides exactly the same.
+  if (self.is_alias_of(src)
+      && self.storage_offset() == src.storage_offset()
+      && self.dtype() == src.dtype()
+      && self.is_contiguous() == src.is_contiguous()
+      && self.strides().equals(src.strides())
+      // Extra checks to ensure the safety in case cuda_equal is directly called in C++.
+      && self.layout() == src.layout()
+      && self.is_neg() == src.is_neg()
+      && self.is_conj() == src.is_conj()) {
+    return true;
+  }
+
+  return at::eq(self, src).all().item().to<bool>();
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/FillKernel.cu b/aten/src/ATen/native/zoom/FillKernel.cu
new file mode 100644
index 00000000000000..24c0a00c54726b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FillKernel.cu
@@ -0,0 +1,30 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/Fill.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+template<typename scalar_t>
+struct FillFunctor {
+  FillFunctor(scalar_t v): value(v) {}
+  __device__ __forceinline__ scalar_t operator() () const {
+    return value;
+  }
+  private:
+    scalar_t value;
+};
+
+void fill_kernel_zoom(TensorIterator& iter, const Scalar& value) {
+  AT_DISPATCH_V2(iter.dtype(), "fill_zoom", AT_WRAP([&]() {
+    gpu_kernel(iter, FillFunctor<scalar_t>(value.to<scalar_t>()));
+  }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(fill_stub, &fill_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/MiscUtils.h b/aten/src/ATen/native/zoom/MiscUtils.h
new file mode 100644
index 00000000000000..257c488bd7e98e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MiscUtils.h
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/PinnedMemoryAllocator.h>
+
+namespace at {
+namespace native {
+
+static inline int zoom_int_cast(int64_t value, const char* varname) {
+  auto result = static_cast<int>(value);
+  TORCH_CHECK(static_cast<int64_t>(result) == value,
+              "zoom_int_cast: The value of ", varname, "(", (long long)value,
+              ") is too large to fit into a int (", sizeof(int), " bytes)");
+  return result;
+}
+
+// Creates an array of size elements of type T, backed by pinned memory
+// wrapped in a Storage
+template<class T>
+static inline Storage pin_memory(int64_t size) {
+  auto* allocator = zoom::getPinnedMemoryAllocator();
+  int64_t adjusted_size = size * sizeof(T);
+  return Storage(
+      Storage::use_byte_size_t(),
+      adjusted_size,
+      allocator,
+      /*resizable=*/false);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/Nonzero.cu b/aten/src/ATen/native/zoom/Nonzero.cu
new file mode 100644
index 00000000000000..d735795bcc1720
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Nonzero.cu
@@ -0,0 +1,130 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh> //for MAX_DIMS
+#include <ATen/zoom/cub.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/nonzero_native.h>
+#endif
+
+
+namespace at::native {
+
+namespace{
+template<typename T>
+struct NonZeroOp
+{
+    __host__ __device__ __forceinline__ bool operator()(const T& a) const {
+      return (a!=T(0));
+    }
+};
+
+//TODO: actually support int64_t index_t
+template<typename index_t>
+struct TensorDims {
+  index_t sizes[MAX_DIMS];
+};
+
+template <typename index_t>
+__global__ void write_indices(
+    int64_t* inp,
+    TensorDims<index_t> dims,
+    int ndim,
+    index_t n) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    index_t div = 1;
+    int64_t idx_flat = inp[index];
+#pragma unroll
+    for (int dim = MAX_DIMS; dim >= 0; dim--) {
+      if (dim > ndim - 1)
+        continue;
+      auto dim_size = dims.sizes[dim];
+      inp[index + dim * n] = (idx_flat / div) % dim_size;
+      div *= dim_size;
+    }
+  }
+}
+
+} //anonymous namespace
+
+template<typename scalar_t>
+void nonzero_zoom_out_impl(const Tensor& self, Tensor& out){
+  Tensor self_ = self.contiguous();
+  int N = self_.numel();
+  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+// compute number of nonzero elements
+  size_t temp_storage_bytes=0;
+  auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
+  auto num_nonzeros = allocator.allocate(sizeof(int));
+  hipcub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
+  hipcub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
+  auto temp_storage = allocator.allocate(temp_storage_bytes);
+  hipcub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
+  int num_nonzeros_h;
+  c10::zoom::memcpy_and_sync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), hipMemcpyDeviceToHost, stream);
+  //expected output size is num_nonzeros x ndim
+  //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output)
+  //we are able to directly use passed output with this size and strides, and we can also (per contract)
+  //resize passed output with incorrect sizes anyway we want.
+  //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
+  bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
+  at::Tensor out_temp = need_to_copy ?
+      Tensor(at::detail::empty_zoom({self.dim(), num_nonzeros_h}, out.options())) :
+      out.resize_({self.dim(), num_nonzeros_h});
+  //Scalars are expected to produce output of size (1,0), so we can't write to it
+  if (self.dim() > 0) {
+    hipcub::CountingInputIterator<int64_t> counting_itr(0);
+    temp_storage_bytes = 0;
+    hipcub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
+      out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
+    temp_storage = allocator.allocate(temp_storage_bytes);
+    hipcub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
+      out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
+    if (num_nonzeros_h > 0 && self.dim() > 1){
+        TensorDims<int> dims;
+        for (int i=0; i<self.dim(); i++){
+            dims.sizes[i] = self.sizes()[i];
+        }
+        const int nthreads = 256;
+        const int nblocks = (num_nonzeros_h + nthreads -1)/nthreads;
+        write_indices<<<nblocks, nthreads, 0, stream>>>(out_temp.mutable_data_ptr<int64_t>(),
+        dims, self.dim(), num_nonzeros_h);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+  if (need_to_copy) {
+    out.copy_(out_temp.t());
+  } else {
+    //transpose out so it is correct size
+    Tensor out_ = out_temp.t();
+    out.set_(out_);
+  }
+}
+
+Tensor& nonzero_out_zoom(const Tensor& self, Tensor& out){
+  TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
+  See https://github.com/pytorch/pytorch/issues/51871");
+  TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
+  TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
+  out.device(), " and self on ", self.device());
+  TORCH_CHECK(self.dim() <= MAX_DIMS, "nonzero is not supported for tensor with more than ", MAX_DIMS, " dimensions");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(at::ScalarType::ComplexHalf, at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+    self.scalar_type(), "nonzero_zoom",
+    [&] {nonzero_zoom_out_impl<scalar_t>(self, out);});
+  return out;
+}
+
+Tensor nonzero_zoom(const Tensor& self){
+  Tensor out = at::detail::empty_zoom({0}, self.options().dtype(kLong));
+  return at::native::nonzero_out_zoom(self, out);
+}
+} //namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Resize.cpp b/aten/src/ATen/native/zoom/Resize.cpp
new file mode 100644
index 00000000000000..da9a11971c86f3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Resize.cpp
@@ -0,0 +1,69 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/Resize.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/PeerToPeerAccess.h>
+#include <ATen/native/ResizeCommon.h>
+#include <c10/zoom/ZoomGuard.h>
+
+namespace at::native {
+
+void resize_bytes_zoom(StorageImpl* storage, size_t size_bytes) {
+  TORCH_CHECK(storage->resizable(), "Trying to resize storage that is not resizable");
+  auto allocator = storage->allocator();
+  TORCH_CHECK(allocator != nullptr, "Trying to resize storage without an allocator");
+
+  c10::Device device = storage->device();
+
+  if (size_bytes == 0) {
+    storage->set_data_ptr_noswap(at::DataPtr(nullptr, device));
+    storage->set_nbytes(0);
+    return;
+  }
+
+  c10::zoom::ZoomGuard guard(device.index());
+  at::DataPtr data = allocator->allocate(size_bytes);
+  if (storage->data_ptr()) {
+    at::globalContext().lazyInitPrivateUse1();
+
+    C10_ZOOM_CHECK(
+        hipMemcpyAsync(
+            data.get(),
+            storage->data(),
+            std::min(storage->nbytes(), size_bytes),
+            hipMemcpyDeviceToDevice,
+            c10::zoom::getCurrentZoomStream()));
+  }
+
+  // Destructively overwrite data_ptr
+  storage->set_data_ptr_noswap(std::move(data));
+  storage->set_nbytes(size_bytes);
+}
+
+const Tensor& resize_zoom_(
+    const Tensor& self,
+    IntArrayRef size,
+    std::optional<MemoryFormat> optional_memory_format) {
+  if (self.has_names()) {
+    return resize_named_tensor_(self, size, optional_memory_format);
+  }
+  auto* self_ = self.unsafeGetTensorImpl();
+  int64_t old_storage_nbytes = self_->unsafe_storage() ? self_->unsafe_storage().nbytes() : 0;
+  resize_impl_zoom_(self_, size, /*strides=*/c10::nullopt);
+  if (optional_memory_format.has_value()) {
+    auto memory_format =
+        optional_memory_format.value();
+    TORCH_CHECK(
+        memory_format != MemoryFormat::Preserve,
+        "Unsupported memory format",
+        memory_format);
+    self_->empty_tensor_restride(memory_format);
+  }
+  // See Note [Enabling Deterministic Operations]
+  if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
+    at::native::fill_resize_deterministic_(self, old_storage_nbytes);
+  }
+  return self;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Resize.h b/aten/src/ATen/native/zoom/Resize.h
new file mode 100644
index 00000000000000..01c71e3fe861ab
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Resize.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/native/ResizeCommon.h>
+
+#include <c10/zoom/ZoomGuard.h>
+
+namespace at { namespace native {
+
+TORCH_ZOOM_API void resize_bytes_zoom(StorageImpl* storage, size_t size_bytes);
+
+static inline void maybe_resize_storage_zoom(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+
+  const Storage &storage = self->unsafe_storage();
+  TORCH_CHECK(storage, "Tensor: invalid null storage");
+  if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_zoom(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+
+inline TensorImpl* resize_impl_zoom_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool device_guard = true) {
+  if (self->sizes() == size && (!stride || self->strides() == stride)) {
+    return self;
+  }
+
+  // NB: We don't need to hold the device guard when calling from TH
+  c10::zoom::OptionalZoomGuard guard;
+  if (device_guard) {
+    guard.set_index(self->storage().device().index());
+  }
+
+  const auto itemsize = self->dtype().itemsize();
+  const auto storage_offset = self->storage_offset();
+  size_t storage_size = 1;
+  if (stride) {
+    self->set_sizes_and_strides(size, *stride);
+    storage_size = at::detail::computeStorageNbytes(
+        size, *stride, itemsize, storage_offset);
+  } else {
+    self->set_sizes_contiguous(size);
+    storage_size = at::detail::computeStorageNbytesContiguous(
+        size, itemsize, storage_offset);
+  }
+  maybe_resize_storage_zoom(self, storage_size);
+
+  return self;
+}
+
+}}
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/TensorCompare.cpp b/aten/src/ATen/native/zoom/TensorCompare.cpp
new file mode 100644
index 00000000000000..21847fa0b41229
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorCompare.cpp
@@ -0,0 +1,23 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TensorCompare.h>
+
+namespace at::native {
+
+namespace {
+
+// Composite op implementation for simplicity. This materializes the cross product of elements and test elements,
+// so it is not very memory efficient, but it is fast on CUDA.
+void isin_default_kernel_gpu(
+    const Tensor& elements, const Tensor& test_elements, bool invert, const Tensor& out) {
+  std::vector<int64_t> bc_shape(elements.dim(), 1);
+  bc_shape.push_back(-1);
+  out.copy_(invert ? elements.unsqueeze(-1).ne(test_elements.view(bc_shape)).all(-1)
+            : elements.unsqueeze(-1).eq(test_elements.view(bc_shape)).any(-1));
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(isin_default_stub, &isin_default_kernel_gpu);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorCompare.cu b/aten/src/ATen/native/zoom/TensorCompare.cu
new file mode 100644
index 00000000000000..e92d058c9b7222
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorCompare.cu
@@ -0,0 +1,133 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/NumericUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <c10/core/Scalar.h>
+
+
+namespace at::native {
+
+namespace {
+
+void where_kernel_impl(TensorIterator &iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBFloat16, kBool, iter.dtype(), "where_zoom", [&] {
+      gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA (bool cond_val, scalar_t self_val, scalar_t other_val) -> scalar_t {
+          return cond_val ? self_val : other_val;
+        });
+  });
+}
+
+void isposinf_kernel_impl(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_zoom", [&]() {
+    gpu_kernel(
+      iter,
+      [] GPU_LAMBDA (scalar_t a) -> bool { return a == std::numeric_limits<scalar_t>::infinity(); }
+    );
+  });
+}
+
+void isneginf_kernel_impl(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_zoom", [&]() {
+    gpu_kernel(
+      iter,
+      [] GPU_LAMBDA (scalar_t a) -> bool { return a == -std::numeric_limits<scalar_t>::infinity(); }
+    );
+  });
+}
+
+void clamp_kernel_impl(TensorIteratorBase& iter) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_zoom", [&] {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower, scalar_t upper) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (at::_isnan(v)) {
+        return v;
+      } if (at::_isnan(lower)) {
+        return lower;
+      } if (at::_isnan(upper)) {
+        return upper;
+      } else {
+        return ::min(::max(v, lower), upper);
+      }
+    });
+  });
+}
+
+void inline launch_clamp_scalar(TensorIteratorBase& iter, Scalar lim0, Scalar lim1, at::native::detail::ClampLimits minmax){
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_scalar_zoom", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto lim0_val = lim0.to<opmath_t>();
+    auto lim1_val = lim1.to<opmath_t>();
+
+    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (_isnan(static_cast<opmath_t>(v))) {
+        return v;
+      } else if (minmax==at::native::detail::ClampLimits::Min){
+        return ::max(static_cast<opmath_t>(v), lim0_val);
+      } else if (minmax==at::native::detail::ClampLimits::Max){
+        return ::min(static_cast<opmath_t>(v), lim0_val);
+      } else {
+        return ::min(::max(static_cast<opmath_t>(v), lim0_val), lim1_val);
+      }
+    });
+  });
+}
+
+
+void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min, const Scalar& max) {
+  launch_clamp_scalar(iter, min, max, at::native::detail::ClampLimits::MinMax);
+}
+
+void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min) {
+  launch_clamp_scalar(iter, min, min, at::native::detail::ClampLimits::Min);
+}
+
+void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max) {
+  launch_clamp_scalar(iter, max, max, at::native::detail::ClampLimits::Max);
+}
+
+} // anonymous namespace
+
+
+REGISTER_PRIVATEUSE1_DISPATCH(where_kernel, &where_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(isposinf_stub, &isposinf_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(isneginf_stub, &isneginf_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_stub, &clamp_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl);
+
+template <typename scalar_t>
+__global__ void _assert_async_zoom_kernel(const scalar_t* input) {
+  ZOOM_KERNEL_ASSERT(input[0] != 0);
+}
+
+__global__ void _assert_async_zoom_kernel(const c10::complex<float>* input) {
+  ZOOM_KERNEL_ASSERT(input[0] != c10::complex<float>(0, 0));
+}
+__global__ void _assert_async_zoom_kernel(const c10::complex<double>* input) {
+  ZOOM_KERNEL_ASSERT(input[0] != c10::complex<double>(0, 0));
+}
+
+void _assert_async_zoom(const Tensor& self_tensor) {
+  const TensorBase &self = get_tensor_base(self_tensor);
+  auto n = self.numel();
+  TORCH_CHECK(n != 0, "Boolean value of Tensor with no values is ambiguous");
+  TORCH_CHECK(n < 2, "Boolean value of Tensor with more than one value is ambiguous");
+  auto stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_assert_async_zoom", [&] {
+    _assert_async_zoom_kernel<<<1, 1, 0, stream>>>(self.const_data_ptr<scalar_t>());
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+// TODO (tmanlaibaatar) Ignore assert msg for now
+void _assert_async_msg_zoom(const Tensor& self_tensor, c10::string_view assert_msg) {
+  _assert_async_zoom(self_tensor);
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/TensorFactories.cu b/aten/src/ATen/native/zoom/TensorFactories.cu
new file mode 100644
index 00000000000000..7cf9b0d7ec2417
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorFactories.cu
@@ -0,0 +1,396 @@
+
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/InitialTensorOptions.h>
+#include <ATen/native/zoom/Resize.h>
+#include <ATen/native/TensorFactories.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/Exception.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_efficientzerotensor_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_strided_native.h>
+#include <ATen/ops/eye_native.h>
+#include <ATen/ops/tril_indices_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_indices_native.h>
+#include <ATen/ops/triu_native.h>
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+
+namespace at::native {
+
+Tensor& eye_out_zoom(int64_t n, Tensor& result) {
+  // the default value of `m` equals to `n`
+  return at::native::eye_out_zoom(n, n, result);
+}
+
+Tensor& eye_out_zoom(int64_t n, int64_t m, Tensor& result) {
+  TORCH_CHECK(n >= 0, "n must be greater or equal to 0, got ", n);
+  TORCH_CHECK(m >= 0, "m must be greater or equal to 0, got ", m);
+
+  result.resize_({n, m});
+  result.zero_();
+
+  int64_t sz = std::min<int64_t>(n, m);
+  int64_t stride = result.stride(0) + result.stride(1);
+
+  Tensor diag = result.as_strided({sz}, {stride});
+  diag.fill_(1);
+  return result;
+}
+
+Tensor empty_zoom(IntArrayRef size, std::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
+  Tensor result = at::detail::zoom_empty_memory_format(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
+  // See Note [Enabling Deterministic Operations]
+  if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
+    fill_empty_deterministic_(result);
+  }
+  return result;
+}
+
+Tensor _efficientzerotensor_zoom(IntArrayRef size,
+    std::optional<ScalarType> dtype,
+    std::optional<Layout> layout,
+    std::optional<Device> device,
+    std::optional<bool> pin_memory) {
+    auto device_ = device_or_default(device);
+    if (!device_.has_index()) {
+      device_.set_index(c10::zoom::current_device());
+    }
+    auto allocator = at::native::ZeroTensorAllocator(device_);
+    auto dtype_ = dtype_or_default(dtype);
+    auto zero_ks = at::DispatchKeySet(c10::DispatchKey::PrivateUse1) | at::DispatchKeySet(c10::DispatchKey::ZeroTensor);
+    auto out = at::detail::empty_generic(size, &allocator, zero_ks, dtype_, c10::nullopt);
+    return out;
+}
+
+
+Tensor empty_strided_zoom(IntArrayRef size, IntArrayRef stride, std::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+  Tensor result = at::detail::zoom_empty_strided(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+  // See Note [Enabling Deterministic Operations]
+  if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
+    fill_empty_deterministic_(result);
+  }
+  return result;
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triangle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace {
+// To find the max integer that does not exceed the root of an int64_t variable,
+// we could use a loop to test one bit at a time, which takes up to 31
+// iterations. This would give the accurate result, but is relatively slow and
+// is an overkill for most cases where double's precision suffice.
+//
+// If we directly use sqrt to calculate the root, the conversion from int64_t
+// to double would lose 11 bits precision.
+//
+// The following solution uses sqrt directly for most cases, and would only
+// special handle it if there is indeed precision loss.
+__device__
+inline int64_t resolve_root_int(
+    int64_t b, int64_t cX4, int64_t x, int32_t sign) {
+  int64_t bXb_cX4 = b*b - cX4;
+  // potential precision loss could occur here when casting int64_t (63 bits
+  // precision) to double (52 bits precision)
+  double sr = ::sqrt((double)bXb_cX4);
+  int64_t res = ::__double2ll_rd((-b + sign * sr)/2);
+
+  // have to cast double to int64_t, otherwise it would only compare up to the
+  // precision of a double variable, ignoring the precision loss
+  if (bXb_cX4 != (int64_t) (sr * sr)) {
+    // handle precision loss by using binary search
+    int64_t llsr = ::__double2ll_rd(sr);
+    // Use the following math to reduce search space.
+    // Suppose z is the accurate result of sqrt(bXb_cX4) without precision loss
+    // let d = abs(bXb_cX4 - llsr * llsr), then we have:
+    // z = sqrt(bXb_cX4) <= sqrt(llsr * llsr + d) <= llsr + sqrt(d)
+    // z = sqrt(bXb_cX4) >= sqrt(llsr * llsr - d) >= llsr - sqrt(d)
+    // Hence, it is sufficient to search range [llsr - sqrt(d), llsr + sqrt(d)).
+    // And the true value of row would also be with in range,
+    //            [res - sqrt(d), res + sqrt(d) + 1)
+    // as the denominator would only reduce the precision penalty.
+    int64_t diff =
+      ::__double2ll_ru(::sqrt(::fabs((double)(bXb_cX4 - llsr * llsr))));
+    // l never exceeds (could equal to) the target row index
+    auto l = res > diff ? res - diff : 0;
+    // r is always larger than the target row index
+    auto r = res + diff + 1;
+
+    // binary search for the correct answer
+    x <<= 1; // the loop always compares with 2x, so do it once here
+    while (l + 1 < r) {
+      auto m = (l + r) >> 1;
+      // for tril:
+      //    b = 2f - 1, sign = 1, hence (2f + m - 1) * m / 2
+      // for triu:
+      //    b = -2f - 1, sign = -1, hence (2f - m + 1) * m / 2
+      if (sign * (b + m) * m > x) {
+        r = m;
+      } else {
+        l = m;
+      }
+    }
+    res = l;
+  }
+
+  return res;
+}
+
+// f: the number of elements in the first row of the trapezoid.
+// x: the index of the target coordinates ordered by row and then column.
+//
+// View the tril as a top trapezoid stacked on a bottom rectangle. Assume x
+// corresponds to the coordinate (row, col) in the trapezoid, where the row and
+// the col both start from 0, then we have:
+//
+//                   (f + f + row - 1) * row / 2 <= x                       [1]
+//                 (f + f + row) * (row + 1) / 2  > x                       [2]
+//
+// Therefore, row is the maximum integer satisfying the following inequality:
+//
+//                       (row + 2f - 1)row <= 2x
+//                  row^2 + (2f-1)row - 2x <= 0.                            [3]
+//
+// Based on inequality [3], we have the following coefficients for formula of
+// root:
+//                               a = 1
+//                               b = 2f - 1
+//                               c = -2x
+// There are two roots, and we should use the largest integer that does not
+// exceed the root on the right. Intuitively, it is because:
+//  i)  the valid solution range of row is between two roots, as it is <= 0;
+//  ii) as we count in more rows, the total # of elements should always
+//      increase, hence so does the left-hand side row^2 + (2f-1)row - 2x.
+//      Therefore, the valid range of row lies in between the nadir point and
+//      the larger root on the right.
+// Full proof can be derived from inequality [2]. So, we calculate the result
+// coordinate as:
+//
+//                   row = floor((-b + sqrt(b^2 - 4c)) / 2)
+//                   col = x - (f + f + row - 1) * row / 2
+__device__
+inline void get_coordinate_in_tril_trapezoid(
+    int64_t f, int64_t x, int64_t & row, int64_t & col) {
+  f <<= 1; // all statements use 2f, so only calculate it once here.
+  auto b = f - 1;
+  auto cX4 = - (x << 3); // 4 * c = 4 * (-2x) = -8x;
+  row = resolve_root_int(b, cX4, x, 1);
+  col = x - ((f + row - 1) * row >> 1);
+}
+
+// f: the number of elements in the first row of the bottom trapezoid.
+// x: the index of the target coordinates ordered by row and then column.
+//
+// View the triu as a top rectangle stacked on a bottom trapezoid, where the
+// trapezoid is upside down. Assume x corresponds to the coordinate (row, col)
+// in the bottom trapezoid, where the row and the col start from 0, then we
+// have:
+//
+//                   (f + f - row + 1) * row / 2 <= x                       [1]
+//                 (f + f - row) * (row + 1) / 2  > x                       [2]
+//
+// Therefore, row is the maximum integer satisfying the following inequality:
+//
+//                       (-row + 2f + 1)row <= 2x
+//                   row^2 - (2f+1)row + 2x >= 0.                           [3]
+//
+// Based on inequality [3], we have the following coefficients for formula of
+// root:
+//                               a = 1
+//                               b = -1 - 2f
+//                               c = 2x
+// There are two roots, and we should use the largest integer that does not
+// exceed the root on the left. Intuitively, it is because:
+//  i)  the valid solution range of row is outside of the two roots, as it is <
+//      > 0;
+//  ii) as we count in more rows, the total # of elements should always
+//      increase, hence so does the left-hand side row^2 - (2f+1)row + 2x.
+//      Therefore, the valid range of row lies to the left of the smaller root
+//      on the left.
+// Full proof can be derived from inequality [2]. So, we calculate the result
+// coordinate as:
+//
+//                   row = floor((-b - sqrt(b^2 - 4c)) / 2)
+//                   col = x - (f + f - row + 1) * row / 2
+__device__
+inline void get_coordinate_in_triu_trapezoid(
+    int64_t f, int64_t x, int64_t & row, int64_t & col) {
+  f <<= 1; // all statements use 2f, so only calculate it once here.
+  auto b = -1 - f;
+  auto cX4 = x << 3; // 4 * c = 4 * (2x) = 8x;
+  row = resolve_root_int(b, cX4, x, -1);
+  col = x - ((f - row + 1) * row >> 1) + row;
+}
+
+} // namespace
+
+template <typename scalar_t>
+__global__
+C10_LAUNCH_BOUNDS_1(512)
+void tril_indices_kernel(scalar_t * tensor,
+                         int64_t row_offset,
+                         int64_t m_first_row,
+                         int64_t col,
+                         int64_t trapezoid_size,
+                         int64_t tril_size) {
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (linear_index < tril_size) {
+    int64_t r, c;
+    if (linear_index < trapezoid_size) {
+      // the coordinate is within the top trapezoid
+      get_coordinate_in_tril_trapezoid(m_first_row, linear_index, r, c);
+    } else {
+      // the coordinate falls in the bottom rectangle
+      auto surplus = linear_index - trapezoid_size;
+      // add the height of trapezoid: m_last_row (col) - m_first_row + 1
+      r = surplus / col + col - m_first_row + 1;
+      c = surplus % col;
+    }
+    r += row_offset;
+
+    tensor[linear_index] = r;
+    tensor[linear_index + tril_size] = c;
+  }
+}
+
+// Some Large test cases for the fallback binary search path is disabled by
+// default to speed up CI tests and to avoid OOM error. When modifying the
+// implementation, please enable them in test/test_cuda.py and make sure they
+// pass on your local server.
+Tensor tril_indices_zoom(
+    int64_t row, int64_t col, int64_t offset, std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+  check_args(row, col, layout_opt);
+
+  auto tril_size = get_tril_size(row, col, offset);
+  auto tensor = empty_zoom({2, tril_size}, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+
+  if (tril_size > 0) {
+    auto m_first_row = offset > 0 ?
+      std::min<int64_t>(col, 1 + offset) : // upper bounded by col
+      row + offset > 0; // either 0 or 1
+    auto trapezoid_row_offset = std::max<int64_t>(0, -offset);
+    auto rectangle_row_offset = trapezoid_row_offset + col - m_first_row + 1;
+    int64_t rectangle_size = 0;
+    if (rectangle_row_offset < row) {
+      rectangle_size = (row - rectangle_row_offset) * col;
+    }
+
+    dim3 dim_block = zoom::getApplyBlock();
+    dim3 dim_grid;
+    // using tril_size instead of tensor.numel(), as each thread takes care of
+    // two elements in the tensor.
+    TORCH_CHECK(
+      zoom::getApplyGrid(tril_size, dim_grid, tensor.get_device()),
+      "unable to get dim grid");
+
+    AT_DISPATCH_INDEX_TYPES(tensor.scalar_type(), "tril_indices_zoom", [&] {
+     hipLaunchKernelGGL(( tril_indices_kernel), 
+          dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
+        tensor.mutable_data_ptr<index_t>(),
+        trapezoid_row_offset,
+        m_first_row,
+        col,
+        tril_size - rectangle_size,
+        tril_size);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  }
+
+  return tensor;
+}
+
+template <typename scalar_t>
+__global__
+void triu_indices_kernel(scalar_t * tensor,
+                         int64_t col_offset,
+                         int64_t m_first_row,
+                         int64_t col,
+                         int64_t rectangle_size,
+                         int64_t triu_size) {
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (linear_index < triu_size) {
+    int64_t r, c;
+    if (linear_index < rectangle_size) {
+      // the coordinate is within the top rectangle
+      r = linear_index / col;
+      c = linear_index % col;
+    } else {
+      // the coordinate falls in the bottom trapezoid
+      get_coordinate_in_triu_trapezoid(
+        m_first_row, linear_index - rectangle_size, r, c);
+      r += rectangle_size / col;
+    }
+
+    c += col_offset;
+    tensor[linear_index] = r;
+    tensor[linear_index + triu_size] = c;
+  }
+}
+
+// Some Large test cases for the fallback binary search path is disabled by
+// default to speed up CI tests and to avoid OOM error. When modifying the
+// implementation, please enable them in test/test_cuda.py and make sure they
+// pass on your local server.
+Tensor triu_indices_zoom(
+    int64_t row, int64_t col, int64_t offset, std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt, c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt) {
+  check_args(row, col, layout_opt);
+
+  auto triu_size = row * col - get_tril_size(row, col, offset - 1);
+  auto tensor = empty_zoom({2, triu_size}, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+
+  if (triu_size > 0) {
+    // # of triu elements in the first row
+    auto m_first_row = offset > 0 ?
+      std::max<int64_t>(col - offset, 0) : // upper bounded by col
+      col;
+
+    // size of the top rectangle
+    int64_t rectangle_size = 0;
+    if (offset < 0) {
+      rectangle_size = std::min<int64_t>(row, -offset) * col;
+    }
+
+    dim3 dim_block = zoom::getApplyBlock();
+    dim3 dim_grid;
+
+    // using triu_size instead of tensor.numel(), as each thread takes care of
+    // two elements in the tensor.
+    TORCH_CHECK(
+      zoom::getApplyGrid(triu_size, dim_grid, tensor.get_device()),
+      "unable to get dim grid");
+
+    AT_DISPATCH_INDEX_TYPES(tensor.scalar_type(), "triu_indices_zoom", [&] {
+     hipLaunchKernelGGL(( triu_indices_kernel), 
+          dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
+        tensor.mutable_data_ptr<index_t>(),
+        std::max<int64_t>(0, offset),
+        m_first_row,
+        col,
+        rectangle_size,
+        triu_size);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  }
+
+  return tensor;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorShape.cu b/aten/src/ATen/native/zoom/TensorShape.cu
new file mode 100644
index 00000000000000..5fad25d8a76179
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorShape.cu
@@ -0,0 +1,833 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorShape.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+#include <c10/util/TypeCast.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_chunk_cat_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/split_with_sizes_copy_native.h>
+#endif
+
+namespace at::native {
+
+namespace detail {
+
+// NOTE [CUDA fast path for split_with_sizes_copy.out]
+// split_with_sizes_copy.out for contiguous operands has the following
+// properties:
+// - Each src split consists of multiple chunks that are separated by a fixed
+// stride. The number of chunks and the strides are the same across all src
+// splits.
+// - Each dst split is the concatenation of the chunks in its corresponding src
+// splits.
+// - The sizes of chunks vary across splits.
+// - A (src, dst) chunk pair is not guaranteed to have the
+// same alignment.
+//
+// The following strategies are employed to optimize for this workload:
+// - The entire workload is fused into a single kernel to maximize I/O
+// throughput and minimize wave quantization.
+// - To account for both small and large chunk sizes, a "jagged grid" is used.
+// Each chunk is processed by one or more blocks depending on its size.
+// - Within each chunk, the region in which writes can be vectorized is
+// identified. Within this region, writes are always vectorized and reads are
+// oppurtunistically vectorized.
+static constexpr int64_t BLOCK_SIZE = 128;
+static constexpr int64_t BYTES_PER_THREAD = 16;
+static constexpr int64_t BYTES_PER_BLOCK = BYTES_PER_THREAD * BLOCK_SIZE;
+
+static __host__ __device__ inline int64_t div_up(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
+template <typename T>
+__device__ inline void stream_load128(uint4& val, const T* addr) {
+  uint64_t low, high;
+  low = reinterpret_cast<const uint64_t*>(addr)[0];
+  high = reinterpret_cast<const uint64_t*>(addr)[1];
+  reinterpret_cast<uint64_t*>(&val)[0] = low;
+  reinterpret_cast<uint64_t*>(&val)[1] = high;
+}
+
+template <typename T>
+__device__ inline void stream_store128(T* addr, const uint4& val) {
+  uint64_t low, high;
+  low = reinterpret_cast<const uint64_t*>(&val)[0];
+  high = reinterpret_cast<const uint64_t*>(&val)[1];
+  reinterpret_cast<uint64_t*>(addr)[0] = low;
+  reinterpret_cast<uint64_t*>(addr)[1] = high;
+}
+
+template <typename T>
+static __device__ inline bool is_aligned(const void* addr) {
+  return reinterpret_cast<uintptr_t>(addr) % sizeof(T) == 0;
+}
+
+template <typename T>
+static __device__ inline void load128(uint4& val, const char* addr) {
+  for (size_t i = 0; i < detail::BYTES_PER_THREAD / sizeof(T); ++i) {
+    reinterpret_cast<T*>(&val)[i] = reinterpret_cast<const T*>(addr)[i];
+  }
+}
+
+template <>
+__device__ inline void load128<uint4>(uint4& val, const char* addr) {
+  stream_load128(val, addr);
+}
+
+static __device__ inline void load128(uint4& val, const char* addr) {
+  if (is_aligned<uint4>(addr)) {
+    load128<uint4>(val, addr);
+  } else if (is_aligned<int64_t>(addr)) {
+    load128<uint64_t>(val, addr);
+  } else if (is_aligned<uint32_t>(addr)) {
+    load128<uint32_t>(val, addr);
+  } else {
+    load128<uint8_t>(val, addr);
+  }
+}
+
+static __device__ __inline__ void get_aligned_region(
+    char* ptr,
+    const int64_t chunk_size,
+    const int64_t alignment,
+    int64_t& align_off,
+    int64_t& aligned_size) {
+  const int64_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
+  align_off = detail::div_up(ptr_val, alignment) * alignment - ptr_val;
+  aligned_size = (chunk_size - align_off) / alignment * alignment;
+}
+
+static __device__ __inline__ void copy_chunk(
+    char* dst,
+    const char* src,
+    int64_t chunk_size,
+    int64_t thread_idx,
+    int64_t num_threads) {
+  if (chunk_size < num_threads) {
+    if (thread_idx < chunk_size) {
+      dst[thread_idx] = src[thread_idx];
+    }
+    return;
+  }
+
+  // Identify the region in which writes are guaranteed to be 128-bit aligned
+  int64_t align_off, aligned_size;
+  get_aligned_region(
+      dst, chunk_size, detail::BYTES_PER_THREAD, align_off, aligned_size);
+
+  for (int64_t off = align_off + thread_idx * detail::BYTES_PER_THREAD;
+       off < align_off + aligned_size;
+       off += num_threads * detail::BYTES_PER_THREAD) {
+    uint4 val;
+    // Oppurtunistically vectorize reads
+    load128(val, &src[off]);
+    stream_store128(&dst[off], val);
+  }
+
+  // Handle unaligned regions
+  if (thread_idx < align_off && thread_idx < chunk_size) {
+    dst[thread_idx] = src[thread_idx];
+  }
+  if (align_off + aligned_size + thread_idx < chunk_size) {
+    dst[align_off + aligned_size + thread_idx] =
+        src[align_off + aligned_size + thread_idx];
+  }
+}
+
+static __global__ void split_with_sizes_copy_out_contiguous_no_cast_kernel(
+    char** dst_base_addrs,
+    char** src_base_addrs,
+    int64_t* split_chunk_sizes,
+    int64_t* block_idx_to_split_idx,
+    int64_t* blocks_cumsums,
+    int64_t src_stride,
+    int64_t num_chunks) {
+  const int64_t split_idx = block_idx_to_split_idx[blockIdx.x];
+  const int64_t split_blocks =
+      blocks_cumsums[split_idx + 1] - blocks_cumsums[split_idx];
+  const int64_t split_threads = split_blocks * blockDim.x;
+  const int64_t split_thread_idx =
+      (blockIdx.x - blocks_cumsums[split_idx]) * blockDim.x + threadIdx.x;
+  const int64_t split_chunk_size = split_chunk_sizes[split_idx];
+
+  char* dst_base_addr = dst_base_addrs[split_idx];
+  char* src_base_addr = src_base_addrs[split_idx];
+
+  for (int64_t i = blockIdx.y; i < num_chunks; i += gridDim.y) {
+    copy_chunk(
+        dst_base_addr + i * split_chunk_size,
+        src_base_addr + i * src_stride,
+        split_chunk_size,
+        split_thread_idx,
+        split_threads);
+  }
+}
+
+// Calculate the base addr for each split.
+static inline std::vector<int64_t> get_split_base_addrs(
+    const at::Tensor& tensor,
+    at::IntArrayRef split_sizes,
+    int64_t dim) {
+  const auto* data_ptr = static_cast<const char*>(tensor.const_data_ptr());
+  const auto strides = tensor.strides();
+  const auto element_sz = tensor.element_size();
+  int64_t off = 0;
+  std::vector<int64_t> split_base_addrs;
+  split_base_addrs.reserve(split_sizes.size());
+  for (const auto& split_size : split_sizes) {
+    split_base_addrs.push_back(reinterpret_cast<int64_t>(data_ptr + off));
+    off += split_size * strides[dim] * element_sz;
+  }
+  return split_base_addrs;
+}
+
+static inline std::vector<int64_t> get_dst_addrs(at::TensorList out) {
+  std::vector<int64_t> addrs;
+  addrs.reserve(out.size());
+  for (const auto& tensor : out) {
+    addrs.push_back(reinterpret_cast<int64_t>(tensor.data_ptr()));
+  }
+  return addrs;
+}
+
+// Calculate the chunk size for each split in bytes.
+static inline std::vector<int64_t> get_split_chunk_sizes(
+    const at::Tensor& tensor,
+    at::IntArrayRef split_sizes,
+    int64_t dim) {
+  const auto stride = tensor.stride(dim);
+  const auto element_sz = tensor.element_size();
+  std::vector<int64_t> split_chunk_sizes;
+  split_chunk_sizes.reserve(split_sizes.size());
+  for (const auto& split_size : split_sizes) {
+    split_chunk_sizes.push_back(split_size * stride * element_sz);
+  }
+  return split_chunk_sizes;
+}
+
+// Calculate the chunk stride in bytes. This is the same for all splits.
+static inline int64_t get_chunk_stride(const at::Tensor& tensor, int64_t dim) {
+  int64_t stride = 1;
+  for (int64_t d = dim; d < tensor.dim(); ++d) {
+    stride *= tensor.sizes()[d];
+  }
+  return stride * tensor.element_size();
+}
+
+// Calculate the number of chunks. This is the same for all splits.
+static inline int64_t get_num_chunks(const at::Tensor& tensor, int64_t dim) {
+  int64_t num_chunks = tensor.numel();
+  for (int64_t d = dim; d < tensor.dim(); ++d) {
+    num_chunks /= tensor.sizes()[d];
+  }
+  return num_chunks;
+}
+
+// Pack multiple std::vector<int64_t> into a single zoom tensor.
+std::pair<at::Tensor, std::vector<int64_t*>> pack_vecs(
+    std::vector<const std::vector<int64_t>*> vecs,
+    const at::Device& device) {
+  int64_t numel = 0;
+  for (const auto* vec : vecs) {
+    numel += vec->size();
+  }
+
+  auto packed = at::empty(
+      {numel}, at::TensorOptions().dtype(at::kLong).pinned_memory(true));
+  size_t offset = 0;
+  for (const auto* vec : vecs) {
+    memcpy(
+        packed.data_ptr<int64_t>() + offset,
+        vec->data(),
+        sizeof(int64_t) * vec->size());
+    offset += vec->size();
+  }
+  packed = packed.to(device, /*non_blocking=*/true);
+
+  std::vector<int64_t*> ptrs;
+  ptrs.reserve(vecs.size());
+  offset = 0;
+  for (const auto* vec : vecs) {
+    ptrs.push_back(packed.data_ptr<int64_t>() + offset);
+    offset += vec->size();
+  }
+  return std::make_pair(std::move(packed), std::move(ptrs));
+}
+
+static inline std::vector<int64_t> get_chunk_cat_out_sizes(
+    IntArrayRef input_tensor_sizes,
+    int64_t dim,
+    int64_t num_chunks,
+    int64_t chunk_size,
+    int64_t out_element_size) {
+  std::vector<int64_t> view_sizes = std::vector<int64_t>(
+      input_tensor_sizes.begin(), input_tensor_sizes.begin() + dim);
+  view_sizes.insert(
+      view_sizes.end(), {num_chunks, chunk_size / out_element_size});
+  return view_sizes;
+}
+
+// Copy `max_chunk_size` bytes from `src` to `dst` by `num_threads`, and pad
+// zero when `src` size (i.e., actual_chunk_size) is less than `max_chunk_size`.
+// Assume elements of src and dst have the same data type.
+template <typename dst_t, typename src_t>
+__device__ __inline__ void copy_chunk_with_pad(
+    dst_t* dst_ptr,
+    src_t* src_ptr,
+    int64_t max_chunk_size,
+    int64_t actual_chunk_size,
+    int64_t thread_idx,
+    int64_t num_threads) {
+  // Supports type cast
+  if (!std::is_same_v<dst_t, src_t>) {
+    const int64_t max_num_elems = max_chunk_size / sizeof(dst_t);
+    const int64_t actual_num_elems = actual_chunk_size / sizeof(src_t);
+    int64_t elem_index = thread_idx;
+    while (elem_index < actual_num_elems) {
+      dst_ptr[elem_index] =
+          static_cast_with_inter_type<dst_t, src_t>::apply(src_ptr[elem_index]);
+      elem_index += num_threads;
+    }
+    while (elem_index < max_num_elems) {
+      dst_ptr[elem_index] = static_cast_with_inter_type<dst_t, int>::apply(0);
+      elem_index += num_threads;
+    }
+    return;
+  }
+  char* dst = reinterpret_cast<char*>(dst_ptr);
+  char* src = reinterpret_cast<char*>(src_ptr);
+  // Fast path when the number of threads is larger than the number of bytes to
+  // be copied (i.e., max_chunk_size). In this case, each thread only copies 1
+  // byte. For 0 <= thread_idx < actual_chunk_size, the thread copies data from
+  // `src`. For actual_chunk_size <= thread_idx < max_chunk_size, the thread set
+  // the val=0 for padding.
+  if (max_chunk_size < num_threads) {
+    char val = static_cast<char>(0);
+    if (thread_idx < actual_chunk_size) {
+      val = src[thread_idx];
+    }
+    if (thread_idx < max_chunk_size) {
+      dst[thread_idx] = val;
+    }
+    return;
+  }
+  // Split dst array into three parts:
+  // [dst, dst+align_off), [dst+align_off, dst+align_end), [dst+align_end,
+  // dst+max_chunk_size) The second part is aligned with BYTES_PER_THREAD(=16
+  // bytes) to enable `stream_store128`.
+  int64_t align_off, aligned_size;
+  get_aligned_region(
+      dst, actual_chunk_size, BYTES_PER_THREAD, align_off, aligned_size);
+  int64_t align_end = align_off + aligned_size;
+  for (int64_t i = align_off + thread_idx * BYTES_PER_THREAD; i < align_end;
+       i += num_threads * BYTES_PER_THREAD) {
+    uint4 val;
+    if (is_aligned<uint4>(src + i)) {
+      stream_load128(val, src + i);
+    } else {
+      for (size_t j = 0; j < BYTES_PER_THREAD; ++j) {
+        reinterpret_cast<char*>(&val)[j] = src[i + j];
+      }
+    }
+    stream_store128(&dst[i], val);
+  }
+  // Copy data for the first part of dst array [dst, dst+align_off).
+  // Check `thread_idx<max_chunk_sze` for the edge case that max_chunk_size <
+  // align_off.
+  if (thread_idx < align_off && thread_idx < max_chunk_size) {
+    char val = (char)0;
+    if (thread_idx < actual_chunk_size) {
+      val = src[thread_idx];
+    }
+    dst[thread_idx] = val;
+  }
+  // Copy data for the third part of dst array [dst+align_end,
+  // dst+max_chunk_size).
+  while (align_end + thread_idx < max_chunk_size) {
+    char val = (char)0;
+    if (align_end + thread_idx < actual_chunk_size) {
+      val = src[align_end + thread_idx];
+    }
+    dst[align_end + thread_idx] = val;
+    align_end += num_threads;
+  }
+}
+
+// NOTE [CUDA kernel for chunk_cat]
+// chunk_cat_zoom adopts a "jagged grid" strategy, inspired by NOTE [CUDA fast
+// path for split_with_sizes_copy.out]. In addition, chunk_cat_zoom supports
+// padding via copy_chunk_with_pad when src chunk size is less than dst chunk
+// size.
+template <typename dst_t, typename src_t>
+static __global__ void chunk_cat_zoom_kernel(
+    src_t** src,
+    dst_t* dst,
+    int64_t* block_idx_to_tensor_idx,
+    int64_t* tensor_idx_to_start_tensor_bytes,
+    int64_t* start_block_idx_per_tensor_chunk,
+    int64_t* actual_tensor_sizes,
+    int64_t* pad_tensor_chunk_sizes,
+    int64_t* num_blocks_per_tensor_chunk,
+    int64_t slice_size,
+    int64_t chunk_size,
+    int64_t dst_to_src_ratio) {
+  const int64_t slice_idx = blockIdx.z;
+  const int64_t chunk_idx = blockIdx.y;
+  const int64_t tensor_idx = block_idx_to_tensor_idx[blockIdx.x];
+  const int64_t tile_idx =
+      blockIdx.x - start_block_idx_per_tensor_chunk[tensor_idx];
+  // Number of threads for the `tensor_idx`-th tensor chunk.
+  const int64_t num_threads =
+      num_blocks_per_tensor_chunk[tensor_idx] * BLOCK_SIZE;
+  const int64_t thread_idx = tile_idx * BLOCK_SIZE + threadIdx.x;
+  char* src_addr = reinterpret_cast<char**>(src)[tensor_idx] +
+      slice_idx * actual_tensor_sizes[tensor_idx] +
+      chunk_idx * pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio;
+  char* dst_addr = reinterpret_cast<char*>(dst) + slice_idx * slice_size +
+      chunk_idx * chunk_size + tensor_idx_to_start_tensor_bytes[tensor_idx];
+  // Compute the actual number of bytes to copy from src.
+  const int64_t actual_copy_size = ::min(
+      pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio,
+      ::max(
+          (int64_t)0,
+          actual_tensor_sizes[tensor_idx] -
+              chunk_idx * pad_tensor_chunk_sizes[tensor_idx] /
+                  dst_to_src_ratio));
+  copy_chunk_with_pad<dst_t, src_t>(
+      reinterpret_cast<dst_t*>(dst_addr),
+      reinterpret_cast<src_t*>(src_addr),
+      pad_tensor_chunk_sizes[tensor_idx],
+      actual_copy_size,
+      thread_idx,
+      num_threads);
+}
+
+bool all_contiguous(TensorList tensors) {
+  bool contiguous = true;
+  for (const auto& t : tensors) {
+    contiguous &= t.is_non_overlapping_and_dense();
+  }
+  return contiguous;
+}
+
+// Get leading dimensions before `dim`-th dimension.
+static inline int64_t get_leading_dim(at::IntArrayRef sizes, int64_t dim) {
+  int64_t leading_dim = 1;
+  if (dim > 0) {
+    leading_dim = c10::multiply_integers(sizes.slice(0, dim));
+  }
+  return leading_dim;
+}
+
+// Get trailing dimensions after `dim`-th dimension and padded size along
+// `dim`-th dimension.
+static inline std::pair<int64_t, int64_t> get_pad_size(
+    at::IntArrayRef sizes,
+    int64_t dim,
+    int64_t num_chunks) {
+  int64_t trailing_numel = 1;
+  if (sizes.size() > (uint64_t)dim + 1) {
+    trailing_numel =
+        c10::multiply_integers(sizes.slice(dim + 1, sizes.size() - dim - 1));
+  }
+  int64_t pad_size_along_dim =
+      detail::div_up(sizes[dim], num_chunks) * num_chunks;
+  return std::make_pair(pad_size_along_dim, trailing_numel);
+}
+
+// Get the padded chunk size.
+static inline int64_t get_chunk_size(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    int64_t elem_size) {
+  auto num_tensors = tensors.size();
+  int64_t chunk_size = 0;
+  for (const auto i : c10::irange(num_tensors)) {
+    auto [pad_size_along_dim, trailing_numel] =
+        get_pad_size(tensors[i].sizes(), dim, num_chunks);
+    const int64_t pad_tensor_chunk_size =
+        pad_size_along_dim * trailing_numel * elem_size / num_chunks;
+    chunk_size += pad_tensor_chunk_size;
+  }
+  return chunk_size;
+}
+
+// Get metadata for chunk_cat.
+std::tuple<
+    int64_t,
+    int64_t,
+    int64_t,
+    int64_t,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>,
+    std::vector<int64_t>>
+get_chunk_cat_metadata(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    int64_t dst_elem_size,
+    int64_t src_elem_size) {
+  TORCH_CHECK(
+      dst_elem_size % src_elem_size == 0,
+      "get_chunk_cat_metadata error: only support dst_elem_size % src_elem_size == 0");
+  auto num_tensors = tensors.size();
+  int64_t leading_dim = get_leading_dim(tensors[0].sizes(), dim);
+  std::vector<int64_t> pad_tensor_chunk_sizes;
+  std::vector<int64_t> num_blocks_per_tensor_chunk;
+  std::vector<int64_t> start_block_idx_per_tensor_chunk{0};
+  std::vector<int64_t> actual_tensor_sizes;
+  std::vector<int64_t> tensor_idx_to_start_tensor_bytes{0};
+  std::vector<int64_t> srcs;
+  pad_tensor_chunk_sizes.reserve(num_tensors);
+  num_blocks_per_tensor_chunk.reserve(num_tensors);
+  start_block_idx_per_tensor_chunk.reserve(num_tensors + 1);
+  actual_tensor_sizes.reserve(num_tensors);
+  tensor_idx_to_start_tensor_bytes.reserve(num_tensors + 1);
+  srcs.reserve(num_tensors);
+  // block_idx_to_tensor_idx cannot be reserved since the number of blocks is
+  // data dependent
+  std::vector<int64_t> block_idx_to_tensor_idx;
+  // Inline computing `chunk_size` to avoid redundant computation
+  int64_t chunk_size = 0;
+  for (const auto i : c10::irange(num_tensors)) {
+    at::Tensor tensor = tensors[i];
+    srcs.push_back(reinterpret_cast<int64_t>(tensor.data_ptr()));
+    auto sizes = tensor.sizes();
+    auto [pad_size_along_dim, trailing_numel] =
+        get_pad_size(sizes, dim, num_chunks);
+    const int64_t pad_tensor_chunk_size =
+        pad_size_along_dim * trailing_numel * dst_elem_size / num_chunks;
+    pad_tensor_chunk_sizes.push_back(pad_tensor_chunk_size);
+    chunk_size += pad_tensor_chunk_size;
+    // Number of blocks required to process this tensor chunk.
+    const int64_t num_blocks =
+        detail::div_up(pad_tensor_chunk_size, detail::BYTES_PER_BLOCK);
+    num_blocks_per_tensor_chunk.push_back(num_blocks);
+    start_block_idx_per_tensor_chunk.push_back(
+        start_block_idx_per_tensor_chunk.back() + num_blocks);
+    block_idx_to_tensor_idx.insert(
+        block_idx_to_tensor_idx.end(), num_blocks, i);
+    tensor_idx_to_start_tensor_bytes.push_back(
+        tensor_idx_to_start_tensor_bytes.back() + pad_tensor_chunk_size);
+    actual_tensor_sizes.push_back(sizes[dim] * trailing_numel * src_elem_size);
+  }
+  const int64_t num_blocks_per_chunk = start_block_idx_per_tensor_chunk.back();
+  const int64_t slice_size = num_chunks * chunk_size;
+  return std::make_tuple(
+      chunk_size,
+      leading_dim,
+      num_blocks_per_chunk,
+      slice_size,
+      srcs,
+      block_idx_to_tensor_idx,
+      tensor_idx_to_start_tensor_bytes,
+      start_block_idx_per_tensor_chunk,
+      actual_tensor_sizes,
+      pad_tensor_chunk_sizes,
+      num_blocks_per_tensor_chunk);
+}
+
+// See [CUDA kernel for chunk_cat_cuda]
+template <typename dst_t, typename src_t>
+void _chunk_cat_out_zoom_contiguous(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    Tensor& out,
+    int64_t dst_elem_size,
+    int64_t src_elem_size) {
+  const auto device = tensors[0].device();
+  // `get_chunk_cat_metadata` must return vectors and `pack_vecs` cannot be
+  // moved into `get_chunk_cat_metadata`. Otherwise `packed` would point to
+  // vectors allocated inside `get_chunk_cat_metadata` which become out of local
+  // scope.
+  auto
+      [chunk_size,
+       leading_dim,
+       num_blocks_per_chunk,
+       slice_size,
+       srcs,
+       block_idx_to_tensor_idx,
+       tensor_idx_to_start_tensor_bytes,
+       start_block_idx_per_tensor_chunk,
+       actual_tensor_sizes,
+       pad_tensor_chunk_sizes,
+       num_blocks_per_tensor_chunk] =
+          get_chunk_cat_metadata(
+              tensors, dim, num_chunks, dst_elem_size, src_elem_size);
+  auto packed = pack_vecs(
+      {&srcs,
+       &block_idx_to_tensor_idx,
+       &tensor_idx_to_start_tensor_bytes,
+       &start_block_idx_per_tensor_chunk,
+       &actual_tensor_sizes,
+       &pad_tensor_chunk_sizes,
+       &num_blocks_per_tensor_chunk},
+      device);
+  std::vector<int64_t> view_sizes = get_chunk_cat_out_sizes(
+      tensors[0].sizes(), dim, num_chunks, chunk_size, dst_elem_size);
+  at::native::resize_output(out, view_sizes);
+  dim3 blocks(num_blocks_per_chunk, num_chunks, leading_dim);
+  dim3 threads(detail::BLOCK_SIZE, 1, 1);
+ hipLaunchKernelGGL(( detail::chunk_cat_zoom_kernel), 
+      dim3(blocks),
+      dim3(threads),
+      0,
+      c10::zoom::getCurrentZoomStream(), 
+      /*srcs=*/reinterpret_cast<src_t**>(packed.second[0]),
+      reinterpret_cast<dst_t*>(out.data_ptr()),
+      /*block_idx_to_tensor_idx=*/packed.second[1],
+      /*tensor_idx_to_start_tensor_bytes=*/packed.second[2],
+      /*start_block_idx_per_tensor_chunk=*/packed.second[3],
+      /*actual_tensor_sizes=*/packed.second[4],
+      /*pad_tensor_chunk_sizes=*/packed.second[5],
+      /*num_blocks_per_tensor_chunk=*/packed.second[6],
+      slice_size,
+      chunk_size,
+      dst_elem_size / src_elem_size);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+} // namespace detail
+
+// See [CUDA fast path for split_with_sizes_copy.out]
+void split_with_sizes_copy_out_zoom_contiguous_no_cast(
+    const at::Tensor& self,
+    at::IntArrayRef split_sizes,
+    int64_t dim,
+    at::TensorList out) {
+  const auto device = self.device();
+  const auto src_base_addrs =
+      detail::get_split_base_addrs(self, split_sizes, dim);
+  const auto dst_base_addrs = detail::get_dst_addrs(out);
+  const auto src_stride = detail::get_chunk_stride(self, dim);
+  const auto split_chunk_sizes =
+      detail::get_split_chunk_sizes(self, split_sizes, dim);
+  const auto num_chunks = detail::get_num_chunks(self, dim);
+
+  // Calculate the number of blocks required for the first chunk across all
+  // splits, assuming each thread only processes BYTES_PER_THREAD bytes.
+  int64_t num_blocks = 0;
+  for (const auto& split_chunk_size : split_chunk_sizes) {
+    num_blocks += detail::div_up(
+        split_chunk_size, detail::BLOCK_SIZE * detail::BYTES_PER_THREAD);
+  }
+
+  // Calculate the maximum number of blocks to launch. Only consider
+  // maxThreadsPerMultiProcessor as a limiting factor as the kernel uses no
+  // shared memory and little registers. Over-subscribe the SMs to hide I/O
+  // latency.
+  const auto num_sms =
+      at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+  const auto max_threads_per_sm =
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor;
+  const int64_t max_blocks =
+      num_sms * max_threads_per_sm / detail::BLOCK_SIZE * 2.0;
+
+  // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate
+  // block size. Spread iter_factor evenly between chunks_per_block and
+  // iters_per_chunk.
+  int64_t iter_factor = detail::div_up(num_blocks * num_chunks, max_blocks);
+  int64_t chunks_per_block = ::ceil(std::sqrt(iter_factor));
+  chunks_per_block = ::min(chunks_per_block, num_chunks);
+  const int64_t iters_per_chunk = detail::div_up(iter_factor, chunks_per_block);
+
+  // Launch a logically jagged grid of shape
+  // (chunk_size*, num_splits, num_chunks / chunks_per_block)
+  // backed by a physical grid of shape
+  // (sum(chunk_size), num_chunks / chunks_per_block).
+  // A block can find its split_idx via block_idx_to_split_idx.
+  std::vector<int64_t> block_idx_to_split_idx;
+  std::vector<int64_t> blocks_cumsums{0};
+  block_idx_to_split_idx.reserve(num_blocks);
+  for (size_t split_idx = 0; split_idx < split_sizes.size(); ++split_idx) {
+    const auto blocks = detail::div_up(
+        split_chunk_sizes[split_idx],
+        detail::BLOCK_SIZE * detail::BYTES_PER_THREAD * iters_per_chunk);
+    block_idx_to_split_idx.insert(
+        block_idx_to_split_idx.end(), blocks, split_idx);
+    blocks_cumsums.push_back(blocks_cumsums.back() + blocks);
+  }
+
+  dim3 blocks(blocks_cumsums.back(), num_chunks / chunks_per_block, 1);
+  dim3 threads(detail::BLOCK_SIZE, 1, 1);
+
+  auto [_, ptrs] = detail::pack_vecs(
+      {&dst_base_addrs,
+       &src_base_addrs,
+       &split_chunk_sizes,
+       &block_idx_to_split_idx,
+       &blocks_cumsums},
+      device);
+
+ hipLaunchKernelGGL(( detail::split_with_sizes_copy_out_contiguous_no_cast_kernel), 
+      dim3(blocks),
+      dim3(threads),
+      0,
+      c10::zoom::getCurrentZoomStream(), 
+      /*dst_base_addrs=*/reinterpret_cast<char**>(ptrs[0]),
+      /*src_base_addrs=*/reinterpret_cast<char**>(ptrs[1]),
+      /*split_chunk_sizes=*/ptrs[2],
+      /*block_idx_to_split_idx=*/ptrs[3],
+      /*blocks_cumsums=*/ptrs[4],
+      src_stride,
+      num_chunks);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+void split_with_sizes_copy_out_zoom(
+    const Tensor& self,
+    IntArrayRef split_sizes,
+    int64_t dim,
+    TensorList out) {
+  const bool is_capturing = c10::zoom::currentStreamCaptureStatusMayInitCtx() !=
+      c10::zoom::CaptureStatus::None;
+  bool contiguous_no_cast = self.is_non_overlapping_and_dense();
+  for (const auto& t : out) {
+    contiguous_no_cast &= t.is_non_overlapping_and_dense();
+    contiguous_no_cast &= (t.dtype() == self.dtype());
+  }
+  // TODO(yifu): make the fast path work for CUDA graph
+  if (!is_capturing && contiguous_no_cast) {
+    // Perform equivalent checks performed by the composite impl
+    if (dim < 0) {
+      dim = at::maybe_wrap_dim(dim, self.dim());
+    }
+    TORCH_CHECK(
+        self.dim() != 0, "split expects at least a 1-dimensional tensor")
+
+    const int64_t dim_size = self.size(dim);
+    int64_t split_sizes_sum = 0;
+    for (const auto i : c10::irange(split_sizes.size())) {
+      TORCH_CHECK(
+          split_sizes[i] >= 0,
+          "split_with_sizes expects split_sizes have only non-negative ",
+          "entries, but got split_sizes=",
+          split_sizes[i]);
+      split_sizes_sum += split_sizes[i];
+    }
+    TORCH_CHECK(
+        split_sizes_sum == dim_size,
+        "split_with_sizes expects split_sizes to sum exactly to ",
+        dim_size,
+        " (input tensor's size at dimension ",
+        dim,
+        "), ",
+        "but got split_sizes=",
+        split_sizes);
+
+    TORCH_CHECK(
+        out.size() == split_sizes.size(),
+        "split_with_sizes_copy_out() expected an out= argument of size ",
+        split_sizes.size(),
+        ", got size ",
+        out.size());
+
+    auto out_shape = self.sizes().vec();
+    for (const auto i : c10::irange(split_sizes.size())) {
+      out_shape[dim] = split_sizes[i];
+      if (resize_output_check(out[i], out_shape)) {
+        out[i].resize_(out_shape);
+      }
+      TORCH_CHECK(
+          out[i].dtype() == self.dtype(),
+          "Expected out tensor to have dtype ",
+          self.dtype(),
+          ", but got ",
+          out[i].dtype(),
+          " instead");
+      TORCH_CHECK(
+          out[i].device() == self.device(),
+          "Expected out tensor to have device ",
+          self.device(),
+          ", but got ",
+          out[i].device(),
+          " instead");
+    }
+    split_with_sizes_copy_out_zoom_contiguous_no_cast(
+        self, split_sizes, dim, out);
+  } else {
+    at::native::split_with_sizes_copy_out(self, split_sizes, dim, out);
+  }
+}
+
+Tensor _chunk_cat_zoom(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
+  if (detail::all_contiguous(tensors)) {
+    // Return a tensor with the same dtype as input tensors
+    int64_t elem_size = tensors[0].element_size();
+    int64_t chunk_size =
+        detail::get_chunk_size(tensors, dim, num_chunks, elem_size);
+    int64_t leading_dim = detail::get_leading_dim(tensors[0].sizes(), dim);
+    auto view_sizes = detail::get_chunk_cat_out_sizes(
+        tensors[0].sizes(), dim, num_chunks, chunk_size, elem_size);
+    Tensor out =
+        tensors[0]
+            .new_empty(chunk_size * num_chunks * leading_dim / elem_size)
+            .view(view_sizes);
+    // Type-agnostic copy since out and input tensors have the same type.
+    detail::_chunk_cat_out_zoom_contiguous<char, char>(
+        tensors, dim, num_chunks, out, elem_size, elem_size);
+    return out;
+  } else {
+    return at::native::_chunk_cat(tensors, dim, num_chunks);
+  }
+}
+
+Tensor& _chunk_cat_out_zoom(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks,
+    Tensor& out) {
+  dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
+  TORCH_CHECK(
+      tensors[0].device() == out.device(),
+      "_chunk_cat_out_zoom: mismatch between input and out tensor devices");
+  bool both_input_output_contiguous =
+      detail::all_contiguous(tensors) && out.is_non_overlapping_and_dense();
+  if (both_input_output_contiguous &&
+      (tensors[0].dtype() == at::ScalarType::BFloat16) &&
+      (out.dtype() == at::ScalarType::Float)) {
+    // _chunk_cat_out_zoom_contiguous should also support other types, thanks to
+    // static_cast_with_inter_type. Here, we dispatch to BFloat16 in and float32
+    // out since it is the only known use case.
+    detail::_chunk_cat_out_zoom_contiguous<float, BFloat16>(
+        tensors,
+        dim,
+        num_chunks,
+        out,
+        out.element_size(),
+        tensors[0].element_size());
+  } else if (
+      both_input_output_contiguous && tensors[0].dtype() == out.dtype()) {
+    // Type-agnostic copy since out and input tensors have the same type.
+    detail::_chunk_cat_out_zoom_contiguous<char, char>(
+        tensors,
+        dim,
+        num_chunks,
+        out,
+        out.element_size(),
+        tensors[0].element_size());
+  } else {
+    at::native::_chunk_cat_out(tensors, dim, num_chunks, out);
+  }
+  return out;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorShapeZoom.cpp b/aten/src/ATen/native/zoom/TensorShapeZoom.cpp
new file mode 100644
index 00000000000000..b74ac6a36d482a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorShapeZoom.cpp
@@ -0,0 +1,37 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/zoom/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/set_native.h>
+#endif
+
+namespace at::native {
+
+Tensor& set_zoom_(Tensor& result) {
+  caffe2::TypeMeta dtype = result.dtype();
+  Storage storage(
+      Storage::use_byte_size_t(),
+      0,
+      at::zoom::getZoomDeviceAllocator(),
+      true);
+    result.set_(storage, 0, {0}, {});
+    TORCH_INTERNAL_ASSERT(dtype == result.dtype());
+  return result;
+}
+
+Tensor& set_storage_zoom_(Tensor& result, Storage storage, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) {
+  checkSetStorage(result, storage, storage_offset, size, stride);
+
+  result.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
+  at::OptionalIntArrayRef stride_opt = stride.data() != nullptr ?
+                                          at::OptionalIntArrayRef(stride) : c10::nullopt;
+  at::native::resize_impl_zoom_(result.unsafeGetTensorImpl(), size, stride_opt);
+  return result;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/TensorTransformations.cu b/aten/src/ATen/native/zoom/TensorTransformations.cu
new file mode 100644
index 00000000000000..fd84d2cb79a1bc
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorTransformations.cu
@@ -0,0 +1,154 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/TensorTransformations.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/roll_native.h>
+#endif
+
+#include <cstddef>
+#include <vector>
+
+namespace at::native {
+
+template <typename scalar_t, typename IndexType>
+C10_LAUNCH_BOUNDS_2(zoom::getApplyBlockSize(), zoom::getApplyBlocksPerSM())
+__global__ void kernel_pointwise_flip_apply2(
+    const zoom::detail::TensorInfo<scalar_t, IndexType> in_tensor_info,
+    zoom::detail::TensorInfo<scalar_t, IndexType> out_tensor_info,
+    IndexType N,
+    int flip_dim,
+    IndexType total_dims) {
+  for (IndexType linear_index = blockIdx.x * blockDim.x + threadIdx.x; linear_index < N; linear_index += gridDim.x * blockDim.x) {
+    IndexType dst_offset = 0;
+    if (flip_dim == 0) {
+      // flip 1st dim
+      dst_offset = (in_tensor_info.sizes[0] - 1 - linear_index / in_tensor_info.strides[0]) * in_tensor_info.strides[0] + linear_index % in_tensor_info.strides[0];
+    }
+    else {
+      // flip last dim
+      IndexType i = total_dims - 1;
+      dst_offset = linear_index / in_tensor_info.strides[0] * in_tensor_info.strides[0] + (in_tensor_info.sizes[i] - 1 - linear_index % in_tensor_info.strides[0]);
+    }
+    out_tensor_info.data[dst_offset] = in_tensor_info.data[linear_index];
+  }
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
+__global__ void flip_zoom_kernel(
+    scalar_t* in_tensor,
+    scalar_t* out_tensor,
+    int64_t N,
+    int64_t* flip_dims,
+    int64_t flip_dims_size,
+    int64_t* strides,
+    int64_t* strides_contiguous,
+    int64_t* shape,
+    int64_t total_dims) {
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= N) {
+    return;
+  }
+
+  int64_t cur_indices = linear_index, rem = 0, dst_offset = 0;
+  for (int64_t i = 0; i < total_dims; i++) {
+    int64_t temp = cur_indices;
+    cur_indices = cur_indices / strides_contiguous[i];
+    rem = temp - cur_indices * strides_contiguous[i];
+    // flip the indices if it is in flip_dims
+    for (int64_t j = 0; j < flip_dims_size; j++) {
+      if (i == flip_dims[j]) {
+        cur_indices = shape[i] - 1 - cur_indices;
+      }
+    }
+    dst_offset += cur_indices * strides[i];
+    cur_indices = rem;
+  }
+  out_tensor[linear_index] = in_tensor[dst_offset];
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
+__global__ void roll_zoom_kernel(
+    const scalar_t* in_tensor,
+    scalar_t* out_tensor,
+    int64_t N,
+    int64_t roll_dim,
+    int64_t start,
+    int64_t size,
+    int64_t stride,
+    int64_t total_dims) {
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= N) {
+    return;
+  }
+  // roll dim idx is the index of linear_index along the rolling dimension.
+  int64_t roll_dim_idx = linear_index % (stride * size) / stride;
+  // index into the source data to find appropriate value.
+  int64_t source_idx = 0;
+  if( roll_dim_idx >= (size - start) ) {
+    source_idx = linear_index - ((size - start) * stride);
+  } else {
+    source_idx = linear_index + (start * stride);
+  }
+  out_tensor[linear_index] = in_tensor[source_idx];
+}
+
+// Roll a tensor along a dimension
+Tensor roll_zoom(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
+  if (dims.size() != 1 || shifts.size() != 1) {
+    return roll_common(self, shifts, dims);
+  }
+
+  auto in_tensor = self;
+  if(!self.is_contiguous()) {
+    in_tensor = self.contiguous();
+  }
+  auto out_tensor = at::empty_like(in_tensor, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (out_tensor.numel() == 0) {
+    return out_tensor;
+  }
+  const int64_t N = in_tensor.numel();
+  const int64_t dim = dims[0];
+  const int64_t size = in_tensor.size(dim);
+  int64_t start = (size - shifts[0]) % size;
+  // Behavior of % is different in C++ vs Python for negative numbers. This
+  // corrects the difference.
+  if( start < 0 ) start = start + size;
+
+  dim3 dim_block = zoom::getApplyBlock();
+  dim3 dim_grid;
+  TORCH_CHECK(zoom::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid");
+
+  auto total_dims = in_tensor.dim();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      at::ScalarType::ComplexHalf,
+      in_tensor.scalar_type(), "roll_zoom",
+      [&] {
+       hipLaunchKernelGGL(( roll_zoom_kernel), dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
+          in_tensor.const_data_ptr<scalar_t>(), out_tensor.mutable_data_ptr<scalar_t>(), N,
+          dim, start,
+          size,
+          in_tensor.stride(dim),
+          total_dims);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+
+  return out_tensor;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/zoom/ATenZoomGeneral.h b/aten/src/ATen/zoom/ATenZoomGeneral.h
new file mode 100644
index 00000000000000..018bfd860bbaa5
--- /dev/null
+++ b/aten/src/ATen/zoom/ATenZoomGeneral.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+#include <c10/macros/Export.h>
+
+// Use TORCH_ZOOM_API or TORCH_CUDA_CU_API for exports from this folder
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ApplyGridUtils.cuh b/aten/src/ATen/zoom/ApplyGridUtils.cuh
new file mode 100644
index 00000000000000..0ba58874c8285b
--- /dev/null
+++ b/aten/src/ATen/zoom/ApplyGridUtils.cuh
@@ -0,0 +1,47 @@
+#include <ATen/zoom/ZoomContext.h>
+
+#include <hip/hip_runtime.h>
+
+namespace at::zoom {
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+namespace {
+
+// Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
+constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
+constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
+
+template <int step = 1>
+inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
+  if (curDevice == -1) return false;
+  uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
+  uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
+  uint64_t maxGridX = at::zoom::getDeviceProperties(curDevice)->maxGridSize[0];
+  if (numBlocks > maxGridX)
+    numBlocks = maxGridX;
+  grid = dim3(numBlocks);
+  return true;
+}
+
+constexpr int getApplyBlocksPerSM() {
+  return AT_APPLY_BLOCKS_PER_SM;
+}
+
+constexpr int getApplyBlockSize() {
+  return AT_APPLY_THREADS_PER_BLOCK;
+}
+
+inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
+  return dim3(max_threads_per_block);
+}
+
+} // anonymous namespace
+} // namespace at::zoom
diff --git a/aten/src/ATen/zoom/AsmUtils.cuh b/aten/src/ATen/zoom/AsmUtils.cuh
new file mode 100644
index 00000000000000..a7d6987be574b6
--- /dev/null
+++ b/aten/src/ATen/zoom/AsmUtils.cuh
@@ -0,0 +1,85 @@
+#pragma once
+#include <cstdint>
+
+// Collection of direct PTX functions
+
+namespace at::zoom {
+
+template <typename T>
+struct Bitfield {};
+
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __host__ __forceinline__
+  unsigned int getBitfield(unsigned int val, int pos, int len) {
+    pos &= 0xff;
+    len &= 0xff;
+
+    unsigned int m = (1u << len) - 1u;
+    return (val >> pos) & m;
+  }
+
+  static __device__ __host__ __forceinline__
+  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+    pos &= 0xff;
+    len &= 0xff;
+
+    unsigned int m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+
+    return (val & ~m) | toInsert;
+  }
+};
+
+template <>
+struct Bitfield<uint64_t> {
+  static __device__ __host__ __forceinline__
+  uint64_t getBitfield(uint64_t val, int pos, int len) {
+    pos &= 0xff;
+    len &= 0xff;
+
+    uint64_t m = (1u << len) - 1u;
+    return (val >> pos) & m;
+  }
+
+  static __device__ __host__ __forceinline__
+  uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
+    pos &= 0xff;
+    len &= 0xff;
+
+    uint64_t m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+
+    return (val & ~m) | toInsert;
+  }
+};
+
+__device__ __forceinline__ int getLaneId() {
+  return __lane_id();
+}
+
+__device__ __forceinline__ unsigned long long int getLaneMaskLt() {
+  const std::uint64_t m = (1ull << getLaneId()) - 1ull;
+  return m;
+}
+
+__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
+  std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
+  return m;
+}
+
+__device__ __forceinline__ unsigned long long int getLaneMaskGt() {
+  const std::uint64_t m = getLaneMaskLe();
+  return m ? ~m : m;
+}
+
+__device__ __forceinline__ unsigned long long int getLaneMaskGe() {
+  const std::uint64_t m = getLaneMaskLt();
+  return ~m;
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/Atomic.cuh b/aten/src/ATen/zoom/Atomic.cuh
new file mode 100644
index 00000000000000..c4e4429cbd0eb9
--- /dev/null
+++ b/aten/src/ATen/zoom/Atomic.cuh
@@ -0,0 +1,457 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+#include <ATen/NumericUtils.h>
+
+template <typename T>
+struct AtomicFPOp;
+
+template <>
+struct AtomicFPOp<at::Half> {
+  template <typename func_t>
+  inline __device__ at::Half operator() (at::Half *address, at::Half val, const func_t& func) {
+    unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    at::Half hsum;
+    do {
+      assumed = old;
+      hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+      hsum = func(hsum, val);
+      old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    return hsum;
+  }
+};
+
+template <>
+struct AtomicFPOp<at::BFloat16> {
+  template <typename func_t>
+  inline __device__ at::BFloat16 operator() (at::BFloat16 *address, at::BFloat16 val, const func_t& func) {
+    unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    at::BFloat16 bsum;
+    do {
+      assumed = old;
+      bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+      bsum = func(bsum, val);
+      old = (size_t)address & 2 ? (old & 0xffff) | (bsum.x << 16) : (old & 0xffff0000) | bsum.x;
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+    bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    return bsum.x;
+  }
+};
+
+template <>
+struct AtomicFPOp<double> {
+  template <typename func_t>
+  inline __device__ double operator() (double * address, double val, const func_t& func) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull;
+    unsigned long long int assumed;
+
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ull, assumed, func(val, assumed));
+      // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+  }
+};
+
+#define ATOMIC_INTEGER_IMPL(NAME)                                                                                      \
+template <typename T, size_t n>                                                                                        \
+struct Atomic##NAME##IntegerImpl;                                                                                      \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 1> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    size_t offset = (size_t)address & 3;                                                                               \
+    uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);                                                 \
+    uint32_t old = *address_as_ui;                                                                                     \
+    uint32_t shift = offset * 8;                                                                                       \
+    uint32_t old_byte;                                                                                                 \
+    uint32_t newval;                                                                                                   \
+    uint32_t assumed;                                                                                                  \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      old_byte = (old >> shift) & 0xff;                                                                                \
+      newval = static_cast<uint8_t>(func(val, static_cast<T>(old_byte)));                                              \
+      newval = (old & ~(0x000000ff << shift)) | (newval << shift);                                                     \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};                                                                                                                     \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 2> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    size_t offset = (size_t)address & 2;                                                                               \
+    uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);                                                 \
+    bool is_32_align = offset;                                                                                         \
+    uint32_t old = *address_as_ui;                                                                                     \
+    uint32_t old_bytes;                                                                                                \
+    uint32_t newval;                                                                                                   \
+    uint32_t assumed;                                                                                                  \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      old_bytes = is_32_align ? old >> 16 : old & 0xffff;                                                              \
+      newval = static_cast<uint16_t>(func(val, static_cast<T>(old_bytes)));                                            \
+      newval = is_32_align ? (old & 0xffff) | (newval << 16) : (old & 0xffff0000) | newval;                            \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};                                                                                                                     \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 4> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    uint32_t * address_as_ui = (uint32_t *) (address);                                                                 \
+    uint32_t old = *address_as_ui;                                                                                     \
+    uint32_t newval;                                                                                                   \
+    uint32_t assumed;                                                                                                  \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      newval = static_cast<uint32_t>(func(val, static_cast<T>(old)));                                                  \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};                                                                                                                     \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 8> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    unsigned long long * address_as_ui = (unsigned long long *) (address);                                             \
+    unsigned long long old = *address_as_ui;                                                                           \
+    unsigned long long newval;                                                                                         \
+    unsigned long long assumed;                                                                                        \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      newval = static_cast<uint64_t>(func(val, static_cast<T>(old)));                                                  \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};
+
+
+# define GPU_ATOMIC_INTEGER(NAME, OP, DTYPE)                                                                           \
+static inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) {                                             \
+Atomic##NAME##IntegerImpl<DTYPE, sizeof(DTYPE)>()(address,                                                             \
+                                                      val,                                                             \
+                                                      [](DTYPE a, DTYPE b) {                                           \
+                                                          return OP;                                                   \
+                                                      });                                                              \
+}                                                                                                                      \
+
+ATOMIC_INTEGER_IMPL(Add)
+GPU_ATOMIC_INTEGER(Add, a || b, bool)
+
+// Don't instantiate gpuAtomicAdd with the macro as it seems non-standard (see int32, int64)
+static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
+  AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address,
+                                                   val,
+                                                   [](uint8_t a, uint8_t b) {
+                                                      return a + b;
+                                                   });
+}
+
+static inline  __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
+  AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address,
+                                                 val,
+                                                 [](int8_t a, int8_t b) {
+                                                   return a + b;
+                                                 });
+}
+
+static inline  __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
+  AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address,
+                                                   val,
+                                                   [](int16_t a, int16_t b) {
+                                                     return a + b;
+                                                   });
+}
+
+static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
+  return atomicAdd(address, val);
+}
+
+static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
+  __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+}
+
+static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half hsum, at::Half val) {
+                                  return hsum + val;
+                                });
+}
+
+static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
+return AtomicFPOp<at::BFloat16>()(address, val,
+                                  [](at::BFloat16 bsum, at::BFloat16 val) {
+                                    return bsum + val;
+                                  });
+}
+
+/* Note [hip-clang differences to hcc]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The upcoming hip-clang compiler for ROCm differs from hcc in a few details.
+ * It exports the __HIP__ macro, we can hence differentiate between hcc and
+ * hip-clang. In the below, hcc only received support for atomicAdd with double
+ * typing after work week 18312. hip-clang had support from the first version.
+ * In general, the code-visible differences between hip-clang and hcc will be
+ * minimal.
+ */
+
+  // // This needs to be defined for the host side pass
+  // static inline  __device__  double atomicAdd(double *address, double val) { }
+
+
+static inline __device__ double gpuAtomicAdd(double *address, double val) {
+  return atomicAdd(address, val);
+}
+
+static inline __device__ float gpuAtomicAdd(float *address, float val) {
+  return atomicAdd(address, val);
+}
+
+template<typename T>
+static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
+  gpuAtomicAdd(&address->real_, val.real_);
+  gpuAtomicAdd(&address->imag_, val.imag_);
+}
+
+/* Note [gpuAtomicAdd vs atomicAdd]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Some extensions such as torchvision call atomicAdd()
+ * directly and require non-library provided data type support. Only for these, we
+ * continue to provide atomicAdd overloads.
+ */
+static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+  return gpuAtomicAdd(address, val);
+}
+
+static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
+  return gpuAtomicAdd(address, val);
+}
+
+static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline  __device__ void atomicAdd(int8_t *address, int8_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline  __device__ void atomicAdd(int16_t *address, int16_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline __device__ void atomicAdd(bool *address, bool val) {
+  gpuAtomicAdd(address, val);
+}
+
+/* Note [explicitly non-returning atomics]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * AMD's MI100 (gfx908) provides an optimized fp32 atomicAdd, exposed via atomicAddNoRet().
+ * Due to compiler limitations, callers must opt-in to guarantee the optimized instruction.
+ * This non-returning atomicAddNoRet cannot be used to implement the returning atomicAdd,
+ * therefore we need a new API 'gpuAtomicAddNoReturn'.
+ */
+template<typename T>
+static inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
+
+/* Special case fp32 atomic. */
+static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAdd(address, val); }
+
+
+// Atomic multiplication implementation.
+
+ATOMIC_INTEGER_IMPL(Mul)
+GPU_ATOMIC_INTEGER(Mul, a * b, uint8_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int8_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int16_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int32_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int64_t)
+
+inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return bsum * val;
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return bsum * val;
+                                    });
+}
+
+inline __device__ double gpuAtomicMul(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(val * __longlong_as_double(assumed));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMul (float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(val *
+                                   __int_as_float(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+// Atomic maximum implementation.
+
+template <typename T>
+__host__ __device__ T safe_max(T a, T b) {
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+    T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
+  return max;
+}
+
+ATOMIC_INTEGER_IMPL(Max)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), uint8_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int8_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int16_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int32_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int64_t)
+
+inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return safe_max(bsum, val);
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return safe_max(bsum, val);
+                                    });
+}
+
+inline __device__ double gpuAtomicMax(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(safe_max(val, __longlong_as_double(assumed)));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMax(float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(safe_max(val, __int_as_float(assumed))));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+// Atomic minimum implementation.
+
+template <typename T>
+__host__ __device__ T safe_min(T a, T b) {
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+    T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
+  return min;
+}
+
+ATOMIC_INTEGER_IMPL(Min)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), uint8_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int8_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int16_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int32_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int64_t)
+
+inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return safe_min(bsum, val);
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return safe_min(bsum, val);
+                                    });
+}
+
+inline __device__ double gpuAtomicMin(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(safe_min(val, __longlong_as_double(assumed)));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMin(float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(safe_min(val, __int_as_float(assumed))));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
diff --git a/aten/src/ATen/zoom/CachingHostAllocator.cpp b/aten/src/ATen/zoom/CachingHostAllocator.cpp
new file mode 100644
index 00000000000000..77d84838a83d7a
--- /dev/null
+++ b/aten/src/ATen/zoom/CachingHostAllocator.cpp
@@ -0,0 +1,266 @@
+#include "CachingHostAllocator.h"
+
+#include <ATen/DeviceGuard.h>
+#include <ATen/zoom/ZoomEvent.h>
+#include <c10/core/thread_pool.h>
+#include <c10/zoom/ZoomAllocatorConfig.h>
+
+#include <hip/hip_runtime.h>
+#include <future>
+
+namespace at::zoom {
+namespace {
+
+// Note: cudaEventCreate when concurrently invoked from multiple threads can be
+// very expensive (at least on certain device/driver combinations). Thus, we a)
+// serialize event creation at a per-device level, and b) pool the events to
+// avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
+// significant improvements in multithreaded workloads with high allocation
+// rates.
+class EventPool {
+ public:
+  using Event = std::unique_ptr<
+      at::zoom::ZoomEvent,
+      std::function<void(at::zoom::ZoomEvent*)>>;
+  EventPool() : pools_(c10::zoom::device_count()) {}
+
+  Event get(DeviceIndex device) {
+    TORCH_INTERNAL_ASSERT(0 <= device);
+    TORCH_INTERNAL_ASSERT(device < static_cast<DeviceIndex>(pools_.size()));
+    auto& pool = pools_[device];
+    auto destructor = [&pool](at::zoom::ZoomEvent* event) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.push_back(std::unique_ptr<at::zoom::ZoomEvent>(event));
+    };
+
+    // Try to acquire an event from the per-device pool.
+    {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      if (!pool.event_pool_.empty()) {
+        auto* event = pool.event_pool_.back().release();
+        pool.event_pool_.pop_back();
+        return Event(event, destructor);
+      }
+    }
+    // otherwise, allocate a new event that will be returned to the pool on
+    // destruction.
+    return Event(
+        std::make_unique<at::zoom::ZoomEvent>(hipEventDisableTiming).release(),
+        destructor);
+  }
+
+  void empty_cache() {
+    for (auto& pool : pools_) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.clear();
+    }
+  }
+
+ private:
+  struct PerDevicePool {
+    alignas(64) std::mutex mutex_;
+    std::vector<std::unique_ptr<at::zoom::ZoomEvent>> event_pool_;
+  };
+  std::vector<PerDevicePool> pools_;
+};
+
+using Block = HostBlock<c10::zoom::ZoomStream>;
+
+struct ZoomCachingHostAllocatorImpl
+    : public CachingHostAllocatorImpl<c10::zoom::ZoomStream, EventPool::Event> {
+ private:
+  void allocate_host_memory(size_t size, void** ptr) override {
+    // Pinned memory pointers allocated by any device can be directly used by
+    // any other device, regardless of the current device at the time of
+    // allocation, since we assume unified addressing. So we grab any existing
+    // primary context, if available. See pytorch/pytorch#21081.
+    at::OptionalDeviceGuard device_guard;
+    auto primary_ctx_device_index =
+        c10::zoom::getDeviceIndexWithPrimaryContext();
+    if (primary_ctx_device_index.has_value()) {
+      device_guard.reset_device(
+          at::Device(at::DeviceType::PrivateUse1, *primary_ctx_device_index));
+    }
+
+    if (c10::zoom::ZoomCachingAllocator::ZoomAllocatorConfig::
+            pinned_use_zoom_host_register()) {
+      allocWithZoomHostRegister(ptr, size);
+    } else {
+      // Use hipHostMalloc for allocating pinned memory (global lock in driver)
+      C10_ZOOM_CHECK(hipHostMalloc(ptr, size, hipHostMallocDefault));
+    }
+  }
+
+  void free_block(Block* block) override {
+    if (c10::zoom::ZoomCachingAllocator::ZoomAllocatorConfig::
+            pinned_use_zoom_host_register()) {
+      void* ptr = block->ptr_;
+      C10_ZOOM_CHECK(hipHostUnregister(ptr));
+      free(ptr);
+    } else {
+      C10_ZOOM_CHECK(hipHostFree(block->ptr_));
+    }
+  }
+
+  void record_stream(
+      std::optional<std::vector<EventPool::Event>>& events,
+      c10::zoom::ZoomStream stream) override {
+    auto event = create_event_internal(stream.device_index());
+    event->record(stream);
+    events->push_back(std::move(event));
+  }
+
+  bool query_event(EventPool::Event& event) override {
+    hipError_t err = hipEventQuery(*event);
+    if (err == hipErrorNotReady) {
+      (void)hipGetLastError(); // clear CUDA error
+      return false;
+    } else if (err != hipSuccess) {
+      C10_ZOOM_CHECK(err);
+    }
+    return true;
+  }
+
+  EventPool::Event create_event_internal(DeviceIndex idx) {
+    // Leak the event pool to avoid shutdown issue.
+    static auto* event_pool = new EventPool();
+    return event_pool->get(idx);
+  }
+
+  TaskThreadPool* getThreadPool() {
+    static TaskThreadPool* pool = new TaskThreadPool(
+        c10::zoom::ZoomCachingAllocator::ZoomAllocatorConfig::
+            pinned_max_register_threads());
+    return pool;
+  }
+
+  void mapPagesForRegister(
+      const void* ptr,
+      size_t size,
+      size_t i,
+      size_t numThreads,
+      size_t pageSize) {
+    uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
+    uintptr_t end = (uintptr_t)start + (size / numThreads);
+    if (i == (numThreads - 1)) {
+      end = (uintptr_t)ptr + size;
+    }
+
+    // pre-fault/map the pages by setting the first byte of the page
+    uintptr_t alignedStart =
+        (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
+    for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
+      memset((void*)p, 0, 1);
+    }
+  }
+
+  void registerPages(const void* ptr, size_t size) {
+    C10_ZOOM_CHECK(
+        hipHostRegister((void*)ptr, (size_t)size, hipHostRegisterDefault));
+
+    // If host and device pointer don't match, give a warning and exit
+    void* devptr;
+    C10_ZOOM_CHECK(hipHostGetDevicePointer(&devptr, (void*)ptr, 0));
+    TORCH_CHECK(
+        (void*)devptr == (void*)ptr,
+        "Host and device pointer dont match with hipHostRegister. "
+        "Please dont use this feature by setting "
+        "PYTORCH_ZOOM_ALLOC_CONF=use_zoom_host_register:False (default)",
+        "");
+  }
+
+  void allocWithZoomHostRegister(void** ptr, size_t roundSize) {
+    // Here we do regular allocation, pre-fault/map the pages, and then do
+    // cudaHostRegister with GPU mapping flags to lock the pages, so we
+    // can minimize the cost for the cuda global lock.
+    *ptr = malloc(roundSize);
+
+    // Parallelize the mapping/registering of pages to reduce wall time
+    size_t pageSize = (1 << 12); // 4kB pages
+    size_t numMapThreads = c10::zoom::ZoomCachingAllocator::
+        ZoomAllocatorConfig::pinned_num_register_threads();
+    if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
+      // parallelize the mapping of pages with a threadpool
+      auto* pool = getThreadPool();
+      std::vector<std::promise<void>> promises;
+      std::vector<std::future<void>> futures;
+      promises.reserve(numMapThreads);
+      futures.reserve(numMapThreads);
+
+      for (size_t i = 0; i < numMapThreads; i++) {
+        promises.emplace_back();
+        futures.push_back(promises[i].get_future());
+        auto task = [this,
+                     i,
+                     ptr,
+                     roundSize,
+                     numMapThreads,
+                     pageSize,
+                     &promises]() mutable {
+          mapPagesForRegister(
+              *ptr,
+              roundSize,
+              i, // thread task-id
+              numMapThreads,
+              pageSize);
+          // set the promise when mapping pages are done
+          promises[i].set_value();
+        };
+        pool->run(task);
+      }
+      for (auto& future : futures) {
+        future.wait();
+      }
+    } else {
+      // Map pages in the same thread
+      mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
+    }
+
+    // Register the mapped pages using cudaHostRegister
+    registerPages(*ptr, roundSize);
+  }
+};
+
+void raw_local_deleter(void* ptr);
+
+struct ZoomCachingHostAllocator final
+    : public CachingHostAllocatorInterface<ZoomCachingHostAllocatorImpl> {
+  at::DataPtr allocate(size_t size) override {
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        &raw_local_deleter,
+        at::DeviceType::CPU};
+  }
+};
+
+ZoomCachingHostAllocator caching_host_allocator;
+
+static inline ZoomCachingHostAllocator& getZoomCachingHostAllocator() {
+  return caching_host_allocator;
+}
+
+void raw_local_deleter(void* ptr) {
+  getZoomCachingHostAllocator().free(ptr);
+}
+
+} // anonymous namespace
+
+bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::zoom::ZoomStream stream) {
+  return getZoomCachingHostAllocator().record_event(ptr, ctx, stream);
+}
+
+// Releases cached pinned memory allocations via cudaHostFree
+void CachingHostAllocator_emptyCache() {
+  getZoomCachingHostAllocator().empty_cache();
+}
+
+at::Allocator* getCachingHostAllocator() {
+  return &getZoomCachingHostAllocator();
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/CachingHostAllocator.h b/aten/src/ATen/zoom/CachingHostAllocator.h
new file mode 100644
index 00000000000000..f9dfab67591052
--- /dev/null
+++ b/aten/src/ATen/zoom/CachingHostAllocator.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/core/CachingHostAllocator.h>
+#include <c10/core/Allocator.h>
+#include <ATen/zoom/detail/ZoomHooks.h>
+#include <ATen/detail/ZoomHooksInterface.h>
+#include <c10/zoom/ZoomStream.h>
+
+namespace at::zoom {
+
+//
+// A caching allocator for CUDA host allocations (pinned memory).
+//
+// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
+// freed pinned (page-locked) memory allocations. This avoids device
+// synchronizations due to cudaFreeHost calls.
+//
+// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
+// called anytime a pointer from this allocator is used in a cudaMemcpyAsync
+// call between host and device, and passed the corresponding context from the
+// allocation. This is currently invoked by at::native::copy_kernel_cuda.
+//
+TORCH_ZOOM_API c10::Allocator* getCachingHostAllocator();
+
+// Records an event in the specified stream. The allocation corresponding to the
+// input `ptr`/`ctx` will not be re-used until the event has occurred.
+TORCH_ZOOM_API bool CachingHostAllocator_recordEvent(
+    void* ptr,
+    void* ctx,
+    c10::zoom::ZoomStream stream);
+
+// Releases cached pinned memory allocations via cudaHostFree
+TORCH_ZOOM_API void CachingHostAllocator_emptyCache();
+
+inline TORCH_ZOOM_API at::DataPtr HostAlloc(size_t size) {
+  return getCachingHostAllocator()->allocate(size);
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/DeviceUtils.cuh b/aten/src/ATen/zoom/DeviceUtils.cuh
new file mode 100644
index 00000000000000..951d761d0b8533
--- /dev/null
+++ b/aten/src/ATen/zoom/DeviceUtils.cuh
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <c10/util/complex.h>
+#include <c10/util/Half.h>
+
+__device__ __forceinline__ unsigned int ACTIVE_MASK()
+{
+// will be ignored anyway
+    return 0xffffffff;
+}
+
+__device__ __forceinline__ void WARP_SYNC(unsigned mask = 0xffffffff) {
+
+}
+
+
+__device__ __forceinline__ unsigned long long int WARP_BALLOT(int predicate)
+{
+return __ballot(predicate);
+}
+
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+    return __shfl_xor(value, laneMask, width);
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+    return __shfl(value, srcLane, width);
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+    return __shfl_up(value, delta, width);
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+    return __shfl_down(value, delta, width);
+}
+
+template<>
+__device__ __forceinline__ int64_t WARP_SHFL_DOWN<int64_t>(int64_t value, unsigned int delta, int width , unsigned int mask)
+{
+  //(HIP doesn't support int64_t). Trick from https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+  int2 a = *reinterpret_cast<int2*>(&value);
+  a.x = __shfl_down(a.x, delta);
+  a.y = __shfl_down(a.y, delta);
+  return *reinterpret_cast<int64_t*>(&a);
+}
+
+template<>
+__device__ __forceinline__ c10::Half WARP_SHFL_DOWN<c10::Half>(c10::Half value, unsigned int delta, int width, unsigned int mask)
+{
+  return c10::Half(WARP_SHFL_DOWN<unsigned short>(value.x, delta, width, mask), c10::Half::from_bits_t{});
+}
+
+template <typename T>
+__device__ __forceinline__ c10::complex<T> WARP_SHFL_DOWN(c10::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+    return c10::complex<T>(
+        __shfl_down(value.real_, delta, width),
+        __shfl_down(value.imag_, delta, width));
+}
+
+template <typename T>
+__device__ __forceinline__ T doLdg(const T* p) {
+  return *p;
+}
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/EmptyTensor.cpp b/aten/src/ATen/zoom/EmptyTensor.cpp
new file mode 100644
index 00000000000000..087962f699033e
--- /dev/null
+++ b/aten/src/ATen/zoom/EmptyTensor.cpp
@@ -0,0 +1,71 @@
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <ATen/detail/ZoomHooksInterface.h>
+#include <iostream>
+
+namespace at::detail {
+
+    TensorBase zoom_empty_generic(IntArrayRef size, ScalarType dtype, std::optional<Device> device_opt, std::optional<c10::MemoryFormat> memory_format_opt) {
+        at::globalContext().lazyInitPrivateUse1();
+        const auto device = device_or_default(device_opt);
+        TORCH_INTERNAL_ASSERT(device.is_privateuseone());
+        const DeviceGuard device_guard(device);
+        auto* allocator = at::zoom::getZoomDeviceAllocator();
+        constexpr c10::DispatchKeySet zoom_dks(c10::DispatchKey::PrivateUse1);
+        return at::detail::empty_generic(
+            size, allocator, zoom_dks, dtype, memory_format_opt);
+    }
+
+    TensorBase zoom_empty_memory_format(IntArrayRef size, ::std::optional<ScalarType> dtype_opt, ::std::optional<Layout> layout_opt, ::std::optional<Device> device_opt, ::std::optional<bool> pin_memory_opt, ::std::optional<MemoryFormat> memory_format_opt) {
+        TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned");
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
+
+        const auto dtype = dtype_or_default(dtype_opt);
+        return zoom_empty_generic(size, dtype, device_opt, memory_format_opt);
+    }
+
+    TensorBase empty_zoom(IntArrayRef size, const TensorOptions &options) {
+        return zoom_empty_memory_format(size, 
+            optTypeMetaToScalarType(options.dtype_opt()),
+            options.layout_opt(),
+            options.device_opt(),
+            options.pinned_memory_opt(),
+            options.memory_format_opt());
+    }
+    
+
+    TensorBase zoom_empty_strided_generic(IntArrayRef size, IntArrayRef stride, ScalarType dtype, ::std::optional<Device> device_opt) {
+        at::globalContext().lazyInitPrivateUse1();
+        const auto device = device_or_default(device_opt);
+        TORCH_INTERNAL_ASSERT(device.is_privateuseone());
+        const DeviceGuard device_guard(device);
+        auto* allocator = at::zoom::getZoomDeviceAllocator();
+        constexpr c10::DispatchKeySet zoom_dks(c10::DispatchKey::PrivateUse1);
+        return at::detail::empty_strided_generic(
+            size, stride, allocator, zoom_dks, dtype);
+    }
+    
+    TensorBase zoom_empty_strided(IntArrayRef size, IntArrayRef stride, ::std::optional<ScalarType> dtype_opt, ::std::optional<Layout> layout_opt, ::std::optional<Device> device_opt, ::std::optional<bool> pin_memory_opt){
+        TORCH_CHECK(!pin_memory_opt.has_value() || !*pin_memory_opt, "Only dense CPU tensors can be pinned");
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);
+
+        const auto dtype = dtype_or_default(dtype_opt);
+        return zoom_empty_strided_generic(size, stride, dtype, device_opt);
+    }
+
+    TensorBase empty_strided_zoom(
+                IntArrayRef size,
+                IntArrayRef stride,
+                const TensorOptions &options) {
+            return zoom_empty_strided(
+                size,
+                stride,
+                optTypeMetaToScalarType(options.dtype_opt()),
+                options.layout_opt(),
+                options.device_opt(),
+                options.pinned_memory_opt());
+}
+
+}
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/EmptyTensor.h b/aten/src/ATen/zoom/EmptyTensor.h
new file mode 100644
index 00000000000000..59ac131c5b13c6
--- /dev/null
+++ b/aten/src/ATen/zoom/EmptyTensor.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+    TensorBase zoom_empty_generic(IntArrayRef size, ScalarType dtype, std::optional<Device> device, std::optional<c10::MemoryFormat> memory_format);
+    TensorBase zoom_empty_memory_format(IntArrayRef size, ::std::optional<ScalarType> dtype, ::std::optional<Layout> layout, ::std::optional<Device> device, ::std::optional<bool> pin_memory, ::std::optional<MemoryFormat> memory_format); // {"schema": "aten::empty.memory_format(SymInt[] size, *, ScalarTy
+    TORCH_ZOOM_API TensorBase empty_zoom(IntArrayRef size, const TensorOptions &options);
+
+    TensorBase zoom_empty_strided_generic(IntArrayRef size, IntArrayRef stride, ScalarType dtype, ::std::optional<Device> device_opt); 
+    TensorBase zoom_empty_strided(IntArrayRef size, IntArrayRef stride, ::std::optional<ScalarType> dtype_opt, ::std::optional<Layout> layout_opt, ::std::optional<Device> device_opt, ::std::optional<bool> pin_memory_opt); // {"schema": "aten::empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> TensorBase", "dispatch": "True", "default": "False"}
+    TORCH_ZOOM_API TensorBase empty_strided_zoom(IntArrayRef size, IntArrayRef stride, const TensorOptions &options);
+
+}
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPConfig.h b/aten/src/ATen/zoom/HIPConfig.h
new file mode 100644
index 00000000000000..017177b4ed597b
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPConfig.h
@@ -0,0 +1,9 @@
+#define AT_ROCM_ENABLED() true
+#define AT_MAGMA_ENABLED() false
+
+// disabled for now because we're testing on an old hipsparselt
+#ifdef HIPSPARSELT_ENABLED
+#define AT_HIPSPARSELT_ENABLED() true
+#else
+#define AT_HIPSPARSELT_ENABLED() false
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPGraph.cpp b/aten/src/ATen/zoom/HIPGraph.cpp
new file mode 100644
index 00000000000000..49079ed083042f
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPGraph.cpp
@@ -0,0 +1,317 @@
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/zoom/HIPGraph.h>
+#include <c10/zoom/ZoomException.h>
+#include <ATen/Functions.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomFunctions.h>
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <thread>
+#include <vector>
+
+namespace at::zoom {
+
+static bool _hip_graphs_debug = false;
+constexpr int kSynchronizeBusyWaitMillis = 10;
+
+MempoolId_t graph_pool_handle() {
+  // uuid count starts at 1. 0 is reserved to mean "wasn't set by graph_pool_handle".
+  static std::atomic<CaptureId_t> uid{1};
+  // Sets just the second value, to distinguish it from MempoolId_ts created from
+  // cudaStreamGetCaptureInfo id_s in capture_begin.
+  return {0, uid++};
+}
+
+
+// Get the expected id of a capture sequence so that we can call beginAllocateStreamToPool
+// before starting a graph capture
+CaptureId_t capture_sequence_id() {
+  // id starts at 1:
+  // Ensures uuid count starts at 1. 0 is reserved to mean "not set by cudaStreamGetCaptureInfo".
+  // (But how do we know GetCaptureInfo never sets id_ to 0? Because that's the current behavior,
+  // and I asked cuda devs to keep it that way, and they agreed.)
+  static std::atomic<CaptureId_t> uuid{1};
+  return uuid++;
+}
+
+/**
+ * Note [CUDA Graph Wrapper Class]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Q: Why do we need graph capture and launch bindings in Pytorch?
+ *    Why can't they live in a user extension, for example?
+ *
+ * A1: Convenience.
+ * A2: To ensure valid numerics on replay, some native CUDA ops (like RNG ops with
+ *     CPU statefulness) need cooperation from the capture and replay bindings
+ *     (see Note [CUDA Graph-safe RNG states] in ZoomGeneratorImpl.h).
+ *
+ *     We can't expect users to know about this cooperation.  If users write capture
+ *     bindings naively in an extension, they likely won't interact with the native
+ *     ops properly.  Their graphs would yield invalid numerics on replay.
+ */
+
+/**
+ * Note [Interaction with CUDA graph capture] in ZoomCachingAllocator.cpp
+ * describes memory management for captures.
+ */
+
+std::atomic<int> HIPGraph::pending_event_queries = 0;
+
+// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they
+// can be resolved before the capture begins. Note that event queries are not allowed during a
+// graph capture in the default capture mode.
+void HIPGraph::inc_pending_event_queries() {
+  pending_event_queries++;
+}
+
+void HIPGraph::dec_pending_event_queries() {
+  TORCH_INTERNAL_ASSERT(pending_event_queries > 0,
+    "Attempted to decrement the number of outstanding events to be queried, but it was <= 0.");
+  pending_event_queries--;
+}
+
+int HIPGraph::num_pending_event_queries() {
+  return pending_event_queries;
+}
+
+HIPGraph::HIPGraph()
+  // CUDAStreams may not be default-constructed.
+  : capture_stream_(c10::zoom::getCurrentZoomStream()) {
+}
+
+void HIPGraph::register_generator_state(
+    c10::intrusive_ptr<at::ZoomGeneratorState> state) {
+  captured_generator_states_[std::move(state)] = 0;
+}
+
+void HIPGraph::register_generator_state(const at::Generator& generator) {
+  c10::intrusive_ptr<ZoomGeneratorImpl> zoom_gen =
+      dynamic_intrusive_pointer_cast<ZoomGeneratorImpl>(
+          generator.getIntrusivePtr());
+  zoom_gen->register_graph(this);
+}
+
+void HIPGraph::capture_begin(MempoolId_t pool/*=0*/, hipStreamCaptureMode capture_mode) {
+  TORCH_CHECK(!has_graph_exec_,
+              "This HIPGraph instance already owns a captured graph. "
+              "To capture a new graph, create a new instance.");
+
+  // default generator is always registered
+  auto* gen = get_generator_or_default<ZoomGeneratorImpl>(
+      c10::nullopt, zoom::detail::getDefaultZoomGenerator());
+  gen->register_graph(this);
+
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->capture_prologue();
+  }
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  TORCH_CHECK(stream != c10::zoom::getDefaultZoomStream(),
+              "HIP graphs must be captured on a non-default stream. "
+              "(However, after capture, it's ok to replay them on the "
+              "default stream.)");
+
+  capture_stream_ = stream;
+  capture_dev_ = c10::zoom::current_device();
+
+  id_ = capture_sequence_id();
+
+  if (pool.first != 0 || pool.second != 0) {
+    // Either value being nonzero means the user supplied a pool to share.
+    // But only one should be nonzero.
+    // If pool was created by another graph's capture_begin, first should be nonzero.
+    // If pool was created by graph_pool_handle, second should be nonzero.
+    TORCH_INTERNAL_ASSERT(!(pool.first && pool.second));
+    mempool_id_ = pool;
+  } else {
+    // User did not ask us to share a mempool. Use our own id_ as our mempool_id_.
+    // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
+    mempool_id_ = {id_, 0};
+  }
+
+  // Addendum: beginAllocateStreamToPool is now called before cudaStreamBeginCapture to prevent an
+  // autograd thread's free() call triggering an invalid cudaEventRecord in the caching allocator
+  // due to the capture status being updated _after_ a capture had already started.
+  c10::zoom::ZoomCachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](hipStream_t stream) {
+      hipStreamCaptureStatus status;
+      CaptureId_t stream_capture_id;
+      C10_ZOOM_CHECK(hipStreamGetCaptureInfo(stream, &status, &stream_capture_id));
+      return status == hipStreamCaptureStatus::hipStreamCaptureStatusActive && stream_capture_id == capture_id_;
+  });
+
+  // At this point, any NCCL watchdogs should be aware that we are in capture mode
+  // and therefore should not enqueue any additional work that could be event-queried.
+  // We still must wait on any existing work that has not been cleaned up.
+  while (num_pending_event_queries()) {
+    TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture.");
+    std::this_thread::sleep_for(
+      std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+  }
+
+  // cudaStreamCaptureModeGlobal is the most conservative option to
+  // prevent potentially unsafe CUDA API calls during capture.  See
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
+  C10_ZOOM_CHECK(hipStreamBeginCapture(capture_stream_, capture_mode));
+
+  hipStreamCaptureStatus status;
+  C10_ZOOM_CHECK(hipStreamGetCaptureInfo(stream, &status, &capture_id_));
+  TORCH_INTERNAL_ASSERT(status == hipStreamCaptureStatus::hipStreamCaptureStatusActive);
+
+  TORCH_INTERNAL_ASSERT(id_ > 0);
+}
+
+void HIPGraph::capture_end() {
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  TORCH_CHECK(stream == capture_stream_,
+              "Capture must end on the same stream it began on.");
+
+  C10_ZOOM_CHECK(hipStreamEndCapture(capture_stream_, &graph_));
+
+  c10::zoom::ZoomCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
+
+  TORCH_CHECK(graph_ != NULL, "Invalid capture.");
+  has_graph_ = true;
+
+  // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
+  // between replays.
+  // If Pytorch compiles and runs with a CUDA 11.4+ toolkit, there's a chance the allocator backend
+  // is cudaMallocAsync.
+  // cudaMallocAsync is generally graph-safe, but if some tensors are not freed between replays,
+  // the graph's internal bookkeeping requires that we instantiate with
+  // cudaGraphInstantiateFlagAutoFreeOnLaunch. See
+  // cudaGraphLaunch
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1g1accfe1da0c605a577c22d9751a09597
+  // cudaGraphInstantiateWithFlags
+  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
+
+    // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
+    // who prefer not to report error message through these arguments moving forward
+    // (they prefer return value, or errors on api calls internal to the capture)
+
+    C10_ZOOM_CHECK(hipGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
+
+
+  has_graph_exec_ = true;
+
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    wholegraph_increments = generator_state->capture_epilogue();
+  }
+
+  size_t numHIPGraphNodes = 0;
+  C10_ZOOM_CHECK(hipGraphGetNodes(graph_, NULL, &numHIPGraphNodes));
+  if (numHIPGraphNodes == 0) {
+      TORCH_WARN("The HIP Graph is empty. This usually means that the graph was ",
+                 "attempted to be captured on wrong device or stream.");
+  }
+
+  // check if debug path is set
+  if (!_hip_graphs_debug) {
+    // Now that we've instantiated graph_ into graph_exec_,
+    // we don't need graph_ anymore.
+    C10_ZOOM_CHECK(hipGraphDestroy(graph_));
+    has_graph_ = false;
+  } else {
+    TORCH_WARN("DEBUG: TORCH_HIPGRAPHS_DEBUG_PATH detected. graph_ will not be freed until debug_dump is called.");
+  }
+}
+
+void HIPGraph::replay() {
+  TORCH_CHECK(has_graph_exec_,
+              "Called HIPGraph::replay without a preceding successful capture.");
+
+  c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
+
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->replay_prologue(wholegraph_increments);
+  }
+  // graph_exec_ may be replayed in any stream.
+  C10_ZOOM_CHECK(hipGraphLaunch(graph_exec_, c10::zoom::getCurrentZoomStream()));
+
+// cuda does this sync for certain versions, we're ignoring it here
+//   int version;
+//   C10_ZOOM_CHECK(cudaDriverGetVersion(&version));
+//   if (version < 11040) {
+//     // Workaround for bug in libcuda.so that causes replayed graphs with
+//     // certain topologies to be corrupted (kernels elided, internal syncs
+//     // ignored) when replayed back to back without a sync in between.
+//     // The bug is fixed in CUDA 11.4+.
+//     C10_ZOOM_CHECK(cudaDeviceSynchronize());
+//   }
+}
+
+void HIPGraph::enable_debug_mode() {
+  _hip_graphs_debug = true;
+}
+
+void HIPGraph::debug_dump(const std::string& debug_path) {
+  if (_hip_graphs_debug) {
+    TORCH_WARN("DEBUG: calling debug_dump()");
+    if (has_graph_) {
+      TORCH_WARN("DEBUG: calling hipGraphDebugDotPrint() with ", debug_path);
+      C10_ZOOM_CHECK_WARN(hipGraphDebugDotPrint(graph_, debug_path.c_str(), 1<<10)); // most verbose output
+      C10_ZOOM_CHECK(hipGraphDestroy(graph_));
+    }
+  } else {
+    // TODO (Arham): technically false right now, need to add this functionality to the Zoom PyBind module
+    TORCH_WARN("HIP Graphs debug not enabled, set with torch._C._zoom_enable_graphs_debug_mode");
+  }
+
+}
+
+void HIPGraph::reset() {
+  // I'd prefer these checks throw exceptions, not print warnings,
+  // but the destructor calls reset(), and at least one CI build
+  // refuses to compile with a throwing destructor.
+  //
+  // Instead of calling reset() in the destructor to clean up, I could
+  // call reset() in the __del__ method of a thin Python wrapper,
+  // in which case reset would be allowed to throw exceptions.
+  // But Stackoverflow does not like user-defined __del__.
+  // __del__ prevents Graph instances from EVER being garbage collected
+  // if they participate in a reference cycle.
+  // And exceptions thrown in __del__ only print a warning anyway.
+  //
+  // Calling reset() in the C++ destructor, with warnings instead of exceptions
+  // if calls fail, is the compromise we chose.
+  //
+  // If capture_begin, the capture, or capture_end failed at some point, this HIPGraph, the generator,
+  // and the allocator could end up in all kinds of weird states depending where failure occurred.
+  // If the user catches the failure exception in a script, or is running in REPL or (god forbid)
+  // a Jupyter notebook, I don't see an easy way for reset() to gracefully fix all such possible error states.
+  if (has_graph_ || has_graph_exec_) {
+    // notifyCaptureDestroy may throw. How should we handle this?
+    c10::zoom::ZoomCachingAllocator::releasePool(capture_dev_, mempool_id_);
+  }
+  if (has_graph_) {
+    C10_ZOOM_CHECK_WARN(hipGraphDestroy(graph_));
+    has_graph_ = false;
+  }
+  if (has_graph_exec_) {
+    C10_ZOOM_CHECK_WARN(hipGraphExecDestroy(graph_exec_));
+    has_graph_exec_ = false;
+  }
+}
+
+// Returns an id another graph's capture_begin can use to share the same memory pool as this graph.
+MempoolId_t HIPGraph::pool() {
+TORCH_CHECK(has_graph_exec_,
+              "Called HIPGraph::pool() without a preceding successful capture.");
+  return mempool_id_;
+}
+
+HIPGraph::~HIPGraph() {
+  for (auto& [generator_state, wholegraph_increments] :
+       captured_generator_states_) {
+    generator_state->unregister_graph(this);
+  }
+  reset();
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPGraph.h b/aten/src/ATen/zoom/HIPGraph.h
new file mode 100644
index 00000000000000..7bea7814fe344c
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPGraph.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Device.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+#include <c10/zoom/ZoomStream.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace at {
+
+struct Generator;
+struct ZoomGeneratorImpl;
+struct ZoomGeneratorState;
+
+using MempoolId_t = c10::zoom::MempoolId_t;
+using CaptureId_t = c10::zoom::CaptureId_t;
+
+namespace zoom {
+
+// Standalone way to get a unique mempool id usable as a pool=... argument
+// to HIPGraph::capture_begin
+TORCH_ZOOM_API MempoolId_t graph_pool_handle();
+
+struct TORCH_ZOOM_API HIPGraph {
+  HIPGraph();
+  ~HIPGraph();
+
+  static void inc_pending_event_queries();
+  static void dec_pending_event_queries();
+  static int num_pending_event_queries();
+  // See Note [Explicit Registration of Generators to the CUDA Graph]
+  void register_generator_state(c10::intrusive_ptr<at::ZoomGeneratorState> state);
+  void register_generator_state(const at::Generator& generator);
+  void capture_begin(
+      MempoolId_t pool = {0, 0},
+      hipStreamCaptureMode capture_mode = hipStreamCaptureModeGlobal);
+  void capture_end();
+  void replay();
+  void reset();
+  MempoolId_t pool();
+  void enable_debug_mode();
+  void debug_dump(const std::string& debug_path);
+
+ protected:
+  hipGraph_t graph_ = NULL;
+  hipGraphExec_t graph_exec_ = NULL;
+
+  static std::atomic<int> pending_event_queries;
+
+  // internal states so reset() can do its best cleaning up
+  // Set to true in capture_end if hipStreamEndCapture succeeded
+  // Set back to false soon after, when graph_ is consumed by hipGraphInstantiate
+  // to create graph_exec_, then graph_ is deleted
+  bool has_graph_ = false;
+  // Set to true in capture_end if hipGraphInstantiate succeeded
+  bool has_graph_exec_ = false;
+
+  // uuid of this instance's current capture, used to
+  // specify the pool.
+  CaptureId_t id_;
+
+  // the ID assigned by hip during graph capture,
+  // used to identify when a stream is participating in capture
+  CaptureId_t capture_id_ = -1;
+
+  // uuid used to request a particular private mempool from CUDACachingAllocator.
+  // By default, this will be set to {id_, 0}.
+  //
+  // If capture_begin is called with "pool=other_graph.pool()", this graph's mempool_id_
+  // will be set to the other graph's mempool_id_, and therefore share a mempool with the
+  // other graph.
+  //
+  // If capture_begin is called with "pool=handle" where "handle" came from graph_pool_handle(),
+  // it will share a mempool with any other captures that used "pool=handle".
+  //
+  // Sharing a mempool across graphs saves memory, and it's safe if you
+  // know you'll replay those graphs in the same order you captured them.
+  MempoolId_t mempool_id_;
+
+  // Stream on which capture began
+  c10::zoom::ZoomStream capture_stream_;
+
+  // multiple generator states and their wholegraph_increments in this graph
+  // that are managed by the CUDA Graph
+  ska::flat_hash_map<c10::intrusive_ptr<at::ZoomGeneratorState>, uint64_t>
+      captured_generator_states_;
+
+  // Device where capture occurred. Right now, for simplicity, we require all ops
+  // in a capture to run on the same device, but this is a limitation of HIPGraph,
+  // not CUDA itself.  We can straightforwardly modify HIPGraph to support multi-device
+  // captures if needed.
+  int capture_dev_;
+};
+
+} // namespace cuda
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPGraphsUtils.hpp b/aten/src/ATen/zoom/HIPGraphsUtils.hpp
new file mode 100644
index 00000000000000..1f9a227f5e5492
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPGraphsUtils.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/zoom/ZoomEvent.h>
+#include <ATen/zoom/PhiloxUtils.hpp>
+// #include <ATen/zoom/detail/CUDAHooks.h>"
+#include <ATen/zoom/detail/ZoomHooks.h>
+#include <ATen/detail/ZoomHooksInterface.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+#include <c10/zoom/ZoomGuard.h>
+
+// c10/cuda/CUDAGraphsC10Utils.h has utils used by both c10 and aten.
+// This file adds utils used by aten only.
+
+namespace at::zoom {
+
+using CaptureId_t = c10::zoom::CaptureId_t;
+using CaptureStatus = c10::zoom::CaptureStatus;
+
+// Use this version where you don't want to create a CUDA context if none exists.
+inline CaptureStatus currentStreamCaptureStatus() {
+  // don't create a context if we don't have to
+  if (c10::zoom::hasPrimaryContext(c10::zoom::current_device())) {
+    return c10::zoom::currentStreamCaptureStatusMayInitCtx();
+  } else {
+    return CaptureStatus::None;
+  }
+}
+
+inline void assertNotCapturing(std::string attempt) {
+  auto status = currentStreamCaptureStatus();
+  TORCH_CHECK(status == CaptureStatus::None,
+              attempt,
+              " during HIP graph capture. If you need this call to be captured, "
+              "please file an issue. "
+              "Current hipStreamCaptureStatus: ",
+              status);
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPUtils.h b/aten/src/ATen/zoom/HIPUtils.h
new file mode 100644
index 00000000000000..4461619e00cd96
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPUtils.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/zoom/ZoomContext.h>
+
+namespace at::zoom {
+
+// Check if every tensor in a list of tensors matches the current
+// device.
+inline bool check_device(ArrayRef<Tensor> ts) {
+  if (ts.empty()) {
+    return true;
+  }
+  Device curDevice = Device(kPrivateUse1, c10::zoom::current_device());
+  for (const Tensor& t : ts) {
+    if (t.device() != curDevice) return false;
+  }
+  return true;
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/NumericLimits.cuh b/aten/src/ATen/zoom/NumericLimits.cuh
new file mode 100644
index 00000000000000..8b5d6b5932ee01
--- /dev/null
+++ b/aten/src/ATen/zoom/NumericLimits.cuh
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+
+// NumericLimits.cuh is a holder for numeric limits definitions of commonly used
+// types. This header is very specific to ROCm HIP and may be removed in the future.
+// This header is derived from the legacy THCNumerics.cuh.
+
+// The lower_bound and upper_bound constants are same as lowest and max for
+// integral types, but are -inf and +inf for floating point types. They are
+// useful in implementing min, max, etc.
+
+namespace at {
+
+template <typename T>
+struct numeric_limits {
+};
+
+// WARNING: the following at::numeric_limits definitions are there only to support
+//          HIP compilation for the moment. Use std::numeric_limits if you are not
+//          compiling for ROCm.
+//          from @colesbury: "The functions on numeric_limits aren't marked with
+//          __device__ which is why they don't work with ROCm. CUDA allows them
+//          because they're constexpr."
+
+namespace {
+  // ROCm doesn't like INFINITY too.
+  constexpr double inf = INFINITY;
+}
+
+template <>
+struct numeric_limits<bool> {
+  static inline __host__ __device__ bool lowest() { return false; }
+  static inline __host__ __device__ bool max() { return true; }
+  static inline __host__ __device__ bool lower_bound() { return false; }
+  static inline __host__ __device__ bool upper_bound() { return true; }
+};
+
+template <>
+struct numeric_limits<uint8_t> {
+  static inline __host__ __device__ uint8_t lowest() { return 0; }
+  static inline __host__ __device__ uint8_t max() { return UINT8_MAX; }
+  static inline __host__ __device__ uint8_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint8_t upper_bound() { return UINT8_MAX; }
+};
+
+template <>
+struct numeric_limits<int8_t> {
+  static inline __host__ __device__ int8_t lowest() { return INT8_MIN; }
+  static inline __host__ __device__ int8_t max() { return INT8_MAX; }
+  static inline __host__ __device__ int8_t lower_bound() { return INT8_MIN; }
+  static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
+};
+
+template <>
+struct numeric_limits<int16_t> {
+  static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
+  static inline __host__ __device__ int16_t max() { return INT16_MAX; }
+  static inline __host__ __device__ int16_t lower_bound() { return INT16_MIN; }
+  static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
+};
+
+template <>
+struct numeric_limits<int32_t> {
+  static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
+  static inline __host__ __device__ int32_t max() { return INT32_MAX; }
+  static inline __host__ __device__ int32_t lower_bound() { return INT32_MIN; }
+  static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
+};
+
+template <>
+struct numeric_limits<int64_t> {
+#ifdef _MSC_VER
+  static inline __host__ __device__ int64_t lowest() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t max() { return _I64_MAX; }
+  static inline __host__ __device__ int64_t lower_bound() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t upper_bound() { return _I64_MAX; }
+#else
+  static inline __host__ __device__ int64_t lowest() { return INT64_MIN; }
+  static inline __host__ __device__ int64_t max() { return INT64_MAX; }
+  static inline __host__ __device__ int64_t lower_bound() { return INT64_MIN; }
+  static inline __host__ __device__ int64_t upper_bound() { return INT64_MAX; }
+#endif
+};
+
+template <>
+struct numeric_limits<at::Half> {
+  static inline __host__ __device__ at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half max() { return at::Half(0x7BFF, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half lower_bound() { return at::Half(0xFC00, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half upper_bound() { return at::Half(0x7C00, at::Half::from_bits()); }
+};
+
+template <>
+struct numeric_limits<at::BFloat16> {
+  static inline __host__ __device__ at::BFloat16 lowest() { return at::BFloat16(0xFF7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 max() { return at::BFloat16(0x7F7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 lower_bound() { return at::BFloat16(0xFF80, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 upper_bound() { return at::BFloat16(0x7F80, at::BFloat16::from_bits()); }
+};
+
+template <>
+struct numeric_limits<float> {
+  static inline __host__ __device__ float lowest() { return -FLT_MAX; }
+  static inline __host__ __device__ float max() { return FLT_MAX; }
+  static inline __host__ __device__ float lower_bound() { return -static_cast<float>(inf); }
+  static inline __host__ __device__ float upper_bound() { return static_cast<float>(inf); }
+};
+
+template <>
+struct numeric_limits<double> {
+  static inline __host__ __device__ double lowest() { return -DBL_MAX; }
+  static inline __host__ __device__ double max() { return DBL_MAX; }
+  static inline __host__ __device__ double lower_bound() { return -inf; }
+  static inline __host__ __device__ double upper_bound() { return inf; }
+};
+
+} // namespace at
diff --git a/aten/src/ATen/zoom/PeerToPeerAccess.cpp b/aten/src/ATen/zoom/PeerToPeerAccess.cpp
new file mode 100644
index 00000000000000..b5c3b8eda00565
--- /dev/null
+++ b/aten/src/ATen/zoom/PeerToPeerAccess.cpp
@@ -0,0 +1,59 @@
+#include <ATen/zoom/PeerToPeerAccess.h>
+
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <vector>
+
+namespace at::zoom {
+
+static std::vector<int8_t> p2pAccessEnabled_;
+static int64_t num_devices_ = -1;
+
+namespace detail {
+
+void init_p2p_access_cache(int64_t num_devices) {
+  // p2pAccessEnabled records if p2p copies are allowed between pairs of
+  // devices. Values include "1" (copy allowed), "0" (copy not allowed), and
+  // "-1" (unknown).
+  // Currently the max number of gpus in P2P group is 8, so if there are more
+  // we enable P2P in groups of 8
+  p2pAccessEnabled_.clear();
+  p2pAccessEnabled_.resize(num_devices * num_devices, -1);
+  num_devices_ = num_devices;
+
+  for (const auto i : c10::irange(num_devices)) {
+    p2pAccessEnabled_[i * num_devices + i] = 1;
+  }
+}
+
+}  // namespace detail
+
+bool get_p2p_access(int dev, int dev_to_access) {
+  at::globalContext().lazyInitPrivateUse1();
+
+  TORCH_CHECK(dev >= 0 || dev < num_devices_,
+              dev, " is not a device");
+  TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_,
+              dev_to_access, " is not a device");
+  TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");
+
+  auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];
+
+  if (cache != -1) {
+    return cache;
+  }
+
+  int result;
+  C10_ZOOM_CHECK(hipDeviceCanAccessPeer(&result, dev, dev_to_access));
+  cache = result ? 1 : 0;
+  if (cache) {
+    c10::zoom::ZoomCachingAllocator::enablePeerAccess(dev, dev_to_access);
+  }
+
+  return cache;
+}
+
+}  // namespace at::zoom::detail
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/PeerToPeerAccess.h b/aten/src/ATen/zoom/PeerToPeerAccess.h
new file mode 100644
index 00000000000000..b299e48862024c
--- /dev/null
+++ b/aten/src/ATen/zoom/PeerToPeerAccess.h
@@ -0,0 +1,12 @@
+#include <c10/macros/Macros.h>
+#include <cstdint>
+#include <ATen/zoom/ZoomContext.h>
+
+namespace at::zoom {
+namespace detail {
+void init_p2p_access_cache(int64_t num_devices);
+}
+
+TORCH_ZOOM_API bool get_p2p_access(int source_dev, int dest_dev);
+
+}  // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/PhiloxHIPState.h b/aten/src/ATen/zoom/PhiloxHIPState.h
new file mode 100644
index 00000000000000..58a6bb5199fe58
--- /dev/null
+++ b/aten/src/ATen/zoom/PhiloxHIPState.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <cstdint>
+
+#include <ATen/zoom/detail/PhiloxHIPStateRaw.hpp>
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/PhiloxUtils.hpp b/aten/src/ATen/zoom/PhiloxUtils.hpp
new file mode 100644
index 00000000000000..ba2afd230f2c90
--- /dev/null
+++ b/aten/src/ATen/zoom/PhiloxUtils.hpp
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <ATen/zoom/PhiloxHIPState.h>
+#include <ATen/zoom/detail/UnpackRaw.hpp>
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/PinnedMemoryAllocator.cpp b/aten/src/ATen/zoom/PinnedMemoryAllocator.cpp
new file mode 100644
index 00000000000000..5b9ed21a971a23
--- /dev/null
+++ b/aten/src/ATen/zoom/PinnedMemoryAllocator.cpp
@@ -0,0 +1,32 @@
+#include <ATen/zoom/PinnedMemoryAllocator.h>
+#include <ATen/Context.h>
+#include <ATen/Config.h>
+#include <ATen/TensorUtils.h>
+#include <c10/core/Storage.h>
+#include <ATen/ATen.h>
+#include <ATen/CPUFunctions.h>
+
+namespace at::native {
+
+bool is_pinned_zoom(const Tensor& self, std::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_privateuseone());
+  // TODO: unhook this
+  return detail::getZoomHooks().isPinnedPtr(self.storage().data());
+}
+
+Tensor _pin_memory_zoom(const Tensor& self, std::optional<Device> device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_privateuseone());
+  auto* allocator = at::zoom::getPinnedMemoryAllocator();
+  auto storage = Storage(
+      Storage::use_byte_size_t(),
+      detail::computeStorageNbytes(
+          self.sizes(), self.strides(), self.dtype().itemsize()),
+      allocator,
+      /*resizable=*/false);
+  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
+  tensor.copy_(self);
+  return tensor;
+}
+
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/PinnedMemoryAllocator.h b/aten/src/ATen/zoom/PinnedMemoryAllocator.h
new file mode 100644
index 00000000000000..2c52bead795996
--- /dev/null
+++ b/aten/src/ATen/zoom/PinnedMemoryAllocator.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <ATen/zoom/CachingHostAllocator.h>
+
+namespace at::zoom {
+
+inline TORCH_ZOOM_API at::Allocator* getPinnedMemoryAllocator() {
+  return getCachingHostAllocator();
+}
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ScanUtils.cuh b/aten/src/ATen/zoom/ScanUtils.cuh
new file mode 100644
index 00000000000000..d1a3558a42a6c1
--- /dev/null
+++ b/aten/src/ATen/zoom/ScanUtils.cuh
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <ATen/ceil_div.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/zoom/AsmUtils.cuh>
+#include <c10/macros/Macros.h>
+
+// Collection of in-kernel scan / prefix sum utilities
+
+namespace at::zoom {
+
+// Inclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
+  // Within-warp, we use warp voting.
+  unsigned long long int vote = WARP_BALLOT(in);
+  T index = __popcll(getLaneMaskLe() & vote);
+  T carry = __popcll(vote);
+
+  int warp = threadIdx.x / C10_WARP_SIZE;
+
+  // Per each warp, write out a value
+  if (getLaneId() == 0) {
+    smem[warp] = carry;
+  }
+
+  __syncthreads();
+
+  // Sum across warps in one thread. This appears to be faster than a
+  // warp shuffle scan for CC 3.0+
+  if (threadIdx.x == 0) {
+    int current = 0;
+    for (int i = 0; i < blockDim.x / C10_WARP_SIZE; ++i) {
+      T v = smem[i];
+      smem[i] = binop(smem[i], current);
+      current = binop(current, v);
+    }
+  }
+
+  __syncthreads();
+
+  // load the carry from the preceding warp
+  if (warp >= 1) {
+    index = binop(index, smem[warp - 1]);
+  }
+
+  *out = index;
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
+
+  // Inclusive to exclusive
+  *out -= (T) in;
+
+  // The outgoing carry for all threads is the last warp's sum
+  *carry = smem[at::ceil_div<int>(blockDim.x, C10_WARP_SIZE) - 1];
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+}  // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ThrustAllocator.h b/aten/src/ATen/zoom/ThrustAllocator.h
new file mode 100644
index 00000000000000..17ba84d64f2222
--- /dev/null
+++ b/aten/src/ATen/zoom/ThrustAllocator.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstddef>
+#include <c10/zoom/ZoomCachingAllocator.h>
+
+namespace at::zoom {
+
+/// Allocator for Thrust to re-route its internal device allocations
+/// to the THC allocator
+class ThrustAllocator {
+public:
+  typedef char value_type;
+
+  char* allocate(std::ptrdiff_t size) {
+    return static_cast<char*>(c10::zoom::ZoomCachingAllocator::raw_alloc(size));
+  }
+
+  void deallocate(char* p, size_t size) {
+    c10::zoom::ZoomCachingAllocator::raw_delete(p);
+  }
+};
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomApplyUtils.cuh b/aten/src/ATen/zoom/ZoomApplyUtils.cuh
new file mode 100644
index 00000000000000..dcb91d124a11d0
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomApplyUtils.cuh
@@ -0,0 +1,537 @@
+#pragma once
+
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/macros/Macros.h>
+#include <ATen/native/Copy.h>
+
+#include <math.h>
+
+//
+// This file contains pointwise operation functions and kernels that
+// work on both contiguous and non-contiguous tensor arguments of
+// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
+// copying or temporary storage.
+//
+
+/*
+  NOTE [ CUDA_tensor_applyN helpers ]
+
+  The following CUDA_tensor_applyN (where N currently can be 1, 2, 3, or 4)
+  functions apply a pointwise operator to N tensor(s).
+
+  The calling convention is
+
+  1. The template arguments should be, sequentially,
+    - First N typename args specify the scalar types of each of the N tensors.
+    - (Optional) `int step` arg specifies the number of elements processed
+      together at the same time.
+      Default is 1.
+    - A usually omitted (i.e., inferred) typename arg specifies the type of the
+      function/functor applied on `N * step` values  in each iteration of each
+      CUDA thread.
+  2. The arguments should be, sequentially,
+    - N tensors
+    - op: a function/functor that processes `N * step` values at the same time.
+      - If `step == 1`, it must have signature
+        `void(*)(scalar1_t&, scalar2_t&, ..., scalarN_t&)`, where
+        `scalar*_t`s are the first N typename template args, and the inputs
+        are the `N` values from the `N` tensors retrieved at a common index.
+      - Otherwise, it must must have signature
+          void(*)(int n, scalar1_t&, scalar1_t&, ..., scalar1_t&,  // repeat `step` times
+                         scalar2_t&, scalar2_t&, ..., scalar2_t&,  // repeat `step` times
+                         ...,
+                         scalarN_t&, scalarN_t&, ..., scalarN_t&)  // repeat `step` times
+        Different from `step == 1` case, it processes `N * step` values taken
+        from `step` common indices. Moreover, the first input `n` represents the
+        number of valid indices (it will always have `0 < n <= step`). It will
+        almost always be `step`, but at the boundary we may not have full `step`
+        elements and `n` can be a lesser value.
+
+        E.g., if `step == 4` and `N == 2`, `op` could be
+
+          [](int n, scalar1_t &u1, scalar1_t &u2, scalar1_t &u3, scalar1_t &u4,
+                    scalar2_t &v1, scalar2_t &v2, scalar2_t &v3, scalar2_t &v4) {
+            // Only process u1, ..., un and v1, ..., vn.
+            // So if `n == 3`, `u4` and `v4` need not to be considered.
+          }
+
+      In both cases, the references can actually be const, but at least one of
+      them should be non-const in order to write the output.
+    - (Optional, but recommended) N TensorArgType args that specify for each
+      tensor whether `op` reads AND writes ] (i.e., TensorArgType::ReadWrite),
+      or only reads (i.e., TensorArgType::ReadOnly).
+      Default is TensorArgType::ReadWrite for first Tensor, and
+                 TensorArgType::ReadOnly  for the rest.
+
+  E.g.,
+
+  to compute a = b^2 for a and b of same dtype, we can call
+
+  Zoom_tensor_apply2<scalar, scalar>(
+    a, b,
+    [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; }
+  );
+
+  to work on 2 values at the same time, we can call
+
+  Zoom_tensor_apply2<scalar1, scalar2, 2>(
+    a, b,
+    [] __device__ (int n, scalar1 &a_val1, scalar1 &a_val2,
+                          const scalar2 &b_val1, const scalar2 &b_val2) {
+      // call special vectorized op here, or just do elementwise and enjoy unrolling...
+      // if n == 1, only process a_val1 and b_val1
+    }
+  );
+*/
+
+namespace at::zoom {
+
+// TODO: combine with TensorArg?  So far that's been for debugging, and this is functional...
+enum class TensorArgType { ReadWrite, ReadOnly };
+
+namespace {
+
+// Rearrange dimensions for pointwise operations so that strides are in
+// decreasing order as much as possible, so that kernels have better memory
+// access patterns.
+//
+// For example, consider a binary operation on two "transposed" 2-dim tensors:
+//    sizes:          256 512
+//    aInfo->strides:   1 256
+//    bInfo->strides:   1 256
+//
+// Given this, each concurrent memory access inside kernelPointwiseApply2() is
+// exactly 256 elements apart, resulting in poor performance.
+//
+// This function exchanges dimensions so that memory access is contiguous:
+//    sizes:          512 256
+//    aInfo->strides: 256   1
+//    bInfo->strides: 256   1
+//
+// (Actually, it becomes even better because now collapseDims() can turn each
+// input into one contiguous array.)
+//
+// In general, given M (<=4) TensorInfo's with N dimensions, we can view each
+// strides[i] (0 <= i < N) as an M-tuple.  Given each pair i < j, we exchange
+// strides[i] and [j] if
+//    (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
+//        (exchanging them will benefit input #k), and
+//    (2) strides[i][k] <= strieds[j][k] for all k
+//        (exchanging them will not make any input worse).
+template <typename T1, typename IndexType,
+          typename T2 = void, typename T3 = void, typename T4 = void>
+inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
+                          detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
+                          detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
+                          detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
+  int numInfos = 1;
+  int dims = aInfo->dims;
+  IndexType *sizes[4] = { aInfo->sizes, };
+  IndexType *strides[4] = { aInfo->strides, };
+
+  if (bInfo != nullptr) {
+    ++numInfos;
+    if (bInfo->dims != dims) return;
+    sizes[1] = bInfo->sizes;
+    strides[1] = bInfo->strides;
+  }
+
+  if (cInfo != nullptr) {
+    ++numInfos;
+    if (cInfo->dims != dims) return;
+    sizes[2] = cInfo->sizes;
+    strides[2] = cInfo->strides;
+  }
+
+  if (dInfo != nullptr) {
+    ++numInfos;
+    if (dInfo->dims != dims) return;
+    sizes[3] = dInfo->sizes;
+    strides[3] = dInfo->strides;
+  }
+
+  // Bail out if sizes do not match: we are using "deprecated pointwise
+  // behavior" among tensors of different shapes but same number of elements.
+  for (int i = 1; i < numInfos; ++i) {
+    for (int j = 0; j < dims; ++j) {
+      if (sizes[i][j] != sizes[0][j]) return;
+    }
+  }
+
+  for (int i = 0; i < dims - 1; ++i) {
+    // No need to consider dimensions of size 1.
+    if (sizes[0][i] == 1) continue;
+
+    for (int j = i + 1; j < dims; ++j) {
+      if (sizes[0][j] == 1) continue;
+
+      // Compare the relative sizes of strides between dim #i and dim #j.
+      bool hasIncreasingStrides = false;
+      bool hasDecreasingStrides = false;
+
+      for (int k = 0; k < numInfos; k++) {
+        IndexType stride_i = strides[k][i];
+        IndexType stride_j = strides[k][j];
+        if (stride_i < stride_j) {
+          hasIncreasingStrides = true;
+        } else if (stride_i > stride_j) {
+          hasDecreasingStrides = true;
+        }
+      }
+
+      if (hasIncreasingStrides && !hasDecreasingStrides) {
+        for (int k = 0; k < numInfos; k++) {
+          IndexType size = sizes[k][i];
+          sizes[k][i] = sizes[k][j];
+          sizes[k][j] = size;
+
+          IndexType stride = strides[k][i];
+          strides[k][i] = strides[k][j];
+          strides[k][j] = stride;
+        }
+      }
+    }
+  }
+}
+
+// The `remaining_steps` argument is used to support Op that operates on
+// multiple elements at the same time. Generally, the strategy of ApplyOpN is to
+//  1. Initialize `remaining_steps = step`, where `step` is the template arg of
+//     CUDA_tensor_applyN helpers. The input arg `n` to `apply()` represents the
+//     number of elements in bound for this call. It will almost always equal to
+//     `step` except at boundaries.
+//  2. If `remaining_steps > 0` convert the current linearIndex to offset (if in
+//     bound), and recursively call `ApplyOpN` with `remaining_steps - 1`.
+//  3. At `remaining_steps = 0`,
+//       if `step = 1`, call `op(tensor1_val, tensor2_val, ...)`;
+//       if `step > 1`, call `op(n, tensor1_val1, tensor1_val2, ..., tesor1_valstep,
+//                                  tensor2_val1, tensor2_val2, ..., tesor2_valstep,
+//                                       ...
+//                                  tensorN_val1, tensorN_val2, ..., tesorN_valstep);`
+//
+// See NOTE [ CUDA_tensor_applyN helpers ] above for how Op may look like.
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp1 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
+                  IndexType linearIndex, Offsets... aOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, op, n, linearIndex + 1, aOffsets..., aOffset
+  );
+}
+};
+
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          typename Offset>
+struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op,
+                  int n, IndexType linearIndex, Offset offset) {
+  op(a.data[offset]);
+}
+};
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          typename... Offsets>
+struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
+                 IndexType linearIndex, Offsets... offsets) {
+  op(n, a.data[offsets]...);
+}
+};
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          int step>
+
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+
+__global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
+                                      IndexType totalElements, const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp1<Op, scalar, IndexType, ADims, step>::apply(
+      a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
+  }
+}
+
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp2 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int64_t n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
+    detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  // Convert `linearIndex` into an offset of `b`
+  const IndexType bOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
+    detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
+
+  ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset
+  );
+}
+};
+
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          typename Offset>
+struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int /*n*/, IndexType /*linearIndex*/,
+                  Offset aOffset, Offset bOffset) {
+  op(a.data[aOffset], b.data[bOffset]);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          typename... Offsets>
+struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets) {
+  op(n, a.data[aOffsets]..., b.data[bOffsets]...);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims, int BDims,
+          int step,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
+
+C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm)
+
+__global__ void
+kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
+                      detail::TensorInfo<scalar2, IndexType> b,
+                      IndexType totalElements,
+                      const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply(
+      a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)),
+      linearIndex);
+  }
+}
+
+} // anonymous namespace
+
+template <typename scalar1, typename scalar2, int step, typename Op,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
+inline bool Zoom_tensor_apply2(at::TensorBase a,
+                               at::TensorBase b,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly) {
+  TORCH_CHECK(a.device().is_privateuseone() && b.device().is_privateuseone(),
+              "Zoom_tensor_apply2: Expected tensors to have Zoom DeviceType, but got "
+              "tensors with type ", a.device().type(), " and ", b.device().type());
+  int64_t totalElements = a.numel();
+
+  if (totalElements != b.numel()) {
+    return false;
+  }
+
+  if (a.dim() > MAX_TENSORINFO_DIMS ||
+      b.dim() > MAX_TENSORINFO_DIMS) {
+    return false;
+  }
+
+  if (a.numel() == 0) {
+    // Empty tensor; do nothing
+    return true;
+  }
+  const dim3 block = getApplyBlock(max_threads_per_block);
+
+  dim3 grid;
+  auto curDevice = c10::zoom::current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only
+  once.
+  */
+  TensorBase oldA;
+  TensorBase oldB;
+
+  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
+    // Must perform in contiguous space
+    oldA = std::exchange(a, a.contiguous());
+  }
+  if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
+    // Must perform in contiguous space
+    oldB = std::exchange(b, b.contiguous());
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+
+#define HANDLE_CASE(TYPE, A, B)                                        \
+  kernelPointwiseApply2<Op,                                            \
+                        scalar1,                                       \
+                        scalar2,                                       \
+                        TYPE, A, B, step,                              \
+                        max_threads_per_block,                         \
+                        min_blocks_per_sm>                             \
+   <<<grid, block, 0, c10::zoom::getCurrentZoomStream(curDevice)>>>(    \
+       aInfo, bInfo, static_cast<TYPE>(totalElements), op);            \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+#define HANDLE_B_CASE(TYPE, A, B) {         \
+  switch (B) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, A, 1);              \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, A, 2);              \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, A, -1);             \
+      break;                                \
+  }                                         \
+}
+
+#define HANDLE_A_CASE(TYPE, A, B) {         \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_B_CASE(TYPE, 1, B);            \
+      break;                                \
+    case 2:                                 \
+      HANDLE_B_CASE(TYPE, 2, B);            \
+      break;                                \
+    default:                                \
+      HANDLE_B_CASE(TYPE, -1, B);           \
+      break;                                \
+  }                                         \
+}
+
+  if (detail::canUse32BitIndexMath(a) &&
+      detail::canUse32BitIndexMath(b)) {
+    detail::TensorInfo<scalar1, unsigned int> aInfo =
+      detail::getTensorInfo<scalar1, unsigned int>(a);
+
+    detail::TensorInfo<scalar2, unsigned int> bInfo =
+      detail::getTensorInfo<scalar2, unsigned int>(b);
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
+  } else {
+    detail::TensorInfo<scalar1, uint64_t> aInfo =
+      detail::getTensorInfo<scalar1, uint64_t>(a);
+
+    detail::TensorInfo<scalar2, uint64_t> bInfo =
+      detail::getTensorInfo<scalar2, uint64_t>(b);
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time.
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1) {
+      HANDLE_CASE(uint64_t, 1, 1);
+    } else {
+      HANDLE_CASE(uint64_t, -1, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA.defined()) {
+    at::native::copy_ignoring_overlaps(oldA, a);
+  }
+
+  if (oldB.defined()) {
+    at::native::copy_ignoring_overlaps(oldB, b);
+  }
+
+  return true;
+}
+
+/* Provides default step = 1 to Zoom_tensor_apply2. */
+template <typename scalar1, typename scalar2, typename Op,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
+inline bool Zoom_tensor_apply2(const at::TensorBase &a,
+                               const at::TensorBase &b,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly) {
+  return Zoom_tensor_apply2<scalar1, scalar2, 1, Op,
+                            max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType);
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomContext.cpp b/aten/src/ATen/zoom/ZoomContext.cpp
new file mode 100644
index 00000000000000..3182fafed7493f
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomContext.cpp
@@ -0,0 +1,69 @@
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/util/CallOnce.h>
+
+// #include <ATen/cuda/CUDAConfig.h>
+#include <mutex>
+#include <deque>
+#include <vector>
+
+namespace at::zoom {
+
+namespace {
+
+DeviceIndex num_gpus = -1;
+c10::once_flag init_flag;
+std::deque<c10::once_flag> device_flags;
+std::vector<hipDeviceProp_t> device_properties;
+
+void initZoomContextVectors() {
+  num_gpus = c10::zoom::device_count();
+  device_flags.resize(num_gpus);
+  device_properties.resize(num_gpus);
+}
+
+void initDeviceProperty(DeviceIndex device_index) {
+  hipDeviceProp_t device_prop;
+  C10_ZOOM_CHECK(hipGetDeviceProperties(&device_prop, device_index));
+  device_properties[device_index] = device_prop;
+}
+
+} // anonymous namespace
+
+// We need this function to force the linking against torch_cuda(_cpp) on Windows.
+// If you need to modify this function, please specify a new function and apply
+// the changes according to https://github.com/pytorch/pytorch/pull/34288.
+// Related issue: https://github.com/pytorch/pytorch/issues/31611.
+/* Device info */
+int warp_size() {
+  return getCurrentDeviceProperties()->warpSize;
+}
+
+hipDeviceProp_t* getCurrentDeviceProperties() {
+  auto device = c10::zoom::current_device();
+  return getDeviceProperties(device);
+}
+
+hipDeviceProp_t* getDeviceProperties(c10::DeviceIndex device) {
+  c10::call_once(init_flag, initZoomContextVectors);
+  if (device == -1) device = c10::zoom::current_device();
+  AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus);
+  c10::call_once(device_flags[device], initDeviceProperty, device);
+  return &device_properties[device];
+}
+
+bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device) {
+  c10::call_once(init_flag, initZoomContextVectors);
+  if (device == -1) device = c10::zoom::current_device();
+  AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus);
+  AT_ASSERT(peer_device >= 0 && peer_device < num_gpus, "peer_device=", peer_device, ", num_gpus=", num_gpus);
+  int can_access = 0;
+  C10_ZOOM_CHECK(hipDeviceCanAccessPeer(&can_access, device, peer_device));
+  return can_access != 0;
+}
+
+Allocator* getZoomDeviceAllocator() {
+  return c10::zoom::ZoomCachingAllocator::get();
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomContext.h b/aten/src/ATen/zoom/ZoomContext.h
new file mode 100644
index 00000000000000..98a36bee8b4fd7
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomContext.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/zoom/ZoomContextLight.h>
+
+// Preserved for BC, as many files depend on these includes
+#include <ATen/Context.h>
+#include <c10/zoom/ZoomStream.h>
+#include <c10/util/Logging.h>
+#include <c10/zoom/ZoomException.h>
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomContextLight.h b/aten/src/ATen/zoom/ZoomContextLight.h
new file mode 100644
index 00000000000000..44a82879f05267
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomContextLight.h
@@ -0,0 +1,85 @@
+#pragma once
+// Light-weight version of ZoomContext.h with fewer transitive includes
+#define DISABLE_HIPBLASLT
+
+#include <cstdint>
+
+#include <hip/hip_runtime.h>
+#include <c10/core/Allocator.h>
+#include <c10/zoom/ZoomFunctions.h>
+
+#include <hipsparse/hipsparse.h>
+#include <hipsolver/hipsolver.h>
+#include <hipblas/hipblas.h>
+#ifndef DISABLE_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+#endif
+
+namespace c10 {
+struct Allocator;
+}
+
+namespace at::zoom {
+
+/*
+A common CUDA interface for ATen.
+
+This interface is distinct from CUDAHooks, which defines an interface that links
+to both CPU-only and CUDA builds. That interface is intended for runtime
+dispatch and should be used from files that are included in both CPU-only and
+CUDA builds.
+
+CUDAContext, on the other hand, should be preferred by files only included in
+CUDA builds. It is intended to expose CUDA functionality in a consistent
+manner.
+
+This means there is some overlap between the CUDAContext and CUDAHooks, but
+the choice of which to use is simple: use CUDAContext when in a CUDA-only file,
+use CUDAHooks otherwise.
+
+Note that CUDAContext simply defines an interface with no associated class.
+It is expected that the modules whose functions compose this interface will
+manage their own state. There is only a single CUDA context/state.
+*/
+
+/**
+ * DEPRECATED: use device_count() instead
+ */
+inline int64_t getNumGPUs() {
+    return c10::zoom::device_count();
+}
+
+/**
+ * CUDA is available if we compiled with CUDA, and there are one or more
+ * devices.  If we compiled with CUDA but there is a driver problem, etc.,
+ * this function will report CUDA is not available (rather than raise an error.)
+ */
+inline bool is_available() {
+    return c10::zoom::device_count() > 0;
+}
+
+TORCH_ZOOM_API hipDeviceProp_t* getCurrentDeviceProperties();
+
+TORCH_ZOOM_API int warp_size();
+
+TORCH_ZOOM_API hipDeviceProp_t* getDeviceProperties(c10::DeviceIndex device);
+
+TORCH_ZOOM_API bool canDeviceAccessPeer(
+    c10::DeviceIndex device,
+    c10::DeviceIndex peer_device);
+
+TORCH_ZOOM_API c10::Allocator* getZoomDeviceAllocator();
+
+TORCH_ZOOM_API hipsparseHandle_t getCurrentHIPSparseHandle();
+TORCH_ZOOM_API hipblasHandle_t getCurrentHIPBlasHandle();
+#ifndef DISABLE_HIPBLASLT
+TORCH_ZOOM_API hipblasLtHandle_t getCurrentHIPBlasLtHandle();
+#endif
+
+
+#if defined(hipsolverVersionMajor)
+TORCH_ZOOM_API hipsolverDnHandle_t getCurrentHIPSolverDnHandle();
+#endif
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomDataType.h b/aten/src/ATen/zoom/ZoomDataType.h
new file mode 100644
index 00000000000000..41186e419bea1e
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomDataType.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+
+#include <hip/hip_runtime.h>
+#include <hip/library_types.h>
+#include <hip/hip_complex.h>
+
+namespace at::zoom {
+
+template <typename scalar_t>
+hipDataType getHIPDataType() {
+  TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to hipDataType.")
+}
+
+template<> inline hipDataType getHIPDataType<at::Half>() {
+  return HIP_R_16F;
+}
+template<> inline hipDataType getHIPDataType<float>() {
+  return HIP_R_32F;
+}
+template<> inline hipDataType getHIPDataType<double>() {
+  return HIP_R_64F;
+}
+template<> inline hipDataType getHIPDataType<c10::complex<c10::Half>>() {
+  return HIP_C_16F;
+}
+template<> inline hipDataType getHIPDataType<c10::complex<float>>() {
+  return HIP_C_32F;
+}
+template<> inline hipDataType getHIPDataType<c10::complex<double>>() {
+  return HIP_C_64F;
+}
+
+template<> inline hipDataType getHIPDataType<uint8_t>() {
+  return HIP_R_8U;
+}
+template<> inline hipDataType getHIPDataType<int8_t>() {
+  return HIP_R_8I;
+}
+template<> inline hipDataType getHIPDataType<int>() {
+  return HIP_R_32I;
+}
+
+template<> inline hipDataType getHIPDataType<int16_t>() {
+  return HIP_R_16I;
+}
+template<> inline hipDataType getHIPDataType<int64_t>() {
+  return HIP_R_64I;
+}
+template<> inline hipDataType getHIPDataType<at::BFloat16>() {
+  return HIP_R_16BF;
+}
+
+inline hipDataType ScalarTypeToHIPDataType(const c10::ScalarType& scalar_type) {
+  switch (scalar_type) {
+    case c10::ScalarType::Byte:
+      return HIP_R_8U;
+    case c10::ScalarType::Char:
+      return HIP_R_8I;
+    case c10::ScalarType::Int:
+      return HIP_R_32I;
+    case c10::ScalarType::Half:
+      return HIP_R_16F;
+    case c10::ScalarType::Float:
+      return HIP_R_32F;
+    case c10::ScalarType::Double:
+      return HIP_R_64F;
+    case c10::ScalarType::ComplexHalf:
+      return HIP_C_16F;
+    case c10::ScalarType::ComplexFloat:
+      return HIP_C_32F;
+    case c10::ScalarType::ComplexDouble:
+      return HIP_C_64F;
+    case c10::ScalarType::Short:
+      return HIP_R_16I;
+    case c10::ScalarType::Long:
+      return HIP_R_64I;
+    case c10::ScalarType::BFloat16:
+      return HIP_R_16BF;
+#if defined(HIP_NEW_TYPE_ENUMS)
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return HIP_R_8F_E4M3_FNUZ;
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return HIP_R_8F_E5M2_FNUZ;
+#else
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return static_cast<hipDataType>(1000);
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return static_cast<hipDataType>(1001);
+#endif
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to hipDataType.")
+  }
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomDevice.h b/aten/src/ATen/zoom/ZoomDevice.h
new file mode 100644
index 00000000000000..e7ac4e781cba91
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomDevice.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <c10/zoom/ZoomException.h>
+
+#include <hip/hip_runtime.h>
+
+namespace at::zoom {
+
+inline Device getDeviceFromPtr(void* ptr) {
+  hipPointerAttribute_t attr{};
+
+  C10_ZOOM_CHECK(hipPointerGetAttributes(&attr, ptr));
+
+  return {c10::DeviceType::PrivateUse1, static_cast<DeviceIndex>(attr.device)};
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomEvent.h b/aten/src/ATen/zoom/ZoomEvent.h
new file mode 100644
index 00000000000000..dfb0557e6fba0e
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomEvent.h
@@ -0,0 +1,213 @@
+#pragma once
+
+// #include <ATen/cuda/ATenCUDAGeneral.h>
+#include <hip/hip_runtime.h>
+#include <c10/macros/Export.h>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/zoom/ZoomStream.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <utility>
+
+namespace at::zoom {
+
+/*
+* CUDAEvents are movable not copyable wrappers around CUDA's events.
+*
+* CUDAEvents are constructed lazily when first recorded unless it is
+* reconstructed from a cudaIpcEventHandle_t. The event has a device, and this
+* device is acquired from the first recording stream. However, if reconstructed
+* from a handle, the device should be explicitly specified; or if ipc_handle() is
+* called before the event is ever recorded, it will use the current device.
+* Later streams that record the event must match this device.
+*/
+struct TORCH_ZOOM_API ZoomEvent {
+  // Constructors
+  // Default value for `flags` is specified below - it's cudaEventDisableTiming
+  ZoomEvent() noexcept = default;
+  ZoomEvent(unsigned int flags) noexcept : flags_{flags} {}
+
+  ZoomEvent(
+      DeviceIndex device_index, const hipIpcEventHandle_t* handle) {
+      device_index_ = device_index;
+      c10::zoom::ZoomGuard guard(device_index_);
+
+      C10_ZOOM_CHECK(hipIpcOpenEventHandle(&event_, *handle));
+      is_created_ = true;
+  }
+
+  // Note: event destruction done on creating device to avoid creating a
+  // CUDA context on other devices.
+  ~ZoomEvent() {
+    try {
+      if (is_created_) {
+        c10::zoom::ZoomGuard guard(device_index_);
+        const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+        if (C10_UNLIKELY(interp)) {
+          (*interp)->trace_gpu_event_deletion(DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(event_));
+        }
+        C10_ZOOM_CHECK(hipEventDestroy(event_));
+      }
+    } catch (...) { /* No throw */ }
+  }
+
+  ZoomEvent(const ZoomEvent&) = delete;
+  ZoomEvent& operator=(const ZoomEvent&) = delete;
+
+  ZoomEvent(ZoomEvent&& other) noexcept { moveHelper(std::move(other)); }
+  ZoomEvent& operator=(ZoomEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(std::move(other));
+    }
+    return *this;
+  }
+
+  operator hipEvent_t() const { return event(); }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const ZoomEvent& left, const ZoomEvent& right) {
+    return left.event_ < right.event_;
+  }
+
+  optional<at::Device> device() const {
+    if (is_created_) {
+      return at::Device(DeviceType::PrivateUse1, device_index_);
+    } else {
+      return {};
+    }
+  }
+
+  bool isCreated() const { return is_created_; }
+  DeviceIndex device_index() const {return device_index_;}
+  hipEvent_t event() const { return event_; }
+
+  // Note: hipEventQuery can be safely called from any device
+  bool query() const {
+    if (!is_created_) {
+      return true;
+    }
+
+    hipError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    } else if (err != hipErrorNotReady) {
+      C10_ZOOM_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)hipGetLastError();
+    }
+
+    return false;
+  }
+
+  void record() { record(c10::zoom::getCurrentZoomStream()); }
+
+  void recordOnce(const c10::zoom::ZoomStream& stream) {
+    if (!was_recorded_) record(stream);
+  }
+
+  // Note: hipEventRecord must be called on the same device as the event.
+  void record(const c10::zoom::ZoomStream& stream) {
+    if (!is_created_) {
+      createEvent(stream.device_index());
+    }
+
+    TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_,
+      " does not match recording stream's device ", stream.device_index(), ".");
+    c10::zoom::ZoomGuard guard(device_index_);
+    C10_ZOOM_CHECK(hipEventRecord(event_, stream));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(DeviceType::PrivateUse1,
+          reinterpret_cast<uintptr_t>(event_),
+          reinterpret_cast<uintptr_t>(stream.stream())
+      );
+    }
+    was_recorded_ = true;
+  }
+
+  // Note: hipStreamWaitEvent must be called on the same device as the stream.
+  // The event has no actual GPU resources associated with it.
+  void block(const c10::zoom::ZoomStream& stream) {
+    if (is_created_) {
+      c10::zoom::ZoomGuard guard(stream.device_index());
+      C10_ZOOM_CHECK(hipStreamWaitEvent(stream, event_, 0));
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(DeviceType::PrivateUse1,
+            reinterpret_cast<uintptr_t>(event_),
+            reinterpret_cast<uintptr_t>(stream.stream())
+        );
+      }
+    }
+  }
+
+  // Note: hipEventElapsedTime can be safely called from any device
+  float elapsed_time(const ZoomEvent& other) const {
+    TORCH_CHECK(is_created_ && other.isCreated(),
+      "Both events must be recorded before calculating elapsed time.");
+    float time_ms = 0;
+    // We do not strictly have to set the device index to the same as our event,
+    // but if we don't and the current device is not initialized, it will
+    // create a new hip context, which will consume a lot of memory.
+    c10::zoom::ZoomGuard guard(device_index_);
+    // raise hipErrorNotReady if either event is recorded but not yet completed
+    C10_ZOOM_CHECK(hipEventElapsedTime(&time_ms, event_, other.event_));
+    return time_ms;
+  }
+
+  // Note: hipEventSynchronize can be safely called from any device
+  void synchronize() const {
+    if (is_created_) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+          (*interp)->trace_gpu_event_synchronization(DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(event_));
+      }
+      C10_ZOOM_CHECK(hipEventSynchronize(event_));
+    }
+  }
+
+  // Note: hipIpcGetEventHandle must be called on the same device as the event
+  void ipc_handle(hipIpcEventHandle_t * handle) {
+      if (!is_created_) {
+        // this ZoomEvent object was initially constructed from flags but event_
+        // is not created yet.
+        createEvent(c10::zoom::getCurrentZoomStream().device_index());
+      }
+      c10::zoom::ZoomGuard guard(device_index_);
+      C10_ZOOM_CHECK(hipIpcGetEventHandle(handle, event_));
+  }
+
+private:
+  unsigned int flags_ = hipEventDisableTiming;
+  bool is_created_ = false;
+  bool was_recorded_ = false;
+  DeviceIndex device_index_ = -1;
+  hipEvent_t event_{};
+
+  void createEvent(DeviceIndex device_index) {
+    device_index_ = device_index;
+    c10::zoom::ZoomGuard guard(device_index_);
+    C10_ZOOM_CHECK(hipEventCreateWithFlags(&event_, flags_));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(event_));
+    }
+    is_created_ = true;
+  }
+
+  void moveHelper(ZoomEvent&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+};
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomGeneratorImpl.cpp b/aten/src/ATen/zoom/ZoomGeneratorImpl.cpp
new file mode 100644
index 00000000000000..d0b9a5a963db95
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomGeneratorImpl.cpp
@@ -0,0 +1,512 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/zoom/HIPGraph.h>
+#include <ATen/zoom/HIPGraphsUtils.hpp>
+#include <c10/core/StreamGuard.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/util/CallOnce.h>
+#include <ATen/core/GeneratorForPrivateuseone.h>
+#include <deque>
+
+namespace at {
+namespace zoom::detail {
+
+namespace {
+
+// Ensures we only call cudaGetDeviceCount only once.
+static c10::once_flag num_gpu_init_flag;
+
+// Total number of gpus in the system.
+static int64_t num_gpus;
+
+// Ensures default_gens_zoom is initialized once.
+static std::deque<c10::once_flag> zoom_gens_init_flag;
+
+// Default, global CUDA generators, one per GPU.
+static std::vector<Generator> default_gens_zoom;
+
+/*
+ * Populates the global variables related to CUDA generators
+ * Warning: this function must only be called once!
+ */
+static void initZoomGenVector() {
+  num_gpus = c10::zoom::device_count();
+  zoom_gens_init_flag.resize(num_gpus);
+  default_gens_zoom.resize(num_gpus);
+}
+
+} // anonymous namespace
+
+/**
+ * PyTorch maintains a collection of default generators that get
+ * initialized once. The purpose of these default generators is to
+ * maintain a global running state of the pseudo random number generation,
+ * when a user does not explicitly mention any generator.
+ * getDefaultZoomGenerator gets the default generator for a particular
+ * cuda device.
+ */
+const Generator& getDefaultZoomGenerator(DeviceIndex device_index) {
+  c10::call_once(num_gpu_init_flag, initZoomGenVector);
+  DeviceIndex idx = device_index;
+  if (idx == -1) {
+    idx = c10::zoom::current_device();
+  } else {
+    TORCH_CHECK(idx >= 0 && idx < num_gpus);
+  }
+  c10::call_once(zoom_gens_init_flag[idx], [&] {
+    default_gens_zoom[idx] = make_generator<ZoomGeneratorImpl>(idx);
+    default_gens_zoom[idx].seed();
+  });
+  return default_gens_zoom[idx];
+}
+
+// register to PrivateUse1
+REGISTER_GENERATOR_PRIVATEUSE1(getDefaultZoomGenerator);
+
+/**
+ * Utility to create a ZoomGeneratorImpl. Returns a shared_ptr
+ */
+Generator createZoomGenerator(DeviceIndex device_index) {
+  c10::call_once(num_gpu_init_flag, initZoomGenVector);
+  DeviceIndex idx = device_index;
+  if (idx == -1) {
+    idx = c10::zoom::current_device();
+  }
+  TORCH_CHECK(idx >= 0 && idx < num_gpus, "The device_index is invalid.");
+  auto gen = make_generator<ZoomGeneratorImpl>(idx);
+  auto zoom_gen = check_generator<ZoomGeneratorImpl>(gen);
+  zoom_gen->set_current_seed(default_rng_seed_val);
+  zoom_gen->set_philox_offset_per_thread(0);
+  return gen;
+}
+
+} // namespace zoom::detail
+
+/**
+ * Creates a clone of this CUDA Generator State.
+ */
+c10::intrusive_ptr<ZoomGeneratorState> ZoomGeneratorState::clone() {
+  return make_intrusive<ZoomGeneratorState>(
+      seed_, philox_offset_per_thread_, offset_intragraph_);
+}
+
+/**
+ * Function to increase the internal offset based on the specified increment.
+ */
+void ZoomGeneratorState::increase(uint64_t increment) {
+  // Rounds increment up to the nearest multiple of 4 to meet alignment
+  // requirements.
+  // see Note [Why enforce RNG offset % 4 == 0?]
+  increment = ((increment + 3) / 4) * 4;
+  // Handling different behaviors based on whether capturing is active.
+  if (at::zoom::currentStreamCaptureStatus() != at::zoom::CaptureStatus::None) {
+    // Ensures that the state is actually capturing.
+    TORCH_CHECK(
+        capturing_,
+        "Attempt to increase offset for a Zoom generator not in capture mode.");
+    // Ensures the offset is a multiple of 4
+    // see Note [Why enforce RNG offset % 4 == 0?]
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
+    // Ensures the increment does not cause overflow.
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+        "Increment causes overflow in the offset value.");
+    offset_intragraph_ += increment;
+  } else {
+    // Checks that the increment is expected outside graph capturing.
+    TORCH_CHECK(
+        !capturing_,
+        "Offset increment outside graph capture encountered unexpectedly.");
+    // Ensures the offset is a multiple of 4
+    // see Note [Why enforce RNG offset % 4 == 0?]
+    TORCH_INTERNAL_ASSERT(
+        philox_offset_per_thread_ % 4 == 0,
+        "RNG offset must be a multiple of 4.");
+    philox_offset_per_thread_ += increment;
+  }
+}
+
+/**
+ * Registers this state to a CUDA graph to manage within the graph.
+ */
+void ZoomGeneratorState::register_graph(zoom::HIPGraph* graph) {
+  // Ensures that the RNG state is not currently being captured.
+  at::zoom::assertNotCapturing(
+      "Cannot register the state during capturing stage.");
+
+  // If this is the first graph to be registered, allocate memory for the seed
+  // and offset on the GPU.
+  if (registered_graphs_.empty()) {
+    auto options = at::TensorOptions().device(DeviceType::PrivateUse1).dtype(at::kLong);
+    seed_extragraph_ = at::empty({1}, options);
+    offset_extragraph_ = at::empty({1}, options);
+  }
+
+  // Insert the graph into the set of registered graphs if it's not already
+  // registered.
+  if (registered_graphs_.find(graph) == registered_graphs_.end()) {
+    registered_graphs_.insert(graph);
+  }
+}
+
+/**
+ * Unregisters a CUDA graph from the RNG state.
+ */
+void ZoomGeneratorState::unregister_graph(zoom::HIPGraph* graph) {
+  // Ensures that the RNG state is not currently being captured.
+  at::zoom::assertNotCapturing(
+      "Cannot unregister the state during capturing stage.");
+  // Verify the graph was previously registered.
+  TORCH_CHECK(
+      registered_graphs_.find(graph) != registered_graphs_.end(),
+      "The graph should be registered to the state");
+
+  // Remove the graph from the set of registered graphs.
+  registered_graphs_.erase(graph);
+
+  // If no more graphs are registered, deallocate the GPU memory for the seed
+  // and offset.
+  if (registered_graphs_.empty()) {
+    seed_extragraph_.reset();
+    offset_extragraph_.reset();
+  }
+}
+
+/**
+ * Note [Explicit Registration of Generators to the CUDA Graph]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Ideally, it would be more user-friendly if the state could be exchanged and generators
+ * could be registered with the CUDA graph implicitly. However, resetting GPU tensors during
+ * the capture stage causes these reset operations to be recorded within the CUDA graph.
+ * This behavior is undesirable because we do not want these tensors to be reset during
+ * the replay stage of the graph.
+ *
+ * As of now, there is no available method to perform a CUDA operation during the graph's
+ * recording phase without having that operation be included in the CUDA graph.
+ * This limitation necessitates explicit user action to register generators with the graph.
+ * By requiring users to manually register their generators, we can ensure that state resets
+ * (capture_prologue) only occur before the graph capture begins, thus avoiding unintended
+ * resets during the replay of the graph. See https://github.com/pytorch/pytorch/pull/114068.
+ */
+
+/**
+ * Performs the prologue steps for capturing a CUDA graph state.
+ * This method is intended to reset graph-related state variables before capturing begins.
+ */
+void ZoomGeneratorState::capture_prologue() {
+  capturing_ = true;
+  offset_intragraph_ = 0;
+  seed_extragraph_.fill_(int64_t(seed_));
+  offset_extragraph_.fill_(int64_t(0));
+}
+
+/**
+ * Ends the capturing phase and resets related variables, returning the whole
+ * graph increment.
+ */
+uint64_t ZoomGeneratorState::capture_epilogue() {
+  capturing_ = false;
+  return offset_intragraph_;
+}
+
+/**
+ * Prepares the state for replay by setting initial state tensors and applying
+ * total increment.
+ */
+void ZoomGeneratorState::replay_prologue(uint64_t wholegraph_increment) {
+  // Ensures the generator is not in capturing mode.
+  at::zoom::assertNotCapturing(
+      "Cannot prepare for replay during capturing stage.");
+  seed_extragraph_.fill_(int64_t(seed_));
+  offset_extragraph_.fill_(int64_t(philox_offset_per_thread_));
+  // Applies the total increment achieved during previous captures to update the
+  // offset.
+  increase(wholegraph_increment);
+}
+
+/**
+ * Note [Why enforce RNG offset % 4 == 0?]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Curand philox does allow offsets that aren't a multiple of 4.
+ * But jit kernels don't use curand, they use a custom "Philox" class (see
+ * torch/csrc/jit/tensorexpr/cuda_random.h or
+ * torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu).
+ * The "Philox" constructor computes offset/4 (a uint64_t division) to locate its
+ * internal start in its virtual bitstream viewed as 128-bit chunks, then, when called
+ * in a thread, returns one 32-bit chunk at a time from that start in the bitstream.
+ * In other words, if the incoming offset is not a multiple of 4, each thread
+ * might repeat some previously-generated 32-bit values in the bitstream. See
+ * https://github.com/pytorch/pytorch/pull/50169.
+ */
+
+/**
+ * ZoomGeneratorImpl class implementation
+ */
+ZoomGeneratorImpl::ZoomGeneratorImpl(DeviceIndex device_index)
+  : c10::GeneratorImpl{Device(DeviceType::PrivateUse1, device_index),
+          DispatchKeySet(c10::DispatchKey::PrivateUse1)} {
+  at::zoom::assertNotCapturing("Cannot construct a new ZoomGeneratorImpl");
+  state_ = make_intrusive<ZoomGeneratorState>();
+  no_reset_rnn_state_.clear();
+}
+
+ZoomGeneratorImpl::ZoomGeneratorImpl(
+    DeviceIndex device_index,
+    c10::intrusive_ptr<ZoomGeneratorState> state)
+    : c10::
+          GeneratorImpl{Device(DeviceType::PrivateUse1, device_index), DispatchKeySet(c10::DispatchKey::PrivateUse1)},
+      state_(std::move(state)) {
+  no_reset_rnn_state_.clear();
+}
+
+/**
+ * Sets the seed to be used by curandStatePhilox4_32_10
+ * Resets the philox_offset_per_thread_ to 0
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+void ZoomGeneratorImpl::set_current_seed(uint64_t seed) {
+  at::zoom::assertNotCapturing(
+      "Cannot call ZoomGeneratorImpl::set_current_seed");
+  state_->seed_ = seed;
+  state_->philox_offset_per_thread_ = 0;
+  no_reset_rnn_state_.clear();
+}
+
+/**
+ * Sets the offset to be used by curandStatePhilox4_32_10
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+void ZoomGeneratorImpl::set_offset(uint64_t offset) {
+  at::zoom::assertNotCapturing("Cannot call ZoomGeneratorImpl::set_offset");
+  // the set function checks if the offset is a multiple of 4.
+  set_philox_offset_per_thread(offset);
+  no_reset_rnn_state_.clear();
+}
+
+/**
+ * Gets the current offset of ZoomGeneratorImpl.
+ */
+uint64_t ZoomGeneratorImpl::get_offset() const {
+  // Debatable if get_offset() should be allowed in captured regions.
+  // Conservatively disallow it for now.
+  at::zoom::assertNotCapturing("Cannot call ZoomGeneratorImpl::get_offset");
+  return state_->philox_offset_per_thread_;
+}
+
+/**
+ * Gets the current seed of ZoomGeneratorImpl.
+ */
+uint64_t ZoomGeneratorImpl::current_seed() const {
+  // Debatable if current_seed() should be allowed in captured regions.
+  // Conservatively disallow it for now.
+  at::zoom::assertNotCapturing("Cannot call ZoomGeneratorImpl::current_seed");
+  return state_->seed_;
+}
+
+/**
+ * Gets a nondeterministic random number from /dev/urandom or time,
+ * seeds the CPUGeneratorImpl with it and then returns that number.
+ *
+ * FIXME: You can move this function to Generator.cpp if the algorithm
+ * in getNonDeterministicRandom is unified for both CPU and CUDA
+ */
+uint64_t ZoomGeneratorImpl::seed() {
+  at::zoom::assertNotCapturing("Cannot call ZoomGeneratorImpl::seed");
+  auto random = c10::detail::getNonDeterministicRandom(true);
+  this->set_current_seed(random);
+  return random;
+}
+
+/**
+ * Gets the current internal state of ZoomGeneratorImpl. The internal
+ * state is returned as a CPU byte tensor.
+ */
+c10::intrusive_ptr<c10::TensorImpl> ZoomGeneratorImpl::get_state() const {
+  // The RNG state comprises the seed, and an offset used for Philox.
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  auto current_seed = this->current_seed();
+  auto offset = static_cast<int64_t>(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
+  memcpy(rng_state, &current_seed, seed_size);
+  memcpy(rng_state + seed_size, &offset, offset_size);
+
+  return state_tensor.getIntrusivePtr();
+}
+
+/**
+ * Sets the internal state of ZoomGeneratorImpl. The new internal state
+ * must be a strided CPU byte tensor and have appropriate size. See
+ * comments of ZoomGeneratorImpl::state for information about the layout
+ * and size of the internal state.
+ */
+void ZoomGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  at::zoom::assertNotCapturing(
+      "Please ensure to utilize the ZoomGeneratorImpl::set_state_index method during capturing.");
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = seed_size + offset_size;
+
+  detail::check_rng_state(new_state);
+
+  bool no_philox_seed = false;
+  auto new_state_size = new_state.numel();
+  if (new_state_size == total_size - offset_size) {
+    no_philox_seed = true;
+  } else {
+    TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+  }
+
+  uint64_t input_seed = 0;
+  auto new_rng_state = new_state.data_dtype_initialized<uint8_t>();
+  memcpy(&input_seed, new_rng_state, seed_size);
+  this->set_current_seed(input_seed);
+  int64_t philox_offset = 0;
+  if (!no_philox_seed) {
+    memcpy(&philox_offset, new_rng_state + seed_size, offset_size);
+  }
+  this->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
+}
+
+/**
+ * Sets the generator's current state to
+ * This function allows switching between different registered states of
+ * the generator.
+ */
+void ZoomGeneratorImpl::graphsafe_set_state(
+    const c10::intrusive_ptr<GeneratorImpl>& gen) {
+  c10::intrusive_ptr<ZoomGeneratorImpl> zoom_gen =
+      dynamic_intrusive_pointer_cast<ZoomGeneratorImpl>(gen);
+  TORCH_CHECK(zoom_gen, "Expected a Zoom Generator");
+  state_ = zoom_gen->state_;
+}
+
+/**
+ * Get the GeneratorImpl that point to current state_
+ */
+c10::intrusive_ptr<c10::GeneratorImpl> ZoomGeneratorImpl::graphsafe_get_state()
+    const {
+  auto gen = make_intrusive<ZoomGeneratorImpl>(device().index(), state_);
+  return gen;
+}
+
+/**
+ * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+void ZoomGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
+  // see Note [Why enforce RNG offset % 4 == 0?]
+  TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
+  state_->philox_offset_per_thread_ = offset;
+}
+
+/**
+ * Gets the current philox_offset_per_thread_ of ZoomGeneratorImpl.
+ */
+uint64_t ZoomGeneratorImpl::philox_offset_per_thread() const {
+  return state_->philox_offset_per_thread_;
+}
+
+/**
+ * Registers this state to a CUDA graph to manage within the graph.
+ */
+void ZoomGeneratorImpl::register_graph(zoom::HIPGraph* graph) {
+  graph->register_generator_state(state_);
+  state_->register_graph(graph);
+}
+
+/**
+ * Unregisters a CUDA graph from the RNG state.
+ */
+void ZoomGeneratorImpl::unregister_graph(zoom::HIPGraph* graph) {
+  state_->unregister_graph(graph);
+}
+
+/**
+ * Gets the seed and philox offset value to be used in
+ * curandStatePhilox4_32_10, in an opaque PhiloxHIPState that's safe
+ * and can be used non-divergently in callers whether CUDA graph
+ * capture is underway or not.  See
+ * Note [CUDA Graph-safe RNG states]
+ *
+ * Each kernel using philox has to sensibly increment offset
+ * for future users of philox. So it gets the "old" value for
+ * itself (before add), and tells subsequent users which offset
+ * they should use, since only the kernel knows how many randoms
+ * it intends to generate.
+ *
+ * Increment should be at least the number of curand() random numbers used in
+ * each thread. It is the user's responsibility to make sure the increment
+ * for philox is never smaller than the number of curand() calls. Increment
+ * value > the number of curand() calls won't harm but anything less would mean
+ * that you would be reusing random values from previous calls.
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+PhiloxHIPState ZoomGeneratorImpl::philox_hip_state(uint64_t increment) {
+  if (at::zoom::currentStreamCaptureStatus() != at::zoom::CaptureStatus::None) {
+    uint32_t offset = state_->offset_intragraph_;
+    state_->increase(increment);
+    return PhiloxHIPState(
+        state_->seed_extragraph_.data_ptr<int64_t>(),
+        state_->offset_extragraph_.data_ptr<int64_t>(),
+        offset);
+  } else {
+    uint64_t offset = state_->philox_offset_per_thread_;
+    state_->increase(increment);
+    return PhiloxHIPState(state_->seed_, offset);
+  }
+}
+
+/**
+ * Temporarily accommodates call sites that use philox_engine_inputs.
+ * Allows incremental refactor of call sites to use philox_hip_state.
+ */
+std::pair<uint64_t, uint64_t> ZoomGeneratorImpl::philox_engine_inputs(
+    uint64_t increment) {
+  at::zoom::assertNotCapturing(
+      "Refactor this op to use ZoomGeneratorImpl::philox_hip_state. Cannot call ZoomGeneratorImpl::philox_engine_inputs");
+  uint64_t offset = state_->philox_offset_per_thread_;
+  state_->increase(increment);
+  return std::make_pair(state_->seed_, offset);
+}
+
+/*
+ * Gets the DeviceType of ZoomGeneratorImpl.
+ * Used for type checking during run time.
+ */
+DeviceType ZoomGeneratorImpl::device_type() {
+  return DeviceType::PrivateUse1;
+}
+
+/**
+ * Public clone method implementation
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+std::shared_ptr<ZoomGeneratorImpl> ZoomGeneratorImpl::clone() const {
+  return std::shared_ptr<ZoomGeneratorImpl>(this->clone_impl());
+}
+
+/**
+ * Private clone method implementation
+ *
+ * See Note [Acquire lock when using random generators]
+ */
+ZoomGeneratorImpl* ZoomGeneratorImpl::clone_impl() const {
+  at::zoom::assertNotCapturing("Cannot call ZoomGeneratorImpl::clone_impl");
+  auto gen = new ZoomGeneratorImpl(this->device().index(), state_->clone());
+  return gen;
+}
+
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomGeneratorImpl.h b/aten/src/ATen/zoom/ZoomGeneratorImpl.h
new file mode 100644
index 00000000000000..106432ec428fa2
--- /dev/null
+++ b/aten/src/ATen/zoom/ZoomGeneratorImpl.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/zoom/PhiloxHIPState.h>
+#include <atomic>
+#include <limits>
+#include <memory>
+#include <unordered_set>
+namespace at {
+
+namespace zoom {
+struct HIPGraph;
+}
+
+/**
+ * Note [CUDA Graph-safe RNG states]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Strategy:
+ * ~~~~~~~~~
+ * (It helps to look at
+ * cuda/detail/PhiloxCudaStateRaw.cuh and
+ * cuda/detail/UnpackRaw.cuh
+ * while you read this.)
+ *
+ * A CUDA graph containing multiple RNG ops behaves like a
+ * single giant kernel from the perspective of ops external
+ * to the graph.  During graph capture, logic in ZoomGeneratorImpl
+ * records the total of all offset increments that occur in the
+ * graphed region, and records the final total as the offset for
+ * the entire graph.
+ *
+ * When the graph reruns, the logic that reruns it
+ * increments this device's CUDA generator's offset
+ * by that total.
+ *
+ * Meanwhile, within the graph, at capture time, instead of
+ * populating PhiloxCudaStates with the uint64_t offset pulled
+ * directly from the global state, PhiloxHIPState uses a pointer
+ * to a one-element stream-local int64_t device tensor
+ * holding an initial offset value, and a uint64_t holding an
+ * intra-graph offset. (The intra-graph offset starts from zero
+ * when capture begins.)  In each consumer kernel,
+ * at::zoom::philox::unpack computes the offset to use for this kernel
+ * as intra-graph offset + *initial offset.
+ *
+ * When the graph reruns, the logic that reruns it first
+ * fill_s the initial offset tensor with this device's
+ * CUDA generator's current offset.
+ *
+ * The control flow above ensures graphed execution is bitwise
+ * identical to eager execution as long as RNG ops are enqueued
+ * from a single thread, even if RNG ops and graphs containing
+ * RNG ops are enqueued and run simultaneously on multiple streams.
+ *
+ * Usage:
+ * ~~~~~~
+ * PhiloxHIPState in this file, and unpack() in
+ * cuda/CUDAGraphsUtils.cuh allow non-divergent use of
+ * ZoomGeneratorImpl whether graph capture is underway or not.
+ *
+ * Each PhiloxHIPState instance should be used for one and only one
+ * consumer kernel.
+ *
+ * Example (see e.g. native/cuda/Dropout.cu):
+ *
+ * #include <ATen/cuda/ZoomGeneratorImpl.h>
+ * #include <ATen/cuda/CUDAGraphsUtils.cuh>
+ *
+ * __global__ void kernel(..., PhiloxHIPState philox_args) {
+ *   auto seeds = at::zoom::philox::unpack(philox_args);
+ *   IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+ *   curandStatePhilox4_32_10_t state;
+ *   curand_init(std::get<0>(seeds), // seed
+ *               idx,                // per-thread subsequence
+ *               std::get<1>(seeds), // offset in subsequence
+ *               &state);
+ *   ...
+ * }
+ *
+ * host_caller(...) {
+ *   PhiloxHIPState rng_engine_inputs;
+ *   {
+ *     // See Note [Acquire lock when using random generators]
+ *     std::lock_guard<std::mutex> lock(gen->mutex_);
+ *
+ *     // gen could be HostState or DevState here! No divergent code needed!
+ *     rng_engine_inputs = gen->philox_hip_state(offset_increment);
+ *   }
+ *   kernel<<<...>>>(..., rng_engine_inputs);
+ * }
+ *
+ */
+
+struct ZoomGeneratorState : public c10::intrusive_ptr_target {
+  uint64_t seed_;
+  uint64_t philox_offset_per_thread_;
+  uint32_t offset_intragraph_;
+  bool capturing_{};
+  std::unordered_set<zoom::HIPGraph*> registered_graphs_;
+  at::TensorBase seed_extragraph_{};
+  at::TensorBase offset_extragraph_{};
+
+  ZoomGeneratorState(
+      uint64_t seed = default_rng_seed_val,
+      uint64_t philox_offset_per_thread = 0,
+      uint32_t offset_intragraph = 0)
+      : seed_(seed),
+        philox_offset_per_thread_(philox_offset_per_thread),
+        offset_intragraph_(offset_intragraph) {}
+
+  void increase(uint64_t increment);
+
+  void register_graph(zoom::HIPGraph* graph);
+  void unregister_graph(zoom::HIPGraph* graph);
+
+  void capture_prologue();
+  // capture_epilogue returns the wholegraph_increment
+  uint64_t capture_epilogue();
+  void replay_prologue(uint64_t wholegraph_increment);
+  c10::intrusive_ptr<ZoomGeneratorState> clone();
+};
+
+struct TORCH_ZOOM_API ZoomGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  ZoomGeneratorImpl(DeviceIndex device_index = -1);
+  ZoomGeneratorImpl(
+      DeviceIndex device_index,
+      c10::intrusive_ptr<ZoomGeneratorState> state_);
+  ~ZoomGeneratorImpl() override = default;
+
+  // ZoomGeneratorImpl methods
+  std::shared_ptr<ZoomGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void graphsafe_set_state(
+      const c10::intrusive_ptr<GeneratorImpl>& state) override;
+  c10::intrusive_ptr<c10::GeneratorImpl> graphsafe_get_state() const override;
+
+  void set_philox_offset_per_thread(uint64_t offset);
+  uint64_t philox_offset_per_thread() const;
+
+  void register_graph(zoom::HIPGraph* graph);
+  void unregister_graph(zoom::HIPGraph* graph);
+
+  // Generates a PhiloxHIPState with a specified increment, and increment
+  // current state
+  PhiloxHIPState philox_hip_state(uint64_t increment);
+
+  bool reset_rnn_state() {
+    return !no_reset_rnn_state_.test_and_set();
+  }
+
+  // Temporarily accommodates call sites that use philox_engine_inputs.
+  // Allows incremental refactor of call sites to use philox_hip_state.
+  std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
+
+  static c10::DeviceType device_type();
+
+ private:
+  ZoomGeneratorImpl* clone_impl() const override;
+
+  c10::intrusive_ptr<ZoomGeneratorState> state_;
+  std::atomic_flag no_reset_rnn_state_;
+};
+
+namespace zoom::detail {
+
+TORCH_ZOOM_API const Generator& getDefaultZoomGenerator(
+    DeviceIndex device_index = -1);
+TORCH_ZOOM_API Generator createZoomGenerator(DeviceIndex device_index = -1);
+
+} // namespace zoom::detail
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/cub-RadixSortKeys.cu b/aten/src/ATen/zoom/cub-RadixSortKeys.cu
new file mode 100644
index 00000000000000..a18326a1daacee
--- /dev/null
+++ b/aten/src/ATen/zoom/cub-RadixSortKeys.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/HIPConfig.h>
+#include <ATen/zoom/cub.cuh>
+
+namespace at::zoom::hipcub {
+
+template <typename key_t>
+void radix_sort_keys(
+    const key_t* keys_in,
+    key_t* keys_out,
+    int64_t n,
+    bool descending,
+    int64_t begin_bit,
+    int64_t end_bit) {
+  TORCH_CHECK(
+      n <= std::numeric_limits<int>::max(),
+      "cub sort does not support sorting more than INT_MAX elements");
+  using key_t_ = typename detail::hip_type<key_t>::type;
+
+  const key_t_* keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
+  key_t_* keys_out_ = reinterpret_cast<key_t_*>(keys_out);
+
+  if (descending) {
+    HIPCUB_WRAPPER(
+        NO_ROCM(at_zoom_detail)::hipcub::DeviceRadixSort::SortKeysDescending,
+        keys_in_,
+        keys_out_,
+        n,
+        begin_bit,
+        end_bit,
+        c10::zoom::getCurrentZoomStream());
+  } else {
+    HIPCUB_WRAPPER(
+        NO_ROCM(at_zoom_detail)::hipcub::DeviceRadixSort::SortKeys,
+        keys_in_,
+        keys_out_,
+        n,
+        begin_bit,
+        end_bit,
+        c10::zoom::getCurrentZoomStream());
+  }
+}
+
+#define AT_INSTATIATE_CUB_TEMPLATES(scalar_t, ScalarType) \
+  template void radix_sort_keys(                          \
+      const scalar_t* keys_in,                            \
+      scalar_t* keys_out,                                 \
+      int64_t n,                                          \
+      bool descending,                                    \
+      int64_t begin_bit,                                  \
+      int64_t end_bit);
+
+AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTATIATE_CUB_TEMPLATES)
+AT_INSTATIATE_CUB_TEMPLATES(uint16_t, UInt16)
+AT_INSTATIATE_CUB_TEMPLATES(uint32_t, UInt32)
+AT_INSTATIATE_CUB_TEMPLATES(uint64_t, UInt64)
+
+} // namespace at::zoom::hipcub
diff --git a/aten/src/ATen/zoom/cub-RadixSortPairs.cu b/aten/src/ATen/zoom/cub-RadixSortPairs.cu
new file mode 100644
index 00000000000000..ef81eb365f1c9b
--- /dev/null
+++ b/aten/src/ATen/zoom/cub-RadixSortPairs.cu
@@ -0,0 +1,86 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/HIPConfig.h>
+#include <ATen/zoom/cub.cuh>
+
+namespace at::zoom::hipcub::detail {
+
+template <typename key_t, int value_size>
+void radix_sort_pairs_impl(
+    const key_t* keys_in,
+    key_t* keys_out,
+    const OpaqueType<value_size>* values_in,
+    OpaqueType<value_size>* values_out,
+    int64_t n,
+    bool descending,
+    int64_t begin_bit,
+    int64_t end_bit) {
+  TORCH_CHECK(
+      n <= std::numeric_limits<int>::max(),
+      "cub sort does not support sorting more than INT_MAX elements");
+  using key_t_ = typename detail::hip_type<key_t>::type;
+
+  auto allocator = c10::zoom::ZoomCachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+
+  if (keys_out == nullptr) {
+    keys_out_owner = allocator->allocate(n * sizeof(key_t));
+    keys_out = reinterpret_cast<key_t*>(keys_out_owner.get());
+  }
+
+  const key_t_* keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
+  key_t_* keys_out_ = reinterpret_cast<key_t_*>(keys_out);
+
+  if (descending) {
+    HIPCUB_WRAPPER(
+        NO_ROCM(at_zoom_detail)::hipcub::DeviceRadixSort::SortPairsDescending,
+        keys_in_,
+        keys_out_,
+        values_in,
+        values_out,
+        n,
+        begin_bit,
+        end_bit,
+        c10::zoom::getCurrentZoomStream());
+  } else {
+    HIPCUB_WRAPPER(
+        NO_ROCM(at_zoom_detail)::hipcub::DeviceRadixSort::SortPairs,
+        keys_in_,
+        keys_out_,
+        values_in,
+        values_out,
+        n,
+        begin_bit,
+        end_bit,
+        c10::zoom::getCurrentZoomStream());
+  }
+}
+
+#define AT_INSTANTIATE_SORT_PAIRS(key_t, value_size) \
+  template void radix_sort_pairs_impl(               \
+      const key_t* keys_in,                          \
+      key_t* keys_out,                               \
+      const OpaqueType<value_size>* values_in,       \
+      OpaqueType<value_size>* values_out,            \
+      int64_t n,                                     \
+      bool descending,                               \
+      int64_t begin_bit,                             \
+      int64_t end_bit);
+
+AT_INSTANTIATE_SORT_PAIRS(int32_t, 1)
+AT_INSTANTIATE_SORT_PAIRS(int32_t, 2)
+AT_INSTANTIATE_SORT_PAIRS(int32_t, 4)
+AT_INSTANTIATE_SORT_PAIRS(int64_t, 1)
+AT_INSTANTIATE_SORT_PAIRS(int64_t, 2)
+AT_INSTANTIATE_SORT_PAIRS(int64_t, 4)
+
+#define AT_INSTANTIATE_SORT_PAIRS_8(scalar_t, ScalarType) \
+  AT_INSTANTIATE_SORT_PAIRS(scalar_t, 8)
+
+AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8)
+AT_INSTANTIATE_SORT_PAIRS(uint16_t, 8)
+AT_INSTANTIATE_SORT_PAIRS(uint32_t, 8)
+AT_INSTANTIATE_SORT_PAIRS(uint64_t, 8)
+AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8)
+
+} // namespace at::zoom::hipcub::detail
diff --git a/aten/src/ATen/zoom/cub.cu b/aten/src/ATen/zoom/cub.cu
new file mode 100644
index 00000000000000..f00caf3675f20a
--- /dev/null
+++ b/aten/src/ATen/zoom/cub.cu
@@ -0,0 +1,51 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/cub.cuh>
+#include <ATen/zoom/HIPConfig.h>
+
+namespace at::zoom::hipcub {
+
+namespace {
+template <typename scalar_t>
+struct SumOp {
+  __device__ scalar_t operator () (scalar_t a, scalar_t b) const {
+    return a + b;
+  }
+};
+}
+
+template <typename input_t, typename output_t>
+void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t num_items) {
+  using NO_ROCM(at_zoom_detail)::hipcub::Sum;
+  inclusive_scan(input, output, Sum{}, num_items);
+}
+
+template void inclusive_sum_truncating(const int32_t *input, int32_t *output, int64_t num_items);
+template void inclusive_sum_truncating(const int64_t *input, int64_t *output, int64_t num_items);
+template void inclusive_sum_truncating(const int32_t *input, int64_t *output, int64_t num_items);
+
+template <typename input_t, typename output_t>
+void exclusive_sum_in_common_type(const input_t *input, output_t *output, int64_t num_items) {
+  using scalar_t = std::common_type_t<input_t, output_t>;
+  exclusive_scan(input, output, SumOp<scalar_t>{}, scalar_t(0), num_items);
+}
+
+template void exclusive_sum_in_common_type(const int32_t *input, int32_t *output, int64_t num_items);
+template void exclusive_sum_in_common_type(const int64_t *input, int64_t *output, int64_t num_items);
+
+namespace {
+struct CountMaskOp {
+  __device__ int64_t operator() (const uint8_t &x) const {
+    return x != 0;
+  }
+};
+}
+
+void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n) {
+  CountMaskOp op{};
+  auto iter = NO_ROCM(at_zoom_detail)::hipcub::TransformInputIterator<
+      bool, decltype(op), decltype(mask)>(mask, op);
+  exclusive_scan(iter, output_idx, SumOp<int64_t>{}, int64_t{0}, n);
+}
+
+}  // namespace at::zoom::hipcub
diff --git a/aten/src/ATen/zoom/cub.cuh b/aten/src/ATen/zoom/cub.cuh
new file mode 100644
index 00000000000000..331f98301eca4e
--- /dev/null
+++ b/aten/src/ATen/zoom/cub.cuh
@@ -0,0 +1,284 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#pragma once
+#include <ATen/zoom/cub.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <iterator>
+#include <limits>
+
+#include <ATen/zoom/cub_definitions.cuh>
+
+#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
+
+#include <hipcub/hipcub.hpp>
+
+#else
+
+// include cub in a safe manner, see:
+// https://github.com/pytorch/pytorch/pull/55292
+#undef CUB_NS_POSTFIX //undef to avoid redefinition warnings
+#undef CUB_NS_PREFIX
+#undef CUB_NS_QUALIFIER
+#define CUB_NS_PREFIX namespace at_zoom_detail {
+#define CUB_NS_POSTFIX }
+#define CUB_NS_QUALIFIER ::at_zoom_detail::hipcub
+#include <hipcub/hipcub.hpp>
+#undef CUB_NS_POSTFIX
+#undef CUB_NS_PREFIX
+#undef CUB_NS_QUALIFIER
+
+#endif
+
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomStream.h>
+
+// handle the temporary storage and 'twice' calls for cub API
+#define HIPCUB_WRAPPER(func, ...) do {                                       \
+  size_t temp_storage_bytes = 0;                                          \
+  func(nullptr, temp_storage_bytes, __VA_ARGS__);                         \
+  auto& caching_allocator = *::c10::zoom::ZoomCachingAllocator::get();    \
+  auto temp_storage = caching_allocator.allocate(temp_storage_bytes);     \
+  func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__);              \
+  C10_ZOOM_CHECK(hipGetLastError());                                      \
+} while (false)
+
+#define NO_ROCM(x)
+#define ROCM_HIPCUB(x) ::hipcub
+
+
+// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
+
+template <>
+struct ROCM_HIPCUB(cub)::FpLimits<c10::BFloat16>
+{
+    static __host__ __device__ __forceinline__ c10::BFloat16 Max() {
+        unsigned short max_word = 0x7F7F;
+        return reinterpret_cast<c10::BFloat16&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ c10::BFloat16 Lowest() {
+        unsigned short lowest_word = 0xFF7F;
+        return reinterpret_cast<c10::BFloat16&>(lowest_word);
+    }
+};
+
+template <>
+struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
+       ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
+
+
+
+namespace at::zoom::hipcub {
+
+namespace detail {
+
+template<typename T>
+struct hip_type {
+  using type = T;
+};
+template<>
+struct hip_type<c10::Half> {
+  using type = __half;
+};
+
+template<>
+struct hip_type<c10::BFloat16> {
+  using type = hip_bfloat16;
+};
+
+
+}  // namespace detail
+
+template<typename key_t, typename value_t, typename OffsetIteratorT>
+inline void segmented_sort_pairs(
+    const key_t *keys_in, key_t *keys_out,
+    const value_t *values_in, value_t *values_out,
+    int64_t num_elements, int64_t num_segments,
+    OffsetIteratorT begin_offsets, OffsetIteratorT end_offsets,
+    bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
+) {
+  TORCH_CHECK(num_elements <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
+  TORCH_CHECK(num_segments <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
+  using key_t_ = typename detail::hip_type<key_t>::type;
+
+  auto allocator = c10::zoom::ZoomCachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+
+  if (keys_out == nullptr) {
+    keys_out_owner = allocator->allocate(num_elements * sizeof(key_t));
+    keys_out = reinterpret_cast<key_t *>(keys_out_owner.get());
+  }
+
+  const key_t_ *keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
+  key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out);
+
+  if (descending) {
+    HIPCUB_WRAPPER(NO_ROCM(at_zoom_detail)::hipcub::DeviceSegmentedRadixSort::SortPairsDescending,
+      keys_in_, keys_out_, values_in, values_out,
+      num_elements, num_segments, begin_offsets, end_offsets,
+      begin_bit, end_bit, c10::zoom::getCurrentZoomStream());
+  } else {
+    HIPCUB_WRAPPER(NO_ROCM(at_zoom_detail)::hipcub::DeviceSegmentedRadixSort::SortPairs,
+      keys_in_, keys_out_, values_in, values_out,
+      num_elements, num_segments, begin_offsets, end_offsets,
+      begin_bit, end_bit, c10::zoom::getCurrentZoomStream());
+  }
+}
+
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename KeysOutputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
+inline void unique_by_key(
+  KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
+  KeysOutputIteratorT keys_out, ValuesOutputIteratorT values_out,
+  NumSelectedIteratorT num_selected, int64_t num_input_items)
+{
+  // TODO: use thrust::discard_iterator to handle null keys_out when https://github.com/NVIDIA/cub/issues/406 is fixed.
+  constexpr bool null_keys_out = std::is_same<KeysOutputIteratorT, std::nullptr_t>::value;
+  using KeyT = typename std::iterator_traits<KeysInputIteratorT>::value_type;
+  using RealKeysOutputIteratorT = typename std::conditional<null_keys_out, KeyT *, KeysOutputIteratorT>::type;
+  RealKeysOutputIteratorT keys_out_;
+  auto allocator = c10::zoom::ZoomCachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+  if constexpr (null_keys_out) {
+    keys_out_owner = allocator->allocate(num_input_items * sizeof(KeyT));
+    keys_out_ = static_cast<KeyT *>(keys_out_owner.get());
+  } else {
+    keys_out_ = keys_out;
+  }
+  HIPCUB_WRAPPER(NO_ROCM(at_zoom_detail)::hipcub::DeviceSelect::UniqueByKey,
+    keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::zoom::getCurrentZoomStream());
+}
+#endif
+
+namespace impl {
+
+template<typename InputIteratorT1, typename InputIteratorT2, typename OutputIteratorT, class ScanOpT>
+C10_LAUNCH_BOUNDS_1(1)
+__global__ void transform_vals(InputIteratorT1 a, InputIteratorT2 b, OutputIteratorT out, ScanOpT scan_op){
+  // NOTE: out here not the final scan output, but an intermediate of the accumulation type.
+  using acc_t = typename std::iterator_traits<OutputIteratorT>::value_type;
+  *out = scan_op(static_cast<acc_t>(*a), static_cast<acc_t>(*b));
+}
+
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+template<typename ValueT, typename InputIteratorT>
+struct chained_iterator {
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = ValueT;
+  using pointer           = ValueT*;
+  using reference         = ValueT&;
+
+  InputIteratorT iter;
+  ValueT *first;
+  difference_type offset = 0;
+
+  __device__ ValueT operator[](difference_type i) {
+    i +=  offset;
+    if (i == 0) {
+      return *first;
+    } else {
+      return ValueT(iter[i - 1]);
+    }
+  }
+  __device__ chained_iterator operator+(difference_type i) {
+    return chained_iterator{iter, first, i};
+  }
+  __device__ ValueT operator*() {
+    return (*this)[0];
+  }
+};
+#endif
+
+// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
+// so split at int_max/2
+constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
+}
+
+// non synchronizing cub call
+// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
+// so split at int_max/2
+template<typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, int max_cub_size=impl::max_cub_size>
+inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, int64_t num_items) {
+  //For ROCm, use hipCUB chained iterators
+  HIPCUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::InclusiveScan,
+      input,
+      output,
+      scan_op,
+      num_items,
+      c10::zoom::getCurrentZoomStream());
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, int max_cub_size=impl::max_cub_size>
+inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, InitValueT init_value, int64_t num_items) {
+  //For ROCm, use hipCUB chained iterators
+  HIPCUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::ExclusiveScan,
+      input,
+      output,
+      scan_op,
+      init_value,
+      num_items,
+      c10::zoom::getCurrentZoomStream());
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+}
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
+inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+    "cub InclusiveSumByKey does not support more than INT_MAX elements");
+  HIPCUB_WRAPPER(at_zoom_detail::hipcub::DeviceScan::InclusiveSumByKey,
+      keys, input, output, num_items, at_zoom_detail::hipcub::Equality(), c10::zoom::getCurrentZoomStream());
+}
+
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT>
+inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, ScanOpT scan_op, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+    "cub InclusiveSumByKey does not support more than INT_MAX elements");
+  HIPCUB_WRAPPER(at_zoom_detail::hipcub::DeviceScan::InclusiveScanByKey,
+      keys, input, output, scan_op, num_items, at_zoom_detail::hipcub::Equality(), c10::zoom::getCurrentZoomStream());
+}
+
+#endif
+
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+void unique(InputIteratorT input, OutputIteratorT output,
+            NumSelectedIteratorT num_selected_out, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+              "cub unique does not support more than INT_MAX elements");
+  HIPCUB_WRAPPER(NO_ROCM(at_zoom_detail)::hipcub::DeviceSelect::Unique,
+              input, output, num_selected_out, num_items, c10::zoom::getCurrentZoomStream());
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename CountsOutputIteratorT,
+          typename LengthOutputIteratorT>
+void run_length_encode(InputIteratorT input, OutputIteratorT output, CountsOutputIteratorT counts_out,
+                       LengthOutputIteratorT length_out, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+              "cub run_length_encode does not support more than INT_MAX elements");
+  HIPCUB_WRAPPER(
+      NO_ROCM(at_zoom_detail)::hipcub::DeviceRunLengthEncode::Encode,
+      input, output, counts_out, length_out, num_items,
+      c10::zoom::getCurrentZoomStream());
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
+void reduce(InputIteratorT input, OutputIteratorT output, int64_t num_items, ReductionOpT op, T init) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+              "cub reduce does not support more than INT_MAX elements");
+  HIPCUB_WRAPPER(
+      NO_ROCM(at_zoom_detail)::hipcub::DeviceReduce::Reduce,
+      input, output, num_items, op, init,
+      c10::zoom::getCurrentZoomStream());
+
+}
+
+}  // namespace at::zoom::hipcub
diff --git a/aten/src/ATen/zoom/cub.h b/aten/src/ATen/zoom/cub.h
new file mode 100644
index 00000000000000..c38b12526cfc6a
--- /dev/null
+++ b/aten/src/ATen/zoom/cub.h
@@ -0,0 +1,88 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <cstdint>
+#include <c10/core/ScalarType.h>
+#include <ATen/zoom/HIPConfig.h>
+
+// NOTE: These templates are intentionally not defined in this header,
+// which aviods re-compiling them for each translation unit. If you get
+// a link error, you need to add an explicit instantiation for your
+// types in cub.cu
+
+namespace at::zoom::hipcub {
+
+inline int get_num_bits(uint64_t max_key) {
+  int num_bits = 1;
+  while (max_key > 1) {
+    max_key >>= 1;
+    num_bits++;
+  }
+  return num_bits;
+}
+
+namespace detail {
+
+// radix_sort_pairs doesn't interact with value_t other than to copy
+// the data, so we can save template instantiations by reinterpreting
+// it as an opaque type.
+template <int N> struct alignas(N) OpaqueType { char data[N]; };
+
+template<typename key_t, int value_size>
+void radix_sort_pairs_impl(
+    const key_t *keys_in, key_t *keys_out,
+    const OpaqueType<value_size> *values_in, OpaqueType<value_size> *values_out,
+    int64_t n, bool descending, int64_t begin_bit, int64_t end_bit);
+
+}  // namespace detail
+
+template<typename key_t, typename value_t>
+void radix_sort_pairs(
+    const key_t *keys_in, key_t *keys_out,
+    const value_t *values_in, value_t *values_out,
+    int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8) {
+  static_assert(std::is_trivially_copyable<value_t>::value ||
+                AT_ROCM_ENABLED(),  // ROCm incorrectly fails this check for vector types
+                "radix_sort_pairs value type must be trivially copyable");
+  // Make value type opaque, so all inputs of a certain size use the same template instantiation
+  using opaque_t = detail::OpaqueType<sizeof(value_t)>;
+  static_assert(sizeof(value_t) <= 8 && (sizeof(value_t) & (sizeof(value_t) - 1)) == 0,
+                "This size of value_t is not instantiated. Please instantiate it in cub.cu"
+                " and modify this check.");
+  static_assert(sizeof(value_t) == alignof(value_t), "Expected value_t to be size-aligned");
+  detail::radix_sort_pairs_impl(
+      keys_in, keys_out,
+      reinterpret_cast<const opaque_t*>(values_in),
+      reinterpret_cast<opaque_t*>(values_out),
+      n, descending, begin_bit, end_bit);
+}
+
+template<typename key_t>
+void radix_sort_keys(
+    const key_t *keys_in, key_t *keys_out,
+    int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8);
+
+// NOTE: Intermediate sums will be truncated to input_t precision
+template <typename input_t, typename output_t>
+void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t n);
+
+template <typename scalar_t>
+void inclusive_sum(const scalar_t *input, scalar_t *output, int64_t n) {
+  return inclusive_sum_truncating(input, output, n);
+}
+
+// NOTE: Sums are done is common_type<input_t, output_t>
+template <typename input_t, typename output_t>
+void exclusive_sum_in_common_type(const input_t *input, output_t *output, int64_t n);
+
+template <typename scalar_t>
+void exclusive_sum(const scalar_t *input, scalar_t *output, int64_t n) {
+  return exclusive_sum_in_common_type(input, output, n);
+}
+
+void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n);
+inline void mask_exclusive_sum(const bool *mask, int64_t *output_idx, int64_t n) {
+  return mask_exclusive_sum(
+      reinterpret_cast<const uint8_t*>(mask), output_idx, n);
+}
+
+}  // namespace at::zoom::hipcub
diff --git a/aten/src/ATen/zoom/cub_definitions.cuh b/aten/src/ATen/zoom/cub_definitions.cuh
new file mode 100644
index 00000000000000..c199557279519d
--- /dev/null
+++ b/aten/src/ATen/zoom/cub_definitions.cuh
@@ -0,0 +1,27 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+
+#define CUB_VERSION 0
+
+#define CUB_SUPPORTS_NV_BFLOAT16() false
+
+// cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
+// https://github.com/NVIDIA/cub/pull/326
+// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
+// starting from CUDA 11.5
+#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
+#else
+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
+#endif
+
+
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
+
+
+#define CUB_SUPPORTS_SCAN_BY_KEY() 0
+
+
+
+#define CUB_SUPPORTS_FUTURE_VALUE() false
+
diff --git a/aten/src/ATen/zoom/detail/DeviceThreadHandles.h b/aten/src/ATen/zoom/detail/DeviceThreadHandles.h
new file mode 100644
index 00000000000000..1b7ba32607499c
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/DeviceThreadHandles.h
@@ -0,0 +1,151 @@
+// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states.
+// These handles are tied to device, and these libraries requires/recommends not to
+// share handles across host threads.
+//
+// These libraries recommend using one handle per host thread. We may not want to do
+// this because threads are relatively light-weight, but creating and destroying
+// handles is expensive (destroying the handle causes synchronizations). DataParallel,
+// for example, creates new threads for each forward pass.
+//
+// This file implements a handle pool mechanism. The handle pool returns handles on
+// demand as threads request them. If all existing handles in the pool are in use,
+// it creates a new one. As threads terminate, they release handles back into the pool.
+// In this way, the handle pool never creates more handles than the high-water mark of
+// active threads, so it's efficient with DataParallel.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <mutex>
+#include <memory>
+
+#include <c10/util/Exception.h>
+
+namespace at::zoom { namespace {
+
+template <typename Handle_t, void Create(Handle_t *), void Destroy(Handle_t)>
+struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThreadHandlePool<Handle_t, Create, Destroy>> {
+
+    struct Handle {
+    Handle_t handle;
+    Handle(bool create = false) : handle(nullptr)
+    {
+        if(create) Create(&handle);
+    }
+    // std::vector.emplace() and push_back() may route through temporaries and call
+    // copy/move constructors along the way.  If this is the case, we don't want
+    // the destructors of temporaries to call cudnnDestroy on the handle.
+    // We can achieve safety (for the narrow case of stashing within std::vectors)
+    // by making Handle moveable but not copyable, and transferring handle ownership
+    // to the latest constructed object.  This is not a substitute for full-blown
+    // reference counting, but reference counting may be overkill here.
+    // Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
+    // unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
+    Handle(const Handle& rhs) = delete;
+    // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
+    Handle(Handle&& rhs) : Handle() { std::swap(handle, rhs.handle); }
+    // operator= takes argument by value
+    Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
+    ~Handle() {
+        if(handle) Destroy(handle);
+    }
+    };
+
+    std::mutex mutex;
+
+    // Handles are lazily created as different threads request them,
+    // but are never destroyed until the end of the process.
+    // The maximum number of handles this process will create for each device is equal
+    // to the high-water mark of the number of concurrently active threads that request
+    // handles for that device.
+    // When threads terminate, they release their handles back into the pool for reuse.
+    // Otherwise, new handles would be created every time new threads were spawned,
+    // resulting in poor performance for Python modules that repeatedly or frequently
+    // spawned new sets of threads (like DataParallel, which creates a new set of threads
+    // for each forward pass).
+    //
+    // To prevent potential deadlocks, we explicitly choose not to cap the number
+    // of handles that are created per device.
+    // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
+    // only 4 can make forward progress at any time. The other 4 will not release their
+    // handles until they exit, so the fifth cannot make progress until then.  This is
+    // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
+    // intermediate point (ie, before any of them have exited).  We have no way to anticipate
+    // or enforce that user threads will not attempt such intermediate synchronization.
+    // The only way to ensure safety is to avoid imposing a cap on the number of handles.
+    std::unordered_map<int, std::vector<Handle>> created_handles;
+    std::unordered_map<int, std::vector<Handle_t>> available_handles;
+
+    // PoolWindow lazily creates and caches the handles that a particular thread is using,
+    // so in the common case handle access doesn't incur either handle creation or a mutex lock.
+    class PoolWindow
+    {
+    public:
+    PoolWindow(std::shared_ptr<DeviceThreadHandlePool> parent): weak_parent(std::move(parent)) {}
+    ~PoolWindow(){ release(); }
+
+    Handle_t reserve(int device)
+    {
+        // If this thread already has a handle for this device, return it
+        if(my_handles.find(device) != my_handles.end())
+        return my_handles[device];
+
+        // otherwise, either grab a handle from the pool if one is available,
+        // or if not, create a new one.
+        auto parent = weak_parent.lock();
+        TORCH_CHECK(parent, "Cannot create handle during program termination");
+        std::lock_guard<std::mutex> guard(parent->mutex);
+
+        if(parent->available_handles[device].size() > 0)
+        {
+        my_handles[device] = parent->available_handles[device].back();
+        parent->available_handles[device].pop_back();
+        }
+        else
+        {
+        // In local testing, I do observe that emplace_back sometimes routes through temporaries
+        // that incur move-constructor and destructor calls.  See comments in Handle above.
+        parent->created_handles[device].emplace_back(true /*create*/);
+        my_handles[device] = parent->created_handles[device].back().handle;
+        }
+
+        return my_handles[device];
+    }
+
+    private:
+    // Stores the per-device handles currently owned by this thread
+    std::unordered_map<int, Handle_t> my_handles;
+
+    std::weak_ptr<DeviceThreadHandlePool> weak_parent;
+
+    // Called by the destructor.  Releases this thread's handles back into the pool.
+    void release() {
+        if(my_handles.size() > 0) {
+            auto parent = weak_parent.lock();
+            if (!parent) {
+                // If this thread exits after atexit handlers have completed, the
+                // cuda context itself may be invalid, so we must leak the handles.
+                return;
+            }
+
+            std::lock_guard<std::mutex> guard(parent->mutex);
+            for(auto d_h : my_handles)
+                parent->available_handles[d_h.first].push_back(d_h.second);
+        }
+    }
+    };
+
+    // Warning:
+    // If you want to change this function, be aware that this function will be called
+    // by multiple threads and there is no mutex guarding the call of this function, so
+    // make sure your implementation is thread-safe.
+    PoolWindow *newPoolWindow() {
+        // The returned pointer will be owned by a thread local variable
+        // so that different threads does not share the same PoolWindow.
+        return new PoolWindow(this->shared_from_this());
+    }
+};
+
+}}  // namespace at::zoom::detail::<anonymous>
diff --git a/aten/src/ATen/zoom/detail/IndexUtils.cu b/aten/src/ATen/zoom/detail/IndexUtils.cu
new file mode 100644
index 00000000000000..7e643871c6031d
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/IndexUtils.cu
@@ -0,0 +1,75 @@
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <vector>
+
+namespace at {
+namespace zoom {
+namespace detail {
+
+struct SizeAndStride {
+  int64_t size;
+  int64_t stride;
+};
+
+/*
+ A comparator that will sort SizeAndStride structs by stride,
+ in ascending order.
+ */
+ int compareSizeAndStride(const void* a, const void* b) {
+  const SizeAndStride* aS = (const SizeAndStride*) a;
+  const SizeAndStride* bS = (const SizeAndStride*) b;
+
+  if (aS->stride < bS->stride) return -1;
+  if (aS->stride == bS->stride) return 0;
+  return 1;
+}
+
+/*
+Returns false if there is no possibility that the tensor
+has "overlapping" indices and true otherwise.
+"Overlapping" indices are two+ valid indices that specify
+the same offset within the tensor.
+The function does this by checking for a sufficient but not
+necessary condition of no overlap. In particular, that
+that there exists an ordering of the tensor's dimensions
+that is nicely "nested," with each dimension contained
+within the next one.
+*/
+bool maybeOverlappingIndices(const TensorBase& t) {
+  /* Extract size/stride arrays; only consider size >1 dims. */
+  std::vector<SizeAndStride> info(t.dim());
+  int dims = t.dim();
+  int nonSize1Dims = 0;
+  for (int i = 0; i < dims; ++i) {
+    int64_t size = t.size(i);
+    if (size > 1) {
+      info[nonSize1Dims].size = size;
+      info[nonSize1Dims].stride = t.stride(i);
+
+      if (info[nonSize1Dims].stride < 1) {
+        return true;
+      }
+
+      ++nonSize1Dims;
+    }
+  }
+
+  // Short-circuits if tensor is a single element.
+  if (nonSize1Dims == 0) {
+    return false;
+  }
+
+  /* Ascending order (innermost dimension in sorted view is at [0]) */
+  qsort(info.data(), nonSize1Dims, sizeof(SizeAndStride), compareSizeAndStride);
+
+  for (int i = 0; i < (nonSize1Dims - 1); ++i) {
+    if (((info[i].size - 1) * info[i].stride) >= info[i + 1].stride) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+} // detail
+} // zoom
+} // at
diff --git a/aten/src/ATen/zoom/detail/IndexUtils.cuh b/aten/src/ATen/zoom/detail/IndexUtils.cuh
new file mode 100644
index 00000000000000..a3739645b6b427
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/IndexUtils.cuh
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/native/CanUse32BitIndexMath.h>
+
+namespace at::zoom::detail {
+
+bool maybeOverlappingIndices(const at::TensorBase &t);
+using at::native::canUse32BitIndexMath;
+
+template <typename scalar, typename IndexType>
+TensorInfo<scalar, IndexType>
+getTensorInfo(const at::TensorBase &t) {
+  IndexType sz[MAX_TENSORINFO_DIMS];
+  IndexType st[MAX_TENSORINFO_DIMS];
+
+  int dims = t.dim();
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = t.size(i);
+    st[i] = t.stride(i);
+  }
+
+  scalar* data_ptr = nullptr;
+
+  if constexpr (std::is_const<scalar>::value) {
+    data_ptr = t.const_data_ptr<scalar>();
+  } else {
+    data_ptr = t.mutable_data_ptr<scalar>();
+  }
+
+  return TensorInfo<scalar, IndexType>(
+    data_ptr, dims, sz, st);
+}
+
+} // namespace at::zoom::detail
diff --git a/aten/src/ATen/zoom/detail/KernelUtils.h b/aten/src/ATen/zoom/detail/KernelUtils.h
new file mode 100644
index 00000000000000..ad0e5cbe9cc2f4
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/KernelUtils.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <limits>
+#include <c10/util/Exception.h>
+
+namespace at::zoom::detail {
+
+// CUDA: grid stride looping
+//
+// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment.
+// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final
+// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be
+// greater than INT_MAX.  But in that case _i_n_d_e_x >= n, so there are no
+// further iterations and the overflowed value in i=_i_n_d_e_x is not used.
+#define HIP_KERNEL_LOOP_TYPE(i, n, index_type)                         \
+  int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x;           \
+  for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+
+#define HIP_KERNEL_LOOP(i, n) HIP_KERNEL_LOOP_TYPE(i, n, int)
+
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int HIP_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=HIP_NUM_THREADS) {
+  TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+  constexpr int64_t max_int = std::numeric_limits<int>::max();
+
+  // Round up division for positive number that cannot cause integer overflow
+  auto block_num = (N - 1) / max_threads_per_block + 1;
+  TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on HIP device");
+
+  return static_cast<int>(block_num);
+}
+
+}  // namespace at::zoom::detail
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/detail/PhiloxHIPStateRaw.hpp b/aten/src/ATen/zoom/detail/PhiloxHIPStateRaw.hpp
new file mode 100644
index 00000000000000..252cc3c9013537
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/PhiloxHIPStateRaw.hpp
@@ -0,0 +1,43 @@
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxHIPState.h>, which has a #pragma once.
+
+// Stores RNG state values. Passed as a kernel argument.
+// See Note [CUDA Graph-safe RNG states].
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+namespace at {
+
+struct PhiloxHIPState {
+  PhiloxHIPState() = default;
+  // Called if graph capture is not underway
+  PhiloxHIPState(uint64_t seed,
+                  uint64_t offset) {
+    seed_.val = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxHIPState(int64_t* seed,
+                  int64_t* offset_extragraph,
+                  uint32_t offset_intragraph) {
+    seed_.ptr = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+
+  // Public members, directly accessible by at::zoom::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+
+  Payload seed_;
+  Payload offset_;
+  uint32_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/detail/TensorInfo.cuh b/aten/src/ATen/zoom/detail/TensorInfo.cuh
new file mode 100644
index 00000000000000..54debad5979827
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/TensorInfo.cuh
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <ATen/CollapseDims.h>
+
+namespace at::zoom::detail {
+
+#define MAX_TENSORINFO_DIMS 25
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo();
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_TENSORINFO_DIMS],
+             IndexType st[MAX_TENSORINFO_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+
+  // See note on [collapse dims].
+  int collapseDims(const int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_TENSORINFO_DIMS];
+  IndexType strides[MAX_TENSORINFO_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo() {
+  data = nullptr;
+  dims = 0;
+}
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_TENSORINFO_DIMS],
+                                     IndexType st[MAX_TENSORINFO_DIMS]) {
+  data = p;
+  dims = dim;
+  TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "Zoom tensors cannot have more than 25 dimensions");
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+  auto result = at::collapse_dims(sizes, strides, dims, excludeDim);
+  dims = std::get<1>(result);
+  return std::get<0>(result);
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+
+    return offset + linearId * info.strides[0];
+  }
+};
+
+// Uses dynamic (runtime) instead of static (compiletime) dims
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+      IndexType offset = 0;
+
+      for (int i = info.dims - 1; i > 0; --i) {
+        IndexType curDimIndex = linearId % info.sizes[i];
+        IndexType curDimOffset = curDimIndex * info.strides[i];
+        offset += curDimOffset;
+        linearId /= info.sizes[i];
+      }
+
+      return offset + linearId * info.strides[0];
+  }
+};
+
+} // namespace at::zoom::detail
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/detail/UnpackRaw.hpp b/aten/src/ATen/zoom/detail/UnpackRaw.hpp
new file mode 100644
index 00000000000000..5a5172e73f3cb2
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/UnpackRaw.hpp
@@ -0,0 +1,28 @@
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxUtils.cuh>, which has a #pragma once.
+
+namespace at::zoom::philox {
+
+// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState instance whether
+// that instance was created with graph capture underway or not.
+// See Note [CUDA Graph-safe RNG states].
+//
+// We can't write a __device__ function in CUDAGeneratorImpl.h, because it's in ATen.
+// Also, whatever call unpacks PhiloxCudaState in consumer kernels must be inlineable.
+// Easiest thing that comes to mind is, define a __device__ unpack helper here, in ATen/cuda.
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+__host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t>
+unpack(at::PhiloxHIPState arg) {
+  if (arg.captured_) {
+    // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long".
+    // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel.
+    // For most threads' reads it will hit in cache, so it shouldn't hurt performance.
+    return std::make_tuple(static_cast<uint64_t>(*arg.seed_.ptr), static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+  } else {
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
+  }
+}
+
+} // namespace at::zoom::philox
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/detail/ZoomHooks.cpp b/aten/src/ATen/zoom/detail/ZoomHooks.cpp
new file mode 100644
index 00000000000000..828ef6993c45b7
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/ZoomHooks.cpp
@@ -0,0 +1,273 @@
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/DynamicLibrary.h>
+#include <ATen/core/Vitals.h>
+// #include <ATen/cuda/CUDAConfig.h>
+#include <ATen/zoom/ZoomDevice.h>
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/PeerToPeerAccess.h>
+#include <ATen/zoom/PinnedMemoryAllocator.h>
+#include <ATen/zoom/hiprtc_stub/ATenHIPRTC.h>
+#include <ATen/zoom/detail/ZoomHooks.h>
+// #include <ATen/native/zoom/HIPFFTPlanCache.h>
+#include <c10/util/Exception.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/util/irange.h>
+
+// #if AT_CUDNN_ENABLED()
+// #include <ATen/cudnn/cudnn-wrapper.h>
+// #endif
+
+// #if AT_MAGMA_ENABLED()
+// #include <magma_v2.h>
+// #endif
+
+// #if defined(USE_ROCM)
+// #include <miopen/version.h>
+// #endif
+
+#include <sstream>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <iostream>
+#include <string>
+
+namespace c10::zoom::_internal {
+void setHasPrimaryContext(bool (*func)(DeviceIndex));
+}
+
+// defined in Aten/zoom/HIPblasHandlePool.cpp
+namespace at::zoom {
+  bool getHIPBlasAtomicsEnabled();
+}
+
+namespace at::zoom::detail {
+
+const at::zoom::HIPRTC& hiprtc();
+DeviceIndex current_device();
+
+// static void (*magma_init_fn)() = nullptr;
+
+// void set_magma_init_fn(void (*fn)()) {
+//   magma_init_fn = fn;
+// }
+
+namespace {
+bool _hasPrimaryContext(DeviceIndex device_index) {
+  TORCH_CHECK(device_index >= 0 && device_index < c10::zoom::device_count(),
+              "hasPrimaryContext expects a valid device index, but got device_index=", device_index);
+  unsigned int ctx_flags;
+  // In standalone tests of cuDevicePrimaryCtxGetState, I've seen the "active" argument end up with weird
+  // (garbage-looking nonzero) values when the context is not active, unless I initialize it to zero.
+  int ctx_is_active = 0;
+//   AT_CUDA_DRIVER_CHECK(nvrtc().cuDevicePrimaryCtxGetState(device_index, &ctx_flags, &ctx_is_active));
+    hipDevicePrimaryCtxGetState(device_index, &ctx_flags, &ctx_is_active);
+  return ctx_is_active == 1;
+}
+
+// Register hasPrimaryContext back to c10::zoom
+struct _Initializer {
+  _Initializer() {
+      c10::zoom::_internal::setHasPrimaryContext(_hasPrimaryContext);
+  }
+  ~_Initializer() {
+      c10::zoom::_internal::setHasPrimaryContext(nullptr);
+  }
+} initializer;
+} // anonymous namespace
+
+// Sets the CUDA_MODULE_LOADING environment variable
+// if it's not set by the user.
+void maybe_set_zoom_module_loading(const std::string &def_value) {
+  auto value = std::getenv("ZOOM_MODULE_LOADING");
+  if (!value) {
+#ifdef _WIN32
+    auto env_var = "ZOOM_MODULE_LOADING=" + def_value;
+    _putenv(env_var.c_str());
+#else
+    setenv("ZOOM_MODULE_LOADING", def_value.c_str(), 1);
+#endif
+  }
+}
+
+// NB: deleter is dynamic, because we need it to live in a separate
+// compilation unit (alt is to have another method in hooks, but
+// let's not if we don't need to!)
+void ZoomHooks::initZoom() const {
+  C10_LOG_API_USAGE_ONCE("aten.init.zoom");
+  // Force the update to enable unit testing. This code get executed before unit tests
+  // have a chance to enable vitals.
+  at::vitals::VitalsAPI.setVital("ZOOM", "used", "true", /* force = */ true);
+
+  maybe_set_zoom_module_loading("LAZY");
+  const auto num_devices = c10::zoom::device_count_ensure_non_zero();
+  c10::zoom::ZoomCachingAllocator::init(num_devices);
+  at::zoom::detail::init_p2p_access_cache(num_devices);
+}
+
+void ZoomHooks::initPrivateUse1() const {
+  initZoom();
+}
+
+const Generator& ZoomHooks::getDefaultZoomGenerator(DeviceIndex device_index) const {
+  return at::zoom::detail::getDefaultZoomGenerator(device_index);
+}
+
+Device ZoomHooks::getDeviceFromPtr(void* data) const {
+  return at::zoom::getDeviceFromPtr(data);
+}
+
+bool ZoomHooks::isPinnedPtr(const void* data) const {
+  // First check if driver is broken/missing, in which case PyTorch CPU
+  // functionalities should still work, we should report `false` here.
+  if (!at::zoom::is_available()) {
+    return false;
+  }
+  // cudaPointerGetAttributes grabs context on the current device, so we set
+  // device to one that already has context, if exists.
+  at::OptionalDeviceGuard device_guard;
+  auto primary_ctx_device_index = c10::zoom::getDeviceIndexWithPrimaryContext();
+  if (primary_ctx_device_index.has_value()) {
+    device_guard.reset_device(at::Device(at::DeviceType::PrivateUse1, *primary_ctx_device_index));
+  }
+  hipPointerAttribute_t attr;
+  // We do not believe that CUDA needs mutable access to the data
+  // here.
+  hipError_t err = hipPointerGetAttributes(&attr, data);
+  // HIP throws hipErrorUnknown here
+  if (err != hipSuccess) {
+    (void)hipGetLastError(); // clear HIP error
+    return false;
+  }
+  return attr.type == hipMemoryTypeHost;
+}
+
+bool ZoomHooks::hasROCM() const {
+  return at::zoom::is_available();
+}
+
+// rocBLAS is deterministic if atomic operations are disabled
+// for details on when rocBLAS is guaranteed to be bitwise deterministic see below:
+// https://github.com/ROCm/rocBLAS/issues/1459#issuecomment-2272082035
+bool ZoomHooks::checkHIPBlasDeterministic() const {
+  return !at::zoom::getHIPBlasAtomicsEnabled();
+}
+
+// #if defined(USE_DIRECT_NVRTC) || defined(USE_DIRECT_HIPRTC)
+  static std::pair<std::unique_ptr<at::DynamicLibrary>, at::zoom::HIPRTC*> load_hiprtc() {
+    return std::make_pair(nullptr, at::zoom::load_hiprtc());
+  }
+// #else
+//   static std::pair<std::unique_ptr<at::DynamicLibrary>, at::zoom::HIPRTC*> load_hiprtc() {
+//   #if defined(_WIN32)
+//     std::string libcaffe2_hiprtc = "caffe2_hiprtc.dll";
+//   #elif defined(__APPLE__)
+//     std::string libcaffe2_hiprtc = "libcaffe2_hiprtc.dylib";
+//   #else
+//     std::string libcaffe2_hiprtc = "libcaffe2_hiprtc.so";
+//   #endif
+//     std::unique_ptr<at::DynamicLibrary> libhiprtc_stub(
+//         new at::DynamicLibrary(libcaffe2_hiprtc.c_str()));
+//     auto fn = (at::zoom::HIPRTC * (*)()) libhiprtc_stub->sym("load_hiprtc");
+//     return std::make_pair(std::move(libhiprtc_stub), fn());
+//   }
+// #endif
+
+const at::zoom::HIPRTC& hiprtc() {
+  // must hold onto DynamicLibrary otherwise it will unload
+  static auto handle = load_hiprtc();
+  return *handle.second;
+}
+
+const at::zoom::HIPRTC& ZoomHooks::hiprtc() const {
+  return at::zoom::detail::hiprtc();
+}
+
+DeviceIndex current_device() {
+  c10::DeviceIndex device = 0;
+  hipError_t err = c10::zoom::GetDevice(&device);
+  if (err == hipSuccess) {
+    return device;
+  }
+  return -1;
+}
+
+DeviceIndex ZoomHooks::current_device() const {
+  return at::zoom::detail::current_device();
+}
+
+bool ZoomHooks::hasPrimaryContext(DeviceIndex device_index) const {
+  return _hasPrimaryContext(device_index);
+}
+
+Allocator* ZoomHooks::getPinnedMemoryAllocator() const {
+  return at::zoom::getPinnedMemoryAllocator();
+}
+
+Allocator* ZoomHooks::getZoomDeviceAllocator() const {
+  return at::zoom::getZoomDeviceAllocator();
+}
+
+std::string ZoomHooks::showConfig() const {
+  std::ostringstream oss;
+
+  int runtimeVersion;
+  hipRuntimeGetVersion(&runtimeVersion);
+
+  auto printHIPStyleVersion = [&](int v) {
+
+    // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
+    if(v < 500) {
+      // If major=xx, minor=yy then format -> xxyy
+      oss << (v / 100) << "." << (v % 10);
+    }
+    else {
+      // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
+      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
+    }
+
+  };
+
+
+  oss << "  - HIP Runtime ";
+
+  printHIPStyleVersion(runtimeVersion);
+  oss << "\n";
+
+  return oss.str();
+}
+
+int ZoomHooks::getNumGPUs() const {
+  auto cnt = c10::zoom::device_count();
+  std::cout << "numgpu: " << cnt << std::endl;
+  return cnt;
+}
+
+void ZoomHooks::deviceSynchronize(DeviceIndex device_index) const {
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::PrivateUse1, device_index));
+  c10::zoom::device_synchronize();
+}
+
+// // Sigh, the registry doesn't support namespaces :(
+// using at::zoomHooksRegistry;
+// using at::RegistererCUDAHooksRegistry;
+
+// REGISTER_CUDA_HOOKS(ZoomHooks);
+
+using at::PrivateUse1HooksRegistry;
+using at::RegistererPrivateUse1HooksRegistry;
+REGISTER_PRIVATEUSE1_HOOKS(ZoomHooks);
+
+static ZoomHooks* zoom_hooks_impl = nullptr;
+void register_zoom_hooks() {
+  if(zoom_hooks_impl == nullptr){
+    zoom_hooks_impl = new ZoomHooks({});
+    RegisterPrivateUse1HooksInterface(zoom_hooks_impl);
+  }
+}
+
+
+} // namespace at::zoom::detail
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/detail/ZoomHooks.h b/aten/src/ATen/zoom/detail/ZoomHooks.h
new file mode 100644
index 00000000000000..51cabb8bde377f
--- /dev/null
+++ b/aten/src/ATen/zoom/detail/ZoomHooks.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/detail/ZoomHooksInterface.h>
+
+#include <ATen/Generator.h>
+#include <c10/util/Optional.h>
+
+// TODO: No need to have this whole header, we can just put it all in
+// the cpp file
+
+namespace at::zoom::detail {
+
+
+// The real implementation of ZoomHooksInterface
+struct ZoomHooks : public ZoomHooksInterface {
+  ZoomHooks(ZoomHooksArgs) {}
+  void initZoom() const override;
+  void initPrivateUse1() const override;
+  Device getDeviceFromPtr(void* data) const override;
+  bool isPinnedPtr(const void* data) const override;
+  const Generator& getDefaultZoomGenerator(DeviceIndex device_index = -1) const override;
+  bool hasROCM() const override;
+  bool checkHIPBlasDeterministic() const override;
+  const at::zoom::HIPRTC& hiprtc() const override;
+  DeviceIndex current_device() const override;
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  Allocator* getZoomDeviceAllocator() const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  std::string showConfig() const override;
+  int getNumGPUs() const override;
+  void deviceSynchronize(DeviceIndex device_index) const override;
+};
+
+void register_zoom_hooks();
+
+} // at::zoom::detail
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.cpp b/aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.cpp
new file mode 100644
index 00000000000000..d8f0c36d000a7c
--- /dev/null
+++ b/aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.cpp
@@ -0,0 +1,13 @@
+#include <ATen/zoom/hiprtc_stub/ATenHIPRTC.h>
+#include <iostream>
+
+namespace at { namespace zoom {
+
+HIPRTC* load_hiprtc() {
+  auto self = new HIPRTC();
+#define CREATE_ASSIGN(name) self->name = name;
+  AT_FORALL_HIPRTC(CREATE_ASSIGN)
+  return self;
+}
+
+}} // at::zoom
diff --git a/aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.h b/aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.h
new file mode 100644
index 00000000000000..bc3de47142f1e7
--- /dev/null
+++ b/aten/src/ATen/zoom/hiprtc_stub/ATenHIPRTC.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <ATen/zoom/ATenZoomGeneral.h>
+#include <hip/hip_runtime.h>
+#include <hip/hiprtc.h>
+
+namespace at { namespace zoom {
+
+
+// NOTE [ USE OF NVRTC AND DRIVER API ]
+//
+// ATen does not directly link to either libnvrtc or libcuda because they
+// require libcuda to be installed, yet we want our GPU build to work on CPU
+// machines as long as CUDA is not initialized.
+//
+// Normal CUDA code in torch uses the cuda runtime libraries which can be
+// installed even if the driver is not installed, but sometimes we specifically
+// need to use the driver API (e.g., to load JIT compiled code).
+// To accomplish this, we lazily link libcaffe2_nvrtc which provides a struct
+// at::zoom::HIPRTC that contains function pointers to all of the apis we need.
+//
+// IT IS AN ERROR TO TRY TO CALL ANY nvrtc* or cu* FUNCTION DIRECTLY.
+// INSTEAD USE, e.g.
+//   detail::getZoomHooks().nvrtc().cuLoadModule(...)
+// or
+//   globalContext().getNVRTC().cuLoadModule(...)
+//
+// If a function is missing add it to the list in ATen/cuda/nvrtc_stub/ATenNVRTC.h
+// and edit ATen/cuda/detail/LazyNVRTC.cpp accordingly (e.g., via one of the stub
+// macros).
+
+
+// NOTE [ ATen NVRTC Stub and HIP ]
+//
+// ATen's NVRTC stub library, caffe2_nvrtc, provides dynamic loading of both
+// NVRTC and driver APIs. While the former is not yet supported for HIP, the
+// later is supported and needed (e.g., in CUDAHooks::getDeviceWithPrimaryContext()
+// used by tensor.pin_memory()).
+//
+// The macro below strips out certain unsupported operations on HIP from the full
+// list above.
+//
+// HIP doesn't have
+//   cuGetErrorString  (maps to non-functional hipGetErrorString___)
+//
+// HIP from ROCm 3.5 on renamed hipOccupancyMaxActiveBlocksPerMultiprocessor
+// to hipModuleOccupancyMaxActiveBlocksPerMultiprocessor.
+// #if TORCH_HIP_VERSION < 305
+// #define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR hipOccupancyMaxActiveBlocksPerMultiprocessor
+// #else
+// #define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR hipModuleOccupancyMaxActiveBlocksPerMultiprocessor
+// #endif
+
+#define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR hipModuleOccupancyMaxActiveBlocksPerMultiprocessor
+
+#define AT_FORALL_HIPRTC(_)                        \
+  _(hiprtcVersion)                                 \
+  _(hiprtcCreateProgram)                           \
+  _(hiprtcAddNameExpression)                       \
+  _(hiprtcDestroyProgram)                          \
+  _(hiprtcGetCodeSize)                              \
+  _(hiprtcGetCode)                                  \
+  _(hipModuleLoadData)                             \
+  _(hipModuleGetFunction)                          \
+  _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
+  _(hiprtcGetErrorString)                          \
+  _(hiprtcGetProgramLogSize)                       \
+  _(hiprtcGetProgramLog)                           \
+  _(hipModuleLaunchKernel)                               \
+  _(hiprtcCompileProgram)                          \
+  _(hipCtxGetCurrent)                              \
+  _(hiprtcGetLoweredName)                          \
+  _(hipModuleUnload)                               \
+  _(hipDevicePrimaryCtxGetState)
+
+
+
+extern "C" typedef struct HIPRTC {
+#define CREATE_MEMBER(name) decltype(&name) name;
+  AT_FORALL_HIPRTC(CREATE_MEMBER)
+#undef CREATE_MEMBER
+} HIPRTC;
+
+extern "C" TORCH_ZOOM_API HIPRTC* load_hiprtc();
+}} // at::zoom
diff --git a/aten/src/ATen/zoom/jit/HIPJitLoops.cuh b/aten/src/ATen/zoom/jit/HIPJitLoops.cuh
new file mode 100644
index 00000000000000..01154c0b568173
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/HIPJitLoops.cuh
@@ -0,0 +1,292 @@
+#pragma once
+#include <ATen/zoom/jit/macros.h>
+
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/Array.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+#include <ATen/zoom/jit/thread_constants.h>
+
+#include "Loops.cuh"
+
+#include <c10/macros/Macros.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/SmallBuffer.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <tuple>
+#include <mutex>
+
+namespace at {
+namespace native {
+
+template <typename Tuple, std::size_t... I>
+constexpr auto tuple_to_array_helper(Tuple& t, std::index_sequence<I...> seq) {
+    constexpr auto size = seq.size();
+    (void)t; // warning : unused parameter when tuple is empty.
+    return std::array<void*, size>{static_cast<void*>(&std::get<I>(t))...};
+}
+
+// Helper function convert tuple to std::array<void*, N>
+// for passing the arguments to CUDA Kernel
+// NOTE: We capture tuple by reference,
+// so the pointers in returned array are only valid
+// till tuple is alive.
+template <typename ...Args>
+constexpr auto tuple_to_array(std::tuple<Args...>& extra_args) {
+    constexpr auto tuple_size = sizeof...(Args);
+    return tuple_to_array_helper(extra_args, std::make_index_sequence<tuple_size>{});
+}
+
+struct JittedVecKernelCache {
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  at::zoom::jit::hiprtcFunction vec1;
+  at::zoom::jit::hiprtcFunction vec2;
+  at::zoom::jit::hiprtcFunction vec4;
+};
+
+struct JittedKernelVariantCache {
+  JittedVecKernelCache vec;
+  at::zoom::jit::hiprtcFunction noncontiguous;
+  at::zoom::jit::hiprtcFunction dynamic_contiguous;
+  at::zoom::jit::hiprtcFunction dynamic_noncontiguous;
+};
+
+inline c10::SmallBuffer<void*, 64> pack_kernel_args(
+    std::initializer_list<void*> args,
+    c10::ArrayRef<void*> extra_args) {
+  c10::SmallBuffer<void*, 64> ret(args.size() + extra_args.size());
+  std::copy(args.begin(), args.end(), ret.data());
+  std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size());
+  return ret;
+}
+
+template<typename array_t,
+         typename inp_calc_t,
+         typename out_calc_t,
+         typename loader_t,
+         typename storer_t>
+void launch_jitted_unrolled_kernel(
+    std::mutex &jiterator_mutex,
+    at::zoom::jit::hiprtcFunction &fn_cache,
+    const at::zoom::jit::KernelDescriptor &desc,
+    int64_t N,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s,
+    bool contiguous,
+    at::zoom::jit::BinaryFuncVariant scalar_pos,
+    void* scalar_val,
+    c10::ArrayRef<void*> extra_args) {
+
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  //casting result to int is always safe, intermediate is int64 and won't overflow
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+
+  if (!fn_cache.function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_cache.function) {
+      constexpr bool dynamic_casting = !std::is_same<decltype(l), memory::LoadWithoutCast>() ||
+                                       !std::is_same<decltype(s), memory::StoreWithoutCast>();
+      auto code = at::zoom::jit::generate_code(
+          desc, contiguous, dynamic_casting, scalar_pos);
+      fn_cache = at::zoom::jit::jit_pwise_function(code, desc.name);
+    }
+  }
+
+  auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+  at::zoom::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u},
+  {num_threads(), 1u, 1u});
+}
+
+template<int arity, typename array_t>
+void launch_jitted_vectorized_kernel(
+    std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache,
+    const at::zoom::jit::KernelDescriptor &desc, int64_t N, array_t data,
+    at::zoom::jit::BinaryFuncVariant scalar_pos,
+    void *scalar_val, c10::ArrayRef<void*> extra_args) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  // N is still int64_t for the computation, but it's always safe to cast result to int
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+  const int vec_size = at::zoom::jit::can_vectorize_up_to(
+      desc, c10::ArrayRef<char*>(data.data, data.size()));
+
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
+  at::zoom::jit::hiprtcFunction* fn_ptr;
+  if (vec_size == 4) {
+    fn_ptr = &fn_cache.vec4;
+  } else if (vec_size == 2) {
+    fn_ptr = &fn_cache.vec2;
+  } else if (vec_size ==1) {
+    fn_ptr = &fn_cache.vec1;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel");
+  }
+
+  bool vectorized = vec_size > 1;
+
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_ptr->function) { // cache miss!
+
+      // Generates program
+      auto code = at::zoom::jit::generate_code(
+          desc, /*contiguous=*/true, /*dynamic_casting=*/false,
+          scalar_pos, vectorized, vec_size);
+      std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name;
+
+      // Acquires the program
+      *fn_ptr = at::zoom::jit::jit_pwise_function(code, kernel_name);
+    }
+  }
+
+  if (vectorized) {
+    auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args);
+    at::zoom::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+  } else {
+// NVCC complains about unused variables l and s.
+// It should be false positive in most cases, so we suppress the warnings.
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
+    auto ic = TrivialOffsetCalculator<arity>();
+    auto oc = TrivialOffsetCalculator<1>();
+    auto l = memory::LoadWithoutCast();
+    auto s = memory::StoreWithoutCast();
+
+    auto args = pack_kernel_args(
+        {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+    at::zoom::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+#pragma nv_diagnostic pop
+  }
+}
+
+template <int arity>
+void jitted_gpu_kernel_generic(
+    std::mutex &jiterator_mutex,
+    JittedKernelVariantCache &cache,
+    const at::zoom::jit::KernelDescriptor &desc,
+    at::zoom::jit::BinaryFuncVariant scalar_pos,
+    c10::ArrayRef<void*> extra_args,
+    TensorIteratorBase& iter,
+    const bool dynamic_casting,
+    void *scalar_val) {
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+
+  constexpr int ntensors = arity + 1;
+  at::detail::Array<char*, ntensors> data;
+  for (auto i : c10::irange(ntensors)) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+  bool contiguous = iter.is_contiguous();
+
+  // Decides which of 4 kernel types to launch
+  // Variations are:
+  //   - Case 1: no dynamic casting and contiguous
+  //   - Case 2: no dynamic casting and noncontiguous
+  //   - Case 3: dynamic casting and contiguous
+  //   - Case 4: dynamic casting and noncontiguous
+  // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl
+
+  if (!dynamic_casting) {
+    if (contiguous) {
+      // Case 1: no dynamic casting and contiguous
+      launch_jitted_vectorized_kernel<arity>(
+          jiterator_mutex, cache.vec, desc,
+          numel, data, scalar_pos, scalar_val, extra_args);
+      return;
+    }
+
+    // Case 2: no dynamic casting and noncontiguous
+    auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.noncontiguous, desc, numel, data,
+        input_offset_calculator, output_offset_calculator, loader,
+        storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+
+  // Cases 3 and 4 are handled below
+  // Both require construction of a storer (this asserts 1 output) and one or more loaders
+
+  // Creates store cast to output (the zeroth tensor in TensorIterator)
+  auto storer = memory::StoreWithCast<1>(iter);
+
+  // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors)
+  auto loader = memory::LoadWithCast<arity>(iter);
+
+  if (contiguous) {
+    // Case 3: dynamic casting and contiguous
+    auto input_offset_calculator = TrivialOffsetCalculator<arity>();
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator,
+        output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+
+  // Case 4: dynamic casting and noncontiguous
+  auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+  auto output_offset_calculator = make_output_offset_calculator(iter);
+  launch_jitted_unrolled_kernel(
+      jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator,
+      output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+}
+
+// NOTE: static to reduce chances of name collision.
+template <
+    char const* name,
+    typename result_type,
+    typename f_inputs_type,
+    int arity,
+    at::zoom::jit::BinaryFuncVariant scalar_pos =
+        at::zoom::jit::BinaryFuncVariant::NoScalar,
+    typename... ExtraArgs>
+static void jitted_gpu_kernel_impl(
+    TensorIteratorBase& iter,
+    const std::string &f,
+    const bool dynamic_casting,
+    at::opmath_type<f_inputs_type> scalar_val,
+    std::tuple<ExtraArgs...> extra_args) {
+
+  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  //   the same compute capability
+  static std::mutex jiterator_mutex;
+  static std::vector<JittedKernelVariantCache> device_caches(c10::zoom::device_count());
+
+  constexpr int nInputs = arity;
+  constexpr int nOutputs = 1;  // TODO: Support more than 1 output
+  static const auto desc = at::zoom::jit::make_kernel_descriptor<
+    result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs);
+
+  auto &cache = device_caches[iter.device().index()];
+  auto extra_args_array = tuple_to_array(extra_args);
+  return jitted_gpu_kernel_generic<arity>(
+      jiterator_mutex,
+      cache,
+      desc,
+      scalar_pos,
+      extra_args_array,
+      iter,
+      dynamic_casting,
+      &scalar_val
+    );
+}
+
+}}  // at::native
diff --git a/aten/src/ATen/zoom/jit/HIPLoops.cuh b/aten/src/ATen/zoom/jit/HIPLoops.cuh
new file mode 100644
index 00000000000000..85cdd5211e7006
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/HIPLoops.cuh
@@ -0,0 +1,333 @@
+#pragma once
+
+// This file provides two functions to help write GPU elementwise kernels:
+//
+//   gpu_kernel(TensorIterator iter, <lambda>)
+//   gpu_kernel_with_scalars(TensorIterator iter, <lambda>)
+//
+// The gpu_kernel_with_scalars generates specializations that support a
+// single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
+// is lifted to a kernel parameter instead of copying to device memory.
+// This should be  used in conjunction with TensorIterator::allow_cpu_scalars_,
+// which is the default for TensorIterator::binary_op. Otherwise, all inputs
+// and the output must be on the GPU.
+//
+// For example, to write a reciprocal kernel for GPU float Tensors:
+//
+//   gpu_kernel(iter, []GPU_LAMBDA(float a) {
+//    return 1.0f / a;
+//   });
+//
+// To write a multiplication kernel for GPU float Tensors where one argument
+// may be a CPU scalar:
+//
+//   gpu_kernel_with_scalars(iter, []GPU_LAMBDA(float a, float b) {
+//     return a * b;
+//   });
+//
+// See BinaryOpsKernel.cu for the complete implementation
+//
+
+#include <iostream>
+#include <tuple>
+#include <type_traits>
+
+#include <ATen/core/Array.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/core/DynamicCast.h>
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeCast.h>
+
+// #ifdef __NVCC__
+// #define ASSERT_HOST_DEVICE_LAMBDA(type)                       \
+//   static_assert(                                              \
+//       __nv_is_extended_host_device_lambda_closure_type(type), \
+//       #type " must be a __host__ __device__ lambda")
+// #else
+#define ASSERT_HOST_DEVICE_LAMBDA(type)
+// #endif
+
+namespace at {
+namespace native {
+
+template <int vec_size, typename func_t, typename array_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
+  using traits = function_traits<func_t>;
+  int remaining = N - block_work_size() * blockIdx.x;
+
+  if (remaining < block_work_size()) { // if this block handles the reminder,
+                                       // just do a naive unrolled loop
+    auto input_calc = TrivialOffsetCalculator<traits::arity>();
+    auto output_calc = TrivialOffsetCalculator<1>();
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    auto policy = memory::policies::unroll<
+        array_t,
+        decltype(input_calc),
+        decltype(output_calc),
+        memory::LoadWithoutCast,
+        memory::StoreWithoutCast>(
+        data, remaining, input_calc, output_calc, loader, storer);
+    elementwise_kernel_helper(f, policy);
+  } else { // if this block has a full `block_work_size` data to handle, use
+           // vectorized memory access
+    elementwise_kernel_helper(
+        f, memory::policies::vectorized<vec_size, array_t>(data));
+  }
+}
+
+template <
+    typename func_t,
+    typename array_t,
+    typename inp_calc_t,
+    typename out_calc_t,
+    typename loader_t,
+    typename storer_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void unrolled_elementwise_kernel(
+    int N,
+    func_t f,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s) {
+  int remaining = N - block_work_size() * blockIdx.x;
+  auto policy = memory::policies::
+      unroll<array_t, inp_calc_t, out_calc_t, loader_t, storer_t>(
+          data, remaining, ic, oc, l, s);
+  elementwise_kernel_helper(f, policy);
+}
+
+// this function assume trivial 1d and no dynamic casting
+template <typename func_t, typename array_t>
+static inline void launch_vectorized_kernel(
+    int64_t N,
+    const func_t& f,
+    array_t data) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  using traits = function_traits<func_t>;
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = c10::zoom::getCurrentZoomStream();
+  int vec_size = memory::can_vectorize_up_to<func_t>(data);
+
+  switch (vec_size) {
+    case 4:
+      vectorized_elementwise_kernel<4, func_t, array_t>
+          <<<grid, num_threads(), 0, stream>>>(N, f, data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      break;
+    case 2:
+      vectorized_elementwise_kernel<2, func_t, array_t>
+          <<<grid, num_threads(), 0, stream>>>(N, f, data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      break;
+    case 1: {
+      auto input_calc = TrivialOffsetCalculator<traits::arity>();
+      auto output_calc = TrivialOffsetCalculator<1>();
+      auto loader = memory::LoadWithoutCast();
+      auto storer = memory::StoreWithoutCast();
+      unrolled_elementwise_kernel<func_t, array_t>
+          <<<grid, num_threads(), 0, stream>>>(
+              N, f, data, input_calc, output_calc, loader, storer);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      break;
+    }
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
+  }
+}
+
+template <
+    typename func_t,
+    typename array_t,
+    typename inp_calc_t,
+    typename out_calc_t,
+    typename loader_t,
+    typename storer_t>
+static inline void launch_unrolled_kernel(
+    int64_t N,
+    const func_t& f,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = c10::zoom::getCurrentZoomStream();
+  unrolled_elementwise_kernel<func_t, array_t>
+      <<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc, l, s);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <int nt, int vt, typename func_t>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void elementwise_kernel(int N, func_t f) {
+  int tid = threadIdx.x;
+  int nv = nt * vt;
+  int idx = nv * blockIdx.x + tid;
+#pragma unroll
+  for (int i = 0; i < vt; i++) {
+    if (idx < N) {
+      f(idx);
+      idx += nt;
+    }
+  }
+}
+
+template <int nt, int vt, typename func_t>
+static void launch_legacy_kernel(int64_t N, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
+  if (N == 0) {
+    return;
+  }
+  dim3 block(nt);
+  dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  auto stream = c10::zoom::getCurrentZoomStream();
+  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename traits, typename func_t, typename index_t, size_t... INDEX>
+C10_HOST_DEVICE typename traits::result_type invoke_impl(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    int i,
+    std::index_sequence<INDEX...>) {
+  (void)strides;
+  (void)i;
+  return f(c10::load<typename traits::template arg<INDEX>::type>(
+      data[INDEX] + i * strides[INDEX])...);
+}
+
+template <
+    typename func_t,
+    typename index_t,
+    typename traits = function_traits<func_t>>
+C10_HOST_DEVICE typename traits::result_type invoke(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    int i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return invoke_impl<traits>(f, data, strides, i, Indices{});
+}
+
+template <typename traits, typename func_t, typename index_t, size_t... I>
+C10_HOST_DEVICE typename traits::result_type invoke_impl(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    const ScalarType dtypes[],
+    int i,
+    std::index_sequence<I...>) {
+  (void)strides;
+  (void)i;
+  return f(c10::fetch_and_cast<typename traits::template arg<I>::type>(
+      dtypes[I], data[I] + i * strides[I])...);
+}
+
+template <
+    typename func_t,
+    typename index_t,
+    typename traits = function_traits<func_t>>
+C10_HOST_DEVICE typename traits::result_type invoke(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    const ScalarType dtypes[],
+    int i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return invoke_impl<traits>(f, data, strides, dtypes, i, Indices{});
+}
+
+template <typename func_t>
+void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using arg0_t = typename traits::result_type;
+  constexpr int ntensors = traits::arity + 1;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  bool contiguous = iter.is_contiguous();
+
+  if (contiguous) {
+    return launch_vectorized_kernel(numel, f, data);
+  }
+  auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+  constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 2 : 4;
+  launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
+    auto offsets = offset_calc.get(idx);
+    arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
+    *out = invoke(f, &data.data[1], &offsets.data[1], 1);
+  });
+}
+
+template <typename func_t>
+void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
+  if (!needs_dynamic_casting<func_t>::check(iter)) {
+    return gpu_kernel_impl_nocast(iter, f);
+  }
+  using traits = function_traits<func_t>;
+  using arg0_t = typename traits::result_type;
+  constexpr int ntensors = traits::arity + 1;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  bool contiguous = iter.is_contiguous();
+
+  if (contiguous) {
+    at::detail::Array<ScalarType, ntensors> dtypes;
+    auto inner_strides = iter.get_inner_strides();
+    at::detail::Array<int, ntensors> strides;
+    for (int i = 0; i < ntensors; i++) {
+      dtypes[i] = iter.dtype(i);
+      strides[i] = inner_strides[i];
+    }
+    launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      void* out = data[0] + strides[0] * idx;
+      arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
+      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+    });
+  } else {
+    at::detail::Array<ScalarType, ntensors> dtypes;
+    for (int i = 0; i < ntensors; i++) {
+      dtypes[i] = iter.dtype(i);
+    }
+    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
+      auto offsets = offset_calc.get(idx);
+      void* out = data[0] + offsets[0];
+      arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
+      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+    });
+  }
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/IntegerDivider.cuh b/aten/src/ATen/zoom/jit/IntegerDivider.cuh
new file mode 100644
index 00000000000000..2e0d34df31e02e
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/IntegerDivider.cuh
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <assert.h>
+#include <hip/hip_runtime.h>
+
+// insurance for now, torch only defines this macro if you're compiling with cuda or
+// following traditional ROCm build
+#define C10_HOST_DEVICE __host__ __device__
+
+namespace at::zoom::detail {
+
+// A utility class to implement integer division by multiplication, given a fixed
+// divisor.
+//
+// WARNING: The fast divider algorithm is only implemented for unsigned int;
+//          otherwise we default to plain integer division.  For unsigned int,
+//          we further assume that the dividend is at most INT32_MAX.  Thus,
+//          IntDivider must NOT be used for general integer division.
+//
+//          This reduced range is enough for our purpose, and it allows us to
+//          slightly simplify the computation.
+//
+// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
+//
+// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
+// <= m < 2^(N+1)) and shift s such that:
+//
+//    \floor(n / d) = \floor((m * n) / 2^(N+s)).
+//
+// Given such m and s, the integer division can be then implemented as:
+//
+//    let m' = m - 2^N  // 0 <= m' < 2^N
+//
+//    fast_integer_division(n):
+//      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
+//      // integer.  Then take the higher N bits.
+//      t = (m' * n) >> N
+//
+//      // Here we use the fact that n is less than 2^(N-1): otherwise the value
+//      // of (t + n) may not fit in an N-bit integer.
+//      return (t + n) >> s
+//
+// Finding such a magic number is surprisingly easy:
+//
+//    s  = \ceil(\log_2 d)
+//    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
+//
+// See also:
+//    - Division by Invariant Integers Using Multiplication,
+//      Torbjörn Granlund and Peter L. Montgomery, 1994.
+//
+//    - http://www.hackersdelight.org/magic.htm
+//
+//    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+
+// Result of div/mod operation stored together.
+template <typename Value>
+struct DivMod {
+  Value div, mod;
+
+  C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
+};
+
+// Base case: we only have an implementation for uint32_t for now.  For
+// everything else, we use plain division.
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) { }
+
+  C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
+  C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
+  C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
+    return DivMod<Value>(n / divisor, n % divisor);
+  }
+
+  Value divisor;
+};
+
+// Implement fast integer division.
+template <>
+struct IntDivider<unsigned int> {
+  static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
+
+  IntDivider() = default;
+
+  IntDivider(unsigned int d) : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+
+    // TODO: gcc/clang has __builtin_clz() but it's not portable.
+    for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
+
+    uint64_t one = 1;
+    uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    m1 = magic;
+    assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
+  }
+
+  C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
+#if defined(__HIP_DEVICE_COMPILE__)
+    // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
+    // 'm1'.
+    unsigned int t = __umulhi(n, m1);
+    return (t + n) >> shift;
+#else
+    // Using uint64_t so that the addition does not overflow.
+    uint64_t t = ((uint64_t) n * m1) >> 32;
+    return (t + n) >> shift;
+#endif
+  }
+
+  C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
+    return n - div(n) * divisor;
+  }
+
+  C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
+    unsigned int q = div(n);
+    return DivMod<unsigned int>(q, n - q * divisor);
+  }
+
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+};
+
+}  // namespace at::zoom::detail
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/JitLoops.cuh b/aten/src/ATen/zoom/jit/JitLoops.cuh
new file mode 100644
index 00000000000000..8cd5ac713856cb
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/JitLoops.cuh
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <ATen/zoom/jit/macros.h>
+
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+
+#include <ATen/zoom/jit/HIPJitLoops.cuh>
+
+namespace at {
+namespace native {
+
+/* Note [Jiterator]
+The "jiterator" simply just-in-time compiles the same kernels that
+Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time,
+build size, and initial CUDA context size.
+
+By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels.
+This behavior is controlled with two environment variables:
+  - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use
+  - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels
+
+The jiterator currently has some limitations, however. It cannot:
+  - handle math on complex datatypes
+  - handle kernels with scalar parameters
+
+These improvements will likely come soon.
+
+For examples of how to use the jiterator see the i1 and gcd kernel
+implementations, which pass jittable strings implementing their
+operations instead of the typical CUDA functors.
+
+To pass a runtime argument (similar to lambda captures in non-JIT kernels),
+we need to pass to additional arguments to `jitted_gpu_kernel` by value.
+Currently only primitive C++ types used for computation are valid.
+The order of these extra arguments should be same as the order they appear
+in kernel's function signature. (look at polygamma for example)
+
+NOTE: One big restriction being that these arguments should be after the
+arguments provided by TensorIterator. Eg. While capturing `n`, where
+`scalar_t x` and `scalar_t y` are provided by TensorIterator,
+* foo(scalar_t x, scalar_t y, int n) works!
+* foo(int n, scalar_t x, scalar_y) doesn't work
+* foo(scalar_t x, int n, scalar_y) doesn't work
+
+*/
+
+// Entrypoint for jitted GPU kernels.
+// Only handles elementwise unary and binary kernels with a
+//   common dtype and a single output.
+// NOTE: this assumes the op's iterator has a common_dtype.
+// NOTE: We use std::tuple instead of parameter pack
+//  for `extra_args` due to following
+// bug on older versions of clang
+// https://bugs.llvm.org/show_bug.cgi?id=23029
+template <
+    char const* name,
+    typename return_type,
+    typename f_inputs_type,
+    int arity,
+    typename... Args>
+void jitted_gpu_kernel(
+    TensorIteratorBase& iter,
+    const std::string& f,
+    at::zoom::jit::BinaryFuncVariant scalar_pos =
+        at::zoom::jit::BinaryFuncVariant::NoScalar,
+    at::opmath_type<f_inputs_type> scalar_val = 0,
+    std::tuple<Args...> extra_args = std::make_tuple()) {
+  // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel
+  //   Maybe it could be refactored?
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_privateuseone(),
+      "argument ", arg, ": expected a Zoom device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      jitted_gpu_kernel<name, return_type, f_inputs_type, arity>(
+          sub_iter, f, scalar_pos, scalar_val, extra_args);
+    }
+
+    return;
+  }
+
+  // Computes if dynamic casting is needed
+  // Dynamic casting is needed if an input's dtype differs from the common dtype
+  //   or if the result dtype differs from the output's dtype
+  // Note: this is intentionally divergent from calling needs_dynamic_casting,
+  //   which is more general and inspects a lambda to determine if dynamic
+  //   casting is needed.
+  bool needs_dynamic_casting = false;
+
+  // Checks output
+  const ScalarType return_scalar_type = c10::CppTypeToScalarType<return_type>::value;
+  const auto dtype0 = iter.dtype(0);
+  if (dtype0 != return_scalar_type) {
+    needs_dynamic_casting = true;
+  }
+
+  // Checks input(s)
+  const ScalarType inputs_scalar_type = c10::CppTypeToScalarType<f_inputs_type>::value;
+  for (auto i = decltype(arity){1}; i < (arity + 1); ++i) {
+    const auto dtypei = iter.dtype(i);
+    if (dtypei != inputs_scalar_type) {
+      needs_dynamic_casting = true;
+      break;
+    }
+  }
+  if (scalar_pos == at::zoom::jit::BinaryFuncVariant::NoScalar) {
+    // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used
+    // for computation in the generated code and hence we pass a dummy
+    // value of `0`.
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::zoom::jit::BinaryFuncVariant::NoScalar>(
+        iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args);
+  } else if (scalar_pos == at::zoom::jit::BinaryFuncVariant::RhsScalar) {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::zoom::jit::BinaryFuncVariant::RhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+
+  } else {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::zoom::jit::BinaryFuncVariant::LhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+  }
+}
+
+// TODO: support runtime state capture similar to `jitted_gpu_kernel`.
+template <char const *name, typename return_type, typename f_inputs_type>
+void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+  //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type)
+  using opmath_t = at::opmath_type<f_inputs_type>;
+  if (iter.is_cpu_scalar(1)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(1);
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::zoom::jit::BinaryFuncVariant::LhsScalar, scalar_val);
+  } else if (iter.is_cpu_scalar(2)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(2);
+    iter.remove_operand(2);
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::zoom::jit::BinaryFuncVariant::RhsScalar, scalar_val);
+  } else {
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 2>(iter, f);
+  }
+}
+
+}}  // at::native
diff --git a/aten/src/ATen/zoom/jit/Loops.cuh b/aten/src/ATen/zoom/jit/Loops.cuh
new file mode 100644
index 00000000000000..cc6d2845506939
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/Loops.cuh
@@ -0,0 +1,325 @@
+#pragma once
+
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/OpMathType.h>
+#include <ATen/zoom/jit/thread_constants.h>
+#include <c10/zoom/ZoomException.h>
+
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+
+
+namespace at { namespace native {
+
+template<int N>
+static OffsetCalculator<N> make_input_offset_calculator(const TensorIteratorBase& iter) {
+  // array size can not be 0, this happens when N == 0
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + iter.noutputs()).data();
+    element_sizes[i] = iter.element_size(i + iter.noutputs());
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template <int num_outputs = 1>
+static OffsetCalculator<num_outputs> make_output_offset_calculator(const TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
+  std::array<const int64_t*, num_outputs> strides;
+  int64_t element_sizes[num_outputs];
+  for (int i = 0; i < num_outputs; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template<typename func_t, typename policy_t>
+__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  using args_t = typename traits::ArgsTuple;
+
+  int idx = blockIdx.x;
+
+  return_t results[thread_work_size()];
+  args_t args[thread_work_size()];
+
+  // load
+  policy.load(args, idx);
+
+  // compute
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (policy.check_inbounds(i)) {
+      results[i] = c10::guts::apply(f, args[i]);
+    }
+  }
+
+  // store
+  policy.store(results, idx);
+}
+
+}}  // namespace at::native
+
+#include "HIPLoops.cuh"
+
+namespace at:: native {
+
+template <typename func_t>
+void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_privateuseone(),
+      "argument ", arg, ": expected a Zoom device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_nocast(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_impl_nocast(iter, f);
+}
+
+template <typename func_t>
+void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_privateuseone(),
+      "argument ", arg, ": expected a Zoom device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_impl(iter, f);
+}
+
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct AUnaryFunctor {
+  using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  __device__ return_t operator()(arg2_t b) const {
+    return f(a, b);
+  }
+  // NB: scalar is stored in higher precision!
+  AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {}
+  private:
+    func_t f;
+    opmath_arg1_t a;
+};
+
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BUnaryFunctor {
+  using traits = function_traits<func_t>;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
+  __device__ return_t operator()(arg1_t a) const {
+    return f(a, b);
+  }
+  // NB: scalar is stored in higher precision!
+  BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {}
+  private:
+    func_t f;
+    opmath_arg2_t b;
+};
+
+// Though seemingly noop, this inserts casts from arg1_t to func_t's type
+// (which may be higher precision), as well as casts to return_t
+template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BinaryFunctor {
+  __device__ return_t operator()(arg1_t a, arg2_t b) const {
+    return f(a, b);
+  }
+  BinaryFunctor(func_t f_): f(f_) {}
+  private:
+    func_t f;
+};
+
+// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which
+// accepts inputs at higher precision (typically opmath_t), but then
+// ensure that we load from memory at the correct precision (scalar_t)
+// to avoid expensive loads.  For the whole sordid story see
+// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302
+template <typename arg1_t, typename arg2_t = arg1_t, typename return_t = arg1_t, typename func_t>
+void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+
+  using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+
+  if (iter.is_cpu_scalar(1)) {
+    AUnaryFunctor<arg1_t, arg2_t, return_t, func_t> af(f, iter.scalar_value<opmath_arg1_t>(1));
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    gpu_kernel(iter, af);
+  } else if (iter.is_cpu_scalar(2)) {
+    BUnaryFunctor<arg1_t, arg2_t, return_t, func_t> bf(f, iter.scalar_value<opmath_arg2_t>(2));
+    iter.remove_operand(2);
+    gpu_kernel(iter, bf);
+  } else {
+    gpu_kernel(iter, BinaryFunctor<arg1_t, arg2_t, return_t, func_t>(f));
+  }
+}
+
+template <typename scalar_t, typename return_t = scalar_t, typename func_t>
+void opmath_symmetric_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  // Use symmetric property of the functor to reduce number of kernels,
+  // requires f(a, b) == f(b, a)
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+
+  using traits = function_traits<func_t>;
+  using opmath_arg_t = typename traits::template arg<0>::type;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  static_assert(std::is_same<opmath_arg_t, typename traits::template arg<1>::type>::value,
+                "f is not symmetric");
+
+  OptionalDeviceGuard device_guard;
+  opmath_arg_t scalar_val{};
+
+  if (iter.is_cpu_scalar(1)) {
+    scalar_val = iter.scalar_value<opmath_arg_t>(1);
+    iter.remove_operand(1);
+
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    device_guard.reset_device(iter.device(1));
+  } else if (iter.is_cpu_scalar(2)) {
+    scalar_val = iter.scalar_value<opmath_arg_t>(2);
+    iter.remove_operand(2);
+  }
+
+  if (iter.ninputs() == 2) {
+    gpu_kernel(iter, BinaryFunctor<scalar_t, scalar_t, return_t, func_t>(f));
+  } else {
+    AUnaryFunctor<scalar_t, scalar_t, return_t, func_t> unary_f(f, scalar_val);
+    gpu_kernel(iter, unary_f);
+  }
+}
+
+// Legacy variant that assumes that func_t has the correct types
+// that we expect to load from memory
+template <typename func_t>
+void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  using arg1_t = typename traits::template arg<0>::type;
+  using arg2_t = typename traits::template arg<1>::type;
+  using return_t = typename traits::result_type;
+  opmath_gpu_kernel_with_scalars<arg1_t, arg2_t, return_t, func_t>(iter, f);
+}
+
+namespace { // functions for `gpu_kernel_multiple_outputs`.
+
+// check the return type is `thrust::tuple`, not `std::tuple`.
+template <typename T> struct is_tuple: std::false_type {};
+
+template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {};
+
+template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  int remaining = N - block_work_size() * blockIdx.x;
+  elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc));
+}
+
+template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = c10::zoom::getCurrentZoomStream();
+  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename func_t>
+void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using output_t = typename traits::result_type;
+  static_assert(is_tuple<output_t>::value, "f's return type must be `thrust::tuple`");
+  constexpr int num_outputs = thrust::tuple_size<output_t>::value;
+  constexpr int num_inputs = traits::arity;
+  constexpr int ntensors = num_outputs + num_inputs;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors);
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  if (iter.is_contiguous()) {
+    auto input_calc = TrivialOffsetCalculator<num_inputs>();
+    auto output_calc = TrivialOffsetCalculator<num_outputs>();
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  } else {
+    auto input_calc = make_input_offset_calculator<num_inputs>(iter);
+    auto output_calc = make_output_offset_calculator<num_outputs>(iter);
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  }
+}
+} // namespace
+
+template <typename func_t>
+void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) {
+  ASSERT_HOST_DEVICE_LAMBDA(func_t);
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(iter.device(arg).is_privateuseone());
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_multiple_outputs(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_multiple_outputs_impl(iter, f);
+}
+
+} //namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/MemoryAccess.cuh b/aten/src/ATen/zoom/jit/MemoryAccess.cuh
new file mode 100644
index 00000000000000..4b182724166cbb
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/MemoryAccess.cuh
@@ -0,0 +1,395 @@
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+#include <c10/core/DynamicCast.h>
+#include <c10/util/Exception.h>
+#include <c10/util/TypeCast.h>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Array.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/thread_constants.h>
+#include <c10/zoom/ZoomException.h>
+#include <hip/hip_runtime.h>
+
+#include <thrust/tuple.h>
+
+
+// make sure this is defined
+#define C10_HOST_DEVICE __host__ __device__
+
+// References:
+// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
+
+namespace at { namespace native { namespace memory {
+
+namespace detail {
+
+// What does the `static_unroll` do?
+//
+// We want to do something like:
+//
+//    using args_t = typename traits::ArgsTuple;
+//    args_t args;
+//    #pragma unroll
+//    for (int i = 0; i < traits::arity; i++) {
+//      std::get<i>(args) = ....
+//    }
+//
+// but unfortunately the above code does not work because
+// the template argument has to be a compile time constant
+// so `static_unroll` is created to simulate `#pragma unroll`
+// using template metaprogramming.
+
+template<template<int i> typename func, int end, int current=0>
+struct static_unroll {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
+    func<current>::apply(std::forward<Args>(args)...);
+    static_unroll<func, end, current+1>::with_args(args...);
+  }
+};
+
+template<template<int i> typename func, int end>
+struct static_unroll<func, end, end> {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args... args) {}
+};
+
+// helper structs to be used with static_unroll to load arguments
+// one by one
+
+template<int arg_index>
+struct vectorized_load_helper {
+  template <typename args_t, typename policy_t>
+  static __device__ void apply(policy_t &self, args_t *args, int idx) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
+    auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
+    self.load_single_arg(args_accessor, ptr);
+  }
+};
+
+template<int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
+  static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    std::get<arg_index>(args[j]) = loader.template load<arg_t>(self.data[arg_index + num_outputs], offset[arg_index], arg_index);
+  }
+};
+
+template <int current>
+struct multi_outputs_store_helper {
+  template<int ntensors, int num_outputs, typename ...Args>
+  C10_HOST_DEVICE static void apply(
+      at::detail::Array<char*, ntensors> data,
+      at::detail::Array<uint32_t, num_outputs> offsets,
+      thrust::tuple<Args...> ret) {
+    using T = typename thrust::tuple_element<current, thrust::tuple<Args...>>::type;
+    T *to = reinterpret_cast<T *>(data[current]) + offsets[current];
+    *to = thrust::get<current>(ret);
+  }
+};
+
+}  // namespace detail
+
+struct LoadWithoutCast {
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    return c10::load(reinterpret_cast<scalar_t *>(base_ptr) + offset);
+  }
+};
+
+template <int N>
+struct LoadWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+
+  array_t dtypes;
+  size_array_t element_sizes;
+
+  LoadWithCast(const TensorIteratorBase& iter) {
+    ZOOM_KERNEL_ASSERT(iter.ninputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i + iter.noutputs());
+      element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs()));
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    return c10::fetch_and_cast<scalar_t>(dtypes[arg], ptr);
+  }
+};
+
+struct StoreWithoutCast {
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    *(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
+  }
+};
+
+template <int N = 1>
+struct StoreWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+
+  array_t dtypes;
+  size_array_t element_sizes;
+
+  StoreWithCast(const TensorIteratorBase& iter) {
+    ZOOM_KERNEL_ASSERT(iter.noutputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i);
+      element_sizes[i] = c10::elementSize(iter.dtype(i));
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    c10::cast_and_store<scalar_t>(dtypes[arg], ptr, value);
+  }
+};
+
+// aligned vector generates vectorized load/store on CUDA
+template<typename scalar_t, int vec_size>
+struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+  scalar_t val[vec_size];
+};
+
+template <int vec_size, typename scalar_t>
+__device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
+  using vec_t = aligned_vector<scalar_t, vec_size>;
+  auto *from = reinterpret_cast<const vec_t *>(base_ptr);
+  return from[offset];
+}
+
+template <int vec_size>
+__device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint32_t offset) {
+  // See NOTE [Loading boolean values]
+  auto tmp = load_vector<vec_size>(reinterpret_cast<const uint8_t*>(base_ptr), offset);
+  aligned_vector<bool, vec_size> ret;
+  for (int i = 0; i < vec_size; ++i) {
+    ret.val[i] = bool(tmp.val[i]);
+  }
+  return ret;
+}
+
+namespace policies {
+
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int num_outputs = 1>
+struct unroll {
+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  loader_t loader;
+  storer_t storer;
+
+  __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
+    data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
+
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      int offset = output_offset_calculator.get(linear_idx)[0];
+      storer.store(from[i], data[0], offset);
+      thread_idx += num_threads();
+    }
+  }
+};
+
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+// Note:
+// Functions in vectorized policy does not do boundary check. It assumes the whole block
+// has its job to do. So the reminders should be handled by the caller manually.
+template <int vec_size, typename data_t>  // vec_size: number of scalars, can be 1, 2, or 4.
+struct vectorized {
+
+  static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
+  static constexpr int loop_size = thread_work_size() / vec_size;
+
+  data_t data;
+
+  __device__ vectorized(data_t data) : data(data) {}
+
+  __device__ inline constexpr bool check_inbounds(int thread_work_elem) {
+    return true;
+  }
+
+  template<typename accessor_t, typename scalar_t>
+  __device__ inline void load_single_arg(accessor_t to, scalar_t *from) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      auto v = load_vector<vec_size>(from, index);
+      #pragma unroll
+      for (int j = 0; j < vec_size; j++) {
+        to(vec_size * i + j) = v.val[j];
+      }
+    }
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    detail::static_unroll<detail::vectorized_load_helper, arity>::with_args(*this, args, idx);
+  }
+
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    using vec_t = aligned_vector<scalar_t, vec_size>;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
+    vec_t *to_ = reinterpret_cast<vec_t *>(to);
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      vec_t v;
+      for (int j = 0; j < vec_size; j++) {
+        v.val[j] = from[vec_size * i + j];
+      }
+      to_[index] = v;
+    }
+  }
+};
+
+template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
+struct multi_outputs_unroll {
+  //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
+  //we don't use inheritance because of compiler bug in cuda 10.2+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  LoadWithoutCast loader;
+  StoreWithoutCast storer;
+
+  __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc):
+  data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
+
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+
+
+  template <typename return_t>
+  __device__ inline void store(return_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= this->remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offsets = this->output_offset_calculator.get(linear_idx);
+      memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
+      thread_idx += num_threads();
+    }
+  }
+};
+
+}  // namespace policies
+
+// This is only used in host, but we will wrap this into some templates
+// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE
+// in order to compile
+template<typename scalar_t>
+inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec2_alignment = std::alignment_of<aligned_vector<scalar_t, 2>>::value;
+  constexpr int vec4_alignment = std::alignment_of<aligned_vector<scalar_t, 4>>::value;
+  if (address % vec4_alignment == 0) {
+    return 4;
+  } else if (address % vec2_alignment == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+template<typename scalar_t>
+inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
+  return can_vectorize_up_to<scalar_t>(static_cast<const char*>(pointer));
+}
+
+template<int i>
+struct can_vectorize_up_to_helper {
+  template <typename array_t, typename traits>
+  static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) {
+    using arg_t = typename traits::template arg<i>::type;
+    // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    result = std::min<int>(result, can_vectorize_up_to<arg_t>(pointers[i + 1]));
+  }
+};
+
+template<typename func_t, typename array_t>
+inline int can_vectorize_up_to(array_t pointers) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  constexpr int arity = traits::arity;
+  int result = can_vectorize_up_to<return_t>(pointers[0]);
+  // We need to get the type for each argument of `func_t`, this can only
+  // be done at compile time.
+  detail::static_unroll<can_vectorize_up_to_helper, arity>::with_args(result, pointers, traits());
+  return result;
+}
+
+}}} // namespace at::native::memory
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/OffsetCalculator.cuh b/aten/src/ATen/zoom/jit/OffsetCalculator.cuh
new file mode 100644
index 00000000000000..618d30a23f5dd0
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/OffsetCalculator.cuh
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <type_traits>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Array.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/IntegerDivider.cuh>
+
+// If element_sizes is nullptr, then the strides will be in bytes, otherwise
+// the strides will be in # of elements.
+// Operands that share the same shape, but may have different strides.
+// OffsetCalculator iterates the tensor in a column-major order
+
+constexpr int MAX_DIMS = 16;
+
+template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
+struct OffsetCalculator {
+  // We allow having negative strides to implement some operations like torch.flip
+  using stride_t = std::conditional_t<signed_strides,
+                                      std::make_signed_t<index_t>,
+                                      index_t>;
+  // The offset for each argument. Wrapper around fixed-size array.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = at::detail::Array<stride_t, std::max<int>(NARGS, 1)>;
+
+  // if element_sizes is nullptr, then the strides will be in bytes, otherwise
+  // the strides will be in # of elements.
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) {
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
+    for (int i=0; i < dims; i++){
+      sizes_[i] = at::zoom::detail::IntDivider<index_t>(sizes[i]);
+      for (int arg = 0; arg < NARGS; arg++) {
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] = strides[arg][i] / element_size;
+      }
+    }
+  }
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = 0;
+    }
+
+    #pragma unroll
+    for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+        break;
+      }
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.div;
+
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] += divmod.mod * strides_[dim][arg];
+      }
+
+    }
+    return offsets;
+  }
+
+  int dims;
+  at::zoom::detail::IntDivider<index_t> sizes_[MAX_DIMS];
+  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+
+template <int NARGS, typename index_t = uint32_t>
+struct TrivialOffsetCalculator {
+  // The offset for each argument. Wrapper around fixed-size array.
+  // The offsets are in # of elements, not in bytes.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = at::detail::Array<index_t, std::max<int>(NARGS, 1)>;
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
+};
+
+// Make an OffsetCalculator with byte offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_offset_calculator(const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(iter.ndim(), iter.shape().data(), strides.data());
+}
+
+// Make an OffsetCalculator with element offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_element_offset_calculator(
+    const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  std::array<int64_t, N> element_sizes;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(
+      iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data());
+}
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/jit_utils.cpp b/aten/src/ATen/zoom/jit/jit_utils.cpp
new file mode 100644
index 00000000000000..16f6b3807260d9
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/jit_utils.cpp
@@ -0,0 +1,1752 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
+#include <c10/util/hash.h>
+#include <c10/util/Optional.h>
+#include <ATen/zoom/jit/macros.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomMiscFunctions.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/hiprtc_stub/ATenHIPRTC.h>
+#include <ATen/code_template.h>
+#include <ATen/OpMathType.h>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/zoom/jit/llvm_jit_strings.h>
+#include <ATen/native/zoom/reduction_template.cuh>
+
+#include <sstream>
+#include <fstream>
+#include <cstdio>
+#include <iterator> // istreambuf_iterator
+#include <cstdlib>
+#include <string>
+
+// TODO: C++17 has the filesystem header, which may replace these
+#ifdef _WIN32
+  // On Windows, the POSIX implementations are considered deprecated. We simply map to the newer variant.
+  #include <process.h>
+  #include <direct.h>
+  #include <io.h>
+  #define access _access
+  #define getpid _getpid
+  #define R_OK    4
+  #define W_OK    2
+  #define F_OK    0
+#else
+  #include <sys/types.h>
+  #include <sys/stat.h> // mkdir
+  #include <unistd.h>
+#endif
+
+
+namespace at::zoom::jit {
+
+const std::string jit_preamble = R"ESCAPE(
+#pragma clang force_cuda_host_device begin
+)ESCAPE";
+const std::string jit_epilogue = R"ESCAPE(
+#pragma clang force_cuda_host_device end
+)ESCAPE";
+
+
+const std::string jit_common_types = R"ESCAPE(
+  #ifdef __HIPCC__
+  #define ERROR_UNSUPPORTED_CAST ;
+  // corresponds to aten/src/ATen/native/cuda/thread_constants.h
+  #define CUDA_OR_ROCM_NUM_THREADS 256
+  // corresponds to aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+  #define MAX_DIMS 16
+  #ifndef __forceinline__
+  #define __forceinline__ inline __attribute__((always_inline))
+  #endif
+  #else
+  //TODO use _assert_fail, because assert is disabled in non-debug builds
+  #define ERROR_UNSUPPORTED_CAST assert(false);
+  #define CUDA_OR_ROCM_NUM_THREADS 128
+  #define MAX_DIMS 25
+  #endif
+  #define POS_INFINITY __int_as_float(0x7f800000)
+  #define INFINITY POS_INFINITY
+  #define NEG_INFINITY __int_as_float(0xff800000)
+  #define NAN __int_as_float(0x7fffffff)
+
+  typedef long long int int64_t;
+  typedef unsigned int uint32_t;
+  typedef signed char int8_t;
+  typedef unsigned char uint8_t;  // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
+  typedef short int16_t;
+  static_assert(sizeof(int64_t) == 8, "expected size does not match");
+  static_assert(sizeof(uint32_t) == 4, "expected size does not match");
+  static_assert(sizeof(int8_t) == 1, "expected size does not match");
+  constexpr int num_threads = CUDA_OR_ROCM_NUM_THREADS;
+  constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
+  constexpr int block_work_size = thread_work_size * num_threads;
+
+  ${traits_string}
+  ${cmath_string}
+
+  // NB: Order matters for this macro; it is relied upon in
+  // _promoteTypesLookup and the serialization format.
+  // Note, some types have ctype as void because we don't support them in codegen
+  #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
+  _(uint8_t, Byte) /* 0 */                               \
+  _(int8_t, Char) /* 1 */                                \
+  _(int16_t, Short) /* 2 */                              \
+  _(int, Int) /* 3 */                                    \
+  _(int64_t, Long) /* 4 */                               \
+  _(at::Half, Half) /* 5 */                                  \
+  _(float, Float) /* 6 */                                \
+  _(double, Double) /* 7 */                              \
+  _(std::complex<at::Half>, ComplexHalf) /* 8 */        \
+  _(std::complex<float>, ComplexFloat) /* 9 */                          \
+  _(std::complex<double>, ComplexDouble) /* 10 */                         \
+  _(bool, Bool) /* 11 */                                 \
+  _(void, QInt8) /* 12 */                          \
+  _(void, QUInt8) /* 13 */                        \
+  _(void, QInt32) /* 14 */                        \
+  _(at::BFloat16, BFloat16) /* 15 */                             \
+
+  #define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_)       \
+  _(uint8_t, Byte)                                                 \
+  _(int8_t, Char)                                                  \
+  _(int16_t, Short)                                                \
+  _(int, Int)                                                      \
+  _(int64_t, Long)                                                 \
+  _(at::Half, Half)                                                \
+  _(float, Float)                                                  \
+  _(double, Double)                                                \
+  _(std::complex<at::Half>, ComplexHalf)                           \
+  _(std::complex<float>, ComplexFloat)                             \
+  _(std::complex<double>, ComplexDouble)                           \
+  _(bool, Bool)                                                    \
+  _(at::BFloat16, BFloat16)
+
+
+  enum class ScalarType : int8_t {
+  #define DEFINE_ENUM(_1, n) n,
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
+  #undef DEFINE_ENUM
+      Undefined,
+  NumOptions
+  };
+
+  template <typename T, int size>
+  struct Array {
+  T data[size];
+
+  __device__ T operator[](int i) const {
+      return data[i];
+  }
+  __device__ T& operator[](int i) {
+      return data[i];
+  }
+  Array() = default;
+  Array(const Array&) = default;
+  Array& operator=(const Array&) = default;
+  __device__ Array(T x) {
+    for (int i = 0; i < size; i++) {
+      data[i] = x;
+    }
+  }
+  };
+
+  ${half_string}
+  ${bfloat16_string}
+  ${complex_body_string}
+  ${complex_half_body_string}
+  ${complex_math_string}
+
+
+)ESCAPE";
+
+//we need to include half, bfloat16 and complex strings to all kernels with half arguments and to all kernels with type casting
+//regardless of whether they have half arguments (because fetch_and_cast and cast_and_store loop over all types)
+const std::string jiterator_half_support_literal = R"ESCAPE(
+namespace at {
+struct alignas(2) Half {
+  unsigned short x;
+
+  Half() = default;
+  inline __host__ __device__ Half(float value){
+#ifdef __HIPCC__
+    x = __half_as_short(__float2half(value));
+#else
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(x) : "f"(value));
+#endif
+  }
+  inline __host__ __device__ Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+  }
+  inline __host__ __device__ operator __half() const {
+    return *reinterpret_cast<const __half*>(&x);
+  }
+  inline __host__ __device__ operator float() const{
+#ifdef __HIPCC__
+      return __half2float(*reinterpret_cast<const __half*>(&x));
+#else
+      float val;
+      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(x)); // do we need const cast here?
+      //asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(x)));
+      return val;
+#endif
+  }
+};
+
+  /// Arithmetic
+
+  inline __host__ __device__ Half operator+(const Half& a, const Half& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+  }
+
+  inline __host__ __device__ Half operator-(const Half& a, const Half& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+  }
+
+  inline __host__ __device__ Half operator*(const Half& a, const Half& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+  }
+
+  inline __host__ __device__ Half operator/(const Half& a, const Half& b)
+   {
+  return static_cast<float>(a) / static_cast<float>(b);
+  }
+
+  inline __host__ __device__ Half operator-(const Half& a) {
+  #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+      defined(__HIP_DEVICE_COMPILE__)
+  return __hneg(a);
+  #elif defined(__SYCL_DEVICE_ONLY__)
+  return -c10::bit_cast<sycl::half>(a);
+  #else
+  return -static_cast<float>(a);
+  #endif
+  }
+
+  inline __host__ __device__ Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+  }
+
+  inline __host__ __device__ Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+  }
+
+  inline __host__ __device__ Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+  }
+
+  inline __host__ __device__ Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+  }
+
+}
+
+
+)ESCAPE";
+
+const std::string jiterator_bfloat16_support_literal = R"ESCAPE(
+namespace at {
+struct alignas(2) BFloat16 {
+  unsigned short x;
+
+  __device__ unsigned short __internal_float2bfloat16(
+      const float f,
+      unsigned int& sign,
+      unsigned int& remainder) {
+    unsigned int x;
+
+    x = __float_as_uint(f);
+
+    if ((x & 0x7fffffffU) > 0x7f800000U) {
+      sign = 0U;
+      remainder = 0U;
+      return static_cast<unsigned short>(0x7fffU);
+    }
+    sign = x >> 31;
+    remainder = x << 16;
+    return static_cast<unsigned short>(x >> 16);
+  }
+
+
+  BFloat16() = default;
+  inline __host__ __device__ BFloat16(float value){
+  #if __CUDA_ARCH__ >= 800
+  asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(x) : "f"(value));
+  )ESCAPE"
+  R"ESCAPE(
+  #else
+  unsigned int sign;
+  unsigned int remainder;
+  x = __internal_float2bfloat16(value, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((x & 0x1U) != 0U))) {
+    x++;
+  }
+  #endif
+  }
+
+  inline __host__ __device__ operator float() const{
+#ifdef __HIPCC__
+    union
+    {
+        uint32_t int32;
+        float    fp32;
+    } u = {uint32_t(x) << 16};
+    return u.fp32;
+#else
+    float val;
+    asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(val) : "h"(x)); //do we need const cast here?
+    return val;
+#endif
+  }
+
+};
+
+  /// Arithmetic
+
+  inline __host__ __device__ BFloat16
+  operator+(const BFloat16& a, const BFloat16& b) {
+    return static_cast<float>(a) + static_cast<float>(b);
+  }
+
+  inline __host__ __device__ BFloat16
+  operator-(const BFloat16& a, const BFloat16& b) {
+    return static_cast<float>(a) - static_cast<float>(b);
+  }
+
+  inline __host__ __device__ BFloat16
+  operator*(const BFloat16& a, const BFloat16& b) {
+    return static_cast<float>(a) * static_cast<float>(b);
+  }
+
+  inline __host__ __device__ BFloat16 operator/(const BFloat16& a, const BFloat16& b) {
+    return static_cast<float>(a) / static_cast<float>(b);
+  }
+
+  inline __host__ __device__ BFloat16 operator-(const BFloat16& a) {
+    return -static_cast<float>(a);
+  }
+
+  inline __host__ __device__ BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+    a = a + b;
+    return a;
+  }
+
+  inline __host__ __device__ BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+    a = a - b;
+    return a;
+  }
+
+  inline __host__ __device__ BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+    a = a * b;
+    return a;
+  }
+
+  inline __host__ __device__ BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+    a = a / b;
+    return a;
+  }
+
+  inline __host__ __device__ BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+    a.x = a.x | b.x;
+    return a;
+  }
+
+  inline __host__ __device__ BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+    a.x = a.x ^ b.x;
+    return a;
+  }
+
+  inline __host__ __device__ BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+    a.x = a.x & b.x;
+    return a;
+  }
+
+}
+)ESCAPE";
+
+// From c10/util/Load.h
+const std::string load_support_literal = R"ESCAPE(
+
+  namespace c10 {
+    template <typename T>
+    struct LoadImpl {
+      __device__ static T apply(const void *src) {
+        return *reinterpret_cast<const T*>(src);
+      }
+    };
+
+    template <>
+    struct LoadImpl<bool> {
+      __device__ static bool apply(const void *src) {
+        static_assert(sizeof(bool) == sizeof(char), "");
+        return LoadImpl<char>::apply(src);
+      }
+    };
+
+    template <typename T>
+    __device__ T load(const void *src) {
+      return LoadImpl<T>::apply(src);
+    }
+
+    template <typename scalar_t>
+    __device__ scalar_t load(const scalar_t *src) {
+      return LoadImpl<scalar_t>::apply(src);
+    }
+  }  // namespace c10
+
+)ESCAPE";
+
+// copy-pasted from c10/util/TypeCast.h and c10/core/DynamicCast.h
+const std::string dynamic_cast_support_literal = R"ESCAPE(
+
+  template <typename T>
+  struct is_complex : public std::false_type {};
+
+  template <typename T>
+  struct is_complex<std::complex<T>> : public std::true_type {};
+
+  template <typename dest_t, typename src_t>
+  struct needs_real {
+    constexpr static bool value =
+        (is_complex<src_t>::value && !is_complex<dest_t>::value);
+  };
+
+  template <bool, typename src_t>
+  struct maybe_real {
+    static inline src_t apply(src_t src) {
+      return src;
+    }
+  };
+
+  template <typename src_t>
+  struct maybe_real<true, src_t> {
+    static inline decltype(auto) apply(src_t src) {
+      return src.real();
+    }
+  };
+
+  template <typename dest_t, typename src_t>
+  struct static_cast_with_inter_type {
+    static inline dest_t apply(
+        src_t src) {
+      constexpr bool real = needs_real<dest_t, src_t>::value;
+      return static_cast<dest_t>(maybe_real<real, src_t>::apply(src));
+    }
+  };
+
+  template <typename src_t>
+  struct static_cast_with_inter_type<uint8_t, src_t> {
+    static inline uint8_t apply(
+        src_t src) {
+      constexpr bool real = needs_real<uint8_t, src_t>::value;
+      return static_cast<uint8_t>(
+          static_cast<int64_t>(maybe_real<real, src_t>::apply(src)));
+    }
+  };
+
+  template <>
+  struct static_cast_with_inter_type<std::complex<at::Half>, at::BFloat16> {
+    static inline std::complex<at::Half> apply(at::BFloat16 src) {
+      return static_cast<std::complex<at::Half>>(float{src});
+    }
+  };
+
+  template <>
+  struct static_cast_with_inter_type<std::complex<at::Half>, at::Half> {
+    static inline std::complex<at::Half> apply(at::Half src) {
+      return static_cast<std::complex<at::Half>>(float{src});
+    }
+  };
+
+  template <>
+  struct static_cast_with_inter_type<
+      std::complex<at::Half>,
+      std::complex<double>> {
+    static inline std::complex<at::Half> apply(std::complex<double> src) {
+      return static_cast<std::complex<at::Half>>(static_cast<std::complex<float>>(src));
+    }
+  };
+
+  // Fetch a value with dynamic type src_type from ptr, and cast it to static type dest_t.
+  #define FETCH_AND_CAST_CASE(type, scalartype) \
+    case ScalarType::scalartype:                \
+      return static_cast_with_inter_type<dest_t, type>::apply(c10::load<type>(ptr));
+  template<typename dest_t>
+  __device__ inline dest_t fetch_and_cast(const ScalarType src_type, const void *ptr) {
+    switch (src_type) {
+        AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(FETCH_AND_CAST_CASE)
+        default:
+          ERROR_UNSUPPORTED_CAST
+    }
+    return dest_t(0); // just to avoid compiler warning
+  }
+
+  // Cast a value with static type src_t into dynamic dest_type, and store it to ptr.
+  #define CAST_AND_STORE_CASE(type, scalartype)                             \
+    case ScalarType::scalartype:                                            \
+      *(type*)ptr = static_cast_with_inter_type<type, src_t>::apply(value); \
+      return;
+  template<typename src_t>
+  __device__ inline void cast_and_store(const ScalarType dest_type, void *ptr, src_t value) {
+  switch (dest_type) {
+      AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(CAST_AND_STORE_CASE)
+      default:;
+  }
+  ERROR_UNSUPPORTED_CAST
+  }
+
+  template <int N>
+  struct LoadWithCast {
+    using array_t = Array<ScalarType, N==0? 1 : N>;
+    using size_array_t = Array<uint32_t, N==0? 1: N>;
+
+    array_t dtypes;
+    size_array_t element_sizes;
+    template <typename scalar_t>
+    __device__ scalar_t load(char* base_ptr, uint32_t offset, int arg) {
+        void* ptr = base_ptr + element_sizes[arg] * offset;
+        return fetch_and_cast<scalar_t>(dtypes[arg], ptr);
+    }
+  };
+
+  template <int N = 1>
+  struct StoreWithCast {
+    using array_t = Array<ScalarType, N==0? 1 : N>;
+    using size_array_t = Array<uint32_t, N==0? 1: N>;
+
+    array_t dtypes;
+    size_array_t element_sizes;
+
+    template<typename scalar_t>
+    __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+        void *ptr = base_ptr + element_sizes[arg] * offset;
+        cast_and_store<scalar_t>(dtypes[arg], ptr, value);
+    }
+  };
+
+)ESCAPE";
+
+const std::string no_dynamic_cast_support_literal = R"ESCAPE(
+
+  struct LoadWithoutCast {
+  template <typename scalar_t>
+  __device__ scalar_t load(char* base_ptr, uint32_t offset, int arg=0) {
+    return c10::load(reinterpret_cast<scalar_t*>(base_ptr) + offset);
+  }
+  };
+
+  struct StoreWithoutCast {
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg=0) {
+    *(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
+  }
+  };
+
+)ESCAPE";
+
+const std::string offset_calc_template = R"ESCAPE(
+  template <typename T>
+  struct DivMod {
+  T div;
+  T mod;
+
+  __device__ DivMod(T _div, T _mod) {
+      div = _div;
+      mod = _mod;
+  }
+  };
+
+  //<unsigned int>
+  struct IntDivider {
+  IntDivider() = default;
+
+  __device__ inline unsigned int div(unsigned int n) const {
+  unsigned int t = __umulhi(n, m1);
+  return (t + n) >> shift;
+  }
+
+  __device__ inline unsigned int mod(unsigned int n) const {
+  return n - div(n) * divisor;
+  }
+
+  __device__ inline DivMod<unsigned int> divmod(unsigned int n) const {
+  unsigned int q = div(n);
+  return DivMod<unsigned int>(q, n - q * divisor);
+  }
+
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+  };
+
+  template <int NARGS>
+  struct TrivialOffsetCalculator {
+    // The offset for each argument. Wrapper around fixed-size array.
+    // The offsets are in # of elements, not in bytes.
+    Array<${index_type}, NARGS> get(${index_type} linear_idx) const {
+      Array<${index_type}, NARGS> offsets;
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] = linear_idx;
+      }
+      return offsets;
+    }
+  };
+
+  template<int NARGS>
+  struct OffsetCalculator {
+  OffsetCalculator() = default;
+  __device__ __forceinline__ Array<${index_type}, NARGS> get(${index_type} linear_idx) const {
+      Array<${index_type}, NARGS> offsets;
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; ++arg) {
+      offsets[arg] = 0;
+      }
+
+      #pragma unroll
+      for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+          break;
+      }
+
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.div;
+
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; ++arg) {
+          offsets[arg] += divmod.mod * strides_[dim][arg];
+      }
+      //printf("offset calc thread dim size stride offset %d %d %d %d %d %d %d %d\n",
+      //threadIdx.x, dim, sizes_[dim].divisor, strides_[dim][0], offsets[0], linear_idx, divmod.div, divmod.mod);
+      }
+      return offsets;
+  }
+
+    int dims;
+    IntDivider sizes_[MAX_DIMS];
+    // NOTE: this approach will not support nInputs == 0
+    ${index_type} strides_[MAX_DIMS][NARGS];
+  };
+
+
+)ESCAPE";
+
+const std::string jit_code_template = R"ESCAPE(
+
+  ${load_support}
+  ${dynamic_casting_string}
+
+
+  ${functor}
+
+  // TODO: setup grid-stride loop
+  extern "C" __global__
+  void ${name}_kernel(
+      const int numel,
+      Array<char*, ${nInputs}+${nOutputs}> data, //[${nInputs}+${nOutputs}],
+      ${offset_calculator}<${nInputs}> input_calculator,
+      ${offset_calculator}<${nOutputs}> output_calculator,
+      ${loader} l,
+      ${storer} s,
+      ${compute_type} scalar_val${extra_params}) {
+    ${declare_load_arrays}
+    ${declare_store_arrays}
+
+    int idx = blockIdx.x;
+
+    int remaining = numel - block_work_size * idx;
+    int thread_idx = threadIdx.x;
+
+    #pragma unroll
+    for (int j = 0; j < thread_work_size; j++){
+        if (thread_idx >= remaining) {
+            break;
+        }
+
+        int linear_idx = thread_idx + block_work_size * idx;
+        auto input_offsets = input_calculator.get(linear_idx);
+        ${load_inputs}
+        // printf(
+        //    "thread %d a %f offsets %d\n", threadIdx.x, arg0[j], input_offsets[0]);
+        thread_idx += num_threads;
+    }
+
+    #pragma unroll
+    for (int j = 0; j < thread_work_size; j++) {
+      if ((threadIdx.x  + j*num_threads) < remaining) {
+        ${call_functor}
+      }
+    }
+
+    thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int j = 0; j < thread_work_size; j++){
+        if (thread_idx >= remaining) {
+            break;
+        }
+        //TODO maybe think about unifying offset calculators and reuse
+        //offsets computed in the load loop
+        int linear_idx = thread_idx + block_work_size * idx;
+        auto output_offsets = output_calculator.get(linear_idx);
+        //printf("output thread %d offset %d\n", threadIdx.x, output_offsets[0]);
+        ${store_outputs}
+        thread_idx += num_threads;
+    }
+  }
+)ESCAPE";
+
+const std::string jit_vectorized_code_template = R"ESCAPE(
+
+  ${load_support}
+
+  template <typename scalar_t>
+  __device__ __inline__ scalar_t load(char* base_ptr, uint32_t offset) {
+      return c10::load(reinterpret_cast<scalar_t*>(base_ptr) + offset);
+  }
+
+  template<typename scalar_t>
+  __device__ __inline__ void store(scalar_t value, char *base_ptr, uint32_t offset) {
+      *(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
+  }
+
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+
+  template <int vec_size, typename scalar_t>
+  __device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
+    using vec_t = aligned_vector<scalar_t, vec_size>;
+    auto *from = reinterpret_cast<const vec_t *>(base_ptr);
+    return from[offset];
+  }
+
+  template <int vec_size>
+  __device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint32_t offset) {
+    // See NOTE [Loading boolean values]
+    auto tmp = load_vector<vec_size>(reinterpret_cast<const uint8_t*>(base_ptr), offset);
+    aligned_vector<bool, vec_size> ret;
+    for (int i = 0; i < vec_size; ++i) {
+      ret.val[i] = bool(tmp.val[i]);
+    }
+    return ret;
+  }
+
+  ${functor}
+
+  // TODO: setup grid-stride loop
+
+  extern "C" __global__
+  void ${name}_vectorized${vec_size}_kernel(
+      const int N,
+      Array<char*, ${nInputs}+${nOutputs}> data,
+      ${compute_type} scalar_val${extra_params}) //[${nInputs}+${nOutputs}],
+      {
+      constexpr int vec_size = ${vec_size};
+      using scalar_t = ${scalar_type};
+      int remaining = N - block_work_size * blockIdx.x;
+      int thread_idx = threadIdx.x;
+      int idx = blockIdx.x;
+      ${declare_load_arrays}
+      ${declare_store_arrays}
+
+      if (remaining < block_work_size) {
+        #pragma unroll
+        for (int j = 0; j < thread_work_size; j++){
+          if (thread_idx >= remaining) {
+            break;
+          }
+          int linear_idx = thread_idx + block_work_size * idx;
+          ${load_unrolled_inputs}
+          thread_idx += num_threads;
+        }
+        #pragma unroll
+        for (int j = 0; j < thread_work_size; j++) {
+          if ((threadIdx.x  + j*num_threads) < remaining) {
+            ${call_functor}
+          }
+        }
+        thread_idx = threadIdx.x;
+        #pragma unroll
+        for (int j = 0; j < thread_work_size; j++) {
+          if (thread_idx >= remaining) {
+              break;
+          }
+          int linear_idx = thread_idx + block_work_size * idx;
+          ${store_unrolled_outputs}
+          thread_idx += num_threads;
+        }
+      } else {
+        static constexpr int loop_size = thread_work_size / vec_size;
+  //actual loading
+        ${vector_inputs}
+        #pragma unroll
+        for (int i = 0; i<loop_size; i++){
+          ${load_vectorized_inputs}
+          thread_idx += num_threads;
+        }
+
+        #pragma unroll
+        for (int j = 0; j < thread_work_size; j++) {
+          ${call_functor}
+        }
+
+        using vec_t_output = aligned_vector<${result_type}, vec_size>;
+        ${vector_outputs}
+        int thread_idx = threadIdx.x;
+        #pragma unroll
+        for (int i = 0; i<loop_size; i++){
+          vec_t_output v;
+          ${store_vectorized_outputs}
+          thread_idx += num_threads;
+        }
+      }
+  }
+)ESCAPE";
+
+const std::string zoom_jit_code_template = R"ESCAPE(
+  ${load_support}
+  ${dynamic_casting_string}
+
+  // zero init
+  template <typename T>
+  __device__ T zero_init() {
+      return T(0);
+  }
+
+  template <>
+  __device__ hipFloatComplex zero_init<hipFloatComplex>() {
+      return make_hipFloatComplex(0.0f, 0.0f);
+  }
+
+  template <>
+  __device__ hipDoubleComplex zero_init<hipDoubleComplex>() {
+      return make_hipDoubleComplex(0.0, 0.0);
+  }
+
+  // kernels can use scalar_t as a template type in their implementation
+  using scalar_t = ${scalar_t};
+  ${kernel}
+
+)ESCAPE";
+
+static void replace_all(std::string& s, const std::string& to_replace, const std::string& replace_with) {
+  std::ostringstream oss;
+  std::size_t pos = 0;
+  std::size_t prev_pos = pos;
+
+  while (true) {
+    prev_pos = pos;
+    pos = s.find(to_replace, pos);
+    if (pos == std::string::npos)
+      break;
+    oss << s.substr(prev_pos, pos - prev_pos);
+    oss << replace_with;
+    pos += to_replace.size();
+  }
+
+  oss << s.substr(prev_pos);
+  s = oss.str();
+}
+
+// hipify replaces certain device math functions, e.g., std::max -> ::max
+// See torch/utils/hipify/cuda_to_hip_mappings.py.
+// Replace them back. Search for " ::<name>" to avoid duplicate replacements.
+static std::string unhipify_math_functions(const std::string &original) {
+  static std::vector<std::pair<std::string,std::string>> mappings = {
+    {" std::max", " ::max"},
+    {" std::min", " ::min"},
+    {" std::ceil", " ::ceil"},
+    {" std::floor", " ::floor"},
+    {" std::exp", " ::exp"},
+    {" std::log", " ::log"},
+    {" std::pow", " ::pow"},
+    {" std::fabs", " ::fabs"},
+    {" std::fmod", " ::fmod"},
+    {" std::remainder", " ::remainder"},
+    {" std::frexp", " ::frexp"}
+  };
+  std::string ret = original;
+  for (const auto& mapping : mappings) {
+    replace_all(ret, mapping.second, mapping.first);
+  }
+  return ret;
+}
+
+// The following is copied from fused_kernel.cpp
+// TODO: refactor codegenOutputQuery into its own file
+//   that can be included by both files
+// See NOTE [ USE OF NVRTC AND DRIVER API ]
+const at::zoom::HIPRTC& hiprtc() {
+  return at::globalContext().getHIPRTC();
+}
+
+// query codegen output arch and target
+// TODO refactor so this function is usable both from jit and from aten
+void codegenOutputQuery(
+    const hipDeviceProp_t* const prop,
+    int& hip_major,
+    int& hip_minor,
+    int& hiprtc_major,
+    int& hiprtc_minor,
+    bool& compile_to_sass) {
+  ZOOM_HIPRTC_CHECK(hiprtc().hiprtcVersion(&hiprtc_major, &hiprtc_minor));
+  hip_major = prop->major;
+  hip_minor = prop->minor;
+  compile_to_sass = false;
+}
+
+// TODO: another copy paste from jit, refactor so it's usable from both
+// TODO: try making the CUcontext thread local to see if that improves performance - why is this slow?
+void initializeZoomContext() {
+  // lazily construct context if non-existing yet;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  hipCtx_t pctx = nullptr;
+  HIP_DRIVER_CHECK(at::globalContext().getHIPRTC().hipCtxGetCurrent(&pctx));
+  if (!pctx) {
+    std::unique_lock<std::mutex> hipFreeMutexLock(
+        *(c10::zoom::getFreeMutex()));
+    hipFree(nullptr);
+  }
+}
+
+std::string generate_code(
+    const KernelDescriptor &desc,
+    bool contiguous,
+    bool dynamic_casting,
+    BinaryFuncVariant scalar_pos,
+    bool vectorized,
+    int vec_size,
+    bool return_by_ref) {
+  c10::SmallVector<std::string> extra_args_typenames(desc.extra_args_types.size());
+  for (auto i : c10::irange(extra_args_typenames.size())) {
+    extra_args_typenames[i] = typeName(desc.extra_args_types[i]);
+  }
+
+  return generate_code(
+      desc.nInputs,
+      desc.nOutputs,
+      desc.f,
+      desc.name,
+      typeName(desc.f_inputs_type),
+      typeName(toOpMathType(desc.f_inputs_type)),
+      typeName(desc.result_type),
+      contiguous,
+      dynamic_casting,
+      scalar_pos,
+      extra_args_typenames,
+      vectorized,
+      vec_size,
+      return_by_ref);
+}
+
+//FIXME - this are defined in Loops.cuh, but including Loops.cuh here would lead to circular includes Loops.cuh -> CUDALoops.cuh -> jit_utils.h -> Loops.cuh
+#define THREAD_WORK_SIZE 4
+constexpr int thread_work_size = THREAD_WORK_SIZE;
+
+std::string generate_code(
+    int nInputs,
+    int nOutputs,
+    const std::string& func_,
+    const std::string& name,
+    const std::string& f_inputs_type,
+    const std::string& compute_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool dynamic_casting,
+    BinaryFuncVariant scalar_pos,
+    c10::SmallVector<std::string>& extra_args_typenames,
+    bool vectorized,
+    int vec_size,
+    bool return_by_ref) {
+  std::string func = func_;
+  at::jit::TemplateEnv env;
+
+  env.s("index_type", "unsigned int");
+  env.s("nInputs", std::to_string(nInputs));
+  env.s("nOutputs", std::to_string(nOutputs));
+  env.s("scalar_type", f_inputs_type);
+  env.s("compute_type", compute_type);
+  env.s("functor", func);
+  env.s("name", name);
+  env.s("cmath_string", get_cmath_string());
+
+  // Generate `extra_params` for function signature
+  // and `extra_args` for computation call if
+  // extra arguments to capture runtime state are passed.
+  // (look at polygamma for example).
+  std::string extra_params = "";
+  std::string extra_args = "";
+  for (size_t i = 0; i < extra_args_typenames.size(); i++) {
+    auto type = std::string(extra_args_typenames[i]);
+    auto name = "extra_arg_" + std::string(to_string(i));
+    extra_params += "," + type + " " + name;
+    extra_args += ", " + name;
+  }
+  env.s("extra_params", extra_params);
+  env.s("extra_args", extra_args);
+
+  std::stringstream declare_load_arrays;
+  for (int i = 0; i < nInputs; i++) {
+    // TODO these arrays are potentially of the different types, use function
+    // traits to determine the types
+    declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
+                        << "[" << std::to_string(thread_work_size) << "];\n";
+  }
+  env.s("declare_load_arrays", declare_load_arrays.str());
+
+  std::stringstream declare_store_arrays;
+  for (int i = 0; i < nOutputs; i++) {
+    declare_store_arrays << result_type << " out" << std::to_string(i)
+                        << "[" << std::to_string(thread_work_size) << "];\n";
+  }
+  env.s("declare_store_arrays", declare_store_arrays.str());
+
+  std::stringstream functor_args;
+  if (scalar_pos == BinaryFuncVariant::NoScalar) {
+    for (int i = 0; i < nInputs - 1; i++) {
+      functor_args << "arg" << std::to_string(i) << "[j], ";
+    }
+    functor_args << "arg" << std::to_string(nInputs - 1) << "[j]";
+  } else if (scalar_pos == BinaryFuncVariant::LhsScalar) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(nInputs == 1);
+    functor_args << "scalar_val, arg0[j]";
+  } else { //RhsScalar
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(nInputs == 1);
+    functor_args << "arg0[j], scalar_val";
+  }
+  env.s("args", functor_args.str());
+
+  std::string call_functor_template;
+  if (return_by_ref) {  // return one or more outputs by reference
+    bool need_temp_out = (compute_type != result_type);
+    std::stringstream functor_outs;
+    if (need_temp_out) {
+      for (int i = 0; i < nOutputs - 1; i++) {
+        functor_outs << "temp_out" << std::to_string(i) << ", ";
+      }
+      functor_outs << "temp_out" << std::to_string(nOutputs - 1);
+    } else {
+      for (int i = 0; i < nOutputs - 1; i++) {
+        functor_outs << "out" << std::to_string(i) << "[j], ";
+      }
+      functor_outs << "out" << std::to_string(nOutputs - 1) << "[j]";
+    }
+    env.s("functor_outs", functor_outs.str());
+
+    if (need_temp_out) {
+      call_functor_template += "${compute_type} ${functor_outs};\n";
+    }
+
+    call_functor_template += "${name}<${compute_type}>(${args} ${extra_args}, ${functor_outs});\n";
+
+    if (need_temp_out) {
+      for (int i = 0; i < nOutputs; i++) {
+        auto i_string = std::to_string(i);
+        call_functor_template += "out" +i_string + "[j] = temp_out" + i_string + ";\n";
+      }
+    }
+
+  } else {  // return by value for single output functor
+    call_functor_template = "out0[j] = ${name}<${compute_type}>(${args} ${extra_args});";
+  }
+  env.s("call_functor", at::jit::CodeTemplate(call_functor_template).format(env));
+
+  if (f_inputs_type == "at::Half" || result_type == "at::Half" ||
+      f_inputs_type == "std::complex<at::Half>" ||
+      result_type == "std::complex<at::Half>" || dynamic_casting) {
+    // complex<Half> depends on complex<T> and Half dtypes.
+    env.s("half_string", jiterator_half_support_literal);
+  } else {
+    env.s("half_string", "");
+  }
+  if (f_inputs_type == "at::BFloat16" || result_type == "at::BFloat16" || dynamic_casting) {
+    env.s("bfloat16_string", jiterator_bfloat16_support_literal);
+  } else {
+    env.s("bfloat16_string", "");
+  }
+  // the definition of complex math functions is only needed when the compute type is complex
+  // but the definition of std::complex is needed for dynamic casting even if the compute type is not complex
+  if (f_inputs_type == "std::complex<float>" || result_type == "std::complex<float>" ||
+      f_inputs_type == "std::complex<double>" || result_type == "std::complex<double>" ||
+      f_inputs_type == "std::complex<at::Half>" || result_type == "std::complex<at::Half>") {
+    // complex<Half> depends on complex<T> and Half dtypes.
+    env.s("traits_string", get_traits_string());
+    env.s("complex_body_string", get_complex_body_string());
+    env.s("complex_math_string", get_complex_math_string());
+
+    // unhipify math functions, but only if std::complex is used.
+    func = unhipify_math_functions(func);
+    env.s("functor", func);
+
+  } else if (dynamic_casting) {
+    env.s("traits_string", get_traits_string());
+    env.s("complex_body_string", get_complex_body_string());
+    env.s("complex_math_string", "");
+  } else {
+    env.s("traits_string", "");
+    env.s("complex_body_string", "");
+    env.s("complex_math_string", "");
+  }
+  if (f_inputs_type == "std::complex<at::Half>" ||
+      result_type == "std::complex<at::Half>" || dynamic_casting) {
+    // dynamic_casting requires the definition of all types
+    // include complex<at::Half>
+    // Look at the definition of `StoreWithCast` and `LoadWithCast`.
+    env.s("complex_half_body_string", get_complex_half_body_string());
+  } else {
+    env.s("complex_half_body_string", "");
+  }
+
+  env.s("load_support", load_support_literal);
+
+  if (!vectorized) {
+    if (!dynamic_casting) {
+      env.s("loader", "LoadWithoutCast");
+      env.s("storer", "StoreWithoutCast");
+      env.s("dynamic_casting_string", no_dynamic_cast_support_literal);
+    } else {
+      env.s("loader", std::string("LoadWithCast<" + std::to_string(nInputs) + ">"));
+      env.s("storer", std::string("StoreWithCast<" + std::to_string(nOutputs) + ">"));
+      env.s("dynamic_casting_string", dynamic_cast_support_literal);
+    }
+
+    if (contiguous) {
+      env.s("offset_calculator", "TrivialOffsetCalculator");
+    } else {
+      env.s("offset_calculator", "OffsetCalculator");
+    }
+
+    std::stringstream load_inputs;
+    for (int i = 0; i < nInputs; i++) {
+      auto i_string = std::to_string(i);
+      load_inputs << "arg" << i_string << "[j] = l.load<" << f_inputs_type
+                  << ">(data[" << std::to_string(i + nOutputs)
+                  << "], input_offsets[" << i_string << "], " << i_string
+                  << ");\n";
+    }
+    env.s("load_inputs", load_inputs.str());
+
+    std::stringstream store_outputs;
+    for (int i = 0; i < nOutputs; i++) {
+      auto i_string = std::to_string(i);
+      store_outputs << "s.store<" << result_type
+                    << ">(out" << i_string << "[j], data[" << i_string
+                    << "], output_offsets[" << i_string << "], " << i_string
+                    << ");\n";
+    }
+    env.s("store_outputs", store_outputs.str());
+
+    static auto hip_template = at::jit::CodeTemplate(
+      jit_preamble + jit_common_types + offset_calc_template + jit_code_template + jit_epilogue);
+    const auto code = hip_template.format(env);
+    return code;
+  }
+
+  // vectorized case
+  env.s("vec_size", std::to_string(vec_size));
+  env.s("result_type", result_type);
+
+  std::stringstream vector_inputs;
+  for (const auto i : c10::irange(nInputs)){
+    auto i_string = std::to_string(i);
+    vector_inputs << "auto * input" << i_string <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
+        " + block_work_size * idx;\n";
+  }
+  env.s("vector_inputs", vector_inputs.str());
+
+  std::stringstream vector_outputs;
+  for (const auto i : c10::irange(nOutputs)){
+    auto i_string = std::to_string(i);
+    vector_outputs << "vec_t_output* to_" << i_string <<
+    " = reinterpret_cast<vec_t_output*>(data[" << i_string << "])" <<
+    " + block_work_size / vec_size * idx;\n";
+  }
+  env.s("vector_outputs", vector_outputs.str());
+
+  std::stringstream load_vectorized_inputs;
+  for (const auto i : c10::irange(nInputs)) {
+    auto i_string = std::to_string(i);
+    load_vectorized_inputs << "const auto vec" << i_string << " = load_vector<vec_size>("
+                           << "input" << i_string << ", thread_idx);\n";
+    load_vectorized_inputs << "#pragma unroll\n";
+    load_vectorized_inputs << "for (int j=0; j < vec_size; j++){\n";
+    load_vectorized_inputs << "  arg" << i_string << "[vec_size * i + j] = vec" << i_string << ".val[j];\n";
+    load_vectorized_inputs << "}\n";
+  }
+  env.s("load_vectorized_inputs", load_vectorized_inputs.str());
+
+  std::stringstream store_vectorized_outputs;
+  for (const auto i : c10::irange(nOutputs)) {
+    auto i_string = std::to_string(i);
+    store_vectorized_outputs << "#pragma unroll\n";
+    store_vectorized_outputs << "for (int j=0; j<vec_size; j++){\n";
+    store_vectorized_outputs <<   "v.val[j] = out" << i_string << "[vec_size * i + j];\n";
+    store_vectorized_outputs << "}\n";
+    store_vectorized_outputs << "to_"<< i_string << "[thread_idx] = v;\n";
+  }
+  env.s("store_vectorized_outputs", store_vectorized_outputs.str());
+
+  std::stringstream load_unrolled_inputs;
+  for (const auto i: c10::irange(nInputs)){
+    auto i_string = std::to_string(i);
+    load_unrolled_inputs << "arg" << i_string << "[j] = load<" << f_inputs_type
+      << ">(data[" << std::to_string(i + nOutputs) << "], linear_idx);\n";
+  }
+  env.s("load_unrolled_inputs", load_unrolled_inputs.str());
+
+  std::stringstream store_unrolled_outputs;
+  for (const auto i : c10::irange(nOutputs)) {
+    auto i_string = std::to_string(i);
+    store_unrolled_outputs << "store<" << result_type << ">(out" << i_string
+      << "[j], data[" << i_string << "], linear_idx);\n";
+  }
+  env.s("store_unrolled_outputs", store_unrolled_outputs.str());
+
+  static auto hip_template = at::jit::CodeTemplate(
+    jit_preamble + jit_common_types + jit_vectorized_code_template + jit_epilogue);
+  const auto code = hip_template.format(env);
+  return code;
+}
+
+std::string zoom_generate_code(
+    const KernelDescriptor &desc,
+    bool dynamic_casting
+    ) {
+  c10::SmallVector<std::string> extra_args_typenames(desc.extra_args_types.size());
+  for (auto i : c10::irange(extra_args_typenames.size())) {
+    extra_args_typenames[i] = typeName(desc.extra_args_types[i]);
+  }
+
+  return zoom_generate_code(
+      desc.nInputs,
+      desc.nOutputs,
+      desc.f,
+      desc.name,
+      typeName(desc.f_inputs_type),
+      typeName(toOpMathType(desc.f_inputs_type)),
+      typeName(desc.result_type),
+      dynamic_casting,
+      extra_args_typenames
+    );
+}
+
+std::string zoom_generate_code(
+    int nInputs,
+    int nOutputs,
+    const std::string& func_,
+    const std::string& name,
+    const std::string& f_inputs_type,
+    const std::string& compute_type,
+    const std::string& result_type,
+    bool dynamic_casting,
+    c10::SmallVector<std::string>& extra_args_typenames
+) {
+  std::string func = func_;
+  at::jit::TemplateEnv env;
+
+  env.s("index_type", "unsigned int");
+  env.s("nInputs", std::to_string(nInputs));
+  env.s("nOutputs", std::to_string(nOutputs));
+  env.s("scalar_t", f_inputs_type);
+  // std::complex and hipComplex have the same memory layout so we can readily
+  // replace these with one another, and this makes writing kernels much easier.
+  if(f_inputs_type == "std::complex<float>") {
+    env.s("scalar_t", "hipFloatComplex");
+  }
+  else if(f_inputs_type == "std::complex<double>") {
+    env.s("scalar_t", "hipDoubleComplex");
+  }
+  env.s("compute_type", compute_type);
+  env.s("kernel", func);
+  env.s("name", name);
+  env.s("cmath_string", get_cmath_string());
+
+  // Generate `extra_params` for function signature
+  // and `extra_args` for computation call if
+  // extra arguments to capture runtime state are passed.
+  // (look at polygamma for example).
+  std::string extra_params = "";
+  std::string extra_args = "";
+  for (size_t i = 0; i < extra_args_typenames.size(); i++) {
+    auto type = std::string(extra_args_typenames[i]);
+    auto name = "extra_arg_" + std::string(to_string(i));
+    extra_params += "," + type + " " + name;
+    extra_args += ", " + name;
+  }
+  env.s("extra_params", extra_params);
+  env.s("extra_args", extra_args);
+
+  if (f_inputs_type == "at::Half" || result_type == "at::Half" ||
+      f_inputs_type == "std::complex<at::Half>" ||
+      result_type == "std::complex<at::Half>" || dynamic_casting) {
+    // complex<Half> depends on complex<T> and Half dtypes.
+    env.s("half_string", jiterator_half_support_literal);
+  } else {
+    env.s("half_string", "");
+  }
+  if (f_inputs_type == "at::BFloat16" || result_type == "at::BFloat16" || dynamic_casting) {
+    env.s("bfloat16_string", jiterator_bfloat16_support_literal);
+  } else {
+    env.s("bfloat16_string", "");
+  }
+  // the definition of complex math functions is only needed when the compute type is complex
+  // but the definition of std::complex is needed for dynamic casting even if the compute type is not complex
+  if (f_inputs_type == "std::complex<float>" || result_type == "std::complex<float>" ||
+      f_inputs_type == "std::complex<double>" || result_type == "std::complex<double>" ||
+      f_inputs_type == "std::complex<at::Half>" || result_type == "std::complex<at::Half>") {
+    // complex<Half> depends on complex<T> and Half dtypes.
+    env.s("traits_string", get_traits_string());
+    env.s("complex_body_string", get_complex_body_string());
+    env.s("complex_math_string", get_complex_math_string());
+
+    // unhipify math functions, but only if std::complex is used.
+    func = unhipify_math_functions(func);
+    env.s("functor", func);
+
+  } else if (dynamic_casting) {
+    env.s("traits_string", get_traits_string());
+    env.s("complex_body_string", get_complex_body_string());
+    env.s("complex_math_string", "");
+  } else {
+    env.s("traits_string", "");
+    env.s("complex_body_string", "");
+    env.s("complex_math_string", "");
+  }
+  if (f_inputs_type == "std::complex<at::Half>" ||
+      result_type == "std::complex<at::Half>" || dynamic_casting) {
+    // dynamic_casting requires the definition of all types
+    // include complex<at::Half>
+    // Look at the definition of `StoreWithCast` and `LoadWithCast`.
+    env.s("complex_half_body_string", get_complex_half_body_string());
+  } else {
+    env.s("complex_half_body_string", "");
+  }
+
+  env.s("load_support", load_support_literal);
+  if (!dynamic_casting) {
+      env.s("loader", "LoadWithoutCast");
+      env.s("storer", "StoreWithoutCast");
+      env.s("dynamic_casting_string", no_dynamic_cast_support_literal);
+    } else {
+      env.s("loader", std::string("LoadWithCast<" + std::to_string(nInputs) + ">"));
+      env.s("storer", std::string("StoreWithCast<" + std::to_string(nOutputs) + ">"));
+      env.s("dynamic_casting_string", dynamic_cast_support_literal);
+    }
+
+  static auto hip_template = at::jit::CodeTemplate(
+    jit_preamble + jit_common_types + offset_calc_template + zoom_jit_code_template + jit_epilogue);
+  const auto code = hip_template.format(env);
+  return code;
+
+}
+
+// Creates directories recursively
+bool _r_mkdir(const std::string& dir) {
+  // Check if current dir exists
+  const char* p_dir = dir.c_str();
+  const bool dir_exists = (access(p_dir, F_OK) == 0);
+  if (dir_exists) {
+    return true;
+  }
+
+  // Try to create current directory
+#ifdef _WIN32
+  int ret = _mkdir(dir.c_str());
+#else
+  int ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
+  // Success
+  if (ret == 0) {
+    return true;
+  }
+
+  // Find folder separator and check if we are at the top
+  auto  pos = dir.find_last_of("/\\");
+  if (pos == std::string::npos) {
+    return false;
+  }
+
+  // Try to create parent directory
+  if (!(_r_mkdir(dir.substr(0, pos)))) {
+    return false;
+  }
+
+  // Try to create complete path again
+#ifdef _WIN32
+  ret = _mkdir(dir.c_str());
+#else
+  ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
+  return ret == 0;
+}
+
+// Creates directories recursively assuming that base exists
+bool r_mkdir_with_base(std::string& base, std::string& dir){
+  const char* p_base = base.c_str();
+  const bool base_exists = (access(p_base, F_OK) == 0);
+  if (!base_exists) {
+    return false;
+  }
+
+  // remove trailing '/' or '\\'
+  if ((base[base.size()-1]=='/') || base[base.size()-1]=='\\') {
+    base.pop_back();
+  }
+  if ((dir[dir.size()-1]=='/') || dir[dir.size()-1]=='\\') {
+    dir.pop_back();
+  }
+
+  return _r_mkdir(base+dir);
+
+}
+
+std::string load_code_template(const std::string& path) {
+  std::ifstream ifs{path};
+  std::string s{
+    std::istreambuf_iterator<char>(ifs),
+    std::istreambuf_iterator<char>()};
+  return s;
+}
+
+std::string generate_reduction_code(
+    const KernelDescriptor &desc,
+    int vt0,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen) {
+  TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
+  TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
+
+  return generate_reduction_code(
+      desc.nOutputs,
+      desc.f,
+      desc.name,
+      vt0,
+      typeName(desc.f_inputs_type),
+      typeName(toOpMathType(desc.f_inputs_type)),
+      typeName(desc.result_type),
+      contiguous,
+      vectorized,
+      vec_size,
+      max_threads_codegen
+    );
+}
+
+std::string generate_reduction_code(
+    int nOutputs,
+    const std::string& func_,
+    const std::string& name,
+    const int vt0,
+    const std::string& f_inputs_type,
+    const std::string& reduction_accum_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen) {
+      std::string func = func_;
+      at::jit::TemplateEnv env;
+      env.s("index_type", "unsigned int");
+      env.s("scalar_type", f_inputs_type);
+      env.s("result_type", result_type);
+      env.s("reduction_accum_type", reduction_accum_type);
+      env.s("vt0", std::to_string(vt0));
+      env.s("name", name);
+      env.s("max_threads_lb", std::to_string(max_threads_codegen));
+      // reductions don't support dynamic casting, so the only way to get nonstandard types
+      // is through input
+      if (f_inputs_type == "at::Half" || f_inputs_type == "std::complex<at::Half>") {
+        // complex<Half> depends on complex<T> and Half dtypes.
+        env.s("half_string", jiterator_half_support_literal);
+      } else {
+        env.s("half_string", "");
+      }
+      if (f_inputs_type == "at::BFloat16") {
+        env.s("bfloat16_string", jiterator_bfloat16_support_literal);
+      } else {
+        env.s("bfloat16_string", "");
+      }
+      if (f_inputs_type == "std::complex<float>" ||
+          f_inputs_type == "std::complex<double>" ||
+          f_inputs_type == "std::complex<at::Half>" ) {
+        // complex<Half> depends on complex<T> and Half dtypes.
+        env.s("traits_string", get_traits_string());
+        env.s("complex_body_string", get_complex_body_string());
+        env.s("complex_math_string", get_complex_math_string());
+        env.s("complex", std::to_string(1));
+        // unhipify math functions, but only if std::complex is used.
+        func = unhipify_math_functions(func);
+      } else {
+        env.s("traits_string", "");
+        env.s("complex_body_string", "");
+        env.s("complex_math_string", "");
+        env.s("complex", std::to_string(0));
+      }
+      if (f_inputs_type == "std::complex<at::Half>") {
+        env.s("complex_half_body_string", get_complex_half_body_string());
+      } else {
+        env.s("complex_half_body_string", "");
+      }
+      env.s("cmath_string", get_cmath_string());
+      env.s("functor", func);
+      env.s("output_vec_size", std::to_string(vec_size));
+      static auto hip_template = at::jit::CodeTemplate(
+        jit_preamble + jit_common_types + offset_calc_template + get_reduction_template() + jit_epilogue);
+      const auto code = hip_template.format(env);
+      return code;
+}
+
+// Acquires (possibly creating) the kernel cache directory
+std::optional<std::string> get_cache_dir() {
+  // If the environment variable USE_TORCH_KERNEL_CACHE is set to "0" then no persistent cache is used
+  const char* uptkc = std::getenv("USE_PYTORCH_KERNEL_CACHE");
+  const bool use_kernel_cache = (uptkc == nullptr) ? true : std::strcmp(uptkc, "0");
+
+  if (!use_kernel_cache) {
+    return {};
+  }
+
+  // Cache path comes from PYTORCH_KERNEL_CACHE_PATH, then TEMP (Windows) or XDG_CACHE_HOME (Linux), then HOME environment variables
+  std::string cache_dir;
+  char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
+  // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
+  std::string kernels_cache_dir = "";
+  if (ptkcp != nullptr) {
+    cache_dir = std::string(ptkcp);
+  } else {
+#ifdef _WIN32
+    ptkcp = std::getenv("TEMP");
+#else
+    // USES XDG_CACHE_HOME if it's set
+    ptkcp = std::getenv("XDG_CACHE_HOME");
+#endif
+    if (ptkcp != nullptr) {
+      kernels_cache_dir = "/torch/kernels";
+      cache_dir = std::string(ptkcp) + kernels_cache_dir;
+    } else {
+      // Falls back to HOME/.cache
+      ptkcp = std::getenv("HOME");
+      if (ptkcp == nullptr) {
+        TORCH_WARN_ONCE("No PYTORCH_KERNEL_CACHE_PATH or HOME environment variable set!",
+                        " This disables kernel caching.");
+        return {};
+      } else {
+        kernels_cache_dir = "/.cache/torch/kernels";
+        cache_dir = std::string(ptkcp) + kernels_cache_dir;
+      }
+    }
+  }
+
+  // Creates the cache directory if it does not exist
+  const char* p_cache_dir = cache_dir.c_str();
+  const bool cache_dir_exists = (access(p_cache_dir, F_OK) == 0);
+  if (!cache_dir_exists) {
+    std::string s_ptkcp = std::string(ptkcp);
+    if (!r_mkdir_with_base(s_ptkcp, kernels_cache_dir)) {
+      TORCH_WARN_ONCE("Specified kernel cache directory could not be created! This disables kernel caching.",
+                      " Specified directory is ", cache_dir, ".",
+                      " This warning will appear only once per process.");
+      return {};
+    }
+  }
+
+  // Checks that the cache directory is readable and writable
+  const bool cache_dir_readable = (access(p_cache_dir, R_OK) == 0);
+  if (!cache_dir_readable) {
+    TORCH_WARN_ONCE("Specified kernel cache directory is not readable! This disables kernel caching.",
+                    " Specified directory is ", cache_dir, ".",
+                    " This warning will appear only once per process.");
+    return {};
+  }
+
+  const bool cache_dir_writable = (access(p_cache_dir, W_OK) == 0);
+  if (!cache_dir_writable) {
+    TORCH_WARN_ONCE("Specified kernel cache directory is not writable! This disables kernel caching.",
+                    " Specified directory is ", cache_dir, ".",
+                    " This warning will appear only once per process.");
+    return {};
+  }
+
+  return cache_dir;
+}
+
+// Compiles the kernel, or acquires if from the cache if caching
+hiprtcFunction jit_pwise_function(
+    const std::string& code,
+    const std::string& kernel_name) {
+  initializeZoomContext();
+  // Acquires CUDA and nvrtc versions and whether we're compiling to ptx or SASS
+  const hipDeviceProp_t* prop = at::zoom::getCurrentDeviceProperties();
+  int hip_major = 0, hip_minor = 0, hiprtc_major = 0, hiprtc_minor = 0;
+  bool compile_to_sass = false;
+  at::zoom::jit::codegenOutputQuery(
+    prop, hip_major, hip_minor, hiprtc_major, hiprtc_minor, compile_to_sass);
+
+  // Objects used whether loading from the cache or jit compiling
+  const auto& hiprtc = at::globalContext().getHIPRTC();
+  hiprtcFunction compiled_kernel_;
+  std::string name = kernel_name + "_kernel";
+
+  static const std::optional<std::string> cache_dir = get_cache_dir();
+
+  std::string file_path;
+  if (cache_dir.has_value()) {
+    printf("Attempting to read from kernel cache...\n");
+    // Attemps to read from the cache.
+    // Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
+    // Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
+    //   because we hash on the CUDA code, but we save the compiled ptx or sass
+
+    // Acquires SHA1 hash
+    c10::sha1 sha1_hash{code};
+    const auto hash_code = sha1_hash.str();
+
+    // Constructs file path by appending constructed cubin name to cache path
+    std::stringstream ss;
+    ss << *cache_dir << "/";
+    ss << kernel_name;
+    ss << "_arch" << prop->gcnArchName;
+    ss << "_hiprtc" << hiprtc_major << "." << hiprtc_minor;
+    ss << (compile_to_sass ? "_sass" : "_ptx");
+    ss << "_" << code.length();
+    ss << "_" << hash_code;
+    file_path = ss.str();
+
+    std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
+    if (readin.fail()) {
+      // NOTE: this does not warn because the file might not exist
+      // TODO: consider if this should explicitly check for the file's existence or not to throw
+      //   an informative warning
+      readin.close();
+    } else {
+      printf("loading module from cache\n");
+      // TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
+      std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
+      HIP_DRIVER_CHECK(hiprtc.hipModuleLoadData(&(compiled_kernel_.module), buffer.data()));
+      printf("funcload\n");
+      HIP_DRIVER_CHECK(
+        hiprtc.hipModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
+      readin.close();
+      printf("finmodload\n");
+      return compiled_kernel_;
+    }
+  }
+
+  // Just-in-time compiles the program
+
+  // Creates the NVRTC program
+  hiprtcProgram program;
+  ZOOM_HIPRTC_CHECK(hiprtc.hiprtcCreateProgram(
+      &program, code.c_str(), nullptr, 0, nullptr, nullptr));
+
+  std::vector<const char*> args = {"--std=c++17", "-ggdb", "-O0"};
+
+  #undef NDEBUG
+  #ifndef NDEBUG
+    // Add line info to generated kernels
+    args.push_back("-lineinfo");
+  #else
+    // Avoid excessive register usage from assertion
+    args.push_back("-DNDEBUG");
+  #endif
+
+  const auto compilation_result =
+      hiprtc.hiprtcCompileProgram(program, args.size(), args.data());
+
+  // Throws an error on compilation failure
+  if (compilation_result != HIPRTC_SUCCESS) {
+    size_t logsize;
+    ZOOM_HIPRTC_CHECK(hiprtc.hiprtcGetProgramLogSize(program, &logsize));
+    std::string log(logsize, '\0');
+    ZOOM_HIPRTC_CHECK(hiprtc.hiprtcGetProgramLog(program, &log[0]));
+    throw std::runtime_error(code + log);
+  }
+
+  size_t ptx_size = 0;
+  std::vector<char> ptx;
+
+  const auto getSize = hiprtc.hiprtcGetCodeSize;
+  const auto getFunc = hiprtc.hiprtcGetCode;
+
+
+  ZOOM_HIPRTC_CHECK(getSize(program, &ptx_size));
+  ptx.resize(ptx_size);
+  ZOOM_HIPRTC_CHECK(getFunc(program, ptx.data()));
+
+  printf("modload2\n");
+  HIP_DRIVER_CHECK(hiprtc.hipModuleLoadData(&(compiled_kernel_.module), ptx.data()));
+  printf("funcload2\n");
+  HIP_DRIVER_CHECK(
+     hiprtc.hipModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
+  // TODO: use guards to avoid leaking
+  printf("flend\n");
+  ZOOM_HIPRTC_CHECK(hiprtc.hiprtcDestroyProgram(&program));
+
+  if (cache_dir.has_value()) {
+    // Writes the program to the cache if caching
+    // NOTE: Actually writes to a per-process temporary file to avoid multi-process contention.
+    //   The temporary file is then renamed to the actual file.
+    //   If the actual file already exists then the rename may fail or replace the actual file,
+    //     the behavior is implementation-specific.
+    //   Files replaced through this process should remain extant if they are being read because
+    //     of UNIX filesystem properties, but this behavior is unverified and may require
+    //     additional review in the future.
+    // TODO: In C++17 we should be able to use the filesystem header.
+    const auto pid = getpid();
+    std::stringstream tmp_file_path_ss;
+    tmp_file_path_ss << file_path << "_tmp_" << pid;
+    const std::string tmp_file_path = tmp_file_path_ss.str();
+    std::ofstream hipbin(tmp_file_path, std::ios::out | std::ofstream::binary);
+    if (hipbin.fail()) {
+      TORCH_WARN_ONCE("Failed to write temporarily kernel cache file!",
+                      " File path was ", tmp_file_path, ".",
+                      " This warning will only appear once per process.");
+    } else {
+      std::copy(ptx.begin(), ptx.end(), std::ostreambuf_iterator<char>(hipbin));
+      if (std::rename(tmp_file_path.c_str(), file_path.c_str()) != 0) {
+        // Removes tmp file if the rename failed
+        std::remove(tmp_file_path.c_str());
+      }
+    }
+    hipbin.close();
+  }
+
+  return compiled_kernel_;
+}
+
+// TODO: may need/want to initialize CUDA context here (refactor into nvrtc call)
+void launch_jitted_pwise_function(
+    hiprtcFunction function,
+    void* args[],
+    const dim3 nBlocks,
+    const dim3 kBlockSize,
+    const int smem) {
+  initializeZoomContext();
+  const auto& hiprtc = at::globalContext().getHIPRTC();
+  // Launches kernel on current stream
+  auto stream = c10::zoom::getCurrentZoomStream();
+  stream.synchronize();
+  HIP_DRIVER_CHECK(hiprtc.hipModuleLaunchKernel(
+    function.function,
+    nBlocks.x,
+    nBlocks.y,
+    nBlocks.z,
+    kBlockSize.x,
+    kBlockSize.y,
+    kBlockSize.z,
+    smem,
+    stream,
+    args,
+    nullptr));
+}
+
+} // at::zoom::jit
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/jit_utils.h b/aten/src/ATen/zoom/jit/jit_utils.h
new file mode 100644
index 00000000000000..1115906144c724
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/jit_utils.h
@@ -0,0 +1,230 @@
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <c10/util/irange.h>
+#include <ATen/zoom/jit/macros.h>
+#include <hip/hiprtc.h>
+
+namespace at { namespace zoom { namespace jit {
+
+enum class BinaryFuncVariant {NoScalar, RhsScalar, LhsScalar};
+
+struct hiprtcFunction {
+  hipModule_t module = hipModule_t();
+  hipFunction_t function = nullptr;
+};
+
+struct KernelDescriptor {
+  std::string name;
+  std::string f;
+  c10::ScalarType f_inputs_type;
+  c10::ScalarType result_type;
+  c10::SmallVector<c10::ScalarType> extra_args_types;
+  int nInputs, nOutputs;
+};
+
+// Helper function to return a vector<string>
+// corresponding to the type of the arguments in parameter pack.
+template <typename... Args>
+c10::SmallVector<at::ScalarType> get_extra_args_types() {
+  return {c10::CppTypeToScalarType<Args>::value ...};
+}
+
+template <
+  typename result_type,
+  typename f_inputs_type,
+  typename... ExtraArgs>
+KernelDescriptor make_kernel_descriptor(
+    std::string name,
+    std::string f,
+    int nInputs,
+    int nOutputs) {
+  KernelDescriptor ret;
+  ret.name = std::move(name);
+  ret.f = std::move(f);
+  ret.f_inputs_type = c10::CppTypeToScalarType<f_inputs_type>::value;
+  ret.result_type = c10::CppTypeToScalarType<result_type>::value;
+  ret.extra_args_types = get_extra_args_types<ExtraArgs...>();
+  ret.nInputs = nInputs;
+  ret.nOutputs = nOutputs;
+  return ret;
+}
+
+inline int can_vectorize_up_to(size_t default_alignment, void *pointer) {
+  auto ip = reinterpret_cast<uintptr_t>(pointer);
+  if (ip % (4 * default_alignment) == 0) {
+    return 4;
+  }
+  if (ip % (2 * default_alignment) == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef<char*> pointers) {
+  TORCH_INTERNAL_ASSERT(desc.nOutputs == 1);
+  TORCH_INTERNAL_ASSERT(static_cast<int64_t>(pointers.size()) == 1 + desc.nInputs);
+
+  // Deals with output
+  auto result_size = c10::scalarTypeToTypeMeta(desc.result_type).itemsize();
+  int result = can_vectorize_up_to(result_size, pointers[0]);
+
+  // Incorporates input(s)
+  auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize();
+  for (auto i : c10::irange(1, pointers.size())) {
+    result = std::min(result, can_vectorize_up_to(input_size, pointers[i]));
+  }
+
+  return result;
+}
+
+std::string generate_code(
+    int nInputs,
+    int nOutputs,
+    const std::string& func,
+    const std::string& name,
+    const std::string& f_input_type,
+    const std::string& compute_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool dynamic_casting,
+    BinaryFuncVariant scalar_pos,
+    c10::SmallVector<std::string>& extra_args_typenames,
+    bool vectorized=false,
+    int vec_size=0,
+    bool return_by_ref=false);
+
+std::string generate_code(
+    const KernelDescriptor &desc,
+    bool contiguous,
+    bool dynamic_casting,
+    BinaryFuncVariant scalar_pos,
+    bool vectorized=false,
+    int vec_size=0,
+    bool return_by_ref=false);
+
+std::string zoom_generate_code(
+    const KernelDescriptor &desc,
+    bool dynamic_casting = false);
+
+std::string zoom_generate_code(
+    int nInputs,
+    int nOutputs,
+    const std::string& func_,
+    const std::string& name,
+    const std::string& f_inputs_type,
+    const std::string& compute_type,
+    const std::string& result_type,
+    bool dynamic_casting,
+    c10::SmallVector<std::string>& extra_args_typenames);
+
+std::string generate_reduction_code(
+    int nOutputs,
+    const std::string& func,
+    const std::string& name,
+    const int vt0,
+    const std::string& f_inputs_type,
+    const std::string& reduction_accum_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen);
+
+std::string generate_reduction_code(
+    const KernelDescriptor &desc,
+    const int vt0,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen);
+
+hiprtcFunction jit_pwise_function(
+    const std::string& code,
+    const std::string& kernel_name);
+
+void launch_jitted_pwise_function(
+    hiprtcFunction function,
+    void* args[],
+    const dim3 nBlocks,
+    const dim3 kBlockSize,
+    const int smem=0);
+
+template <typename T>
+struct delayed_false : std::false_type {
+};
+
+// Defines type names
+// NOTE: General case is instantiated only for invalid types.
+// All the valid types have specialization using the TYPE_NAME_FN
+// macro below.
+template <typename T>
+inline std::string typeName() {
+  // we can't use static_assert(false) directly as the
+  // program will be not compiled even if the template is not
+  // instantiated, so we use `delayed_false`
+  // to make sure compiler doesn't eagerly raise
+  // fail this assertion.
+  static_assert(delayed_false<T>::value, "invalid type for jiterator");
+  return "void";
+}
+
+#define TYPE_NAME_FN(ctype, name) \
+template <> inline std::string typeName<ctype>(){ \
+    return std::string(#ctype);    \
+}
+
+AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN)
+#undef TYPE_NAME_FN
+// JIT uses std::complex directly, because nvRTC compile programs
+// with -default-device, so there is no such issue like:
+//   "std::sin(complex) is __host__ only"
+template <> inline std::string typeName<bool>(){
+    return "bool";
+}
+template <> inline std::string typeName<c10::complex<at::Half>>(){
+    return "std::complex<at::Half>";
+}
+template <> inline std::string typeName<c10::complex<float>>(){
+    return "std::complex<float>";
+}
+template <> inline std::string typeName<c10::complex<double>>(){
+    return "std::complex<double>";
+}
+template <> inline std::string typeName<at::Half>(){
+    return "at::Half";
+}
+template <> inline std::string typeName<at::BFloat16>(){
+    return "at::BFloat16";
+}
+template <> inline std::string typeName<at::Float8_e5m2>(){
+    return "at::Float8_e5m2";
+}
+template <> inline std::string typeName<at::Float8_e4m3fn>(){
+    return "at::Float8_e4m3fn";
+}
+template <> inline std::string typeName<at::Float8_e5m2fnuz>() {
+    return "at::Float8_e5m2fnuz";
+}
+template <> inline std::string typeName<at::Float8_e4m3fnuz>() {
+    return "at::Float8_e4m3fnuz";
+}
+
+#define TYPE_NAME_CASE(ctype, scalartype)                    \
+  case ScalarType::scalartype:  return typeName<ctype>();
+inline std::string typeName(ScalarType t) {
+    switch (t) {
+      AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(TYPE_NAME_CASE)
+      default:
+          TORCH_CHECK(false, "invalid type for jiterator");
+    }
+}
+#undef TYPE_NAME_CASE
+
+TORCH_ZOOM_API void initializeZoomContext();
+
+}}}  // namespace at::zoom::jit
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/llvm_jit_strings.cpp b/aten/src/ATen/zoom/jit/llvm_jit_strings.cpp
new file mode 100644
index 00000000000000..4e8a4ddacce065
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/llvm_jit_strings.cpp
@@ -0,0 +1,1444 @@
+// This is copy-pasted (with modification) from the following llvm file:
+// - https://github.com/llvm/llvm-project/blob/main/libcxx/include/complex
+//
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <string>
+#include <ATen/zoom/jit/llvm_jit_strings.h>
+
+
+namespace at::zoom {
+
+// copy-pasted from some llvm files:
+// - https://github.com/llvm/llvm-project/blob/main/libcxx/include/type_traits
+// - https://github.com/llvm/llvm-project/blob/main/clang/test/Headers/Inputs/include/type_traits
+
+// hiprtc already includes some traits, so this removes duplicate definitions of
+// integral_constant, is_same, is_integral, enable_if, is_floating_point, is_arithmetic.
+// Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
+const std::string traits = R"ESCAPE(
+namespace std {
+
+template <class _Tp>
+_Tp&& __declval(int);
+template <class _Tp>
+_Tp __declval(long);
+template <class _Tp>
+decltype(__declval<_Tp>(0)) declval() noexcept;
+
+template <class _Tp> struct remove_const            {typedef _Tp type;};
+template <class _Tp> struct remove_const<const _Tp> {typedef _Tp type;};
+template <class _Tp> using remove_const_t = typename remove_const<_Tp>::type;
+
+template <class _Tp> struct remove_volatile               {typedef _Tp type;};
+template <class _Tp> struct remove_volatile<volatile _Tp> {typedef _Tp type;};
+template <class _Tp> using remove_volatile_t = typename remove_volatile<_Tp>::type;
+
+template <class _Tp> struct remove_cv
+{typedef typename remove_volatile<typename remove_const<_Tp>::type>::type type;};
+template <class _Tp> using remove_cv_t = typename remove_cv<_Tp>::type;
+
+template <class _Tp> struct __libcpp_is_floating_point              : public false_type {};
+template <>          struct __libcpp_is_floating_point<float>       : public true_type {};
+template <>          struct __libcpp_is_floating_point<double>      : public true_type {};
+template <>          struct __libcpp_is_floating_point<long double> : public true_type {};
+
+template <class _Tp>
+inline constexpr bool is_arithmetic_v = is_arithmetic<_Tp>::value;
+
+template <class _Tp>
+struct __numeric_type
+{
+   static void __test(...);
+   static float __test(float);
+   static double __test(char);
+   static double __test(int);
+   static double __test(unsigned);
+   static double __test(long);
+   static double __test(unsigned long);
+   static double __test(long long);
+   static double __test(unsigned long long);
+   static double __test(double);
+   static long double __test(long double);
+
+   typedef decltype(__test(declval<_Tp>())) type;
+   static const bool value = !is_same<type, void>::value;
+};
+
+template <>
+struct __numeric_type<void>
+{
+   static const bool value = true;
+};
+
+// __promote
+
+template <class _A1, class _A2 = void, class _A3 = void,
+          bool = __numeric_type<_A1>::value &&
+                 __numeric_type<_A2>::value &&
+                 __numeric_type<_A3>::value>
+class __promote_imp
+{
+public:
+    static const bool value = false;
+};
+
+template <class _A1, class _A2, class _A3>
+class __promote_imp<_A1, _A2, _A3, true>
+{
+private:
+    typedef typename __promote_imp<_A1>::type __type1;
+    typedef typename __promote_imp<_A2>::type __type2;
+    typedef typename __promote_imp<_A3>::type __type3;
+public:
+    typedef decltype(__type1() + __type2() + __type3()) type;
+    static const bool value = true;
+};
+
+template <class _A1, class _A2>
+class __promote_imp<_A1, _A2, void, true>
+{
+private:
+    typedef typename __promote_imp<_A1>::type __type1;
+    typedef typename __promote_imp<_A2>::type __type2;
+public:
+    typedef decltype(__type1() + __type2()) type;
+    static const bool value = true;
+};
+
+template <class _A1>
+class __promote_imp<_A1, void, void, true>
+{
+public:
+    typedef typename __numeric_type<_A1>::type type;
+    static const bool value = true;
+};
+
+template <class _A1, class _A2 = void, class _A3 = void>
+class __promote : public __promote_imp<_A1, _A2, _A3> {};
+
+} // namespace std
+)ESCAPE";
+
+const std::string &get_traits_string() {
+    return traits;
+}
+
+// This is copy-pasted from the following llvm file:
+// - https://github.com/llvm/llvm-project/blob/main/libcxx/include/cmath
+const std::string cmath = R"ESCAPE(
+
+namespace std {
+
+using ::signbit;
+using ::isfinite;
+using ::isinf;
+using ::isnan;
+
+using ::abs;
+
+using ::acos;
+using ::acosf;
+using ::asin;
+using ::asinf;
+using ::atan;
+using ::atanf;
+using ::atan2;
+using ::atan2f;
+using ::ceil;
+using ::ceilf;
+using ::cos;
+using ::cosf;
+using ::cosh;
+using ::coshf;
+
+using ::exp;
+using ::expf;
+
+using ::fabs;
+using ::fabsf;
+using ::floor;
+using ::floorf;
+
+using ::fmod;
+using ::fmodf;
+
+using ::frexp;
+using ::frexpf;
+using ::ldexp;
+using ::ldexpf;
+
+using ::log;
+using ::logf;
+
+using ::log10;
+using ::log10f;
+using ::modf;
+using ::modff;
+
+using ::pow;
+using ::powf;
+
+using ::sin;
+using ::sinf;
+using ::sinh;
+using ::sinhf;
+
+using ::sqrt;
+using ::sqrtf;
+using ::tan;
+using ::tanf;
+
+using ::tanh;
+using ::tanhf;
+
+using ::acosh;
+using ::acoshf;
+using ::asinh;
+using ::asinhf;
+using ::atanh;
+using ::atanhf;
+using ::cbrt;
+using ::cbrtf;
+
+using ::copysign;
+using ::copysignf;
+
+using ::erf;
+using ::erff;
+using ::erfc;
+using ::erfcf;
+using ::exp2;
+using ::exp2f;
+using ::expm1;
+using ::expm1f;
+using ::fdim;
+using ::fdimf;
+using ::fmaf;
+using ::fma;
+using ::fmax;
+using ::fmaxf;
+using ::fmin;
+using ::fminf;
+using ::hypot;
+using ::hypotf;
+using ::ilogb;
+using ::ilogbf;
+using ::lgamma;
+using ::lgammaf;
+using ::llrint;
+using ::llrintf;
+using ::llround;
+using ::llroundf;
+using ::log1p;
+using ::log1pf;
+using ::log2;
+using ::log2f;
+using ::logb;
+using ::logbf;
+using ::lrint;
+using ::lrintf;
+using ::lround;
+using ::lroundf;
+
+using ::nan;
+using ::nanf;
+
+using ::nearbyint;
+using ::nearbyintf;
+using ::nextafter;
+using ::nextafterf;
+using ::remainder;
+using ::remainderf;
+using ::remquo;
+using ::remquof;
+using ::rint;
+using ::rintf;
+using ::round;
+using ::roundf;
+using ::scalbln;
+using ::scalblnf;
+using ::scalbn;
+using ::scalbnf;
+using ::tgamma;
+using ::tgammaf;
+using ::trunc;
+using ::truncf;
+
+} // namespace std
+
+)ESCAPE";
+
+const std::string &get_cmath_string() {
+    return cmath;
+}
+
+
+const std::string complex_body = R"ESCAPE(
+
+namespace std {
+
+template<class _Tp> class complex;
+
+template<class _Tp> complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w);
+template<class _Tp> complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y);
+
+template<class _Tp>
+class complex
+{
+public:
+    typedef _Tp value_type;
+private:
+    value_type __re_;
+    value_type __im_;
+public:
+    constexpr
+    complex(const value_type& __re = value_type(), const value_type& __im = value_type())
+        : __re_(__re), __im_(__im) {}
+    template<class _Xp> constexpr
+    complex(const complex<_Xp>& __c)
+        : __re_(__c.real()), __im_(__c.imag()) {}
+
+    constexpr value_type real() const {return __re_;}
+    constexpr value_type imag() const {return __im_;}
+
+    void real(value_type __re) {__re_ = __re;}
+    void imag(value_type __im) {__im_ = __im;}
+
+    constexpr operator bool() const {
+        return real() || imag();
+    }
+
+    complex& operator= (const value_type& __re)
+        {__re_ = __re; __im_ = value_type(); return *this;}
+    complex& operator+=(const value_type& __re) {__re_ += __re; return *this;}
+    complex& operator-=(const value_type& __re) {__re_ -= __re; return *this;}
+    complex& operator*=(const value_type& __re) {__re_ *= __re; __im_ *= __re; return *this;}
+    complex& operator/=(const value_type& __re) {__re_ /= __re; __im_ /= __re; return *this;}
+
+    template<class _Xp> complex& operator= (const complex<_Xp>& __c)
+        {
+            __re_ = __c.real();
+            __im_ = __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator+=(const complex<_Xp>& __c)
+        {
+            __re_ += __c.real();
+            __im_ += __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator-=(const complex<_Xp>& __c)
+        {
+            __re_ -= __c.real();
+            __im_ -= __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator*=(const complex<_Xp>& __c)
+        {
+            *this = *this * complex(__c.real(), __c.imag());
+            return *this;
+        }
+    template<class _Xp> complex& operator/=(const complex<_Xp>& __c)
+        {
+            *this = *this / complex(__c.real(), __c.imag());
+            return *this;
+        }
+};
+
+template<> class complex<double>;
+
+template<>
+class complex<float>
+{
+    float __re_;
+    float __im_;
+public:
+    typedef float value_type;
+
+    constexpr complex(float __re = 0.0f, float __im = 0.0f)
+        : __re_(__re), __im_(__im) {}
+
+    explicit constexpr complex(const complex<double>& __c);
+
+    constexpr float real() const {return __re_;}
+    constexpr float imag() const {return __im_;}
+
+    void real(value_type __re) {__re_ = __re;}
+    void imag(value_type __im) {__im_ = __im;}
+
+    constexpr operator bool() const {
+        return real() || imag();
+    }
+
+    complex& operator= (float __re)
+        {__re_ = __re; __im_ = value_type(); return *this;}
+    complex& operator+=(float __re) {__re_ += __re; return *this;}
+    complex& operator-=(float __re) {__re_ -= __re; return *this;}
+    complex& operator*=(float __re) {__re_ *= __re; __im_ *= __re; return *this;}
+    complex& operator/=(float __re) {__re_ /= __re; __im_ /= __re; return *this;}
+
+    template<class _Xp> complex& operator= (const complex<_Xp>& __c)
+        {
+            __re_ = __c.real();
+            __im_ = __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator+=(const complex<_Xp>& __c)
+        {
+            __re_ += __c.real();
+            __im_ += __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator-=(const complex<_Xp>& __c)
+        {
+            __re_ -= __c.real();
+            __im_ -= __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator*=(const complex<_Xp>& __c)
+        {
+            *this = *this * complex(__c.real(), __c.imag());
+            return *this;
+        }
+    template<class _Xp> complex& operator/=(const complex<_Xp>& __c)
+        {
+            *this = *this / complex(__c.real(), __c.imag());
+            return *this;
+        }
+};
+
+template<>
+class complex<double>
+{
+    double __re_;
+    double __im_;
+public:
+    typedef double value_type;
+
+    constexpr complex(double __re = 0.0, double __im = 0.0)
+        : __re_(__re), __im_(__im) {}
+
+    constexpr complex(const complex<float>& __c);
+
+    constexpr double real() const {return __re_;}
+    constexpr double imag() const {return __im_;}
+
+    void real(value_type __re) {__re_ = __re;}
+    void imag(value_type __im) {__im_ = __im;}
+
+    constexpr operator bool() const {
+        return real() || imag();
+    }
+
+    complex& operator= (double __re)
+        {__re_ = __re; __im_ = value_type(); return *this;}
+    complex& operator+=(double __re) {__re_ += __re; return *this;}
+    complex& operator-=(double __re) {__re_ -= __re; return *this;}
+    complex& operator*=(double __re) {__re_ *= __re; __im_ *= __re; return *this;}
+    complex& operator/=(double __re) {__re_ /= __re; __im_ /= __re; return *this;}
+
+    template<class _Xp> complex& operator= (const complex<_Xp>& __c)
+        {
+            __re_ = __c.real();
+            __im_ = __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator+=(const complex<_Xp>& __c)
+        {
+            __re_ += __c.real();
+            __im_ += __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator-=(const complex<_Xp>& __c)
+        {
+            __re_ -= __c.real();
+            __im_ -= __c.imag();
+            return *this;
+        }
+    template<class _Xp> complex& operator*=(const complex<_Xp>& __c)
+        {
+            *this = *this * complex(__c.real(), __c.imag());
+            return *this;
+        }
+    template<class _Xp> complex& operator/=(const complex<_Xp>& __c)
+        {
+            *this = *this / complex(__c.real(), __c.imag());
+            return *this;
+        }
+};
+
+inline
+constexpr
+complex<float>::complex(const complex<double>& __c)
+    : __re_(__c.real()), __im_(__c.imag()) {}
+
+inline
+constexpr
+complex<double>::complex(const complex<float>& __c)
+    : __re_(__c.real()), __im_(__c.imag()) {}
+
+
+// 26.3.6 operators:
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator+(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__x);
+    __t += __y;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator+(const complex<_Tp>& __x, const _Tp& __y)
+{
+    complex<_Tp> __t(__x);
+    __t += __y;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator+(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__y);
+    __t += __x;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator-(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__x);
+    __t -= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator-(const complex<_Tp>& __x, const _Tp& __y)
+{
+    complex<_Tp> __t(__x);
+    __t -= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator-(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(-__y);
+    __t += __x;
+    return __t;
+}
+
+template<class _Tp>
+complex<_Tp>
+operator*(const complex<_Tp>& __z, const complex<_Tp>& __w)
+{
+    _Tp __a = __z.real();
+    _Tp __b = __z.imag();
+    _Tp __c = __w.real();
+    _Tp __d = __w.imag();
+    _Tp __ac = __a * __c;
+    _Tp __bd = __b * __d;
+    _Tp __ad = __a * __d;
+    _Tp __bc = __b * __c;
+    _Tp __x = __ac - __bd;
+    _Tp __y = __ad + __bc;
+    if (isnan(__x) && isnan(__y))
+    {
+        bool __recalc = false;
+        if (isinf(__a) || isinf(__b))
+        {
+            __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
+            __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
+            if (isnan(__c))
+                __c = copysign(_Tp(0), __c);
+            if (isnan(__d))
+                __d = copysign(_Tp(0), __d);
+            __recalc = true;
+        }
+        if (isinf(__c) || isinf(__d))
+        {
+            __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
+            __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
+            if (isnan(__a))
+                __a = copysign(_Tp(0), __a);
+            if (isnan(__b))
+                __b = copysign(_Tp(0), __b);
+            __recalc = true;
+        }
+        if (!__recalc && (isinf(__ac) || isinf(__bd) ||
+                          isinf(__ad) || isinf(__bc)))
+        {
+            if (isnan(__a))
+                __a = copysign(_Tp(0), __a);
+            if (isnan(__b))
+                __b = copysign(_Tp(0), __b);
+            if (isnan(__c))
+                __c = copysign(_Tp(0), __c);
+            if (isnan(__d))
+                __d = copysign(_Tp(0), __d);
+            __recalc = true;
+        }
+        if (__recalc)
+        {
+            __x = _Tp(INFINITY) * (__a * __c - __b * __d);
+            __y = _Tp(INFINITY) * (__a * __d + __b * __c);
+        }
+    }
+    return complex<_Tp>(__x, __y);
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator*(const complex<_Tp>& __x, const _Tp& __y)
+{
+    complex<_Tp> __t(__x);
+    __t *= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator*(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__y);
+    __t *= __x;
+    return __t;
+}
+
+template<class _Tp>
+complex<_Tp>
+operator/(const complex<_Tp>& __z, const complex<_Tp>& __w)
+{
+    int __ilogbw = 0;
+    _Tp __a = __z.real();
+    _Tp __b = __z.imag();
+    _Tp __c = __w.real();
+    _Tp __d = __w.imag();
+    _Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
+    if (isfinite(__logbw))
+    {
+        __ilogbw = static_cast<int>(__logbw);
+        __c = scalbn(__c, -__ilogbw);
+        __d = scalbn(__d, -__ilogbw);
+    }
+    _Tp __denom = __c * __c + __d * __d;
+    _Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+    _Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+    if (isnan(__x) && isnan(__y))
+    {
+        if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b)))
+        {
+            __x = copysign(_Tp(INFINITY), __c) * __a;
+            __y = copysign(_Tp(INFINITY), __c) * __b;
+        }
+        else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d))
+        {
+            __a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
+            __b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
+            __x = _Tp(INFINITY) * (__a * __c + __b * __d);
+            __y = _Tp(INFINITY) * (__b * __c - __a * __d);
+        }
+        else if (isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) && isfinite(__b))
+        {
+            __c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
+            __d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
+            __x = _Tp(0) * (__a * __c + __b * __d);
+            __y = _Tp(0) * (__b * __c - __a * __d);
+        }
+    }
+    return complex<_Tp>(__x, __y);
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator/(const complex<_Tp>& __x, const _Tp& __y)
+{
+    return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator/(const _Tp& __x, const complex<_Tp>& __y)
+{
+    complex<_Tp> __t(__x);
+    __t /= __y;
+    return __t;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator+(const complex<_Tp>& __x)
+{
+    return __x;
+}
+
+template<class _Tp>
+inline
+complex<_Tp>
+operator-(const complex<_Tp>& __x)
+{
+    return complex<_Tp>(-__x.real(), -__x.imag());
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator==(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return __x.real() == __y.real() && __x.imag() == __y.imag();
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator==(const complex<_Tp>& __x, const _Tp& __y)
+{
+    return __x.real() == __y && __x.imag() == 0;
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator==(const _Tp& __x, const complex<_Tp>& __y)
+{
+    return __x == __y.real() && 0 == __y.imag();
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator!=(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return !(__x == __y);
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator!=(const complex<_Tp>& __x, const _Tp& __y)
+{
+    return !(__x == __y);
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator!=(const _Tp& __x, const complex<_Tp>& __y)
+{
+    return !(__x == __y);
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator&&(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return bool(__x) && bool(__y);
+}
+
+template<class _Tp>
+inline constexpr
+bool
+isnan(const complex<_Tp>& __x)
+{
+    return isnan(__x.real()) || isnan(__x.imag());
+}
+
+template<class _Tp>
+inline constexpr
+bool
+operator||(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return bool(__x) || bool(__y);
+}
+
+// 26.3.7 values:
+
+template <class _Tp, bool = is_integral<_Tp>::value,
+                     bool = is_floating_point<_Tp>::value
+                     >
+struct __libcpp_complex_overload_traits {};
+
+// Integral Types
+template <class _Tp>
+struct __libcpp_complex_overload_traits<_Tp, true, false>
+{
+    typedef double _ValueType;
+    typedef complex<double> _ComplexType;
+};
+
+// Floating point types
+template <class _Tp>
+struct __libcpp_complex_overload_traits<_Tp, false, true>
+{
+    typedef _Tp _ValueType;
+    typedef complex<_Tp> _ComplexType;
+};
+
+// real
+
+template<class _Tp>
+inline constexpr
+_Tp
+real(const complex<_Tp>& __c)
+{
+    return __c.real();
+}
+
+template <class _Tp>
+inline constexpr
+typename __libcpp_complex_overload_traits<_Tp>::_ValueType
+real(_Tp __re)
+{
+    return __re;
+}
+
+// imag
+
+template<class _Tp>
+inline constexpr
+_Tp
+imag(const complex<_Tp>& __c)
+{
+    return __c.imag();
+}
+
+template <class _Tp>
+inline constexpr
+typename __libcpp_complex_overload_traits<_Tp>::_ValueType
+imag(_Tp)
+{
+    return 0;
+}
+
+// abs
+
+template<class _Tp>
+inline
+_Tp
+abs(const complex<_Tp>& __c)
+{
+    return hypot(__c.real(), __c.imag());
+}
+
+// arg
+
+template<class _Tp>
+inline
+_Tp
+arg(const complex<_Tp>& __c)
+{
+    return atan2(__c.imag(), __c.real());
+}
+
+template<class _Tp>
+inline
+typename enable_if
+<
+    is_integral<_Tp>::value || is_same<_Tp, double>::value,
+    double
+>::type
+arg(_Tp __re)
+{
+    return atan2(0., __re);
+}
+
+template <class _Tp>
+inline
+typename enable_if<
+    is_same<_Tp, float>::value,
+    float
+>::type
+arg(_Tp __re)
+{
+    return atan2f(0.F, __re);
+}
+
+}
+
+)ESCAPE";
+
+const std::string complex_half_body = R"ESCAPE(
+namespace std {
+template <>
+struct alignas(2) complex<at::Half> {
+  at::Half real_;
+  at::Half imag_;
+
+  // Constructors
+  complex() = default;
+
+  // implicit casting to and from `complex<float>`.
+  // NOTE: computation of `complex<Half>` will occur in `complex<float>`
+  __host__ __device__ inline complex(const std::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  inline __host__ __device__ operator std::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  at::Half real() const {return real_;}
+  at::Half imag() const {return imag_;}
+
+};
+}
+)ESCAPE";
+
+
+const std::string &get_complex_body_string() {
+  return complex_body;
+}
+
+const std::string &get_complex_half_body_string() {
+  return complex_half_body;
+}
+
+const std::string complex_math = R"ESCAPE(
+
+namespace std {
+
+// norm
+
+template<class _Tp>
+inline
+_Tp
+norm(const complex<_Tp>& __c)
+{
+    if (isinf(__c.real()))
+        return abs(__c.real());
+    if (isinf(__c.imag()))
+        return abs(__c.imag());
+    return __c.real() * __c.real() + __c.imag() * __c.imag();
+}
+
+template <class _Tp>
+inline
+typename __libcpp_complex_overload_traits<_Tp>::_ValueType
+norm(_Tp __re)
+{
+    typedef typename __libcpp_complex_overload_traits<_Tp>::_ValueType _ValueType;
+    return static_cast<_ValueType>(__re) * __re;
+}
+
+// conj
+
+template<class _Tp>
+inline
+complex<_Tp>
+conj(const complex<_Tp>& __c)
+{
+    return complex<_Tp>(__c.real(), -__c.imag());
+}
+
+template <class _Tp>
+inline
+typename __libcpp_complex_overload_traits<_Tp>::_ComplexType
+conj(_Tp __re)
+{
+    typedef typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType;
+    return _ComplexType(__re);
+}
+
+
+
+// proj
+
+template<class _Tp>
+inline
+complex<_Tp>
+proj(const complex<_Tp>& __c)
+{
+    complex<_Tp> __r = __c;
+    if (isinf(__c.real()) || isinf(__c.imag()))
+        __r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
+    return __r;
+}
+
+template <class _Tp>
+inline
+typename enable_if
+<
+    is_floating_point<_Tp>::value,
+    typename __libcpp_complex_overload_traits<_Tp>::_ComplexType
+>::type
+proj(_Tp __re)
+{
+    if (isinf(__re))
+        __re = abs(__re);
+    return complex<_Tp>(__re);
+}
+
+template <class _Tp>
+inline
+typename enable_if
+<
+    is_integral<_Tp>::value,
+    typename __libcpp_complex_overload_traits<_Tp>::_ComplexType
+>::type
+proj(_Tp __re)
+{
+    typedef typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType;
+    return _ComplexType(__re);
+}
+
+// polar
+
+template<class _Tp>
+complex<_Tp>
+polar(const _Tp& __rho, const _Tp& __theta = _Tp())
+{
+    if (isnan(__rho) || signbit(__rho))
+        return complex<_Tp>(_Tp(NAN), _Tp(NAN));
+    if (isnan(__theta))
+    {
+        if (isinf(__rho))
+            return complex<_Tp>(__rho, __theta);
+        return complex<_Tp>(__theta, __theta);
+    }
+    if (isinf(__theta))
+    {
+        if (isinf(__rho))
+            return complex<_Tp>(__rho, _Tp(NAN));
+        return complex<_Tp>(_Tp(NAN), _Tp(NAN));
+    }
+    _Tp __x = __rho * cos(__theta);
+    if (isnan(__x))
+        __x = 0;
+    _Tp __y = __rho * sin(__theta);
+    if (isnan(__y))
+        __y = 0;
+    return complex<_Tp>(__x, __y);
+}
+
+// log
+
+template<class _Tp>
+inline
+complex<_Tp>
+log(const complex<_Tp>& __x)
+{
+    return complex<_Tp>(log(abs(__x)), arg(__x));
+}
+
+// log10
+
+template<class _Tp>
+inline
+complex<_Tp>
+log10(const complex<_Tp>& __x)
+{
+    return log(__x) / log(_Tp(10));
+}
+
+// log2
+
+template<class _Tp>
+inline
+complex<_Tp>
+log2(const complex<_Tp>& __x)
+{
+    return log(__x) / log(_Tp(2));
+}
+
+// sqrt
+
+template<class _Tp>
+complex<_Tp>
+sqrt(const complex<_Tp>& __x)
+{
+    if (isinf(__x.imag()))
+        return complex<_Tp>(_Tp(INFINITY), __x.imag());
+    if (isinf(__x.real()))
+    {
+        if (__x.real() > _Tp(0))
+            return complex<_Tp>(__x.real(), isnan(__x.imag()) ? __x.imag() : copysign(_Tp(0), __x.imag()));
+        return complex<_Tp>(isnan(__x.imag()) ? __x.imag() : _Tp(0), copysign(__x.real(), __x.imag()));
+    }
+    return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
+}
+
+// exp
+
+template<class _Tp>
+complex<_Tp>
+exp(const complex<_Tp>& __x)
+{
+    _Tp __i = __x.imag();
+    if (__i == 0) {
+        return complex<_Tp>(exp(__x.real()), copysign(_Tp(0), __x.imag()));
+    }
+    if (isinf(__x.real()))
+    {
+        if (__x.real() < _Tp(0))
+        {
+            if (!isfinite(__i))
+                __i = _Tp(1);
+        }
+        else if (__i == 0 || !isfinite(__i))
+        {
+            if (isinf(__i))
+                __i = _Tp(NAN);
+            return complex<_Tp>(__x.real(), __i);
+        }
+    }
+    _Tp __e = exp(__x.real());
+    return complex<_Tp>(__e * cos(__i), __e * sin(__i));
+}
+
+// pow
+
+template<class _Tp>
+inline
+complex<_Tp>
+pow(const complex<_Tp>& __x, const complex<_Tp>& __y)
+{
+    return exp(__y * log(__x));
+}
+
+template<class _Tp, class _Up>
+inline
+complex<typename __promote<_Tp, _Up>::type>
+pow(const complex<_Tp>& __x, const complex<_Up>& __y)
+{
+    typedef complex<typename __promote<_Tp, _Up>::type> result_type;
+    return std::pow(result_type(__x), result_type(__y));
+}
+
+template<class _Tp, class _Up>
+inline
+typename enable_if
+<
+    is_arithmetic<_Up>::value,
+    complex<typename __promote<_Tp, _Up>::type>
+>::type
+pow(const complex<_Tp>& __x, const _Up& __y)
+{
+    typedef complex<typename __promote<_Tp, _Up>::type> result_type;
+    return std::pow(result_type(__x), result_type(__y));
+}
+
+template<class _Tp, class _Up>
+inline
+typename enable_if
+<
+    is_arithmetic<_Tp>::value,
+    complex<typename __promote<_Tp, _Up>::type>
+>::type
+pow(const _Tp& __x, const complex<_Up>& __y)
+{
+    typedef complex<typename __promote<_Tp, _Up>::type> result_type;
+    return std::pow(result_type(__x), result_type(__y));
+}
+
+// __sqr, computes pow(x, 2)
+
+template<class _Tp>
+inline
+complex<_Tp>
+__sqr(const complex<_Tp>& __x)
+{
+    return complex<_Tp>((__x.real() - __x.imag()) * (__x.real() + __x.imag()),
+                        _Tp(2) * __x.real() * __x.imag());
+}
+
+// asinh
+
+template<class _Tp>
+complex<_Tp>
+asinh(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.real()))
+    {
+        if (isnan(__x.imag()))
+            return __x;
+        if (isinf(__x.imag()))
+            return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
+        return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+    }
+    if (isnan(__x.real()))
+    {
+        if (isinf(__x.imag()))
+            return complex<_Tp>(__x.imag(), __x.real());
+        if (__x.imag() == 0)
+            return __x;
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.imag()))
+        return complex<_Tp>(copysign(__x.imag(), __x.real()), copysign(__pi/_Tp(2), __x.imag()));
+    complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
+    return complex<_Tp>(copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
+}
+
+// acosh
+
+template<class _Tp>
+complex<_Tp>
+acosh(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.real()))
+    {
+        if (isnan(__x.imag()))
+            return complex<_Tp>(abs(__x.real()), __x.imag());
+        if (isinf(__x.imag()))
+        {
+            if (__x.real() > 0)
+                return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
+            else
+                return complex<_Tp>(-__x.real(), copysign(__pi * _Tp(0.75), __x.imag()));
+        }
+        if (__x.real() < 0)
+            return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
+        return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
+    }
+    if (isnan(__x.real()))
+    {
+        if (isinf(__x.imag()))
+            return complex<_Tp>(abs(__x.imag()), __x.real());
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.imag()))
+        return complex<_Tp>(abs(__x.imag()), copysign(__pi/_Tp(2), __x.imag()));
+    complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+    return complex<_Tp>(copysign(__z.real(), _Tp(0)), copysign(__z.imag(), __x.imag()));
+}
+
+// atanh
+
+template<class _Tp>
+complex<_Tp>
+atanh(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.imag()))
+    {
+        return complex<_Tp>(copysign(_Tp(0), __x.real()), copysign(__pi/_Tp(2), __x.imag()));
+    }
+    if (isnan(__x.imag()))
+    {
+        if (isinf(__x.real()) || __x.real() == 0)
+            return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
+        return complex<_Tp>(__x.imag(), __x.imag());
+    }
+    if (isnan(__x.real()))
+    {
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.real()))
+    {
+        return complex<_Tp>(copysign(_Tp(0), __x.real()), copysign(__pi/_Tp(2), __x.imag()));
+    }
+    if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0))
+    {
+        return complex<_Tp>(copysign(_Tp(INFINITY), __x.real()), copysign(_Tp(0), __x.imag()));
+    }
+    complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
+    return complex<_Tp>(copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
+}
+
+// sinh
+
+template<class _Tp>
+complex<_Tp>
+sinh(const complex<_Tp>& __x)
+{
+    if (isinf(__x.real()) && !isfinite(__x.imag()))
+        return complex<_Tp>(__x.real(), _Tp(NAN));
+    if (__x.real() == 0 && !isfinite(__x.imag()))
+        return complex<_Tp>(__x.real(), _Tp(NAN));
+    if (__x.imag() == 0 && !isfinite(__x.real()))
+        return __x;
+    return complex<_Tp>(sinh(__x.real()) * cos(__x.imag()), cosh(__x.real()) * sin(__x.imag()));
+}
+
+// cosh
+
+template<class _Tp>
+complex<_Tp>
+cosh(const complex<_Tp>& __x)
+{
+    if (isinf(__x.real()) && !isfinite(__x.imag()))
+        return complex<_Tp>(abs(__x.real()), _Tp(NAN));
+    if (__x.real() == 0 && !isfinite(__x.imag()))
+        return complex<_Tp>(_Tp(NAN), __x.real());
+    if (__x.real() == 0 && __x.imag() == 0)
+        return complex<_Tp>(_Tp(1), __x.imag());
+    if (__x.imag() == 0 && !isfinite(__x.real()))
+        return complex<_Tp>(abs(__x.real()), __x.imag());
+    return complex<_Tp>(cosh(__x.real()) * cos(__x.imag()), sinh(__x.real()) * sin(__x.imag()));
+}
+
+// tanh
+
+template<class _Tp>
+complex<_Tp>
+tanh(const complex<_Tp>& __x)
+{
+    if (isinf(__x.real()))
+    {
+        if (!isfinite(__x.imag()))
+            return complex<_Tp>(copysign(_Tp(1), __x.real()), _Tp(0));
+        return complex<_Tp>(copysign(_Tp(1), __x.real()), copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
+    }
+    if (isnan(__x.real()) && __x.imag() == 0)
+        return __x;
+    _Tp __2r(_Tp(2) * __x.real());
+    _Tp __2i(_Tp(2) * __x.imag());
+    _Tp __d(cosh(__2r) + cos(__2i));
+    _Tp __2rsh(sinh(__2r));
+    if (isinf(__2rsh) && isinf(__d))
+        return complex<_Tp>(__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1),
+                            __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
+    return  complex<_Tp>(__2rsh/__d, sin(__2i)/__d);
+}
+
+// asin
+
+template<class _Tp>
+complex<_Tp>
+asin(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// acos
+
+template<class _Tp>
+complex<_Tp>
+acos(const complex<_Tp>& __x)
+{
+    const _Tp __pi(atan2(+0., -0.));
+    if (isinf(__x.real()))
+    {
+        if (isnan(__x.imag()))
+            return complex<_Tp>(__x.imag(), __x.real());
+        if (isinf(__x.imag()))
+        {
+            if (__x.real() < _Tp(0))
+                return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
+            return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
+        }
+        if (__x.real() < _Tp(0))
+            return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
+        return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
+    }
+    if (isnan(__x.real()))
+    {
+        if (isinf(__x.imag()))
+            return complex<_Tp>(__x.real(), -__x.imag());
+        return complex<_Tp>(__x.real(), __x.real());
+    }
+    if (isinf(__x.imag()))
+        return complex<_Tp>(__pi/_Tp(2), -__x.imag());
+    if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
+        return complex<_Tp>(__pi/_Tp(2), -__x.imag());
+    complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
+    if (signbit(__x.imag()))
+        return complex<_Tp>(abs(__z.imag()), abs(__z.real()));
+    return complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
+}
+
+// atan
+
+template<class _Tp>
+complex<_Tp>
+atan(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// sin
+
+template<class _Tp>
+complex<_Tp>
+sin(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// cos
+
+template<class _Tp>
+inline
+complex<_Tp>
+cos(const complex<_Tp>& __x)
+{
+    return cosh(complex<_Tp>(-__x.imag(), __x.real()));
+}
+
+// tan
+
+template<class _Tp>
+complex<_Tp>
+tan(const complex<_Tp>& __x)
+{
+    complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
+    return complex<_Tp>(__z.imag(), -__z.real());
+}
+
+// Literal suffix for complex number literals [complex.literals]
+inline namespace literals
+{
+  inline namespace complex_literals
+  {
+    constexpr complex<double> operator""i(long double __im)
+    {
+        return { 0.0, static_cast<double>(__im) };
+    }
+
+    constexpr complex<double> operator""i(unsigned long long __im)
+    {
+        return { 0.0, static_cast<double>(__im) };
+    }
+
+
+    constexpr complex<float> operator""if(long double __im)
+    {
+        return { 0.0f, static_cast<float>(__im) };
+    }
+
+    constexpr complex<float> operator""if(unsigned long long __im)
+    {
+        return { 0.0f, static_cast<float>(__im) };
+    }
+  } // namespace complex_literals
+} // namespace literals
+
+} // namespace std
+
+)ESCAPE";
+
+const std::string &get_complex_math_string() {
+  return complex_math;
+}
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/llvm_jit_strings.h b/aten/src/ATen/zoom/jit/llvm_jit_strings.h
new file mode 100644
index 00000000000000..3d71ff866c47fa
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/llvm_jit_strings.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <string>
+#include <c10/macros/Export.h>
+
+namespace at::zoom {
+
+TORCH_ZOOM_API const std::string &get_traits_string();
+TORCH_ZOOM_API const std::string &get_cmath_string();
+TORCH_ZOOM_API const std::string &get_complex_body_string();
+TORCH_ZOOM_API const std::string &get_complex_half_body_string();
+TORCH_ZOOM_API const std::string &get_complex_math_string();
+
+} // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/macros.h b/aten/src/ATen/zoom/jit/macros.h
new file mode 100644
index 00000000000000..2402b0a0d52ba5
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/macros.h
@@ -0,0 +1,4 @@
+#include <string>
+
+#define AT_USE_JITERATOR() true
+#define jiterator_stringify(...) std::string(#__VA_ARGS__);
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/thread_constants.h b/aten/src/ATen/zoom/jit/thread_constants.h
new file mode 100644
index 00000000000000..0df30f8d5a45e8
--- /dev/null
+++ b/aten/src/ATen/zoom/jit/thread_constants.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <c10/macros/Macros.h>
+
+// Marks a lambda as executable on both the host and device. The __host__
+// attribute is important so that we can access static type information from
+// the host, even if the function is typically only executed on the device.
+#ifndef GPU_LAMBDA
+#define GPU_LAMBDA __host__ __device__
+#endif
+
+constexpr int num_threads() {
+  return 256;
+}
+
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
\ No newline at end of file
diff --git a/build_variables.bzl b/build_variables.bzl
index 3f16f9b847c1cc..6bd1898db6310b 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -773,6 +773,19 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
     "torch/csrc/cuda/Tensor.cpp",
 ]
 
+libtorch_python_zoom_sources = [
+    "torch/csrc/zoom/Module.cpp",
+    "torch/csrc/zoom/Event.cpp",
+    "torch/csrc/zoom/python_comm.cpp",
+    "torch/csrc/zoom/Stream.cpp",
+    "torch/csrc/zoom/Graph.cpp",
+    "torch/csrc/zoom/utils.cpp",
+    "torch/csrc/zoom/ZoomPluggableAllocator.cpp",
+    "torch/csrc/zoom/comm.cpp",
+    "torch/csrc/zoom/memory_snapshot.cpp",
+    "torch/csrc/zoom/shared/hiprt.cpp",
+]
+
 libtorch_python_xpu_sources = [
     "torch/csrc/xpu/Event.cpp",
     "torch/csrc/xpu/Module.cpp",
@@ -952,6 +965,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
 aten_cpu_non_globed_sources = [
     "aten/src/ATen/detail/CUDAHooksInterface.cpp",
     "aten/src/ATen/detail/HIPHooksInterface.cpp",
+    "aten/src/ATen/detail/ZoomHooksInterface.cpp",
     "aten/src/ATen/detail/MPSHooksInterface.cpp",
     "aten/src/ATen/detail/MAIAHooksInterface.cpp",
     "aten/src/ATen/detail/PrivateUse1HooksInterface.cpp",
@@ -970,6 +984,7 @@ aten_cpu_non_globed_headers = [
     "aten/src/ATen/detail/CUDAHooksInterface.h",
     "aten/src/ATen/detail/MPSHooksInterface.h",
     "aten/src/ATen/detail/HIPHooksInterface.h",
+    "aten/src/ATen/detail/ZoomHooksInterface.h",
     "aten/src/ATen/detail/MAIAHooksInterface.h",
     "aten/src/ATen/detail/PrivateUse1HooksInterface.h",
     "aten/src/ATen/detail/XPUHooksInterface.h",
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 1f742f4c17683d..c8f74102099a87 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -140,6 +140,10 @@ if(USE_ROCM)
   add_subdirectory(hip)
 endif()
 
+if(USE_ZOOM)
+  add_subdirectory(zoom)
+endif()
+
 if(USE_XPU)
   add_subdirectory(xpu)
 endif()
diff --git a/c10/core/Allocator.cpp b/c10/core/Allocator.cpp
index 491c85b081e885..e855f870a759b8 100644
--- a/c10/core/Allocator.cpp
+++ b/c10/core/Allocator.cpp
@@ -41,6 +41,17 @@ C10_API at::Allocator* allocator_array[at::COMPILE_TIME_MAX_DEVICE_TYPES];
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 C10_API uint8_t allocator_priority[at::COMPILE_TIME_MAX_DEVICE_TYPES] = {0};
 
+/*
+  (Arham) This holds functor that enables getting the PU1 allocator from a function rather than statically registering
+  a pointer to a static global variable, which is useful when we want to create a global allocator that is thread safe
+  (e.g. using std::atomic). See the usage below in GetAllocator and REGISTER_PU1_ALLOCATOR in Allocator.h
+*/
+C10_API at::Allocator* (*getPrivateUse1Allocator)() = nullptr;
+
+void SetPrivateUse1GetAllocator(at::Allocator* (*getAllocatorFunc)()) {
+  getPrivateUse1Allocator = getAllocatorFunc;
+}
+
 void SetAllocator(at::DeviceType t, at::Allocator* alloc, uint8_t priority) {
   if (priority >= allocator_priority[static_cast<int>(t)]) {
     allocator_array[static_cast<int>(t)] = alloc;
@@ -49,6 +60,10 @@ void SetAllocator(at::DeviceType t, at::Allocator* alloc, uint8_t priority) {
 }
 
 at::Allocator* GetAllocator(const at::DeviceType& t) {
+  // if registered, use the functor registration for the PU1 allocator, else use the traditional static registration
+  if(t == DeviceType::PrivateUse1 && getPrivateUse1Allocator != nullptr) {
+    return getPrivateUse1Allocator();
+  }
   auto* alloc = allocator_array[static_cast<int>(t)];
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(alloc, "Allocator for ", t, " is not set.");
   return alloc;
diff --git a/c10/core/Allocator.h b/c10/core/Allocator.h
index 412412557a0d11..936c19469af905 100644
--- a/c10/core/Allocator.h
+++ b/c10/core/Allocator.h
@@ -255,6 +255,15 @@ struct C10_API InefficientStdFunctionContext {
 C10_API void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0);
 C10_API Allocator* GetAllocator(const DeviceType& t);
 
+// set a functor that can retrieve the PrivateUse1 Allocator at will
+C10_API void SetPrivateUse1GetAllocator(at::Allocator* (*getAllocatorFunc)());
+
+struct PrivateUse1AllocatorRegisterer {
+  explicit PrivateUse1AllocatorRegisterer(at::Allocator* (*getAllocatorFunc)()) {
+    SetPrivateUse1GetAllocator(getAllocatorFunc);
+  }
+};
+
 template <DeviceType t>
 struct AllocatorRegisterer {
   explicit AllocatorRegisterer(Allocator* alloc) {
@@ -267,6 +276,11 @@ struct AllocatorRegisterer {
   static c10::AllocatorRegisterer<t> g_allocator_d(f); \
   }
 
+#define REGISTER_PU1_ALLOCATOR(f)                       \
+  namespace {                                          \
+  static PrivateUse1AllocatorRegisterer g_allocator_d(f); \
+  }
+  
 // An interface for reporting thread local memory usage
 // per device
 struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase {
diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index cb68060ed8129d..84438c6eead37d 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -140,8 +140,10 @@
 
 #if defined(TORCH_HIP_BUILD_MAIN_LIB)
 #define TORCH_HIP_API C10_EXPORT
+#define TORCH_ZOOM_API C10_EXPORT
 #else
 #define TORCH_HIP_API C10_IMPORT
+#define TORCH_ZOOM_API C10_EXPORT
 #endif
 
 #if defined(TORCH_XPU_BUILD_MAIN_LIB)
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index f28e526a0431a9..a704e55142f52e 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -310,7 +310,7 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_HIP_HOST_DEVICE
 #endif
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) || defined(USE_ZOOM)
 #define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
 #else
 #define C10_WARP_SIZE 32
diff --git a/c10/util/generic_math.h b/c10/util/generic_math.h
index adfdbfd9955c00..579aee2e2d83e0 100644
--- a/c10/util/generic_math.h
+++ b/c10/util/generic_math.h
@@ -8,7 +8,11 @@
 #include <c10/cuda/CUDAMathCompat.h>
 #define C10_COMPAT_COPYSIGN c10::cuda::compat::copysign
 #elif defined(__HIPCC__)
-#include <c10/hip/HIPMathCompat.h>
+  #ifdef USE_ZOOM
+    #include <c10/zoom/HIPMathCompat.h>
+  #else
+    #include <c10/hip/HIPMathCompat.h>
+  #endif
 #define C10_COMPAT_COPYSIGN c10::hip::compat::copysign
 #else
 #include <c10/util/copysign.h>
diff --git a/c10/zoom/CMakeLists.txt b/c10/zoom/CMakeLists.txt
new file mode 100644
index 00000000000000..f055a8d824cddf
--- /dev/null
+++ b/c10/zoom/CMakeLists.txt
@@ -0,0 +1,60 @@
+include(../../cmake/public/utils.cmake)
+
+# ---[ Configure macro file.
+set(C10_ZOOM_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in
+# configure_file(
+#     ${CMAKE_CURRENT_LIST_DIR}/impl/hip_cmake_macros.h.in
+#     ${CMAKE_BINARY_DIR}/c10/hip/impl/hip_cmake_macros.h)
+
+# NB: All previous cu files are renamed into cc files.  This isn't tested at the
+# moment.
+file(GLOB C10_ZOOM_SRCS
+        *.cpp
+        *.cu
+        impl/*.cpp
+        impl/*.cu
+        )
+
+# Mark the cc files as HIP files, so we call the compiler.  (They have to be
+# suffixed with cc, because the hcc compiler won't accept them otherwise.)
+file(GLOB __c10_zoom_srcs_cpp *.cu impl/*.cu)
+set_source_files_properties(${__c10_zoom_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+set_source_files_properties(${C10_ZOOM_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+file(GLOB_RECURSE C10_ZOOM_HEADERS *.h)
+hip_add_library(c10_zoom ${C10_ZOOM_SRCS} ${C10_ZOOM_HEADERS})
+
+# Propagate HIP_CXX_FLAGS that were set from Dependencies.cmake
+target_compile_options(c10_zoom PRIVATE ${HIP_CXX_FLAGS})
+
+# caffe2_hip adds a bunch of dependencies like rocsparse, but c10/hip is supposed to be
+# minimal.  I'm not sure if we need hip_hcc or not; for now leave it out
+
+# If building shared library, set dllimport/dllexport proper.
+target_compile_options(c10_zoom PRIVATE "-DC10_ZOOM_BUILD_MAIN_LIB")
+# Enable hidden visibility if compiler supports it.
+if(${COMPILER_SUPPORTS_HIDDEN_VISIBILITY})
+  target_compile_options(c10_zoom PRIVATE "-fvisibility=hidden")
+endif()
+
+# ---[ Dependency of c10_zoom
+target_link_libraries(c10_zoom PUBLIC c10)
+
+target_link_libraries(c10_zoom PUBLIC ${PYTORCH_HIP_LIBRARIES})
+
+target_include_directories(
+    c10_zoom PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../..>
+    $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+    $<BUILD_INTERFACE:${ROCM_INCLUDE_DIRS}>
+    $<INSTALL_INTERFACE:include>)
+
+# add_subdirectory(test)
+
+# ---[ Installation
+install(TARGETS c10_zoom EXPORT Caffe2Targets DESTINATION lib)
+install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        DESTINATION include
+        FILES_MATCHING PATTERN "*.h")
+# install(FILES ${CMAKE_BINARY_DIR}/c10/hip/impl/hip_cmake_macros.h
+#   DESTINATION include/c10/hip/impl)
diff --git a/c10/zoom/HIPGraphsC10Utils.h b/c10/zoom/HIPGraphsC10Utils.h
new file mode 100644
index 00000000000000..9e423df8bd0250
--- /dev/null
+++ b/c10/zoom/HIPGraphsC10Utils.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <c10/zoom/ZoomStream.h>
+#include <iostream>
+#include <utility>
+
+// CUDA Graphs utils used by c10 and aten.
+// aten/cuda/CUDAGraphsUtils.cuh adds utils used by aten only.
+
+namespace c10::zoom {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::zoom::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+// RAII guard for "hipStreamCaptureMode", a thread-local value
+// that controls the error-checking strictness of a capture.
+struct ZoomStreamCaptureModeGuard {
+  ZoomStreamCaptureModeGuard(hipStreamCaptureMode desired)
+      : strictness_(desired) {
+    C10_ZOOM_CHECK(hipThreadExchangeStreamCaptureMode(&strictness_));
+  }
+  ~ZoomStreamCaptureModeGuard() {
+    C10_ZOOM_CHECK_WARN(hipThreadExchangeStreamCaptureMode(&strictness_));
+  }
+
+ private:
+  hipStreamCaptureMode strictness_;
+};
+
+// Protects against enum hipStreamCaptureStatus implementation changes.
+// Some compilers seem not to like static_assert without the messages.
+static_assert(
+    int(hipStreamCaptureStatus::hipStreamCaptureStatusNone) == 0,
+    "unexpected int(hipStreamCaptureStatusNone) value");
+static_assert(
+    int(hipStreamCaptureStatus::hipStreamCaptureStatusActive) == 1,
+    "unexpected int(hipStreamCaptureStatusActive) value");
+static_assert(
+    int(hipStreamCaptureStatus::hipStreamCaptureStatusInvalidated) == 2,
+    "unexpected int(hipStreamCaptureStatusInvalidated) value");
+
+enum class CaptureStatus : int {
+  None = int(hipStreamCaptureStatus::hipStreamCaptureStatusNone),
+  Active = int(hipStreamCaptureStatus::hipStreamCaptureStatusActive),
+  Invalidated = int(hipStreamCaptureStatus::hipStreamCaptureStatusInvalidated)
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch (status) {
+    case CaptureStatus::None:
+      os << "hipStreamCaptureStatusNone";
+      break;
+    case CaptureStatus::Active:
+      os << "hipStreamCaptureStatusActive";
+      break;
+    case CaptureStatus::Invalidated:
+      os << "hipStreamCaptureStatusInvalidated";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unknown HIP graph CaptureStatus", int(status));
+  }
+  return os;
+}
+
+// Use this version where you're sure a HIP context exists already.
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
+  hipStreamCaptureStatus is_capturing{hipStreamCaptureStatusNone};
+  C10_ZOOM_CHECK(
+      hipStreamIsCapturing(c10::zoom::getCurrentZoomStream(), &is_capturing));
+  return CaptureStatus(is_capturing);
+}
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/HIPMathCompat.h b/c10/zoom/HIPMathCompat.h
new file mode 100644
index 00000000000000..12c08d2a8a13b4
--- /dev/null
+++ b/c10/zoom/HIPMathCompat.h
@@ -0,0 +1,152 @@
+#pragma once
+
+/* This file defines math functions compatible across different gpu
+ * platforms (currently CUDA and HIP).
+ */
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __HIPCC__
+#define __MATH_FUNCTIONS_DECL__ inline C10_DEVICE
+#else /* __HIPCC__ */
+#ifdef __CUDACC_RTC__
+#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline C10_HOST_DEVICE
+#endif /* __CUDACC_RTC__ */
+#endif /* __HIPCC__ */
+
+namespace c10::hip::compat {
+
+__MATH_FUNCTIONS_DECL__ float abs(float x) {
+  return ::fabsf(x);
+}
+__MATH_FUNCTIONS_DECL__ double abs(double x) {
+  return ::fabs(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp(float x) {
+  return ::expf(x);
+}
+__MATH_FUNCTIONS_DECL__ double exp(double x) {
+  return ::exp(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float ceil(float x) {
+  return ::ceilf(x);
+}
+__MATH_FUNCTIONS_DECL__ double ceil(double x) {
+  return ::ceil(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysignf(x, y);
+#else
+  // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  // This host function needs to be here for the compiler but is never used
+  TORCH_INTERNAL_ASSERT(
+      false, "HIPMathCompat copysign should not run on the CPU");
+#endif
+}
+__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysign(x, y);
+#else
+  // see above
+  TORCH_INTERNAL_ASSERT(
+      false, "HIPMathCompat copysign should not run on the CPU");
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ float floor(float x) {
+  return ::floorf(x);
+}
+__MATH_FUNCTIONS_DECL__ double floor(double x) {
+  return ::floor(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log(float x) {
+  return ::logf(x);
+}
+__MATH_FUNCTIONS_DECL__ double log(double x) {
+  return ::log(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(float x) {
+  return ::log1pf(x);
+}
+
+__MATH_FUNCTIONS_DECL__ double log1p(double x) {
+  return ::log1p(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float max(float x, float y) {
+  return ::fmaxf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double max(double x, double y) {
+  return ::fmax(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float min(float x, float y) {
+  return ::fminf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double min(double x, double y) {
+  return ::fmin(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float pow(float x, float y) {
+  return ::powf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double pow(double x, double y) {
+  return ::pow(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) {
+  return ::sincosf(x, sptr, cptr);
+}
+__MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) {
+  return ::sincos(x, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float sqrt(float x) {
+  return ::sqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double sqrt(double x) {
+  return ::sqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(float x) {
+  return ::rsqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double rsqrt(double x) {
+  return ::rsqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tan(float x) {
+  return ::tanf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tan(double x) {
+  return ::tan(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tanh(float x) {
+  return ::tanhf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tanh(double x) {
+  return ::tanh(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(float x) {
+  return ::normcdff(x);
+}
+__MATH_FUNCTIONS_DECL__ double normcdf(double x) {
+  return ::normcdf(x);
+}
+
+} // namespace c10::hip::compat
+
+#endif
\ No newline at end of file
diff --git a/c10/zoom/ZoomAllocatorConfig.cpp b/c10/zoom/ZoomAllocatorConfig.cpp
new file mode 100644
index 00000000000000..7ff6e6955e98c3
--- /dev/null
+++ b/c10/zoom/ZoomAllocatorConfig.cpp
@@ -0,0 +1,350 @@
+#include <c10/zoom/ZoomAllocatorConfig.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/util/llvmMathExtras.h>
+
+namespace c10::zoom::ZoomCachingAllocator {
+
+constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
+
+ZoomAllocatorConfig::ZoomAllocatorConfig()
+    : m_max_split_size(std::numeric_limits<size_t>::max()),
+      m_garbage_collection_threshold(0),
+      m_pinned_num_register_threads(1),
+      m_expandable_segments(false),
+      m_release_lock_on_hipMalloc(false),
+      m_pinned_use_zoom_host_register(false),
+      m_last_allocator_settings("") {
+  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
+}
+
+size_t ZoomAllocatorConfig::roundup_power2_divisions(size_t size) {
+  size_t log_size = (63 - llvm::countLeadingZeros(size));
+
+  // Our intervals start at 1MB and end at 64GB
+  const size_t interval_start =
+      63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
+  const size_t interval_end =
+      63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
+  TORCH_CHECK(
+      (interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
+      "kRoundUpPowerOfTwoIntervals mismatch");
+
+  int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
+
+  index = std::max(0, index);
+  index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
+  return instance().m_roundup_power2_divisions[index];
+}
+
+void ZoomAllocatorConfig::lexArgs(
+    const char* env,
+    std::vector<std::string>& config) {
+  std::vector<char> buf;
+
+  size_t env_length = strlen(env);
+  for (size_t i = 0; i < env_length; i++) {
+    if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
+      if (!buf.empty()) {
+        config.emplace_back(buf.begin(), buf.end());
+        buf.clear();
+      }
+      config.emplace_back(1, env[i]);
+    } else if (env[i] != ' ') {
+      buf.emplace_back(static_cast<char>(env[i]));
+    }
+  }
+  if (!buf.empty()) {
+    config.emplace_back(buf.begin(), buf.end());
+  }
+}
+
+void ZoomAllocatorConfig::consumeToken(
+    const std::vector<std::string>& config,
+    size_t i,
+    const char c) {
+  TORCH_CHECK(
+      i < config.size() && config[i] == std::string(1, c),
+      "Error parsing CachingAllocator settings, expected ",
+      c,
+      "");
+}
+
+size_t ZoomAllocatorConfig::parseMaxSplitSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_split_size_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_split_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
+  }
+  return i;
+}
+
+size_t ZoomAllocatorConfig::parseGarbageCollectionThreshold(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    double val1 = stod(config[i]);
+    TORCH_CHECK(
+        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
+    TORCH_CHECK(
+        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
+    m_garbage_collection_threshold = val1;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting garbage_collection_threshold value", "");
+  }
+  return i;
+}
+
+size_t ZoomAllocatorConfig::parseRoundUpPower2Divisions(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  bool first_value = true;
+
+  if (++i < config.size()) {
+    if (std::string_view(config[i]) == "[") {
+      size_t last_index = 0;
+      while (++i < config.size() && std::string_view(config[i]) != "]") {
+        const std::string& val1 = config[i];
+        size_t val2 = 0;
+
+        consumeToken(config, ++i, ':');
+        if (++i < config.size()) {
+          val2 = stoi(config[i]);
+        } else {
+          TORCH_CHECK(
+              false, "Error parsing roundup_power2_divisions value", "");
+        }
+        TORCH_CHECK(
+            llvm::isPowerOf2_64(val2),
+            "For roundups, the divisons has to be power of 2 ",
+            "");
+
+        if (std::string_view(val1) == ">") {
+          std::fill(
+              std::next(
+                  m_roundup_power2_divisions.begin(),
+                  static_cast<std::vector<unsigned long>::difference_type>(
+                      last_index)),
+              m_roundup_power2_divisions.end(),
+              val2);
+        } else {
+          size_t val1_long = stoul(val1);
+          TORCH_CHECK(
+              llvm::isPowerOf2_64(val1_long),
+              "For roundups, the intervals have to be power of 2 ",
+              "");
+
+          size_t index = 63 - llvm::countLeadingZeros(val1_long);
+          index = std::max((size_t)0, index);
+          index = std::min(index, m_roundup_power2_divisions.size() - 1);
+
+          if (first_value) {
+            std::fill(
+                m_roundup_power2_divisions.begin(),
+                std::next(
+                    m_roundup_power2_divisions.begin(),
+                    static_cast<std::vector<unsigned long>::difference_type>(
+                        index)),
+                val2);
+            first_value = false;
+          }
+          if (index < m_roundup_power2_divisions.size()) {
+            m_roundup_power2_divisions[index] = val2;
+          }
+          last_index = index;
+        }
+
+        if (std::string_view(config[i + 1]) != "]") {
+          consumeToken(config, ++i, ',');
+        }
+      }
+    } else { // Keep this for backwards compatibility
+      size_t val1 = stoi(config[i]);
+      TORCH_CHECK(
+          llvm::isPowerOf2_64(val1),
+          "For roundups, the divisons has to be power of 2 ",
+          "");
+      std::fill(
+          m_roundup_power2_divisions.begin(),
+          m_roundup_power2_divisions.end(),
+          val1);
+    }
+  } else {
+    TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
+  }
+  return i;
+}
+
+size_t ZoomAllocatorConfig::parseAllocatorConfig(
+    const std::vector<std::string>& config,
+    size_t i,
+    bool& used_zoomMallocAsync) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        ((config[i] == "native") || (config[i] == "zoomMallocAsync")),
+        "Unknown allocator backend, "
+        "options are native and zoomMallocAsync");
+
+     // HIP supports hipMallocAsync and does not need to check versions unlike CUDA
+    used_zoomMallocAsync = (config[i] == "zoomMallocAsync");   
+
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name(),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time");
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
+  }
+  return i;
+}
+
+void ZoomAllocatorConfig::parseArgs(const char* env) {
+  // If empty, set the default values
+  m_max_split_size = std::numeric_limits<size_t>::max();
+  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
+  m_garbage_collection_threshold = 0;
+  bool used_zoomMallocAsync = false;
+  bool used_native_specific_option = false;
+
+  if (env == nullptr) {
+    return;
+  }
+  {
+    std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
+    m_last_allocator_settings = env;
+  }
+
+  std::vector<std::string> config;
+  lexArgs(env, config);
+
+  for (size_t i = 0; i < config.size(); i++) {
+    std::string_view config_item_view(config[i]);
+    if (config_item_view == "max_split_size_mb") {
+      i = parseMaxSplitSize(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "garbage_collection_threshold") {
+      i = parseGarbageCollectionThreshold(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "roundup_power2_divisions") {
+      i = parseRoundUpPower2Divisions(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "backend") {
+      i = parseAllocatorConfig(config, i, used_zoomMallocAsync);
+    } else if (config_item_view == "expandable_segments") {
+      used_native_specific_option = true;
+      consumeToken(config, ++i, ':');
+      ++i;
+      TORCH_CHECK(
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
+          "Expected a single True/False argument for expandable_segments");
+      config_item_view = config[i];
+      m_expandable_segments = (config_item_view == "True");
+    } else if (
+        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
+        // use, accept both. We must break up the string to prevent hipify here.
+        config_item_view == "release_lock_on_hipMalloc" ||
+        config_item_view ==
+            "release_lock_on_c"
+            "udamalloc") {
+      used_native_specific_option = true;
+      consumeToken(config, ++i, ':');
+      ++i;
+      TORCH_CHECK(
+          i < config.size() &&
+              (std::string_view(config[i]) == "True" ||
+               std::string_view(config[i]) == "False"),
+          "Expected a single True/False argument for release_lock_on_hipMalloc");
+      config_item_view = config[i];
+      m_release_lock_on_hipMalloc = (config_item_view == "True");
+    } else if (
+        // ROCm build's hipify step will change "cuda" to "hip", but for ease of
+        // use, accept both. We must break up the string to prevent hipify here.
+        config_item_view == "pinned_use_hip_host_register" ||
+        config_item_view ==
+            "pinned_use_c"
+            "uda_host_register") {
+      i = parsePinnedUseZoomHostRegister(config, i);
+      used_native_specific_option = true;
+    } else if (config_item_view == "pinned_num_register_threads") {
+      i = parsePinnedNumRegisterThreads(config, i);
+      used_native_specific_option = true;
+    } else {
+      TORCH_CHECK(
+          false, "Unrecognized CachingAllocator option: ", config_item_view);
+    }
+
+    if (i + 1 < config.size()) {
+      consumeToken(config, ++i, ',');
+    }
+  }
+
+  if (used_zoomMallocAsync && used_native_specific_option) {
+    TORCH_WARN(
+        "backend:zoomMallocAsync ignores max_split_size_mb,"
+        "roundup_power2_divisions, and garbage_collect_threshold.");
+  }
+}
+
+size_t ZoomAllocatorConfig::parsePinnedUseZoomHostRegister(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for pinned_use_zoom_host_register");
+    m_pinned_use_zoom_host_register = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_use_zoom_host_register value", "");
+  }
+  return i;
+}
+
+size_t ZoomAllocatorConfig::parsePinnedNumRegisterThreads(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    size_t val2 = stoi(config[i]);
+    TORCH_CHECK(
+        llvm::isPowerOf2_64(val2),
+        "Number of register threads has to be power of 2 ",
+        "");
+    auto maxThreads = ZoomAllocatorConfig::pinned_max_register_threads();
+    TORCH_CHECK(
+        val2 <= maxThreads,
+        "Number of register threads should be less than or equal to " +
+            std::to_string(maxThreads),
+        "");
+    m_pinned_num_register_threads = val2;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting pinned_num_register_threads value", "");
+  }
+  return i;
+}
+
+// General caching allocator utilities
+void setAllocatorSettings(const std::string& env) {
+  ZoomCachingAllocator::ZoomAllocatorConfig::instance().parseArgs(env.c_str());
+}
+
+} // namespace c10::zoom::ZoomCachingAllocator
\ No newline at end of file
diff --git a/c10/zoom/ZoomAllocatorConfig.h b/c10/zoom/ZoomAllocatorConfig.h
new file mode 100644
index 00000000000000..86a2d5a6e10c4c
--- /dev/null
+++ b/c10/zoom/ZoomAllocatorConfig.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace c10::zoom::ZoomCachingAllocator {
+
+// Environment config parser
+class ZoomAllocatorConfig {
+ public:
+  static size_t max_split_size() {
+    return instance().m_max_split_size;
+  }
+  static double garbage_collection_threshold() {
+    return instance().m_garbage_collection_threshold;
+  }
+
+  static bool expandable_segments() {
+    // for now, we don't support expanable segments
+    if (instance().m_expandable_segments) {
+      TORCH_WARN_ONCE("expandable_segments not supported on this platform")
+    }
+    return false;
+// #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+//     if (instance().m_expandable_segments) {
+//       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
+//     }
+//     return false;
+// #else
+//     return instance().m_expandable_segments;
+// #endif
+  }
+
+  static bool release_lock_on_hipMalloc() {
+    return instance().m_release_lock_on_hipMalloc;
+  }
+
+  /** Pinned memory allocator settings */
+  static bool pinned_use_zoom_host_register() {
+    return instance().m_pinned_use_zoom_host_register;
+  }
+
+  static size_t pinned_num_register_threads() {
+    return instance().m_pinned_num_register_threads;
+  }
+
+  static size_t pinned_max_register_threads() {
+    // Based on the benchmark results, we see better allocation performance
+    // with 8 threads. However on future systems, we may need more threads
+    // and limiting this to 128 threads.
+    return 128;
+  }
+
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As ane example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_ZOOM_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions(size_t size);
+
+  static std::vector<size_t> roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
+
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(
+        instance().m_last_allocator_settings_mutex);
+    return instance().m_last_allocator_settings;
+  }
+
+  static ZoomAllocatorConfig& instance() {
+    static ZoomAllocatorConfig* s_instance = ([]() {
+      auto inst = new ZoomAllocatorConfig();
+      const char* env = getenv("PYTORCH_ZOOM_ALLOC_CONF");
+      inst->parseArgs(env);
+      return inst;
+    })();
+    return *s_instance;
+  }
+
+  void parseArgs(const char* env);
+
+ private:
+  ZoomAllocatorConfig();
+
+  static void lexArgs(const char* env, std::vector<std::string>& config);
+  static void consumeToken(
+      const std::vector<std::string>& config,
+      size_t i,
+      const char c);
+  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseGarbageCollectionThreshold(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseRoundUpPower2Divisions(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseAllocatorConfig(
+      const std::vector<std::string>& config,
+      size_t i,
+      bool& used_zoomMallocAsync);
+  size_t parsePinnedUseZoomHostRegister(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parsePinnedNumRegisterThreads(
+      const std::vector<std::string>& config,
+      size_t i);
+
+  std::atomic<size_t> m_max_split_size;
+  std::vector<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
+  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<bool> m_expandable_segments;
+  std::atomic<bool> m_release_lock_on_hipMalloc;
+  std::atomic<bool> m_pinned_use_zoom_host_register;
+  std::string m_last_allocator_settings;
+  std::mutex m_last_allocator_settings_mutex;
+};
+
+// General caching allocator utilities
+void setAllocatorSettings(const std::string& env);
+
+} // namespace c10::zoom::ZoomCachingAllocator
\ No newline at end of file
diff --git a/c10/zoom/ZoomCachingAllocator.cpp b/c10/zoom/ZoomCachingAllocator.cpp
new file mode 100644
index 00000000000000..c28541f862c3f7
--- /dev/null
+++ b/c10/zoom/ZoomCachingAllocator.cpp
@@ -0,0 +1,3104 @@
+#include <c10/zoom/ZoomCachingAllocator.h>
+
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/zoom/ZoomAllocatorConfig.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/ScopeExit.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+#include <c10/util/llvmMathExtras.h>
+#include <c10/util/static_tracepoint.h>
+
+// #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+// #include <c10/cuda/driver_api.h>
+// #include <sys/types.h>
+// #include <unistd.h>
+// #endif
+
+#include <c10/util/Exception.h>
+#include <hip/hip_runtime.h>
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <regex>
+#include <set>
+#include <utility>
+#include <vector>
+
+TORCH_SDT_DEFINE_SEMAPHORE(malloc)
+TORCH_SDT_DEFINE_SEMAPHORE(free)
+
+namespace c10 {
+
+C10_DEFINE_REGISTRY(FreeZoomMemoryCallbacksRegistry, FreeMemoryCallback);
+
+namespace zoom::ZoomCachingAllocator {
+
+// Included here as this is externally used in ZoomAllocatorConfig
+const size_t kLargeBuffer =
+    20971520; // "large" allocations may be packed in 20 MiB blocks
+
+namespace Native {
+
+//
+// Yet another caching allocator for HIP device allocations.
+//
+// - Allocations are associated with a stream. Once freed, blocks can be
+//   re-allocated on the same stream, but not on any other stream.
+// - The allocator attempts to find the smallest cached block that will fit the
+//   requested size. If the block is larger than the requested size, it may be
+//   split. If no block is found, the allocator will delegate to hipMalloc.
+// - If the hipMalloc fails, the allocator will attempt to free one cached
+//   block of sufficient size that is not split and retry the allocation.
+//   If this also fails, the allocator will attempt to free all cached blocks
+//   that are not split and retry the allocation.
+// - Large (>1MB) and small allocations are stored in separate pools.
+//   Small requests are packed into 2MB buffers. Large requests will use the
+//   smallest available free block or allocate a new block using hipMalloc.
+// - To reduce fragmentation, requests between 1MB and 10MB will allocate and
+//   split a 20MB block, if no free block of sufficient size is available.
+// - To further reduce fragmentation, blocks >= max_split_size are not allowed
+//   to be split. These oversize cached blocks will still satisfy requests
+//   within 1MB of the oversize cached block size.
+//
+// With this allocator, allocations and frees should logically be considered
+// "usages" of the memory segment associated with streams, just like kernel
+// launches. The programmer must insert the proper synchronization if memory
+// segments are used from multiple streams.
+//
+// The library provides a recordStream() function to help insert the correct
+// synchronization when allocations are used on multiple streams. This will
+// ensure that the block is not reused before each recorded stream completes
+// work.
+//
+
+/**
+ * Note [Interaction with HIP graph capture]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Graph capture performs a dry run of a region of execution, freezing all HIP
+ * work (and virtual addresses used during that work) into a "graph." The graph
+ * may be "replayed" like a single giant kernel, with greatly reduced CPU
+ * overhead as well as modestly improved GPU performance.
+ *
+ * Because capture bakes in memory addresses, the memory used during capture
+ * must be available for the graph to use during replay. DeviceCachingAllocator
+ * assigns and frees memory eagerly and dynamically, so if we're not careful
+ * about managing graphs' memory, at replay time those memory addresses could be
+ * used by other tensors.
+ *
+ * To guarantee a graph's baked in addresses are safe to reuse in replay,
+ * DeviceAllocator satisfies allocations from a graph-private memory pool during
+ * capture, and doesn't begin hipFreeing those addresses until the graph is
+ * destroyed.
+ *
+ * Within the private pool, allocations are freed and reassigned as usual during
+ * capture. Memory regions will be used in a consistent order during replay. So
+ * a private pool doesn't use memory more wastefully than the default pools
+ * during capture, but it does reserve its high-water mark of used memory away
+ * from the default pools as long as the capture(s) it served survive
+ * (regardless whether those captures are idle or replaying).
+ *
+ * CUDAGraph's requests for private pools are mediated by
+ * DeviceAllocator::notifyCaptureBegin,
+ *                  notifyCaptureAboutToEnd,
+ *                  notifyCaptureEnded,
+ *                  notifyCaptureDestroy.
+ */
+
+constexpr size_t kMinBlockSize =
+    512; // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer =
+    2097152; // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kMinLargeAlloc =
+    10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
+
+namespace {
+
+using stream_set = ska::flat_hash_set<zoom::ZoomStream>;
+
+using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
+
+void increase_stat(Stat& stat, size_t amount) {
+  stat.current += static_cast<int64_t>(amount);
+  stat.peak = std::max(stat.current, stat.peak);
+  stat.allocated += static_cast<int64_t>(amount);
+}
+
+void decrease_stat(Stat& stat, size_t amount) {
+  stat.current -= static_cast<int64_t>(amount);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stat.current >= 0,
+      "Negative tracked stat in HIP allocator (likely logic error).");
+  stat.freed += static_cast<int64_t>(amount);
+}
+
+void reset_accumulated_stat(Stat& stat) {
+  stat.allocated = 0;
+  stat.freed = 0;
+}
+
+void reset_peak_stat(Stat& stat) {
+  stat.peak = stat.current;
+}
+
+template <typename Func>
+void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
+  for (const auto stat_type : c10::irange(stat_types.size())) {
+    if (stat_types[stat_type]) {
+      f(stat_type);
+    }
+  }
+}
+
+void decrease_stat_array(
+    StatArray& stat_array,
+    size_t amount,
+    const StatTypes& stat_types) {
+  for_each_selected_stat_type(
+      stat_types, [&stat_array, amount](size_t stat_type) {
+        decrease_stat(stat_array[stat_type], amount);
+      });
+}
+
+struct Block;
+struct PrivatePool;
+typedef bool (*Comparison)(const Block*, const Block*);
+static bool BlockComparatorSize(const Block* a, const Block* b);
+static bool BlockComparatorAddress(const Block* a, const Block* b);
+
+struct BlockPool {
+  BlockPool(bool small, PrivatePool* private_pool = nullptr)
+      : blocks(BlockComparatorSize),
+        unmapped(BlockComparatorAddress),
+        is_small(small),
+        owner_PrivatePool(private_pool) {}
+
+  // Do not insert a Block to blocks directly; use insert_into_blocks(),
+  // instead.
+  std::set<Block*, Comparison> blocks;
+  std::set<Block*, Comparison> unmapped;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool is_small;
+  PrivatePool* owner_PrivatePool;
+  int64_t get_free_blocks_call_count{0};
+
+  // Add a Block into blocks set with updating gc counter.
+  std::pair<std::set<Block*, Comparison>::iterator, bool> insert_into_blocks(
+      Block* block);
+};
+
+struct ExpandableSegment;
+
+struct Block {
+  c10::DeviceIndex device; // gpu
+  hipStream_t stream; // allocation stream
+  stream_set stream_uses; // streams on which the block was used
+  size_t size; // block size in bytes
+  size_t requested_size; // memory originally requested
+  BlockPool* pool{nullptr}; // owning memory pool
+  void* ptr{nullptr}; // memory address
+  bool allocated{false}; // in-use flag
+  bool mapped{true}; // is the virtual address range this Block references
+                     // backed by physical pages. Always true when
+                     // expandable_segment_ is null. When false
+                     // This Block will be aligned to the segment size
+                     // of its expandable_segment_.
+  Block* prev{nullptr}; // prev block if split from a larger allocation
+  Block* next{nullptr}; // next block if split from a larger allocation
+  int event_count{0}; // number of outstanding HIP events
+  int64_t gc_count_base{0}; // get_free_blocks_call_count when Block is inserted
+  std::shared_ptr<GatheredContext> context_when_allocated;
+  // only set for the first block in the segment (when prev == null)
+  // this records the frame information when hipMalloc was called
+  // whereas context_when_allocated records the last time we handed this
+  // memory out from our cache.
+  std::shared_ptr<GatheredContext> context_when_segment_allocated;
+
+  ExpandableSegment* expandable_segment_{nullptr};
+
+  Block(
+      c10::DeviceIndex device,
+      hipStream_t stream,
+      size_t size,
+      BlockPool* pool,
+      void* ptr)
+      : device(device),
+        stream(stream),
+        stream_uses(),
+        size(size),
+        requested_size(0),
+        pool(pool),
+        ptr(ptr) {}
+
+  // constructor for search key
+  Block(c10::DeviceIndex device, hipStream_t stream, size_t size)
+      : device(device),
+        stream(stream),
+        stream_uses(),
+        size(size),
+        requested_size(0) {}
+
+  size_t gc_count() {
+    TORCH_INTERNAL_ASSERT(pool);
+    return static_cast<int>(pool->get_free_blocks_call_count - gc_count_base);
+  }
+
+  bool is_split() const {
+    return (prev != nullptr) || (next != nullptr);
+  }
+  void splice(Block* before, Block* after) {
+    if (before) {
+      TORCH_INTERNAL_ASSERT(before->next == after);
+      before->next = this;
+    }
+    prev = before;
+    if (after) {
+      TORCH_INTERNAL_ASSERT(after->prev == before);
+      after->prev = this;
+    }
+    next = after;
+  }
+};
+
+std::pair<std::set<Block*, Comparison>::iterator, bool> BlockPool::
+    insert_into_blocks(Block* block) {
+  block->gc_count_base = get_free_blocks_call_count;
+  return blocks.insert(block);
+}
+
+struct SegmentRange {
+  char* ptr;
+  size_t size;
+  SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
+};
+
+// For now we don't support expandable segments
+struct ExpandableSegment {
+  ExpandableSegment(
+      c10::DeviceIndex device,
+      hipStream_t stream,
+      size_t size,
+      const std::vector<c10::DeviceIndex>& peers) {
+    TORCH_INTERNAL_ASSERT(false, "expandable segment not supported");
+  }
+  SegmentRange map(SegmentRange range) {
+    return SegmentRange(nullptr, 0);
+  }
+  SegmentRange unmap(SegmentRange range) {
+    return SegmentRange(nullptr, 0);
+  }
+  char* ptr() const {
+    return nullptr;
+  }
+  size_t size() const {
+    return 0;
+  }
+  void addPeer(c10::DeviceIndex device) {}
+};
+
+// BlockState, BlockPoolState, and PrivatePoolState contain the information
+// needed to reconstruct a private pool to a previous state. See note
+// [Checkpointing PrivatePoolState]
+struct BlockState {
+  c10::DeviceIndex device = 0;
+  hipStream_t stream = nullptr;
+  stream_set stream_uses = {};
+  size_t size = 0;
+  void* ptr = nullptr;
+  bool allocated = false;
+  int64_t gc_count_base = 0;
+  // maintain invariant that event_count == 0 ;
+  // history will be left alone in checkpoint
+
+  BlockState(Block* block);
+};
+
+struct SegmentState {
+  std::vector<BlockState> blocks;
+  bool is_small = false;
+
+  SegmentState(Block* head);
+};
+
+struct PrivatePoolState : AllocatorState {
+  // omitting use_count, and hipMalloc_count as they remain the same
+  MempoolId_t owner_id = {0, 0};
+
+  std::vector<SegmentState> segments;
+
+  PrivatePoolState(
+      MempoolId_t pool_id,
+      const std::vector<Block*>& private_pool_head_blocks);
+};
+
+struct RestoreResult {
+  std::vector<void*> allocations_freed;
+  std::vector<Block*> allocations_created;
+};
+
+static bool BlockComparatorSize(const Block* a, const Block* b) {
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  if (a->size != b->size) {
+    return a->size < b->size;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+static bool BlockComparatorAddress(const Block* a, const Block* b) {
+  if (a->stream != b->stream) {
+    return (uintptr_t)a->stream < (uintptr_t)b->stream;
+  }
+  return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
+}
+
+struct AllocParams {
+  AllocParams(
+      c10::DeviceIndex device,
+      size_t size,
+      hipStream_t stream,
+      BlockPool* pool,
+      size_t alloc_size,
+      DeviceStats& stats)
+      : search_key(device, stream, size), pool(pool), alloc_size(alloc_size) {}
+
+  c10::DeviceIndex device() const {
+    return search_key.device;
+  }
+  hipStream_t stream() const {
+    return search_key.stream;
+  }
+  size_t size() const {
+    return search_key.size;
+  }
+
+  Block search_key;
+  BlockPool* pool;
+  size_t alloc_size;
+  Block* block{nullptr};
+  StatTypes stat_types = {false};
+  hipError_t err{hipSuccess};
+};
+
+// Note: cudaEventCreate when concurrently invoked from multiple threads can be
+// very expensive (at least on certain device/driver combinations). Thus, we a)
+// serialize event creation at a per-device level, and b) pool the events to
+// avoid constantly calling cudaEventCreate/cudaEventDestroy. This results in
+// significant improvements in multithreaded workloads with high allocation
+// rates.
+class EventPool {
+ public:
+  using Event = std::unique_ptr<hipEvent_t, std::function<void(hipEvent_t*)>>;
+  // TODO: Explicit device count
+  EventPool() : pools_(c10::zoom::device_count()) {}
+
+  Event get(c10::DeviceIndex device) {
+    TORCH_INTERNAL_ASSERT(0 <= device);
+    TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()));
+    auto& pool = pools_[device];
+    auto destructor = [&pool](hipEvent_t* event) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.push_back(std::unique_ptr<hipEvent_t>(event));
+    };
+
+    // Try to acquire an event from the per-device pool.
+    {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      if (!pool.event_pool_.empty()) {
+        auto* event = pool.event_pool_.back().release();
+        pool.event_pool_.pop_back();
+        return Event(event, destructor);
+      }
+    }
+    // otherwise, allocate a new event that will be returned to the pool on
+    // destruction.
+    auto new_ptr = std::make_unique<hipEvent_t>();
+    C10_ZOOM_CHECK(
+        hipEventCreateWithFlags(new_ptr.get(), hipEventDisableTiming));
+
+    return Event(new_ptr.release(), destructor);
+  }
+
+  void empty_cache() {
+    for (auto& pool : pools_) {
+      std::lock_guard<std::mutex> g(pool.mutex_);
+      pool.event_pool_.clear();
+    }
+  }
+
+ private:
+  struct PerDevicePool {
+    alignas(64) std::mutex mutex_;
+    std::vector<std::unique_ptr<hipEvent_t>> event_pool_;
+  };
+  std::vector<PerDevicePool> pools_;
+};
+
+// HIP graphs helper
+struct PrivatePool {
+  PrivatePool()
+      : large_blocks(/*small=*/false, this),
+        small_blocks(/*small=*/true, this) {}
+  PrivatePool(const PrivatePool&) = delete;
+  PrivatePool(PrivatePool&&) = delete;
+  PrivatePool& operator=(const PrivatePool&) = delete;
+  // Number of live graphs using this pool
+  int use_count{1};
+  // Number of unfreed hipMallocs made for this pool. When use_count and
+  // hipMalloc_count drop to zero, we can delete this PrivatePool from
+  // graph_pools.
+  int hipMalloc_count{0};
+  // Instead of maintaining private BlockPools here, I could stuff all blocks
+  // (private or no) into the top-level large_blocks and small_blocks, and
+  // distinguish private blocks by adding a "pool id" check above the stream
+  // check in BlockComparator. BlockComparator is performance- critical though,
+  // I'd rather not add more logic to it.
+  BlockPool large_blocks;
+  BlockPool small_blocks;
+};
+
+BlockState::BlockState(Block* block)
+    : stream(block->stream),
+      stream_uses(block->stream_uses),
+      size(block->size),
+      ptr(block->ptr),
+      allocated(block->allocated),
+      gc_count_base(block->gc_count_base) {
+  TORCH_CHECK(
+      block->event_count == 0,
+      "Events should have synchronized when checkpointing block");
+};
+
+SegmentState::SegmentState(Block* head) {
+  TORCH_INTERNAL_ASSERT(head->prev == nullptr && head->pool != nullptr);
+  is_small = head->pool->is_small;
+
+  for (Block* curr = head; curr != nullptr; curr = curr->next) {
+    blocks.emplace_back(curr);
+  }
+}
+
+PrivatePoolState::PrivatePoolState(
+    MempoolId_t pool_id,
+    const std::vector<Block*>& private_pool_head_blocks)
+    : owner_id(std::move(pool_id)) {
+  for (Block* head : private_pool_head_blocks) {
+    segments.emplace_back(head);
+  }
+}
+
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
+hipError_t hipMallocMaybeCapturing(void** p, size_t size) {
+  if (c10::zoom::currentStreamCaptureStatusMayInitCtx() ==
+      c10::zoom::CaptureStatus::None) {
+    return C10_ZOOM_ERROR_HANDLED(hipMalloc(p, size));
+  } else {
+    // It's ok to capture hipMallocs, as long as we never hipFree those
+    // addresses before replay.
+    // Capturing hipMalloc behaves nicely: it gives the graph new VA,
+    // but is ignored (won't leakily allocate new memory) in replays.
+    c10::zoom::ZoomStreamCaptureModeGuard g{hipStreamCaptureModeRelaxed};
+    return C10_ZOOM_ERROR_HANDLED(hipMalloc(p, size));
+  }
+}
+
+} // anonymous namespace
+} // namespace Native
+
+static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
+  return "";
+}
+
+namespace Native {
+
+class DeviceCachingAllocator {
+ private:
+  // lock around all operations
+  mutable std::recursive_mutex mutex;
+
+  // device statistics
+  DeviceStats stats;
+
+  // unallocated cached blocks larger than 1 MB
+  BlockPool large_blocks;
+
+  // unallocated cached blocks 1 MB or smaller
+  BlockPool small_blocks;
+
+  // allocated or in use by a stream. Holds all active allocations,
+  // whether they came from graph_pools or one of the BlockPools above.
+  ska::flat_hash_set<Block*> active_blocks;
+
+  // captures_underway tracks if we are diverting some
+  // allocations to a specific pool.
+  // Most of the time it's empty, in which case malloc can avoid calling
+  // hipStreamGetCaptureInfo in the hot path.
+  std::vector<std::pair<MempoolId_t, std::function<bool(hipStream_t)>>>
+      captures_underway;
+
+  // See free() for this thing's purpose
+  std::vector<Block*> needs_events_deferred_until_no_capture;
+  // outstanding hip events
+  ska::flat_hash_map<
+      zoom::ZoomStream,
+      std::deque<std::pair<EventPool::Event, Block*>>>
+      hip_events;
+
+  // record used memory.
+  size_t total_allocated_memory = 0;
+
+  size_t allowed_memory_maximum = 0;
+
+  // all live expandable segments
+  std::vector<ExpandableSegment*> expandable_segments_;
+  std::vector<c10::DeviceIndex> devices_with_peer_access_;
+
+  bool set_fraction = false;
+
+  bool record_history = false;
+
+  std::atomic<CreateContextFn> context_recorder_;
+  size_t alloc_trace_next = 0;
+  RecordContext record_context_ = RecordContext::NEVER;
+  size_t alloc_trace_max_entries_ = 1;
+  std::vector<TraceEntry>*
+      alloc_trace; // pointer because we need to intentionally leak this on
+                   // deallocation it can hold references to Python state which
+                   // will already be destroyed when we are in exit handlers
+
+  // Members specific to HIP graphs
+
+  // Private pools for HIP graphs
+  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
+      graph_pools;
+  // Pools no longer referenced by any graph. Their BlockPools are eligible for
+  // free_blocks. Can't be a vector or deque because we might erase entries in
+  // any order. Could be an std::list, but we don't care much, access and
+  // insert/erase are rare.
+  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
+      graph_pools_freeable;
+
+  // XXX - maybe we should generalize and have multiple events
+  std::vector<OutOfMemoryObserver> oom_observers_;
+
+  std::vector<AllocatorTraceTracker> trace_trackers_;
+
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  DeviceCachingAllocator()
+      : large_blocks(/*small=*/false),
+        small_blocks(/*small=*/true),
+        alloc_trace(new std::vector<TraceEntry>()) {
+    stats.max_split_size =
+        static_cast<int64_t>(ZoomAllocatorConfig::max_split_size());
+    context_recorder_.store(nullptr);
+  }
+
+  void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when) {
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+    TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
+    record_history = enabled;
+    context_recorder_.store(record_history ? context_recorder : nullptr);
+    alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
+    record_context_ = enabled ? when : RecordContext::NEVER;
+    if (!enabled) {
+      alloc_trace_next = 0;
+      alloc_trace->clear();
+    }
+  }
+
+  bool isHistoryEnabled() {
+    return record_history;
+  }
+
+  bool checkPoolLiveAllocations(
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) {
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+
+    PrivatePool* pool = nullptr;
+    auto pool_it = graph_pools.find(mempool_id);
+    TORCH_CHECK(pool_it != graph_pools.end(), "Could not find pool of id");
+    pool = pool_it->second.get();
+
+    TORCH_INTERNAL_ASSERT(pool != nullptr);
+
+    size_t allocated_pool_blocks = 0;
+
+    for (Block* b : active_blocks) {
+      TORCH_INTERNAL_ASSERT(b != nullptr);
+      TORCH_INTERNAL_ASSERT(b->pool != nullptr);
+      if (b->allocated && b->pool->owner_PrivatePool == pool) {
+        if (!expected_live_allocations.count(b->ptr)) {
+          return false;
+        }
+
+        allocated_pool_blocks += 1;
+      }
+    }
+
+    return allocated_pool_blocks == expected_live_allocations.size();
+  }
+
+  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
+    oom_observers_.emplace_back(std::move(observer));
+  }
+
+  void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+    trace_trackers_.emplace_back(std::move(tracker));
+  }
+
+  // Must be called outside of `mutex` or deadlocks are possible with Python
+  std::shared_ptr<GatheredContext> maybeGatherContext(RecordContext level) {
+    if (record_context_ < level) {
+      return nullptr;
+    }
+    return context_recorder_.load()();
+  }
+
+  // All public methods (except the above) acquire the allocator mutex.
+  // Thus, do not call a public method from another public method.
+
+  Block* malloc(
+      c10::DeviceIndex device,
+      size_t orig_size,
+      hipStream_t stream) {
+    // done outside the lock because we don't know what locks the recorder needs
+    // to have...
+    auto context = maybeGatherContext(RecordContext::STATE);
+
+    std::unique_lock<std::recursive_mutex> lock(mutex);
+
+    if (C10_LIKELY(captures_underway.empty())) {
+      // Processes end-of-life events for outstanding allocations used on
+      // multiple streams (checks if their GPU-side uses are complete and
+      // recycles their memory if so)
+      //
+      // Q. Why skip process_events if a capture might be underway?
+      // A. process_events involves hipEventQueries, illegal during HIP graph
+      //    capture.
+      //    Dumb simple solution: defer reclaiming these allocations until after
+      //    capture. Cross-stream memory use is uncommon, so the deferral's
+      //    effect on memory use during capture should be small.
+      process_events(context);
+    }
+    size_t size = round_size(orig_size);
+    auto& pool = get_pool(size, stream);
+    const size_t alloc_size = get_allocation_size(size);
+    AllocParams params(device, size, stream, &pool, alloc_size, stats);
+    params.stat_types = get_stat_types_for_pool(pool);
+
+    // First, try to get a block from the existing pool.
+    bool block_found =
+        // Search pool
+        get_free_block(params)
+        // Trigger callbacks and retry search
+        || (trigger_free_memory_callbacks(params) && get_free_block(params));
+
+    // Can't reuse an existing block; try to get a new one.
+    if (!block_found) {
+      // Do garbage collection if the flag is set.
+      if (C10_UNLIKELY(
+              set_fraction &&
+              ZoomAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+        garbage_collect_cached_blocks(context);
+      }
+      // Attempt allocate
+      // WARNING: alloc_block may release the allocator lock when calling
+      // hipMalloc. So far this function has not modified allocator state, but
+      // keep in mind that any observed allocator state may change across calls
+      // to alloc_block since it may release the lock.
+      block_found = alloc_block(params, false, context, lock)
+          // Free enough available cached blocks to satisfy alloc and retry
+          // alloc.
+          || (release_available_cached_blocks(params, context) &&
+              alloc_block(params, false, context, lock))
+          // Free all non-split cached blocks and retry alloc.
+          || (C10_LIKELY(captures_underway.empty()) &&
+              release_cached_blocks(context) &&
+              alloc_block(params, true, context, lock));
+    }
+
+    if (!block_found) {
+      // For any error code other than hipErrorMemoryAllocation,
+      // alloc_block should have thrown an exception already.
+      TORCH_INTERNAL_ASSERT(params.err == hipErrorMemoryAllocation);
+
+      size_t device_free = 0;
+      size_t device_total = 0;
+      C10_ZOOM_CHECK(hipMemGetInfo(&device_free, &device_total));
+      std::string allowed_info;
+
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      }
+
+      std::string proc_info = reportProcessMemoryInfo(device);
+
+      record_trace(
+          TraceEntry::OOM,
+          device_free,
+          params.size(),
+          params.stream(),
+          params.device(),
+          std::move(context));
+      stats.num_ooms += 1;
+
+      c10::reportOutOfMemoryToProfiler(
+          static_cast<int64_t>(size),
+          stats.allocated_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+              .current,
+          stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
+              .current,
+          c10::Device(c10::DeviceType::PrivateUse1, device));
+
+      auto allocated_bytes =
+          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto reserved_bytes =
+          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto observers_local = oom_observers_;
+
+      size_t allocated_in_private_pools = 0;
+      auto get_size_block = [](const BlockPool& pool) {
+        size_t res = 0;
+        for (const auto& block : pool.blocks) {
+          res += block->size;
+        }
+        return res;
+      };
+      for (const auto& p : graph_pools) {
+        allocated_in_private_pools += get_size_block(p.second->large_blocks);
+        allocated_in_private_pools += get_size_block(p.second->small_blocks);
+      }
+
+      std::string private_pool_msg;
+
+      if (allocated_in_private_pools > 0) {
+        private_pool_msg = "with " + format_size(allocated_in_private_pools) +
+            " allocated in private pools (e.g., HIP Graphs), ";
+      }
+
+      // Make sure we do not have the device lock before calling our
+      // observers which might need hold the GIL
+      // It is safe to release at this point because will no longer
+      // be reading any allocator state.
+
+      lock.unlock();
+
+      for (const auto& obs : observers_local) {
+        obs(device,
+            alloc_size,
+            set_fraction ? allowed_memory_maximum : device_total,
+            device_free);
+      }
+
+      // "total capacity": total global memory on GPU
+      // "allowed": memory is allowed to use, which set by fraction.
+      // "already allocated": memory allocated by the program using the
+      //                      caching allocator
+      // "free": free memory as reported by the HIP API
+      // "cached": memory held by the allocator but not used by the program
+      //
+      // The "allocated" amount  does not include memory allocated outside
+      // of the caching allocator, such as memory allocated by other programs
+      // or memory held by the driver.
+      //
+      // The sum of "allocated" + "free" + "cached" may be less than the
+      // total capacity due to memory held by the driver and usage by other
+      // programs.
+      //
+      // Note that at this point free_cached_blocks has already returned all
+      // possible "cached" memory to the driver. The only remaining "cached"
+      // memory is split from a larger block that is partially in-use.
+      TORCH_CHECK_WITH(
+          OutOfMemoryError,
+          false,
+          "HIP out of memory. Tried to allocate ",
+          format_size(alloc_size),
+          ". GPU ",
+          static_cast<int>(device),
+          " has a total capacity of ",
+          format_size(device_total),
+          " of which ",
+          format_size(device_free),
+          " is free. ",
+          proc_info,
+          "Of the allocated memory ",
+          format_size(allocated_bytes + allocated_in_private_pools),
+          " is allocated by PyTorch, ",
+          private_pool_msg,
+          "and ",
+          format_size(
+              reserved_bytes - allocated_bytes - allocated_in_private_pools),
+          " is reserved by PyTorch but unallocated.",
+          " If reserved but unallocated memory is large try setting",
+          " PYTORCH_ZOOM_ALLOC_CONF=expandable_segments:True to avoid"
+          " fragmentation.  See documentation for Memory Management "
+          " (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)");
+    }
+
+    bool split_remainder = should_split(params.block, params.size());
+    return alloc_found_block(
+        params, orig_size, std::move(context), split_remainder);
+  }
+
+  Block* alloc_found_block(
+      const AllocParams& params,
+      size_t orig_size,
+      std::shared_ptr<GatheredContext> context,
+      bool split_remainder) {
+    auto size = params.size();
+    auto device = params.device();
+    auto pool = params.pool;
+    auto stream = params.stream();
+
+    TORCH_INTERNAL_ASSERT(
+        params.err == hipSuccess && params.block != nullptr &&
+        params.block->ptr != nullptr);
+    Block* block = params.block;
+    Block* remaining = nullptr;
+
+    const bool already_split = block->is_split();
+    if (split_remainder) {
+      remaining = block;
+
+      block = new Block(device, stream, size, pool, block->ptr);
+      block->expandable_segment_ = remaining->expandable_segment_;
+      block->prev = remaining->prev;
+      if (block->prev) {
+        block->prev->next = block;
+      }
+      block->next = remaining;
+
+      remaining->prev = block;
+      remaining->ptr = static_cast<char*>(remaining->ptr) + size;
+      remaining->size -= size;
+      // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+      bool inserted = pool->insert_into_blocks(remaining).second;
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+
+      if (already_split && !block->expandable_segment_) {
+        // An already-split inactive block is being shrunk by size bytes.
+        decrease_stat_array(
+            stats.inactive_split_bytes, block->size, params.stat_types);
+      } else if (!block->expandable_segment_) {
+        // A new split inactive block is being created from a previously unsplit
+        // block, size remaining->size bytes.
+        for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+          increase_stat(stats.inactive_split_bytes[stat_type], remaining->size);
+          increase_stat(stats.inactive_split[stat_type], 1);
+        });
+      }
+
+    } else if (already_split && !block->expandable_segment_) {
+      // An already-split block is becoming active
+      for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+        decrease_stat(stats.inactive_split_bytes[stat_type], block->size);
+        decrease_stat(stats.inactive_split[stat_type], 1);
+      });
+    }
+
+    block->allocated = true;
+    block->requested_size = orig_size;
+
+    block->context_when_allocated = std::move(context);
+    record_trace(
+        TraceEntry::ALLOC,
+        int64_t(block->ptr),
+        orig_size,
+        block->stream,
+        block->device,
+        block->context_when_allocated);
+
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+    bool inserted = active_blocks.insert(block).second;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inserted);
+
+    for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+      increase_stat(stats.allocation[stat_type], 1);
+      increase_stat(stats.allocated_bytes[stat_type], block->size);
+      increase_stat(stats.active[stat_type], 1);
+      increase_stat(stats.active_bytes[stat_type], block->size);
+      increase_stat(stats.requested_bytes[stat_type], block->requested_size);
+    });
+    if (block->size >= ZoomAllocatorConfig::max_split_size())
+      increase_stat(stats.oversize_allocations, 1);
+
+    c10::reportMemoryUsageToProfiler(
+        block->ptr,
+        static_cast<int64_t>(block->size),
+        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        c10::Device(c10::DeviceType::PrivateUse1, device));
+
+    return block;
+  }
+
+  void free(Block* block) {
+    std::shared_ptr<GatheredContext> context =
+        maybeGatherContext(RecordContext::ALL);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    block->allocated = false;
+
+    // following logic might modifying underlaying Block, causing the size
+    // changed. We store ahead for reporting
+    auto orig_block_ptr = block->ptr;
+    auto orig_block_size = block->size;
+
+    StatTypes stat_types = get_stat_types_for_pool(*block->pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      decrease_stat(stats.allocation[stat_type], 1);
+      decrease_stat(stats.allocated_bytes[stat_type], block->size);
+    });
+
+    record_trace(
+        TraceEntry::FREE_REQUESTED,
+        int64_t(block->ptr),
+        block->requested_size,
+        block->stream,
+        block->device,
+        context ? context : block->context_when_allocated);
+
+    if (block->size >= ZoomAllocatorConfig::max_split_size())
+      decrease_stat(stats.oversize_allocations, 1);
+
+    if (!block->stream_uses.empty()) {
+      if (C10_UNLIKELY(!captures_underway.empty())) {
+        // It's forbidden to hipEventQuery an event recorded during HIP graph
+        // capture. We conservatively defer recording end-of-life events until
+        // the next call to process_events() (which won't happen until no
+        // captures are underway)
+        needs_events_deferred_until_no_capture.push_back(block);
+      } else {
+        insert_events(block);
+      }
+    } else {
+      free_block(block, context);
+    }
+
+    c10::reportMemoryUsageToProfiler(
+        orig_block_ptr,
+        -static_cast<int64_t>(orig_block_size),
+        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+        c10::Device(c10::DeviceType::PrivateUse1, block->device));
+  }
+
+  void* getBaseAllocation(Block* block, size_t* outSize) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    TORCH_CHECK(
+        !block->expandable_segment_,
+        "Tensors allocated with expandable_segments:True cannot be shared between processes. Consider using expandable_segments:False in data loading workers via torch.cuda.memory._set_allocator_settings('expandable_segments:False')");
+    while (block->prev) {
+      block = block->prev;
+    }
+    void* basePtr = block->ptr;
+    if (outSize) {
+      size_t size = 0;
+      while (block) {
+        size += block->size;
+        block = block->next;
+      }
+      *outSize = size;
+    }
+    return basePtr;
+  }
+
+  void recordStream(Block* block, zoom::ZoomStream stream) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (stream.stream() == block->stream) {
+      // ignore uses on the allocation stream, since those don't require any
+      // special synchronization
+      return;
+    }
+    block->stream_uses.insert(stream);
+  }
+
+  /** set memory fraction to limit maximum allocated memory **/
+  void setMemoryFraction(double fraction) {
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_ZOOM_CHECK(hipMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum =
+        static_cast<size_t>(fraction * static_cast<double>(device_total));
+    set_fraction = true;
+  }
+
+  /** returns cached blocks to the system allocator **/
+  void emptyCache() {
+    auto context = maybeGatherContext(RecordContext::ALL);
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    release_cached_blocks(context);
+  }
+
+  /** Retrieves size of largest unused block held by the memory cache **/
+  void cacheInfo(size_t* largest) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    if (*largest ==
+        0) { // make an initial guess if a zero *largest is passed in
+      size_t tmp_bytes = 0;
+      C10_ZOOM_CHECK(hipMemGetInfo(
+          largest, // Use free memory as an optimistic initial guess of *largest
+          &tmp_bytes));
+    }
+    cache_info_aux(large_blocks, largest);
+    cache_info_aux(small_blocks, largest);
+    for (const auto& gp : graph_pools) {
+      cache_info_aux(gp.second->large_blocks, largest);
+      cache_info_aux(gp.second->small_blocks, largest);
+    }
+  }
+
+  /** Returns a copy of the memory allocator stats **/
+  DeviceStats getStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    return stats;
+  }
+
+  /** Resets the historical accumulation stats for the device **/
+  void resetAccumulatedStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (const auto statType :
+         c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
+      reset_accumulated_stat(stats.allocation[statType]);
+      reset_accumulated_stat(stats.segment[statType]);
+      reset_accumulated_stat(stats.active[statType]);
+      reset_accumulated_stat(stats.inactive_split[statType]);
+      reset_accumulated_stat(stats.allocated_bytes[statType]);
+      reset_accumulated_stat(stats.reserved_bytes[statType]);
+      reset_accumulated_stat(stats.active_bytes[statType]);
+      reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+      reset_accumulated_stat(stats.requested_bytes[statType]);
+    }
+
+    stats.num_alloc_retries = 0;
+    stats.num_ooms = 0;
+    stats.num_sync_all_streams = 0;
+    stats.num_device_alloc = 0;
+    stats.num_device_free = 0;
+    reset_accumulated_stat(stats.oversize_allocations);
+    reset_accumulated_stat(stats.oversize_segments);
+  }
+
+  /** Resets the historical peak stats for the device **/
+  void resetPeakStats() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    for (const auto statType :
+         c10::irange(static_cast<size_t>(StatType::NUM_TYPES))) {
+      reset_peak_stat(stats.allocation[statType]);
+      reset_peak_stat(stats.segment[statType]);
+      reset_peak_stat(stats.active[statType]);
+      reset_peak_stat(stats.inactive_split[statType]);
+      reset_peak_stat(stats.allocated_bytes[statType]);
+      reset_peak_stat(stats.reserved_bytes[statType]);
+      reset_peak_stat(stats.active_bytes[statType]);
+      reset_peak_stat(stats.inactive_split_bytes[statType]);
+      reset_peak_stat(stats.requested_bytes[statType]);
+    }
+    reset_peak_stat(stats.oversize_allocations);
+    reset_peak_stat(stats.oversize_segments);
+  }
+
+  /* Checkpoint the state of a private pool necessary to return it to its
+   * current state */
+  std::unique_ptr<PrivatePoolState> getCheckpointState(MempoolId_t id) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    auto pool = graph_pools.find(id);
+    if (pool != graph_pools.end()) {
+      auto private_pool_head_blocks =
+          get_private_pool_head_blocks(pool->second.get());
+      return std::make_unique<PrivatePoolState>(id, private_pool_head_blocks);
+    } else if (graph_pools_freeable.count(id)) {
+      TORCH_CHECK(false, "Not expected to checkpoint freeable graph");
+    } else {
+      TORCH_CHECK(false, "Could not find pool of id");
+    }
+  }
+
+  void freeBlocksAllocatedToPool(PrivatePool* private_pool, RestoreResult& rr) {
+    auto pool_blocks = get_private_pool_head_blocks(private_pool);
+
+    std::vector<Block*> head_blocks;
+    for (Block* block : pool_blocks) {
+      if (block->prev == nullptr) {
+        head_blocks.push_back(block);
+      }
+    }
+
+    for (Block* block : head_blocks) {
+      Block* curr = block;
+
+      while (curr) {
+        // When we free a block, its pointer should never change
+        // only its adjacent blocks, so free, then look at pointer
+        if (curr->allocated) {
+          TORCH_CHECK(
+              curr->event_count == 0,
+              "Events should have synchronized when setting checkpointed block");
+          rr.allocations_freed.push_back(curr->ptr);
+          free(curr);
+          TORCH_CHECK(!curr->allocated)
+        }
+        curr = curr->next;
+      }
+    }
+
+    for (Block* b : get_private_pool_head_blocks(private_pool)) {
+      Block* curr = b;
+      while (curr) {
+        TORCH_CHECK(!curr->allocated);
+        curr = curr->next;
+      }
+    }
+  }
+
+  // checkpoint the state of an allocation that may have been
+  // split into multiple blocks
+  void setSegmentStateToCheckpoint(
+      Block* block,
+      SegmentState& segment,
+      const std::shared_ptr<GatheredContext>& context,
+      RestoreResult& rr) {
+    Block* curr_block = block;
+    Block* last_block = block;
+
+    TORCH_INTERNAL_ASSERT(block->pool);
+    BlockPool& pool = *block->pool;
+    const auto segment_len = segment.blocks.size();
+
+    // allocate all blocks in the segment
+    for (size_t i = 0; i < segment_len; ++i) {
+      auto& block_state = segment.blocks.at(i);
+      AllocParams params(
+          block_state.device,
+          block_state.size,
+          block_state.stream,
+          &pool,
+          block_state.size,
+          stats);
+      pool.blocks.erase(curr_block);
+      params.block = curr_block;
+      params.stat_types = get_stat_types_for_pool(pool);
+
+      // splitting a block depends on `max_split_size`, which may have changed
+      // between whe checkpoint was taken and now, so we make sure to recreate
+      // the behavior from the checkpoint.
+      bool split = (i + 1) < segment.blocks.size();
+
+      // curr_block will become next pointer if it is split, so reassign with
+      // the returned value
+      curr_block = alloc_found_block(params, block_state.size, context, split);
+
+      TORCH_CHECK(curr_block->ptr == block_state.ptr);
+      TORCH_CHECK(curr_block->size == block_state.size);
+
+      last_block = curr_block;
+      curr_block = curr_block->next;
+
+      TORCH_CHECK((curr_block != nullptr) == ((i + 1) < (segment_len)));
+    }
+
+    while (last_block->prev) {
+      last_block = last_block->prev;
+    }
+
+    // free blocks that are not allocated in the checkpoint
+    curr_block = last_block;
+
+    for (size_t i = 0; i < segment_len; ++i, curr_block = curr_block->next) {
+      auto& block_state = segment.blocks.at(i);
+      TORCH_INTERNAL_ASSERT(curr_block != nullptr);
+
+      if (block_state.allocated) {
+        rr.allocations_created.push_back(curr_block);
+        continue;
+      }
+
+      free(curr_block);
+
+      TORCH_CHECK(curr_block->ptr == block_state.ptr);
+      TORCH_CHECK(curr_block->allocated == block_state.allocated);
+      TORCH_CHECK(curr_block->size == block_state.size);
+    }
+  }
+
+  /**
+   * Note [Checkpointing PrivatePoolState]
+   *
+   * Refer above to Note [Interaction with HIP graph capture]. Allocations made
+   * during graph capture are made from a separate private pool. During graph
+   * capture allocations behave as usual. During graph replay the allocator
+   * state does not change even as new tensors are created. The private pool
+   * will not free its blocks to the main caching allocator until cuda graph use
+   * is finished to prevent an allocation from eager clobbering the memory from
+   * a live but unaccounted for tensor that was created during replay.
+   *
+   * `make_graphed_callables`, a series of separate callables chained in
+   * successive cuda graphs, can share a memory pool because after a cuda graph
+   * recording the allocations in the shared private pool exactly reflect the
+   * tensors that are allocated.
+   *
+   * We would like to extend callable chaining to support a graphed callable
+   * tree. In this scenario, we have a tree of callable chains which will be
+   * captured with cuda graphs. In the diagram below, we have a tree with four
+   * callables, A, B, C, and D. Suppose we have captured, and subsequently
+   * replayed, A, B, and C. Then on a new invocation, we replay A and B, but
+   * would now like to record D. At this point the private pool will not reflect
+   * any of the live tensors created during graph replay. Allocations made
+   * during a new recording with the pool could overwrite those live tensors.
+   *
+   * In order to record a new graph capture after replaying prior callables in
+   * the tree, we need the allocator to reflect the state of the live tensors.
+   * We checkpoint the state of the private pool after each recording, and then
+   * reapply it when we are starting a new recording chain. Additionally, we
+   * must free the allocations for any tensors that died between the end of our
+   * previous graph replaying and our new recording. All of the allocated
+   * segments that existed in the checkpointed state must still exist in the
+   * pool. There may also exist new allocated blocks.
+   * (TODO : link note [live tensors between iterations] when it exists). For
+   * every block that is currently allocated but no allocated in the snapshot,
+   * we will return a pointer to their block.
+   *.
+   *
+   *
+   *  ---------------> A ---------------> B ---------------> C
+   *                                      |
+   *                                      |
+   *                                      |
+   *                                      |
+   *                                      ╰ ---------------> D
+   */
+  RestoreResult setCheckpointPoolState(PrivatePoolState& pps) {
+    // To reset the caching allocator state we will
+    // - Free all the blocks currently allocated to the pool (see [live tensors
+    // between iterations])
+    // - Allocate all the blocks in a checkpointed segment, whether they are
+    // live or not
+    // - Free the blocks in a checkpointed segment which are not live
+    // This could be optimized, but it nicely reuses exiting apis, and this
+    // is not on the hot path.
+
+    // following `done outside the lock because we don't know what locks the
+    // recorder needs to have...`
+
+    std::shared_ptr<GatheredContext> context =
+        maybeGatherContext(RecordContext::STATE);
+
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    RestoreResult rr;
+
+    TORCH_CHECK(
+        !graph_pools_freeable.count(pps.owner_id),
+        "Not expected to checkpoint freeable graph");
+
+    auto pool = graph_pools.find(pps.owner_id);
+    TORCH_CHECK(pool != graph_pools.end(), "Could not find private pool id");
+
+    PrivatePool* private_pool = pool->second.get();
+
+    freeBlocksAllocatedToPool(private_pool, rr);
+
+    std::unordered_map<void*, Block*> ptrs_to_blocks;
+    // at this point, all of the blocks should be free, so they will all be in
+    // the block set
+    for (Block* block : private_pool->small_blocks.blocks) {
+      ptrs_to_blocks[block->ptr] = block;
+    }
+    for (Block* block : private_pool->large_blocks.blocks) {
+      ptrs_to_blocks[block->ptr] = block;
+    }
+
+    for (auto& segment : pps.segments) {
+      auto ptr = segment.blocks.at(0).ptr;
+      TORCH_CHECK(ptrs_to_blocks.count(ptr), " could not find ", ptr)
+      auto block = ptrs_to_blocks[ptr];
+
+      setSegmentStateToCheckpoint(block, segment, context, rr);
+    }
+    return rr;
+  }
+
+  /** Dump a complete snapshot of the memory held by the allocator. Potentially
+   * VERY expensive. **/
+  std::vector<SegmentInfo> snapshot() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
+    pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
+    for (const auto& pair : graph_pools) {
+      pool_to_id[pair.second.get()] = pair.first;
+    }
+    for (const auto& pair : graph_pools_freeable) {
+      pool_to_id[pair.second] = pair.first;
+    }
+
+    size_t total_active = 0;
+    std::vector<SegmentInfo> result;
+    const auto all_blocks = get_all_blocks();
+
+    for (const Block* const head_block : all_blocks) {
+      // For expandable segments, we report one segment for each contiguous
+      // mapped range of memory
+      if (head_block->prev && head_block->prev->mapped) {
+        continue;
+      }
+      result.emplace_back();
+      SegmentInfo& segment_info = result.back();
+      segment_info.device = head_block->device;
+      segment_info.address = reinterpret_cast<size_t>(head_block->ptr);
+      segment_info.stream = head_block->stream;
+      segment_info.is_large = (!head_block->pool->is_small);
+      segment_info.is_expandable = head_block->expandable_segment_;
+      segment_info.context_when_allocated =
+          head_block->context_when_segment_allocated;
+      auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
+      if (mempool_id != pool_to_id.end()) {
+        segment_info.owner_private_pool_id = mempool_id->second;
+      }
+
+      const Block* block = head_block;
+      while (block != nullptr && block->mapped) {
+        segment_info.blocks.emplace_back();
+        BlockInfo& block_info = segment_info.blocks.back();
+
+        block_info.size = block->size;
+        block_info.requested_size = block->requested_size;
+        block_info.allocated = block->allocated;
+        block_info.active = block->allocated || (block->event_count > 0) ||
+            !block->stream_uses.empty();
+
+        segment_info.total_size += block_info.size;
+        if (block_info.allocated) {
+          segment_info.allocated_size += block_info.size;
+        }
+        if (block_info.active) {
+          segment_info.active_size += block_info.size;
+          segment_info.requested_size += block_info.requested_size;
+        }
+        block_info.context_when_allocated = block->context_when_allocated;
+        block = block->next;
+      }
+      total_active += segment_info.active_size;
+    }
+
+    std::sort(
+        result.begin(),
+        result.end(),
+        [](const SegmentInfo& a, const SegmentInfo& b) {
+          return a.address < b.address;
+        });
+
+    record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
+    return result;
+  }
+
+  std::vector<TraceEntry> trace(
+      const std::function<time_t(approx_time_t)>& tsc_to_us) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    std::vector<TraceEntry> result;
+    result.reserve(alloc_trace->size());
+    result.insert(
+        result.end(),
+        alloc_trace->begin() +
+            static_cast<std::vector<TraceEntry>::difference_type>(
+                alloc_trace_next),
+        alloc_trace->end());
+    result.insert(
+        result.end(),
+        alloc_trace->begin(),
+        alloc_trace->begin() +
+            static_cast<std::vector<TraceEntry>::difference_type>(
+                alloc_trace_next));
+
+    // Convert all the timestamps from tsc to epoch time in microseconds.
+    for (auto& te : result) {
+      te.time_.t_ = tsc_to_us(te.time_.approx_t_);
+    }
+    return result;
+  }
+
+  // This function takes the size and number of divisions argument and rounds
+  // up the size argument for the nearest power-of-2 division.
+  // For example, if we need to round-up 1200 and number of divisions is 4,
+  // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
+  // them, the values are 1024, 1280, 1536, and 1792. So the function will
+  // return 1280 as the nearest ceiling of power-2 divison.
+  static size_t roundup_power2_next_division(size_t size, size_t divisions) {
+    if (C10_UNLIKELY(size <= 4 || divisions <= 1)) {
+      return size;
+    }
+    if (llvm::isPowerOf2_64(size)) {
+      return size;
+    }
+
+    // divide the space between these 2's power into equal divisions
+    // If division is zero, return the power-of-2 ceiling.
+    size_t power2_floor = llvm::PowerOf2Floor(size);
+    size_t power2_divison =
+        power2_floor >> (63 - llvm::countLeadingZeros(divisions));
+    if (C10_UNLIKELY(power2_divison == 0)) {
+      return (power2_floor << 1);
+    }
+    size_t round_size_floor = size & (~(power2_divison - 1));
+    return (round_size_floor == size) ? size
+                                      : round_size_floor + power2_divison;
+  }
+
+  static size_t round_size(size_t size) {
+    if (size < kMinBlockSize) {
+      return kMinBlockSize;
+    } else {
+      auto divisions = ZoomAllocatorConfig::roundup_power2_divisions(size);
+      if (divisions > 0 && size > (kMinBlockSize * divisions)) {
+        return roundup_power2_next_division(size, divisions);
+      } else {
+        return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+      }
+    }
+  }
+
+  // See Note [Interaction with HIP graph capture]
+
+  // Called by CUDAGraph::capture_begin
+  void beginAllocateToPool(
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    auto it = graph_pools.find(mempool_id);
+    if (it == graph_pools.end()) {
+      // mempool_id does not reference an existing pool. Make a new pool for
+      // this capture.
+      graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
+    } else {
+      // mempool_id references an existing pool, which the current capture will
+      // share. Check this pool is live (at least one other capture already
+      // references it).
+      TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
+      it->second->use_count++;
+    }
+    for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
+         ++it2) {
+      TORCH_CHECK(
+          it2->first != mempool_id,
+          "beginAllocateToPool: already recording to mempool_id");
+    }
+    captures_underway.emplace_back(mempool_id, std::move(filter));
+  }
+
+  // Called by CUDAGraph::capture_end
+  void endAllocateToPool(MempoolId_t mempool_id) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    for (auto it = captures_underway.begin(); it != captures_underway.end();
+         ++it) {
+      if (it->first == mempool_id) {
+        captures_underway.erase(it);
+        return;
+      }
+    }
+    TORCH_CHECK(
+        false, "endAllocatePool: not currently recording to mempool_id");
+  }
+
+  // Called by CUDAGraph::reset
+  void releasePool(MempoolId_t mempool_id) {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    // The instantiated cudaGraphExec_t has been destroyed. We can't blindly
+    // delete and hipFree the mempool its capture used, because
+    //  1. other graph(s) might share the same pool
+    //  2. the user might still hold references to output tensors allocated
+    //  during capture.
+    // To handle 1 and 2, we track the number of graphs using this particular
+    // mempool. When the count reaches 0, we tell free_cached_blocks it may now
+    // hipFree blocks from this graph's pool when it discovers they're unused
+    // (unsplit).
+    auto it = graph_pools.find(mempool_id);
+    TORCH_INTERNAL_ASSERT(it != graph_pools.end());
+    auto uc = --(it->second->use_count);
+    TORCH_INTERNAL_ASSERT(uc >= 0);
+    if (uc == 0) {
+      // Allows free_cached_blocks to begin hipFreeing this pool's memory,
+      // and makes sure this pool wasn't somehow made freeable already.
+      // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+      bool inserted =
+          graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
+      TORCH_INTERNAL_ASSERT(inserted);
+    }
+  }
+
+  void addPeerAccess(c10::DeviceIndex dev_to_access) {
+    if (std::find(
+            devices_with_peer_access_.begin(),
+            devices_with_peer_access_.end(),
+            dev_to_access) != devices_with_peer_access_.end()) {
+      return;
+    }
+    devices_with_peer_access_.push_back(dev_to_access);
+    for (auto& es : expandable_segments_) {
+      es->addPeer(dev_to_access);
+    }
+  }
+
+  bool hasAllocatedExpandableSegments() const {
+    return !expandable_segments_.empty();
+  }
+
+ private:
+  // All private methods do not acquire the allocator mutex.
+
+  std::vector<const Block*> get_all_blocks() const {
+    std::vector<const Block*> blocks;
+    blocks.insert(
+        blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
+    blocks.insert(
+        blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
+    for (const auto& gp : graph_pools) {
+      blocks.insert(
+          blocks.end(),
+          gp.second->small_blocks.blocks.begin(),
+          gp.second->small_blocks.blocks.end());
+      blocks.insert(
+          blocks.end(),
+          gp.second->large_blocks.blocks.begin(),
+          gp.second->large_blocks.blocks.end());
+    }
+    blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
+    return blocks;
+  }
+
+  std::vector<Block*> get_private_pool_head_blocks(PrivatePool* pool) const {
+    std::vector<Block*> blocks;
+    for (Block* b : active_blocks) {
+      if ((b->pool == &pool->small_blocks || b->pool == &pool->large_blocks) &&
+          b->prev == nullptr) {
+        blocks.push_back(b);
+      }
+    }
+
+    for (Block* b : pool->small_blocks.blocks) {
+      if (b->prev == nullptr) {
+        blocks.push_back(b);
+      }
+    }
+    for (Block* b : pool->large_blocks.blocks) {
+      if (b->prev == nullptr) {
+        blocks.push_back(b);
+      }
+    }
+
+    return blocks;
+  }
+
+  // returns the smallest possible address in any segment
+  // where there is enough free address space to fit size
+  // may be composed of free and unmapped segments
+  Block* find_expandable_block(
+      c10::DeviceIndex device,
+      hipStream_t stream,
+      BlockPool* pool,
+      size_t size) {
+    Block key(device, stream, 0);
+
+    auto allocatable = [](Block* b) {
+      return b && !b->allocated && b->event_count == 0 &&
+          b->stream_uses.empty();
+    };
+    auto has_available_address_space = [&](Block* b) {
+      size_t bytes = 0;
+      while (bytes < size && allocatable(b)) {
+        bytes += b->size;
+        b = b->next;
+      }
+      return bytes >= size;
+    };
+    for (auto it = pool->unmapped.lower_bound(&key);
+         it != pool->unmapped.end() && (*it)->stream == stream;
+         ++it) {
+      Block* c = *it;
+      // we found the lowest address of an unmapped segment
+      // but there might be a free segment we can also use
+      // right before it
+      if (allocatable(c->prev)) {
+        c = c->prev;
+      }
+      if (has_available_address_space(c)) {
+        return c;
+      }
+    }
+    auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
+    expandable_segments_.emplace_back(new ExpandableSegment(
+        device, stream, segment_size, devices_with_peer_access_));
+
+    ExpandableSegment* es = expandable_segments_.back();
+    Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
+    candidate->mapped = false;
+    candidate->expandable_segment_ = es;
+    pool->unmapped.insert(candidate);
+    return candidate;
+  }
+
+  bool map_block(
+      Block* to_map,
+      size_t size,
+      const std::shared_ptr<GatheredContext>& ctx) {
+    TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size);
+    TORCH_INTERNAL_ASSERT(
+        !to_map->context_when_allocated); // unmapped blocks should not keep
+                                          // history
+    auto mapped_range =
+        to_map->expandable_segment_->map(SegmentRange{to_map->ptr, size});
+    // failed to map the memory
+    if (mapped_range.size == 0) {
+      return false;
+    }
+    TORCH_INTERNAL_ASSERT(
+        mapped_range.ptr == to_map->ptr && mapped_range.size >= size);
+
+    BlockPool& pool = *to_map->pool;
+    pool.unmapped.erase(to_map);
+    to_map->mapped = true;
+
+    if (mapped_range.size < to_map->size) {
+      // to_map -> remaining -> to_map->next(?)
+      Block* remaining = new Block(
+          to_map->device,
+          to_map->stream,
+          to_map->size - mapped_range.size,
+          &pool,
+          static_cast<char*>(to_map->ptr) + mapped_range.size);
+      remaining->mapped = false;
+      remaining->expandable_segment_ = to_map->expandable_segment_;
+      remaining->splice(to_map, to_map->next);
+      pool.unmapped.insert(remaining);
+      to_map->size = mapped_range.size;
+    }
+
+    try_merge_blocks(to_map, to_map->prev, pool);
+    try_merge_blocks(to_map, to_map->next, pool);
+
+    pool.insert_into_blocks(to_map);
+
+    // update statistics
+    total_allocated_memory += mapped_range.size;
+    StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      increase_stat(stats.reserved_bytes[stat_type], mapped_range.size);
+    });
+
+    stats.num_device_alloc++;
+    record_trace(
+        TraceEntry::SEGMENT_MAP,
+        int64_t(mapped_range.ptr),
+        mapped_range.size,
+        to_map->stream,
+        to_map->device,
+        ctx);
+    if (!to_map->prev && !to_map->context_when_segment_allocated) {
+      to_map->context_when_segment_allocated = ctx;
+    }
+
+    return true;
+  }
+
+  Block* try_allocate_expandable_block(
+      c10::DeviceIndex device,
+      hipStream_t stream,
+      BlockPool* pool,
+      size_t size,
+      const std::shared_ptr<GatheredContext>& ctx) {
+    Block* candidate = find_expandable_block(device, stream, pool, size);
+    // Candidate is now a list free/unmapped blocks with at least size room:
+    // unmapped -> null
+    // unmapped -> free -> *
+    // free -> unmapped -> *
+
+    if (!candidate->mapped &&
+        !map_block(candidate, std::min(candidate->size, size), ctx)) {
+      return nullptr;
+    }
+    TORCH_INTERNAL_ASSERT(candidate->mapped);
+
+    while (candidate->size < size) {
+      // invariant: free -> unmapped -> *
+      // map_block will map some of unmapped and merge with free
+      auto remaining = size - candidate->size;
+      auto new_candidate = candidate->next;
+      if (!map_block(
+              new_candidate, std::min(remaining, candidate->next->size), ctx)) {
+        return nullptr;
+      }
+      candidate = new_candidate;
+    }
+    pool->blocks.erase(candidate);
+    return candidate;
+  }
+
+  /** moves a block into a pool of cached free blocks */
+  void free_block(
+      Block* block,
+      const std::shared_ptr<GatheredContext>& context) {
+    TORCH_INTERNAL_ASSERT(
+        !block->allocated && block->event_count == 0 &&
+        block->stream_uses.empty());
+
+    record_trace(
+        TraceEntry::FREE_COMPLETED,
+        int64_t(block->ptr),
+        block->requested_size,
+        block->stream,
+        block->device,
+        context ? context : block->context_when_allocated);
+
+    block->context_when_allocated = nullptr;
+    size_t original_block_size = block->size;
+    size_t requested_size = block->requested_size;
+
+    auto& pool = *block->pool;
+    int64_t net_change_inactive_split_blocks = 0;
+    int64_t net_change_inactive_split_size = 0;
+
+    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
+    for (Block* merge_candidate : merge_candidates) {
+      const auto subsumed_size = try_merge_blocks(block, merge_candidate, pool);
+      if (subsumed_size > 0) {
+        net_change_inactive_split_blocks -= 1;
+        net_change_inactive_split_size -= static_cast<int64_t>(subsumed_size);
+      }
+    }
+
+    active_blocks.erase(block);
+    // Makes sure the Block* isn't already present in the pool we're freeing it
+    // back into.
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+    bool inserted = pool.insert_into_blocks(block).second;
+    TORCH_INTERNAL_ASSERT(inserted);
+
+    if (block->is_split()) {
+      net_change_inactive_split_blocks += 1;
+      net_change_inactive_split_size += static_cast<int64_t>(block->size);
+    }
+
+    StatTypes stat_types = get_stat_types_for_pool(pool);
+
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      // inactive_split tries to capture the idea that blocks
+      // cannot be freed when requested, but fully free pages
+      // of expandable blocks can always be freed.
+      // The logic to track this as statistic is pretty involved,
+      // so we simply just exclude expandable segments from
+      // inactive_split
+      if (!block->expandable_segment_) {
+        if (net_change_inactive_split_blocks > 0) {
+          increase_stat(
+              stats.inactive_split[stat_type],
+              static_cast<size_t>(net_change_inactive_split_blocks));
+        } else if (net_change_inactive_split_blocks < 0) {
+          decrease_stat(
+              stats.inactive_split[stat_type],
+              static_cast<size_t>(-net_change_inactive_split_blocks));
+        }
+        if (net_change_inactive_split_size > 0) {
+          increase_stat(
+              stats.inactive_split_bytes[stat_type],
+              static_cast<size_t>(net_change_inactive_split_size));
+        } else if (net_change_inactive_split_size < 0) {
+          decrease_stat(
+              stats.inactive_split_bytes[stat_type],
+              static_cast<size_t>(-net_change_inactive_split_size));
+        }
+      }
+      decrease_stat(stats.active[stat_type], 1);
+      decrease_stat(stats.active_bytes[stat_type], original_block_size);
+      decrease_stat(stats.requested_bytes[stat_type], requested_size);
+    });
+  }
+
+  /** combine previously split blocks. returns the size of the subsumed block,
+   * or 0 on failure. */
+  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
+    if (!src || src->allocated || src->event_count > 0 ||
+        !src->stream_uses.empty() || dst->mapped != src->mapped) {
+      return 0;
+    }
+
+    AT_ASSERT(dst->is_split() && src->is_split());
+
+    if (dst->prev == src) { // [src dst]
+      dst->ptr = src->ptr;
+      dst->prev = src->prev;
+      if (dst->prev) {
+        dst->prev->next = dst;
+      }
+      dst->context_when_segment_allocated =
+          std::move(src->context_when_segment_allocated);
+    } else { // [dest src]
+      dst->next = src->next;
+      if (dst->next) {
+        dst->next->prev = dst;
+      }
+    }
+    const size_t subsumed_size = src->size;
+    dst->size += subsumed_size;
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+    auto erased =
+        src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(erased == 1);
+    delete src;
+
+    return subsumed_size;
+  }
+
+  BlockPool& get_pool(size_t size, hipStream_t stream) {
+    // captures_underway is a conservative guess that the current stream may be
+    // capturing. It's only non-empty if some thread has begun and not yet ended
+    // a capture, so it's usually 0, and we can short-circuit
+    // hipStreamCaptureStatus (which does a TLS lookup).
+    if (C10_UNLIKELY(!captures_underway.empty())) {
+      for (auto& entry : captures_underway) {
+        if (entry.second(stream)) {
+          auto it1 = graph_pools.find(entry.first);
+          TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+          if (size <= kSmallSize) {
+            return it1->second->small_blocks;
+          } else {
+            return it1->second->large_blocks;
+          }
+        }
+      }
+    }
+    if (size <= kSmallSize) {
+      return small_blocks;
+    } else {
+      return large_blocks;
+    }
+  }
+
+  StatTypes get_stat_types_for_pool(const BlockPool& pool) {
+    StatTypes stat_types = {false};
+    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+    stat_types[static_cast<size_t>(
+        pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
+    return stat_types;
+  }
+
+  bool should_split(const Block* block, size_t size) {
+    size_t remaining = block->size - size;
+    if (block->pool->is_small || ZoomAllocatorConfig::expandable_segments()) {
+      return remaining >= kMinBlockSize;
+    } else {
+      return (size < ZoomAllocatorConfig::max_split_size()) &&
+          (remaining > kSmallSize);
+    }
+  }
+
+  static size_t get_allocation_size(size_t size) {
+    if (size <= kSmallSize) {
+      return kSmallBuffer;
+    } else if (size < kMinLargeAlloc) {
+      return kLargeBuffer;
+    } else {
+      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+    }
+  }
+
+  bool get_free_block(AllocParams& p) {
+    BlockPool& pool = *p.pool;
+
+    if (C10_UNLIKELY(
+            set_fraction &&
+            ZoomAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+      // Track block reuse interval only when garbage collection is enabled.
+      ++pool.get_free_blocks_call_count;
+    }
+    auto it = pool.blocks.lower_bound(&p.search_key);
+    if (it == pool.blocks.end() || (*it)->stream != p.stream())
+      return false;
+
+    if ((*it)->expandable_segment_) {
+      if (ZoomAllocatorConfig::expandable_segments()) {
+        // if we are allocated to the part of the block that is expandable
+        // for the purposes of "best fit" we consider its size to be the size it
+        // can expand to, not the size it currently is. This means that we
+        // sometimes have to search for blocks with bigger 'size' before
+        // choosing this segment.
+        auto expandable_size = [](Block* b) {
+          return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
+        };
+        auto next = it;
+        next++;
+        while ((*it)->expandable_segment_ && next != pool.blocks.end() &&
+               (*next)->stream == p.stream() &&
+               expandable_size(*next) < expandable_size(*it)) {
+          it = next++;
+        }
+      } else {
+        // Rarely expandable segments has been turned off after we have
+        // already allocated some blocks as expandable. For instance,
+        // since we cannot share expandable memory via IPC, someone might
+        // temporarily disable it. In this case we need to honor this request
+        // by only finding non-expandable blocks
+        do {
+          it++;
+        } while (it != pool.blocks.end() && (*it)->expandable_segment_ &&
+                 (*it)->stream == p.stream());
+        if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+          return false;
+        }
+      }
+    }
+
+    // Do not return an oversized block for a large request
+    if ((p.size() < ZoomAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= ZoomAllocatorConfig::max_split_size()))
+      return false;
+    // Allow oversized block size to be rounded up but within a limit
+    if ((p.size() >= ZoomAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= p.size() + kLargeBuffer))
+      return false;
+    p.block = *it;
+    pool.blocks.erase(it);
+    return true;
+  }
+
+  bool trigger_free_memory_callbacks(AllocParams& p) {
+    bool freed_memory = false;
+    for (const auto& name : FreeZoomMemoryCallbacksRegistry()->Keys()) {
+      freed_memory |=
+          FreeZoomMemoryCallbacksRegistry()->Create(name)->Execute();
+    }
+    return freed_memory;
+  }
+
+  void garbage_collect_cached_blocks(
+      const std::shared_ptr<GatheredContext>& context) {
+    // Free unused cached blocks to reclaim GPU memory.
+    // Unlike release_cached_blocks(), this does not enforce synchronization and
+    // therefore should be of less overheads.
+
+    size_t gc_threshold = static_cast<size_t>(
+        ZoomAllocatorConfig::garbage_collection_threshold() *
+        static_cast<double>(allowed_memory_maximum));
+    // No need to trigger GC yet
+    if (total_allocated_memory <= gc_threshold) {
+      return;
+    }
+    const auto target_size = total_allocated_memory - gc_threshold;
+    size_t gc_reclaimed = 0;
+
+    // Calculate the total age of the free-able blocks. We'll use it later to
+    // get "avg age" threshold.
+    size_t total_age = 0.0;
+    int freeable_block_count = 0;
+    for (auto& b : large_blocks.blocks) {
+      if (!b->is_split()) {
+        total_age += b->gc_count();
+        ++freeable_block_count;
+      }
+    }
+    // No free-able blocks?
+    if (freeable_block_count == 0) {
+      return;
+    }
+
+    // Repeat GC until we reach reclaim > target size.
+    bool block_freed = true;
+    while (gc_reclaimed < target_size && block_freed == true &&
+           freeable_block_count > 0) {
+      // Free blocks exceeding this age threshold first.
+      double age_threshold =
+          static_cast<double>(total_age) / freeable_block_count;
+      // Stop iteration if we can no longer free a block.
+      block_freed = false;
+
+      // Free blocks of > avg age. Don't stop upon reaching the target_size,
+      // we don't want this GC to be triggered frequently.
+      auto it = large_blocks.blocks.begin();
+      while (it != large_blocks.blocks.end()) {
+        Block* block = *it;
+        ++it;
+        if (!block->is_split() &&
+            static_cast<double>(block->gc_count()) >= age_threshold) {
+          block_freed = true;
+          gc_reclaimed += block->size;
+          total_age -= block->gc_count(); // Decrement the age
+          freeable_block_count--; // One less block that can be freed
+          release_block(block, context);
+        }
+      }
+    }
+  }
+
+  // This function assumes that global lock has been taken whle calling into
+  // this function. We do hipMalloc sync call in this function which
+  // can be expensive while holding the lock. Hence, we pass-in the lock to the
+  // function to temporarily release the lock before hipMalloc call and acquire
+  // it back again after the call so that other threads dont get blocked.
+  bool alloc_block(
+      AllocParams& p,
+      bool isRetry,
+      const std::shared_ptr<GatheredContext>& ctx,
+      std::unique_lock<std::recursive_mutex>& lock) {
+    // Defensively checks for preexisting HIP error state.
+    C10_ZOOM_CHECK(hipGetLastError());
+
+    size_t size = p.alloc_size;
+    void* ptr = nullptr;
+
+    if (isRetry) {
+      stats.num_alloc_retries += 1;
+    }
+
+    if (set_fraction &&
+        total_allocated_memory + size > allowed_memory_maximum) {
+      p.err = hipErrorMemoryAllocation;
+      return false;
+    } else if (
+        ZoomAllocatorConfig::expandable_segments() &&
+        // our checkpointing logic for private pools doesn't support
+        // the expandable_segments_ structure yet
+        !p.pool->owner_PrivatePool) {
+      p.block = try_allocate_expandable_block(
+          p.device(), p.stream(), p.pool, p.size(), ctx);
+      if (p.block) {
+        p.err = hipSuccess;
+      } else {
+        p.err = hipErrorMemoryAllocation;
+      }
+      return bool(p.block);
+    } else {
+      if (ZoomAllocatorConfig::release_lock_on_hipMalloc()) {
+        // At scope exit, acquire the lock again. This provides safety against
+        // any potential exceptions in the hipMallocMaybeCapturing function.
+        auto sg = c10::make_scope_exit([&]() { lock.lock(); });
+        lock.unlock();
+        p.err = hipMallocMaybeCapturing(&ptr, size);
+      } else {
+        p.err = hipMallocMaybeCapturing(&ptr, size);
+      }
+      if (ZoomAllocatorConfig::release_lock_on_hipMalloc()) {
+        TORCH_CHECK(
+            lock.owns_lock(), "Failed to acquire lock after hipMalloc");
+      }
+
+      if (p.err != hipSuccess) {
+        if (p.err == hipErrorMemoryAllocation) {
+          // If this is the first attempt (!isRetry), we can forgive and clear
+          // HIP's internal error state.
+          //
+          // If this is the second attempt (isRetry), malloc's TORCH_CHECK_WITH
+          // will take over to throw a helpful exception. The user can choose
+          // to catch the exception, free some stuff in their script, and
+          // attempt the allocation again. In this case, we can also forgive and
+          // clear HIP's internal error state.
+          (void)hipGetLastError();
+        } else {
+          // If the error's unrelated to memory allocation, we should throw
+          // immediately.
+          C10_ZOOM_CHECK(p.err);
+        }
+        return false;
+      }
+    }
+
+    if (p.pool->owner_PrivatePool) {
+      // The block is for a HIP graph's PrivatePool.
+      p.pool->owner_PrivatePool->hipMalloc_count++;
+    }
+
+    total_allocated_memory += size;
+    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+    for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
+      increase_stat(stats.segment[stat_type], 1);
+      increase_stat(stats.reserved_bytes[stat_type], size);
+    });
+    if (size >= ZoomAllocatorConfig::max_split_size())
+      increase_stat(stats.oversize_segments, 1);
+
+    // p.block came from new, not hipMalloc. It should not be nullptr here.
+    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+    stats.num_device_alloc++;
+    record_trace(
+        TraceEntry::SEGMENT_ALLOC,
+        int64_t(p.block->ptr),
+        p.block->size,
+        p.stream(),
+        p.device(),
+        ctx);
+    p.block->context_when_segment_allocated = ctx;
+    return true;
+  }
+
+  /** Free one or more oversize blocks to the system allocator.  But only enough
+   * **/
+  /** to satisfy the target size **/
+  bool release_available_cached_blocks(
+      const AllocParams& p,
+      const std::shared_ptr<GatheredContext>& context) {
+    if (ZoomAllocatorConfig::max_split_size() ==
+        std::numeric_limits<size_t>::max())
+      return false;
+    BlockPool& pool = *p.pool;
+
+    // because of std::unique_ptr, block cannot be trivially copied
+    // Use constructor for search key.
+    Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
+    key.size = (key.size < ZoomAllocatorConfig::max_split_size())
+        ? ZoomAllocatorConfig::max_split_size()
+        : key.size;
+    auto it = pool.blocks.lower_bound(&key);
+    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+      // No single block is large enough; free multiple oversize blocks,
+      // starting with the largest
+      if (it == pool.blocks.begin())
+        return false;
+      size_t totalReleased = 0;
+      --it; // Back up one item.  Now on the largest block for the correct
+            // stream
+      while ((totalReleased < key.size) &&
+             ((*it)->size >= ZoomAllocatorConfig::max_split_size()) &&
+             ((*it)->stream == p.stream())) {
+        auto cur = it;
+        totalReleased += (*it)->size;
+        if (it != pool.blocks.begin()) {
+          --it;
+          release_block(*cur, context);
+        } else {
+          release_block(*cur, context);
+          break;
+        }
+      }
+      if (totalReleased < key.size)
+        return false;
+    } else {
+      release_block(*it, context);
+    }
+    return true;
+  }
+
+  bool release_cached_blocks(const std::shared_ptr<GatheredContext>& context) {
+    // First ensure that all blocks that can't currently be allocated due to
+    // outstanding events are returned to the pool.
+    synchronize_and_free_events(context);
+
+    // Free all non-split cached blocks to system allocator
+    release_blocks(large_blocks, context);
+    release_blocks(small_blocks, context);
+
+    for (auto it = graph_pools_freeable.begin();
+         it != graph_pools_freeable.end();) {
+      // See notifyCaptureDestroy for the strategy here.
+      TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+      release_blocks(it->second->small_blocks, context);
+      release_blocks(it->second->large_blocks, context);
+      if (it->second->hipMalloc_count == 0) {
+        auto erase_count = graph_pools.erase(it->first);
+        TORCH_INTERNAL_ASSERT(erase_count == 1);
+        it = graph_pools_freeable.erase(it);
+      } else {
+        ++it;
+      }
+    }
+
+    return true;
+  }
+
+  void release_expandable_segment(Block* block) {
+    TORCH_INTERNAL_ASSERT(
+        block->size == block->expandable_segment_->size(),
+        "block disagrees with segment");
+    TORCH_INTERNAL_ASSERT(!block->mapped);
+    auto it = std::find(
+        expandable_segments_.begin(),
+        expandable_segments_.end(),
+        block->expandable_segment_);
+    TORCH_INTERNAL_ASSERT(it != expandable_segments_.end());
+    expandable_segments_.erase(it);
+    block->pool->unmapped.erase(block);
+    delete block->expandable_segment_;
+    delete block;
+  }
+
+  void release_block(
+      Block* block,
+      const std::shared_ptr<GatheredContext>& context) {
+    TORCH_INTERNAL_ASSERT(!block->expandable_segment_);
+    stats.num_device_free++;
+    record_trace(
+        TraceEntry::SEGMENT_FREE,
+        int64_t(block->ptr),
+        block->size,
+        block->stream,
+        block->device,
+        context ? context : block->context_when_segment_allocated);
+
+    C10_ZOOM_CHECK(hipFree((void*)block->ptr));
+    total_allocated_memory -= block->size;
+
+    auto* pool = block->pool;
+    if (pool->owner_PrivatePool) {
+      // The hipFreed block belonged to a HIP graph's PrivatePool.
+      TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->hipMalloc_count > 0);
+      pool->owner_PrivatePool->hipMalloc_count--;
+    }
+
+    StatTypes stat_types = get_stat_types_for_pool(*pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      decrease_stat(stats.segment[stat_type], 1);
+      decrease_stat(stats.reserved_bytes[stat_type], block->size);
+    });
+
+    if (block->size >= ZoomAllocatorConfig::max_split_size())
+      decrease_stat(stats.oversize_segments, 1);
+    pool->blocks.erase(block);
+    delete block;
+  }
+
+  void unmap_block(
+      Block* block,
+      const std::shared_ptr<GatheredContext>& context) {
+    auto unmapped = block->expandable_segment_->unmap(
+        SegmentRange{block->ptr, block->size});
+    if (unmapped.size == 0) {
+      return;
+    }
+    block->pool->blocks.erase(block);
+
+    ptrdiff_t before_size =
+        static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
+    if (before_size > 0) {
+      // prev? -> before_free -> block
+      Block* before_free = new Block(
+          block->device, block->stream, before_size, block->pool, block->ptr);
+      before_free->expandable_segment_ = block->expandable_segment_;
+      before_free->splice(block->prev, block);
+      block->pool->insert_into_blocks(before_free);
+    }
+
+    auto after_size = block->size - (before_size + unmapped.size);
+    if (after_size > 0) {
+      // block -> after_free -> next?
+      Block* after_free = new Block(
+          block->device,
+          block->stream,
+          after_size,
+          block->pool,
+          static_cast<char*>(unmapped.ptr) + unmapped.size);
+      after_free->expandable_segment_ = block->expandable_segment_;
+      after_free->splice(block, block->next);
+      block->pool->insert_into_blocks(after_free);
+    }
+
+    block->ptr = unmapped.ptr;
+    block->size = unmapped.size;
+    block->mapped = false;
+
+    try_merge_blocks(block, block->prev, *block->pool);
+    try_merge_blocks(block, block->next, *block->pool);
+    block->pool->unmapped.insert(block);
+
+    // update statistics
+    total_allocated_memory -= unmapped.size;
+    StatTypes stat_types = get_stat_types_for_pool(*block->pool);
+    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+      decrease_stat(stats.reserved_bytes[stat_type], unmapped.size);
+    });
+
+    stats.num_device_free++;
+    record_trace(
+        TraceEntry::SEGMENT_UNMAP,
+        int64_t(unmapped.ptr),
+        unmapped.size,
+        block->stream,
+        block->device,
+        context ? context : block->context_when_segment_allocated);
+  }
+  void release_blocks(
+      BlockPool& pool,
+      const std::shared_ptr<GatheredContext>& context) {
+    std::vector<Block*> to_unmap;
+    // Frees all non-split blocks
+    auto it = pool.blocks.begin();
+    while (it != pool.blocks.end()) {
+      Block* block = *it;
+      ++it;
+      if (block->expandable_segment_) {
+        // unmapping will mutate the free pool
+        // so just gather what needs to be freed
+        // to avoid invalidating the iterator
+        to_unmap.push_back(block);
+      } else if (!block->prev && !block->next) {
+        release_block(block, context);
+      }
+    }
+    for (Block* block : to_unmap) {
+      unmap_block(block, context);
+      if (!block->prev && !block->next) {
+        release_expandable_segment(block);
+      }
+    }
+  }
+
+  EventPool::Event create_event_internal(c10::DeviceIndex idx) {
+    // Leak the event pool to avoid shutdown issues.
+    static auto* event_pool = new EventPool();
+    return event_pool->get(idx);
+  }
+
+  void synchronize_and_free_events(
+      const std::shared_ptr<GatheredContext>& context) {
+    // Synchronize on outstanding events and then free associated blocks.
+    stats.num_sync_all_streams++;
+
+    // This function syncs, so capture should not be underway. Might as well
+    // make sure capture-deferred end of life events get processed too.
+    TORCH_INTERNAL_ASSERT(captures_underway.empty());
+    insert_events_deferred_until_no_capture();
+
+    for (auto& st : hip_events) {
+      for (auto& e : st.second) {
+        EventPool::Event event = std::move(e.first);
+        Block* block = e.second;
+
+        C10_ZOOM_CHECK(hipEventSynchronize(*event));
+
+        block->event_count--;
+        if (block->event_count == 0) {
+          free_block(block, context);
+        }
+      }
+    }
+
+    hip_events.clear();
+  }
+
+  void insert_events(Block* block) {
+    c10::DeviceIndex prev_device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&prev_device));
+
+    stream_set streams(std::move(block->stream_uses));
+    AT_ASSERT(block->stream_uses.empty());
+    for (auto& stream : streams) {
+      C10_ZOOM_CHECK(c10::zoom::SetDevice(stream.device_index()));
+
+      EventPool::Event event = create_event_internal(stream.device_index());
+      C10_ZOOM_CHECK(hipEventRecord(*event, stream.stream()));
+
+      block->event_count++;
+      hip_events[stream].emplace_back(std::move(event), block);
+    }
+
+    C10_ZOOM_CHECK(c10::zoom::MaybeSetDevice(prev_device));
+  }
+
+  void insert_events_deferred_until_no_capture() {
+    if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
+      for (auto* block : needs_events_deferred_until_no_capture) {
+        TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
+        insert_events(block);
+      }
+      needs_events_deferred_until_no_capture.clear();
+    }
+  }
+
+  void process_events(const std::shared_ptr<GatheredContext>& context) {
+    insert_events_deferred_until_no_capture();
+
+    // Process outstanding hipEvents. Events that are completed are
+    // removed from the queue, and the 'event_count' for the
+    // corresponding allocation is decremented. We maintain a separate
+    // list of events per stream to avoid head-of-line delays if one
+    // or more streams has long-running operations.
+
+    // Iterate over different streams.
+    for (auto it = hip_events.begin(); it != hip_events.end();) {
+      // Iterate over this stream's (event, block) pairs.
+      while (!it->second.empty()) {
+        auto& e = it->second.front();
+        EventPool::Event event = std::move(e.first);
+        Block* block = e.second;
+
+        hipError_t err = C10_ZOOM_ERROR_HANDLED(hipEventQuery(*event));
+        if (err == hipErrorNotReady) {
+          // ignore and clear the error if not ready
+          (void)hipGetLastError();
+          // Return the ownership of the Event (unique ptr)
+          e.first = std::move(event);
+          break;
+        } else if (err != hipSuccess) {
+          C10_ZOOM_CHECK(err);
+        }
+
+        block->event_count--;
+        if (block->event_count == 0) {
+          free_block(block, context);
+        }
+        it->second.pop_front();
+      }
+
+      if (it->second.empty()) {
+        it = hip_events.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+
+  // Iterates over sizes of all memory blocks for given device in given pool
+  void cache_info_aux(const BlockPool& pool, size_t* largest) {
+    for (const auto& block : pool.blocks) {
+      const auto blocksize = block->size;
+      if (blocksize > *largest) {
+        *largest = blocksize;
+      }
+    }
+  }
+
+  void record_trace(
+      TraceEntry::Action action,
+      size_t addr,
+      size_t size,
+      hipStream_t stream,
+      c10::DeviceIndex device,
+      std::shared_ptr<GatheredContext> context) {
+    if (!record_history && trace_trackers_.empty())
+      return;
+
+    auto te = TraceEntry(
+        action,
+        device,
+        addr,
+        size,
+        stream,
+        getApproximateTime(),
+        record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
+
+    // Callbacks should not include any Pytorch call
+    for (const auto& cb : trace_trackers_) {
+      cb(te);
+    }
+
+    if (record_history) {
+      if (alloc_trace->size() < alloc_trace_max_entries_) {
+        alloc_trace->emplace_back(te);
+      } else {
+        (*alloc_trace)[alloc_trace_next++] = te;
+        if (alloc_trace_next == alloc_trace_max_entries_) {
+          alloc_trace_next = 0;
+        }
+      }
+    }
+  }
+};
+
+// Returns whether to force all allocations to bypass the caching allocator and
+// go straight to hipMalloc.  This setting is useful when debugging GPU memory
+// errors, since the caching allocator foils cuda-memcheck.
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_ZOOM_MEMORY_CACHING") != nullptr;
+  return force_uncached;
+}
+
+static void uncached_delete(void* ptr) {
+  if (TORCH_SDT_IS_ENABLED(free)) {
+    TORCH_SDT_WITH_SEMAPHORE(free, ptr);
+  }
+
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_memory_deallocation(
+        c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(ptr));
+  }
+  C10_ZOOM_CHECK(hipFree(ptr));
+}
+
+void local_raw_delete(void* ptr);
+
+class NativeCachingAllocator : public ZoomAllocator {
+ private:
+  // Shard allocation region to have independent mutexes to reduce contention.
+  static constexpr size_t kNumMutexShard = 67;
+
+  // TODO: use std::hardware_destructive_interference_size once available
+  struct alignas(64) AlignedMutex {
+    std::mutex m;
+  };
+
+  std::array<AlignedMutex, kNumMutexShard> mutex;
+
+  // allocated blocks by device pointer
+  std::array<ska::flat_hash_map<void*, Block*>, kNumMutexShard>
+      allocated_blocks;
+
+  static size_t get_mutex_shard_id(void* ptr) {
+    return twang_mix64((size_t)ptr) % kNumMutexShard;
+  }
+
+  void add_allocated_block(Block* block) {
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    const auto mutex_shard_id = get_mutex_shard_id(block->ptr);
+    std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
+    allocated_blocks[mutex_shard_id][block->ptr] = block;
+  }
+
+  c10::ApproximateClockToUnixTimeConverter clock_converter;
+
+ public:
+  std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
+
+  Block* get_allocated_block(void* ptr, bool remove = false) {
+    const auto mutex_shard_id = get_mutex_shard_id(ptr);
+    std::lock_guard<std::mutex> lock(mutex[mutex_shard_id].m);
+    auto it = allocated_blocks[mutex_shard_id].find(ptr);
+    if (it == allocated_blocks[mutex_shard_id].end()) {
+      return nullptr;
+    }
+    Block* block = it->second;
+    if (remove) {
+      allocated_blocks[mutex_shard_id].erase(it);
+    }
+    return block;
+  }
+
+  void init(int device_count) override {
+    const auto size = static_cast<int64_t>(device_allocator.size());
+    if (size < device_count) {
+      device_allocator.resize(device_count);
+      for (const auto i : c10::irange(size, device_count)) {
+        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
+      }
+    }
+  }
+
+  bool initialized() override {
+    return !device_allocator.empty();
+  }
+
+  /** allocates a block which is safe to use from the provided stream */
+  void malloc(
+      void** devPtr,
+      c10::DeviceIndex device,
+      size_t size,
+      hipStream_t stream) {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && static_cast<size_t>(device) < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    Block* block = device_allocator[device]->malloc(device, size, stream);
+    add_allocated_block(block);
+    *devPtr = (void*)block->ptr;
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_memory_allocation(
+          c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(*devPtr));
+    }
+  }
+
+  void free(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    Block* block = get_allocated_block(ptr, true /* remove */);
+    if (!block) {
+      TORCH_CHECK(false, "invalid device pointer: ", ptr);
+    }
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_memory_deallocation(
+          c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(block->ptr));
+    }
+    device_allocator[block->device]->free(block);
+  }
+
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && static_cast<size_t>(device) < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    TORCH_INTERNAL_ASSERT(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1).");
+    C10_ZOOM_CHECK(c10::zoom::SetDevice(device));
+    device_allocator[device]->setMemoryFraction(fraction);
+  }
+
+  void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when) override {
+    for (auto& allocator : device_allocator) {
+      allocator->recordHistory(
+          enabled, context_recorder, alloc_trace_max_entries, when);
+    }
+  }
+
+  bool isHistoryEnabled() override {
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    return device_allocator[device]->isHistoryEnabled();
+  }
+
+  bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) override {
+    return device_allocator[device]->checkPoolLiveAllocations(
+        mempool_id, expected_live_allocations);
+  }
+
+  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
+    for (auto& allocator : device_allocator) {
+      allocator->attachOutOfMemoryObserver(observer);
+    }
+  }
+
+  void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) override {
+    for (auto& allocator : device_allocator) {
+      allocator->attachAllocatorTraceTracker(tracker);
+    }
+  }
+
+  void emptyCache() override {
+    for (auto& da : device_allocator)
+      da->emptyCache();
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* outSize) override {
+    Block* block = get_allocated_block(ptr);
+    if (!block) {
+      TORCH_CHECK(false, "invalid device pointer: ", ptr);
+    }
+    return device_allocator[block->device]->getBaseAllocation(block, outSize);
+  }
+
+  void recordStream(const DataPtr& ptr, zoom::ZoomStream stream) override {
+    // Empty tensor's storage().data() might be a null ptr. As there is no
+    // blocks associated with those tensors, it is fine to do nothing here.
+    if (!ptr.get()) {
+      return;
+    }
+
+    // If a tensor is not allocated by this instance, simply skip
+    // This usually happens when HIP tensors are shared across processes,
+    // we have implemented reference counting based sharing mechanism to
+    // guarantee tensors won't be accidentally freed by one process while
+    // they are still being used in another
+    if (ptr.get_deleter() != &local_raw_delete)
+      return;
+
+    Block* block = get_allocated_block(ptr.get());
+    // block must not be null reaching here
+    TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found");
+    device_allocator[block->device]->recordStream(block, stream);
+  }
+
+  SnapshotInfo snapshot() override {
+    // Set-up converter to convert timestamps from tsc to microseconds.
+    auto tsc_to_ns = clock_converter.makeConverter();
+    auto tsc_to_us = [=](approx_time_t t_approx) {
+      return tsc_to_ns(t_approx) / 1000;
+    };
+
+    SnapshotInfo result;
+    for (auto& da : device_allocator) {
+      result.device_traces.emplace_back(da->trace(tsc_to_us));
+      auto snap = da->snapshot();
+      result.segments.insert(result.segments.end(), snap.begin(), snap.end());
+    }
+
+    auto& md = result.config_metadata;
+    md.garbage_collection_threshold =
+        ZoomAllocatorConfig::garbage_collection_threshold();
+    md.max_split_size = ZoomAllocatorConfig::max_split_size();
+    md.pinned_num_register_threads =
+        ZoomAllocatorConfig::pinned_num_register_threads();
+    md.expandable_segments = ZoomAllocatorConfig::expandable_segments();
+    md.release_lock_on_malloc =
+        ZoomAllocatorConfig::release_lock_on_hipMalloc();
+    md.pinned_use_host_register =
+        ZoomAllocatorConfig::pinned_use_zoom_host_register();
+    md.last_allocator_settings = ZoomAllocatorConfig::last_allocator_settings();
+    md.roundup_power2_divisions =
+        ZoomAllocatorConfig::roundup_power2_divisions();
+
+    return result;
+  }
+
+  std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
+    return device_allocator[device]->getCheckpointState(id);
+  }
+
+  /**
+   * @brief Checkpoint the private pool state identified in `as` to its prior
+   * state
+   *
+   * @param device - device of the pool to manipulate
+   * @param as - allocator state
+   * @param stale_live_storages - storages of tensors which are currently
+   * allocated but which will be not be allocated after the checkpoint is set.
+   * For these storages we will remove their deleter function.
+   * @return CheckpointDelta - Freed Pointers and DataPtrs that contain deleter
+   * functions for all allocated blocks in the new checkpoint state.
+   */
+  CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<AllocatorState> as) override {
+    std::shared_ptr<PrivatePoolState> pps =
+        std::dynamic_pointer_cast<PrivatePoolState>(as);
+
+    TORCH_CHECK(pps, "Expected PrivatePoolState");
+
+    auto rr = device_allocator[device]->setCheckpointPoolState(*pps);
+
+    CheckpointDelta cpd;
+    for (void* ptr : rr.allocations_freed) {
+      get_allocated_block(ptr, /*remove*/ true);
+      cpd.ptrs_freed.push_back(ptr);
+    }
+    for (Block* block : rr.allocations_created) {
+      add_allocated_block(block);
+      cpd.dataptrs_allocd.emplace_back(
+          block->ptr,
+          block->ptr,
+          &local_raw_delete,
+          Device(DeviceType::PrivateUse1, device));
+    }
+
+    return cpd;
+  }
+
+  DataPtr allocate(size_t size) override {
+    constexpr size_t one_exa_bytes = 1152921504606846976ULL;
+    TORCH_CHECK_WITH(
+        OutOfMemoryError,
+        size < one_exa_bytes,
+        "HIP out of memory. Tried to allocate more than 1EB memory.");
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    void* devPtr = nullptr;
+    void (*deleteFunc)(void*) = &local_raw_delete;
+    ZoomStream stream = zoom::getCurrentZoomStream(device);
+
+    if (forceUncachedAllocator()) {
+      deleteFunc = &uncached_delete;
+
+      // Deliberately don't use hipMallocMaybeCapturing here, to force an error
+      // if someone tries to use forceUncachedAllocator while capturing.
+      C10_ZOOM_CHECK(hipMalloc(&devPtr, size));
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_memory_allocation(
+            c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(devPtr));
+      }
+    } else {
+      if (size != 0) {
+        this->malloc(&devPtr, device, size, stream);
+      }
+    }
+
+    if (size && TORCH_SDT_IS_ENABLED(malloc)) {
+      TORCH_SDT_WITH_SEMAPHORE(malloc, devPtr, device, size, stream.id());
+    }
+
+    return {devPtr, devPtr, deleteFunc, Device(DeviceType::PrivateUse1, device)};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    if (forceUncachedAllocator()) {
+      return &uncached_delete;
+    } else {
+      return &local_raw_delete;
+    }
+  }
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override {
+    device_allocator[device]->cacheInfo(largestBlock);
+  }
+  void assertValidDevice(c10::DeviceIndex device) {
+    const auto device_num = device_allocator.size();
+    TORCH_CHECK(
+        0 <= device && device < static_cast<int64_t>(device_num),
+        "Invalid device argument ",
+        device,
+        ": did you call init?");
+  }
+
+  DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    assertValidDevice(device);
+    return device_allocator[device]->getStats();
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    assertValidDevice(device);
+    device_allocator[device]->resetAccumulatedStats();
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    assertValidDevice(device);
+    device_allocator[device]->resetPeakStats();
+  }
+  // HIPGraph interactions
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) override {
+    assertValidDevice(device);
+    device_allocator[device]->beginAllocateToPool(
+        std::move(mempool_id), std::move(filter));
+  }
+
+  void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
+      override {
+    assertValidDevice(device);
+    device_allocator[device]->endAllocateToPool(mempool_id);
+  }
+
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    assertValidDevice(device);
+    device_allocator[device]->releasePool(std::move(mempool_id));
+  }
+
+  void* raw_alloc(size_t nbytes) override {
+    if (nbytes == 0) {
+      return nullptr;
+    }
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    void* r = nullptr;
+    malloc(&r, device, nbytes, zoom::getCurrentZoomStream(device));
+    return r;
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
+    if (nbytes == 0) {
+      return nullptr;
+    }
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    void* r = nullptr;
+    malloc(&r, device, nbytes, stream);
+    return r;
+  }
+
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override {
+    c10::zoom::ZoomGuard device_guard(dev);
+    hipError_t err = hipDeviceEnablePeerAccess(dev_to_access, 0);
+    if (err == hipErrorPeerAccessAlreadyEnabled) {
+      // ignore and clear the error if access was already enabled
+      (void)hipGetLastError();
+    } else {
+      C10_ZOOM_CHECK(err);
+    }
+    device_allocator[dev_to_access]->addPeerAccess(dev);
+  }
+
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override {
+    if (p2p_enabled || // memcpy ok because memory is mapped in both devices
+        srcDevice == dstDevice || // memcpy ok on a single device
+        // memcpy ok because both dst and src must have come from hipMalloc
+        (!device_allocator[dstDevice]->hasAllocatedExpandableSegments() &&
+         !device_allocator[srcDevice]->hasAllocatedExpandableSegments())) {
+      return hipMemcpyAsync(dst, src, count, hipMemcpyDeviceToDevice, stream);
+    }
+    // when p2p is not enabled, only hipMemcpyPeerAsync correctly handles
+    // memory not allocated via hipMalloc
+    return hipMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
+  }
+
+  void raw_delete(void* ptr) override {
+    this->free(ptr);
+  }
+
+  // In HIP IPC, sender sends a tensor to receiver, getIpcDevPtr
+  // is called by the receiving process to map the HIP memory from the sending
+  // process into its own address space.
+  //
+  // HIP IPC only allows sharing a big memory block associated with a
+  // hipIpcMemHandle_t and it can be opened only **once** per context per
+  // process. There can be multiple types of storage in the same IPC mem block,
+  // so we must cache the device ptr to construct typed storage as it comes.
+  //
+  // ipcMemHandle_to_devptr maps a hipIpcMemHandle_t to a device pointer in the
+  // process that can be used to access the memory block in the sender process.
+  // It only saves a weak_ptr of the device pointer in the map, the shared_ptr
+  // will be used to reconstruct all storages in this hipMalloc allocation. And
+  // it will deleted in cudaIpcCloseMemHandle when its reference count is 0.
+  //
+  std::mutex IpcMutex;
+  ska::flat_hash_map<std::string, std::weak_ptr<void>> ipcMemHandle_to_devptr;
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    std::lock_guard<std::mutex> lock(IpcMutex);
+
+    auto iter = ipcMemHandle_to_devptr.find(handle);
+    if (iter != ipcMemHandle_to_devptr.end()) {
+      auto devptr = iter->second.lock();
+      if (devptr)
+        return devptr;
+    }
+    // This ipcMemHandle hasn't been opened, or already expired, open it to
+    // enable IPC access to that mem block.
+    void* dev = nullptr;
+    auto ipc_handle =
+        reinterpret_cast<const hipIpcMemHandle_t*>(handle.c_str());
+    C10_ZOOM_CHECK(hipIpcOpenMemHandle(
+        &dev, *ipc_handle, hipIpcMemLazyEnablePeerAccess));
+    // devPtr has to be deleted in same device when created.
+    c10::DeviceIndex curr_device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&curr_device));
+    auto sp =
+        std::shared_ptr<void>(dev, [handle, curr_device, this](void* ptr) {
+          zoom::ZoomGuard device_guard(curr_device);
+          std::lock_guard<std::mutex> deleter_lock(IpcMutex);
+          C10_ZOOM_CHECK(hipIpcCloseMemHandle(ptr));
+          ipcMemHandle_to_devptr.erase(handle);
+        });
+    std::weak_ptr<void> wp = sp;
+    // To eliminate an additional search, we can use insert().
+    // It doesn't overwrite when key already exists(ptr expired).
+    // But in the deleter for sp we erased the entry,
+    // this should be safe to do now.
+    ipcMemHandle_to_devptr.insert(iter, {handle, wp});
+
+    return sp;
+  }
+  std::string name() override {
+    return "native";
+  }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    C10_ZOOM_CHECK(
+        hipMemcpy(dest, src, count, hipMemcpyKind::hipMemcpyDeviceToDevice));
+  }
+};
+
+NativeCachingAllocator allocator;
+
+void local_raw_delete(void* ptr) {
+  if (TORCH_SDT_IS_ENABLED(free)) {
+    TORCH_SDT_WITH_SEMAPHORE(free, ptr);
+  }
+
+  allocator.free(ptr);
+}
+
+} // namespace Native
+// Size pretty-printer
+std::string format_size(uint64_t size) {
+  std::ostringstream os;
+  os.precision(2);
+  os << std::fixed;
+  if (size <= 1024) {
+    os << size << " bytes";
+  } else if (size <= 1048576) {
+    os << (static_cast<double>(size) / 1024.0);
+    os << " KiB";
+  } else if (size <= 1073741824ULL) {
+    os << static_cast<double>(size) / 1048576.0;
+    os << " MiB";
+  } else {
+    os << static_cast<double>(size) / 1073741824.0;
+    os << " GiB";
+  }
+  return os.str();
+}
+
+namespace ZoomMallocAsync {
+// If this is put in its own header file, it gets incorrectly renamed in HIPify.
+ZoomAllocator* allocator();
+
+} // namespace ZoomMallocAsync
+
+struct BackendStaticInitializer {
+  // Parses env for backend at load time, duplicating some logic from
+  // ZoomAllocatorConfig. ZoomAllocatorConfig double-checks it later (at
+  // runtime). Defers verbose exceptions and error checks, including Cuda
+  // version checks, to ZoomAllocatorConfig's runtime doublecheck. If this
+  // works, maybe we should move all of ZoomAllocatorConfig here?
+  ZoomAllocator* parseEnvForBackend() {
+    const char* val = getenv("PYTORCH_ZOOM_ALLOC_CONF");
+    if (val != nullptr) {
+      const std::string config(val);
+
+      std::regex exp("[\\s,]+");
+      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
+      std::sregex_token_iterator end;
+      std::vector<std::string> options(it, end);
+
+      for (auto option : options) {
+        std::regex exp2("[:]+");
+        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
+        std::sregex_token_iterator end2;
+        std::vector<std::string> kv(it2, end2);
+        if (kv.size() >= 2) {
+          if (kv[0] == "backend") {
+            if (kv[1] == "hipMallocAsync")
+              return ZoomMallocAsync::allocator();
+            if (kv[1] == "native")
+              return &Native::allocator;
+          }
+        }
+      }
+    }
+    return &Native::allocator;
+  }
+
+  BackendStaticInitializer() {
+    auto r = parseEnvForBackend();
+    allocator.store(r);
+  }
+};
+
+std::atomic<ZoomAllocator*> allocator;
+BackendStaticInitializer backend_static_initializer;
+
+Allocator* getZoomAllocator() {
+    return c10::zoom::ZoomCachingAllocator::get();
+}
+
+REGISTER_PU1_ALLOCATOR(getZoomAllocator);
+// namespace {                                          
+//   static PrivateUse1AllocatorRegisterer g_allocator_d(getZoomAllocator); 
+//   }
+
+} // namespace zoom::ZoomCachingAllocator
+
+} // namespace c10
\ No newline at end of file
diff --git a/c10/zoom/ZoomCachingAllocator.h b/c10/zoom/ZoomCachingAllocator.h
new file mode 100644
index 00000000000000..5311a04726d9fb
--- /dev/null
+++ b/c10/zoom/ZoomCachingAllocator.h
@@ -0,0 +1,480 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+#include <c10/zoom/ZoomStream.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+namespace c10 {
+
+// Caching allocator will execute every registered callback if it unable to find
+// block inside of already allocated area.
+class FreeMemoryCallback {
+ public:
+  virtual ~FreeMemoryCallback() = default;
+  virtual bool Execute() = 0;
+};
+
+C10_DECLARE_REGISTRY(FreeZoomMemoryCallbacksRegistry, FreeMemoryCallback);
+#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(FreeZoomMemoryCallbacksRegistry, name, __VA_ARGS__);
+} // namespace c10
+  //
+// TODO: Turn this into an honest to goodness class. I briefly attempted to do
+// this, but it was a bit irritating to figure out how to also correctly
+// apply pimpl pattern so I didn't have to leak any internal implementation
+// details in the header (ZoomCachingAllocator could be made a pimpl, but
+// you also need to appropriately define a class which is a subclass
+// of Allocator. Not impossible, but required a bit more surgery than
+// I wanted to do at the time.)
+//
+// Why is this using a namespace rather than old-style THCCachingAllocator_
+// prefix?  Mostly because it made the HIPify rules easier to write; _ is
+// not counted as a word boundary, so you would otherwise have to list each
+// of these functions.
+
+namespace c10::zoom::ZoomCachingAllocator {
+
+extern const size_t kLargeBuffer;
+
+struct Stat {
+  int64_t current = 0;
+  int64_t peak = 0;
+  int64_t allocated = 0;
+  int64_t freed = 0;
+};
+
+enum struct StatType : uint64_t {
+  AGGREGATE = 0,
+  SMALL_POOL = 1,
+  LARGE_POOL = 2,
+  NUM_TYPES = 3 // remember to update this whenever a new stat type is added
+};
+
+typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
+
+// Struct containing memory allocator summary statistics for a device.
+struct DeviceStats {
+  // COUNT: allocations requested by client code
+  StatArray allocation;
+  // COUNT: number of allocated segments from cudaMalloc().
+  StatArray segment;
+  // COUNT: number of active memory blocks (allocated or used by stream)
+  StatArray active;
+  // COUNT: number of inactive, split memory blocks (unallocated but can't be
+  // released via cudaFree)
+  StatArray inactive_split;
+
+  // SUM: bytes allocated by this memory alocator
+  StatArray allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  StatArray reserved_bytes;
+  // SUM: bytes within active memory blocks
+  StatArray active_bytes;
+  // SUM: bytes within inactive, split memory blocks
+  StatArray inactive_split_bytes;
+  // SUM: bytes requested by client code
+  StatArray requested_bytes;
+
+  // COUNT: total number of failed calls to CUDA malloc necessitating cache
+  // flushes.
+  int64_t num_alloc_retries = 0;
+
+  // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
+  int64_t num_ooms = 0;
+
+  // COUNT: total number of oversize blocks allocated from pool
+  Stat oversize_allocations;
+
+  // COUNT: total number of oversize blocks requiring malloc
+  Stat oversize_segments;
+
+  // COUNT: total number of synchronize_and_free_events() calls
+  int64_t num_sync_all_streams = 0;
+
+  // COUNT: total number of CUDA allocation calls. This includes both cuMemMap
+  // and cudaMalloc.
+  int64_t num_device_alloc = 0;
+
+  // COUNT: total number of CUDA free calls. This includes both cuMemUnmap
+  // and cudaFree.
+  int64_t num_device_free = 0;
+
+  // SIZE: maximum block size that is allowed to be split.
+  int64_t max_split_size = 0;
+};
+
+typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
+
+// Struct containing info of an allocation block (i.e. a fractional part of a
+// cudaMalloc)..
+struct BlockInfo {
+  size_t size = 0;
+  size_t requested_size = 0;
+  int32_t gc_counter = 0;
+  bool allocated = false;
+  bool active = false;
+  std::shared_ptr<GatheredContext>
+      context_when_allocated; // per-watcher context
+};
+
+// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
+struct SegmentInfo {
+  c10::DeviceIndex device = 0;
+  size_t address = 0;
+  size_t total_size = 0;
+  size_t requested_size = 0; // unrounded, actually requested size
+  size_t allocated_size = 0;
+  size_t active_size = 0;
+  hipStream_t stream = nullptr;
+  bool is_large = false;
+  bool is_expandable = false;
+  MempoolId_t owner_private_pool_id = {0, 0};
+  std::vector<BlockInfo> blocks;
+  std::shared_ptr<GatheredContext> context_when_allocated;
+};
+
+struct AllocatorState {
+  virtual ~AllocatorState() = default;
+};
+
+union trace_time_ {
+  time_t t_;
+  approx_time_t approx_t_;
+};
+
+struct TraceEntry {
+  enum Action {
+    ALLOC, // API made to the caching allocator for new memory
+    FREE_REQUESTED, // API call made to the caching allocator to free memory
+    FREE_COMPLETED, // The allocator might have to delay a free because
+                    // it is still in use on another stream via record_stream
+                    // This event is generated when a free actually completes.
+    SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
+    SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
+                  // defragment or empty_caches)
+    SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments)
+    SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
+    SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
+              // events
+    OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
+        // bytes reported by cuda)
+  };
+  TraceEntry(
+      Action action,
+      c10::DeviceIndex device,
+      size_t addr,
+      size_t size,
+      hipStream_t stream,
+      approx_time_t time,
+      std::shared_ptr<GatheredContext> context = nullptr)
+      : action_(action),
+        device_(device),
+        addr_(addr),
+        context_(std::move(context)),
+        stream_(stream),
+        size_(size) {
+    time_.approx_t_ = time;
+  }
+  Action action_;
+  c10::DeviceIndex device_;
+  size_t addr_; // for OOM, this is the amount of free bytes reported by cuda
+  std::shared_ptr<GatheredContext> context_;
+  hipStream_t stream_{};
+  size_t size_;
+  trace_time_ time_{};
+};
+
+struct AllocatorConfigInfo {
+  double garbage_collection_threshold;
+  size_t max_split_size;
+  size_t pinned_num_register_threads;
+  bool expandable_segments;
+  bool release_lock_on_malloc;
+  bool pinned_use_host_register;
+  std::string last_allocator_settings;
+  std::vector<size_t> roundup_power2_divisions;
+};
+
+struct SnapshotInfo {
+  std::vector<SegmentInfo> segments;
+  std::vector<std::vector<TraceEntry>> device_traces;
+  AllocatorConfigInfo config_metadata;
+};
+
+// returns the pointers freed in the pool
+// and the pointers allocated. Note: a pointer
+// may appear in both freed and allocated
+struct CheckpointDelta {
+  std::vector<void*> ptrs_freed;
+  std::vector<at::DataPtr> dataptrs_allocd;
+};
+
+enum struct RecordContext {
+  NEVER = 0,
+  STATE = 1, // only keep stacks for active allocations
+  ALLOC = 2, // additionally keep stacks for allocations in the trace history
+  ALL = 3, // additionally record stacks for when something is freed
+};
+
+// Size pretty-printer
+std::string format_size(uint64_t size);
+
+using OutOfMemoryObserver = std::function<void(
+    int64_t device,
+    size_t allocated,
+    size_t device_total,
+    size_t device_free)>;
+
+using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
+
+class ZoomAllocator : public Allocator {
+ public:
+  virtual void* raw_alloc(size_t nbytes) = 0;
+  virtual void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) = 0;
+  virtual void raw_delete(void* ptr) = 0;
+  virtual void init(int device_count) = 0;
+  virtual bool initialized() = 0;
+  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual void emptyCache() = 0;
+  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
+  virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
+  virtual void recordStream(const DataPtr&, ZoomStream stream) = 0;
+  virtual DeviceStats getDeviceStats(c10::DeviceIndex device) = 0;
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+  virtual SnapshotInfo snapshot() = 0;
+  virtual void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)> filter) = 0;
+  virtual void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) = 0;
+  virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
+  // returns true if the allocated blocks are equal to expected live allocations
+  virtual bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support checkPoolLiveAllocations. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
+  virtual bool isHistoryEnabled() {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support recordHistory. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when) = 0;
+  virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
+
+  // Attached AllocatorTraceTracker callbacks will be called while the
+  // per-device allocator lock is held. Any additional locks taken from within
+  // the callback must be proven to always have the lock order that never
+  // triggers a deadlock. In particular, Python's GIL may be held when
+  // calling the allocator so it is unsafe to try to acquire the GIL in this
+  // callback.
+  virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
+
+  virtual void enablePeerAccess(
+      c10::DeviceIndex dev,
+      c10::DeviceIndex dev_to_access) = 0;
+
+  // memory not allocated from cudaMalloc cannot be copied
+  // across devices using cudaMemcpyAsync if peer to peer access is disabled.
+  // instead it requires cudaMemcpyAsyncPeer
+  //  with P2P Enabled, all combinations work
+  //  with P2P Disabled:
+  //                       cudaMalloc cudaMallocAsync/cuMemMap
+  // cudaMemcpyAsyncPeer   works      works
+  // cudaMemcpyAsync       works      error
+
+  // This function performs chooses to use the Peer version of
+  // memcpy if required based on where the allocated put dst/src.
+  virtual hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) = 0;
+  virtual std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) = 0;
+  virtual CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<AllocatorState> pps) = 0;
+  virtual std::string name() = 0;
+};
+
+// Allocator object, statically initialized
+// See BackendInitializer in ZoomCachingAllocator.cpp.
+// Atomic loads on x86 are just normal loads,
+// (atomic stores are different), so reading this value
+// is no different than loading a pointer.
+extern std::atomic<ZoomAllocator*> allocator;
+
+inline ZoomAllocator* get() {
+  return allocator.load();
+}
+
+// Called directly by clients.
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache() {
+  return get()->emptyCache();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline void recordStream(const DataPtr& dataPtr, ZoomStream stream) {
+  return get()->recordStream(dataPtr, stream);
+}
+
+inline DeviceStats getDeviceStats(c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+}
+
+inline SnapshotInfo snapshot() {
+  return get()->snapshot();
+}
+
+inline std::shared_ptr<AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+// CUDAGraph interactions
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(hipStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    RecordContext when) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when);
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+}
+// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline hipError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    hipStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  return get()->enablePeerAccess(dev, dev_to_access);
+}
+
+} // namespace c10::zoom::ZoomCachingAllocator
\ No newline at end of file
diff --git a/c10/zoom/ZoomDeviceAssertionHost.cpp b/c10/zoom/ZoomDeviceAssertionHost.cpp
new file mode 100644
index 00000000000000..a3b2207df73a4b
--- /dev/null
+++ b/c10/zoom/ZoomDeviceAssertionHost.cpp
@@ -0,0 +1,344 @@
+#include <c10/zoom/ZoomDeviceAssertionHost.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <hip/hip_runtime.h>
+
+#include <memory>
+#include <string>
+#ifdef TORCH_USE_ZOOM_DSA
+#include <chrono>
+#include <thread>
+#endif
+
+#define C10_ZOOM_CHECK_WO_DSA(EXPR)                                 \
+  do {                                                              \
+    const hipError_t __err = EXPR;                                 \
+    c10::zoom::c10_zoom_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        false);                                                     \
+  } while (0)
+
+namespace c10::zoom {
+
+namespace {
+
+#ifdef TORCH_USE_ZOOM_DSA
+/// Get current device id
+/// We need our own implementation of this function to prevent
+/// an infinite initialization loop for ZoomKernelLaunchRegistry
+int dsa_get_device_id() {
+  c10::DeviceIndex device = -1;
+  C10_ZOOM_CHECK_WO_DSA(c10::zoom::GetDevice(&device));
+  return device;
+}
+
+/// Get a device's compute capability - note that this dangerously assumes
+/// that if one CUDA GPU supports device-side assertions they all do. This is
+/// probably fine since the latest CUDA GPU that doesn't support UVM is the
+/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
+/// rare enough that the defensive
+/// We need our own implementation of this function to prevent
+/// an infinite initialization loop for ZoomKernelLaunchRegistry
+int dsa_get_device_compute_capability(const int device_num) {
+  int compute_capability = -1;
+  C10_ZOOM_CHECK_WO_DSA(hipDeviceGetAttribute(
+      &compute_capability, hipDevAttrComputeCapabilityMajor, device_num));
+  return compute_capability;
+}
+#endif
+
+/// Get the number of HIP devices
+/// We need our own implementation of this function to prevent
+/// an infinite initialization loop for ZoomKernelLaunchRegistry
+int dsa_get_device_count() {
+  int device_count = -1;
+  C10_ZOOM_CHECK_WO_DSA(c10::zoom::GetDeviceCount(&device_count));
+  return device_count;
+}
+
+bool dsa_check_if_all_devices_support_managed_memory() {
+// It looks as though this'll work best on CUDA GPUs with Pascal
+// architectures or newer, per
+// https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
+#ifdef TORCH_USE_ZOOM_DSA
+  for (const auto i : c10::irange(dsa_get_device_count())) {
+    if (dsa_get_device_compute_capability(i) < 6) {
+      return false;
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+bool env_flag_set(const char* env_var_name) {
+  const char* const env_string = std::getenv(env_var_name);
+  return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
+}
+
+/// Deleter for UVM/managed memory pointers
+void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
+  // Ignore error in destructor
+  if (uvm_assertions_ptr) {
+    C10_ZOOM_IGNORE_ERROR(hipFree(uvm_assertions_ptr));
+  }
+}
+
+} // namespace
+
+/// Check that kernels ran correctly by checking the message buffer. BLOCKING.
+std::string c10_retrieve_device_side_assertion_info() {
+#ifdef TORCH_USE_ZOOM_DSA
+  const auto& launch_registry = ZoomKernelLaunchRegistry::get_singleton_ref();
+  if (!launch_registry.enabled_at_runtime) {
+    return "Device-side assertion tracking was not enabled by user.";
+  } else if (!launch_registry.do_all_devices_support_managed_memory) {
+    return "Device-side assertions disabled because not all devices support managed memory.";
+  }
+
+  // Hack that saves a lot of challenging sync logic.
+  // The GPU increments the number of errors it's observed and the CPU can see
+  // that happening immediately which means we can make it here before the GPU
+  // is done writing information about those errors to memory.
+  // A short pause gives it time to finish. Since something's gone wrong, this
+  // pause shouldn't affect perf.
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+
+  // The snapshot causes a brief block. That's okay because this function only
+  // executes if something's gone wrong such that speed is no longer a priority.
+  const auto launch_data = launch_registry.snapshot();
+  const auto& assertion_data = launch_data.first;
+  const auto& launch_infos = launch_data.second;
+
+  std::stringstream oss;
+
+  oss << "Looking for device-side assertion failure information...\n";
+
+  // Loop over each device that could be managed by the process
+  for (const auto device_num : c10::irange(assertion_data.size())) {
+    const auto& assertion_data_for_device = assertion_data.at(device_num);
+
+    // Did anything fail?
+    const auto failures_found = std::min(
+        assertion_data_for_device.assertion_count,
+        C10_ZOOM_DSA_ASSERTION_COUNT);
+    if (failures_found == 0) {
+      continue;
+    }
+
+    // Something failed, let's talk about that
+    oss << failures_found
+        << " HIP device-side assertion failures were found on GPU #"
+        << device_num << "!" << std::endl;
+    if (assertion_data_for_device.assertion_count >
+        C10_ZOOM_DSA_ASSERTION_COUNT) {
+      oss << "But at least " << assertion_data_for_device.assertion_count
+          << " assertion failures occurred on the device" << std::endl;
+      oss << "Adjust `C10_ZOOM_DSA_ASSERTION_COUNT` if you need more assertion failure info"
+          << std::endl;
+    }
+
+    for (const auto i : c10::irange(failures_found)) {
+      const auto& self = assertion_data_for_device.assertions[i];
+      const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
+      oss << "Assertion failure " << i << std::endl;
+      oss << "  GPU assertion failure message = " << self.assertion_msg
+          << std::endl;
+      oss << "  File containing assertion = " << self.filename << ":"
+          << self.line_number << std::endl;
+      oss << "  Device function containing assertion = " << self.function_name
+          << std::endl;
+      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
+          << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
+      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
+          << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
+      if (launch_info.generation_number == self.caller) {
+        oss << "  File containing kernel launch = "
+            << launch_info.launch_filename << ":" << launch_info.launch_linenum
+            << std::endl;
+        oss << "  Function containing kernel launch = "
+            << launch_info.launch_function << std::endl;
+        oss << "  Name of kernel launched that led to failure = "
+            << launch_info.kernel_name << std::endl;
+        oss << "  Device that launched kernel = " << launch_info.device
+            << std::endl;
+        oss << "  Stream kernel was launched on = " << launch_info.stream
+            << std::endl;
+        oss << "  Backtrace of kernel launch site = ";
+        if (launch_registry.gather_launch_stacktrace) {
+          oss << "Launch stacktracing disabled." << std::endl;
+        } else {
+          oss << "\n" << launch_info.launch_stacktrace << std::endl;
+        }
+      } else {
+        oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `ZoomKernelLaunchRegistry::max_size`."
+            << std::endl;
+      }
+    }
+  }
+  return oss.str();
+#else
+  return "Compile with `TORCH_USE_ZOOM_DSA` to enable device-side assertions.\n";
+#endif
+}
+
+ZoomKernelLaunchRegistry::ZoomKernelLaunchRegistry()
+    : do_all_devices_support_managed_memory(
+          dsa_check_if_all_devices_support_managed_memory()),
+      gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
+      enabled_at_runtime(check_env_for_dsa_enabled()) {
+  for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
+    uvm_assertions.emplace_back(nullptr, uvm_deleter);
+  }
+
+  kernel_launches.resize(max_kernel_launches);
+}
+
+bool ZoomKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
+    const {
+  return env_flag_set("PYTORCH_ZOOM_DSA_STACKTRACING");
+}
+
+bool ZoomKernelLaunchRegistry::check_env_for_dsa_enabled() const {
+  return env_flag_set("PYTORCH_USE_ZOOM_DSA");
+}
+
+uint32_t ZoomKernelLaunchRegistry::insert(
+    const char* launch_filename,
+    const char* launch_function,
+    const uint32_t launch_linenum,
+    const char* kernel_name,
+    const int32_t stream_id) {
+#ifdef TORCH_USE_ZOOM_DSA
+  if (!enabled_at_runtime) {
+    return 0;
+  }
+
+  const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";
+
+  const std::lock_guard<std::mutex> lock(read_write_mutex);
+
+  const auto my_gen_number = generation_number++;
+  // TODO: It would probably be good to get a stack trace here so that
+  // we can better indicate which launch caused the failure.
+  kernel_launches[my_gen_number % max_kernel_launches] = {
+      launch_filename,
+      launch_function,
+      launch_linenum,
+      backtrace,
+      kernel_name,
+      dsa_get_device_id(),
+      stream_id,
+      my_gen_number};
+  return my_gen_number;
+#else
+  return 0;
+#endif
+}
+
+std::pair<std::vector<DeviceAssertionsData>, std::vector<ZoomKernelLaunchInfo>>
+ZoomKernelLaunchRegistry::snapshot() const {
+  // This is likely to be the longest-lasting hold on the mutex, but
+  // we only expect it to be called in cases where we're already failing
+  // and speed is no longer important
+  const std::lock_guard<std::mutex> lock(read_write_mutex);
+
+  std::vector<DeviceAssertionsData> device_assertions_data;
+  for (const auto& x : uvm_assertions) {
+    if (x) {
+      device_assertions_data.push_back(*x);
+    } else {
+      device_assertions_data.emplace_back();
+    }
+  }
+
+  return std::make_pair(device_assertions_data, kernel_launches);
+}
+
+DeviceAssertionsData* ZoomKernelLaunchRegistry::
+    get_uvm_assertions_ptr_for_current_device() {
+#ifdef TORCH_USE_ZOOM_DSA
+  if (!enabled_at_runtime) {
+    return nullptr;
+  }
+
+  const auto device_num = dsa_get_device_id();
+
+  // If we've already set up this GPU with managed memory, return a pointer to
+  // the managed memory. This is a lock-free quick-return path.
+  if (uvm_assertions.at(device_num)) {
+    return uvm_assertions.at(device_num).get();
+  }
+
+  // Need a lock here so there's not race-condition on creating the new device
+  // assertions buffer
+  const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);
+
+  // If we've already set up this GPU with managed memory, return a pointer to
+  // the managed memory. This locked path ensures that the device memory is
+  // allocated only once
+  if (uvm_assertions.at(device_num)) {
+    return uvm_assertions.at(device_num).get();
+  }
+
+  // Otherwise, set up the GPU to be able to use the device-side assertion
+  // system
+  DeviceAssertionsData* uvm_assertions_ptr = nullptr;
+
+  C10_ZOOM_CHECK_WO_DSA(
+      hipMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
+
+  C10_ZOOM_CHECK_WO_DSA(hipMemAdvise(
+      uvm_assertions_ptr,
+      sizeof(DeviceAssertionsData),
+      hipMemAdviseSetPreferredLocation,
+      hipCpuDeviceId));
+
+  // GPU will establish direct mapping of data in CPU memory, no page faults
+  // will be generated
+  C10_ZOOM_CHECK_WO_DSA(hipMemAdvise(
+      uvm_assertions_ptr,
+      sizeof(DeviceAssertionsData),
+      hipMemAdviseSetAccessedBy,
+      hipCpuDeviceId));
+
+  // Initialize the memory from the CPU; otherwise, pages may have to be created
+  // on demand. We think that UVM documentation indicates that first access may
+  // not honor preferred location, which would be bad, if true, because we want
+  // this memory on the host so we can access it post-assertion. Initializing
+  // this on the CPU helps ensure that that's where the memory will live.
+  *uvm_assertions_ptr = DeviceAssertionsData();
+
+  // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
+  // uvm_assertions unique_ptr vector
+  uvm_assertions.at(device_num).reset(uvm_assertions_ptr);
+
+  return uvm_assertions_ptr;
+#else
+  return nullptr;
+#endif
+}
+
+ZoomKernelLaunchRegistry& ZoomKernelLaunchRegistry::get_singleton_ref() {
+  static ZoomKernelLaunchRegistry launch_registry;
+  return launch_registry;
+}
+
+bool ZoomKernelLaunchRegistry::has_failed() const {
+  for (const auto& x : uvm_assertions) {
+    if (x && x->assertion_count > 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomDeviceAssertionHost.h b/c10/zoom/ZoomDeviceAssertionHost.h
new file mode 100644
index 00000000000000..867c2e626a1370
--- /dev/null
+++ b/c10/zoom/ZoomDeviceAssertionHost.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <c10/zoom/ZoomMacros.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_ZOOM
+#define TORCH_USE_ZOOM_DSA
+#endif
+
+/// Number of assertion failure messages we can store. If this is too small
+/// threads will fail silently.
+constexpr int C10_ZOOM_DSA_ASSERTION_COUNT = 10;
+constexpr int C10_ZOOM_DSA_MAX_STR_LEN = 512;
+
+namespace c10::zoom {
+
+/// Holds information about any device-side assertions that fail.
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionData {
+  /// Stringification of the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  char assertion_msg[C10_ZOOM_DSA_MAX_STR_LEN]{};
+  /// File the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char filename[C10_ZOOM_DSA_MAX_STR_LEN]{};
+  /// Name of the function the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char function_name[C10_ZOOM_DSA_MAX_STR_LEN]{};
+  /// Line number the assertion was at
+  int line_number{};
+  /// Number uniquely identifying the kernel launch that triggered the assertion
+  uint32_t caller{};
+  /// block_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t block_id[3]{};
+  /// third_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t thread_id[3]{};
+};
+
+/// Used to hold assertions generated by the device
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionsData {
+  /// Total number of assertions found; a subset of thse will be recorded
+  /// in `assertions`
+  int32_t assertion_count{};
+  /// An array of assertions that will be written to in a race-free manner
+  // NOLINTNEXTLINE(*-c-arrays)
+  DeviceAssertionData assertions[C10_ZOOM_DSA_ASSERTION_COUNT]{};
+};
+
+/// Use to hold info about kernel launches so that we can run kernels
+/// asynchronously and still associate launches with device-side
+/// assertion failures
+struct ZoomKernelLaunchInfo {
+  /// Filename of the code where the kernel was launched from
+  const char* launch_filename;
+  /// Function from which the kernel was launched
+  const char* launch_function;
+  /// Line number of where the code was launched from
+  uint32_t launch_linenum;
+  /// Backtrace of where the kernel was launched from, only populated if
+  /// ZoomKernelLaunchRegistry::gather_launch_stacktrace is True
+  std::string launch_stacktrace;
+  /// Kernel that was launched
+  const char* kernel_name;
+  /// Device the kernel was launched on
+  int device;
+  /// Stream the kernel was launched on
+  int32_t stream;
+  /// A number that uniquely identifies the kernel launch
+  uint64_t generation_number;
+};
+
+/// Circular buffer used to hold information about kernel launches
+/// this is later used to reconstruct how a device-side kernel assertion failure
+/// occurred ZoomKernelLaunchRegistry is used as a singleton
+class C10_ZOOM_API ZoomKernelLaunchRegistry {
+ private:
+  /// Assume that this is the max number of kernel launches that might ever be
+  /// enqueued across all streams on a single device
+  static constexpr int max_kernel_launches = 1024;
+  /// How many kernel launch infos we've inserted. Used to ensure that circular
+  /// queue doesn't provide false information by always increasing, but also to
+  /// mark where we are inserting into the queue
+#ifdef TORCH_USE_ZOOM_DSA
+  uint64_t generation_number = 0;
+#endif
+  /// Shared mutex between writer and accessor to ensure multi-threaded safety.
+  mutable std::mutex read_write_mutex;
+  /// Used to ensure prevent race conditions in GPU memory allocation
+  mutable std::mutex gpu_alloc_mutex;
+  /// Pointer to managed memory keeping track of device-side assertions. There
+  /// is one entry for each possible device the process might work with. Unused
+  /// entries are nullptrs. We could also use an unordered_set here, but this
+  /// vector design will be faster and the wasted memory is small since we
+  /// expect the number of GPUs per node will always be small
+  std::vector<
+      std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
+      uvm_assertions;
+  /// A single circular buffer holds information about every kernel launch the
+  /// process makes across all devices.
+  std::vector<ZoomKernelLaunchInfo> kernel_launches;
+  bool check_env_for_enable_launch_stacktracing() const;
+  bool check_env_for_dsa_enabled() const;
+
+ public:
+  ZoomKernelLaunchRegistry();
+  /// Register a new kernel launch and obtain a generation number back to be
+  /// passed to the kernel
+  uint32_t insert(
+      const char* launch_filename,
+      const char* launch_function,
+      const uint32_t launch_linenum,
+      const char* kernel_name,
+      const int32_t stream_id);
+  /// Get copies of the kernel launch registry and each device's assertion
+  /// failure buffer so they can be inspected without raising race conditions
+  std::
+      pair<std::vector<DeviceAssertionsData>, std::vector<ZoomKernelLaunchInfo>>
+      snapshot() const;
+  /// Get a pointer to the current device's assertion failure buffer. If no such
+  /// buffer exists then one is created. This means that the first kernel launch
+  /// made on each device will be slightly slower because memory allocations are
+  /// required
+  DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
+  /// Gets the global singleton of the registry
+  static ZoomKernelLaunchRegistry& get_singleton_ref();
+  /// If not all devices support DSA, we disable it
+  const bool do_all_devices_support_managed_memory = false;
+  /// Whether or not to gather stack traces when launching kernels
+  bool gather_launch_stacktrace = false;
+  /// Whether or not host-side DSA is enabled or disabled at run-time
+  /// Note: Device-side code cannot be enabled/disabled at run-time
+  bool enabled_at_runtime = false;
+  /// Whether or not a device has indicated a failure
+  bool has_failed() const;
+#ifdef TORCH_USE_ZOOM_DSA
+  const bool enabled_at_compile_time = true;
+#else
+  const bool enabled_at_compile_time = false;
+#endif
+};
+
+std::string c10_retrieve_device_side_assertion_info();
+
+} // namespace c10::zoom
+
+// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
+// requires the same input arguments. We introduce the following macro to
+// standardize these.
+#define TORCH_DSA_KERNEL_ARGS                                              \
+  [[maybe_unused]] c10::zoom::DeviceAssertionsData *const assertions_data, \
+      [[maybe_unused]] uint32_t assertion_caller_id
+
+// This macro can be used to pass the DSA arguments onward to another
+// function
+#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
\ No newline at end of file
diff --git a/c10/zoom/ZoomException.cpp b/c10/zoom/ZoomException.cpp
new file mode 100644
index 00000000000000..fe752478067c3b
--- /dev/null
+++ b/c10/zoom/ZoomException.cpp
@@ -0,0 +1,88 @@
+#include <c10/zoom/ZoomException.h>
+#include <c10/util/Exception.h>
+
+#include <string>
+
+namespace c10::zoom {
+
+void c10_zoom_check_implementation(
+    const int32_t err,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const bool include_device_assertions) {
+  const auto hip_error = static_cast<hipError_t>(err);
+  const auto hip_kernel_failure = include_device_assertions
+      ? c10::zoom::ZoomKernelLaunchRegistry::get_singleton_ref().has_failed()
+      : false;
+
+  if (C10_LIKELY(hip_error == hipSuccess && !hip_kernel_failure)) {
+    return;
+  }
+
+  auto error_unused C10_UNUSED = hipGetLastError();
+  (void)error_unused;
+
+  std::string check_message;
+#ifndef STRIP_ERROR_MESSAGES
+  check_message.append("Zoom error: ");
+  check_message.append(hipGetErrorString(hip_error));
+  // checks if HIP_LAUNCH_BLOCKING in HIP, unimplemented here for now
+  check_message.append(c10::zoom::get_hip_check_suffix());
+  check_message.append("\n");
+  if (include_device_assertions) {
+    check_message.append(c10_retrieve_device_side_assertion_info());
+  } else {
+    check_message.append(
+        "Device-side assertions were explicitly omitted for this error check; the error probably arose while initializing the DSA handlers.");
+  }
+  check_message.append(
+        "Device-side assertions were explicitly omitted for this error check; the error probably arose while initializing the DSA handlers.");
+#endif
+
+  TORCH_CHECK(false, check_message);
+}
+
+} // namespace c10::zoom
+
+
+namespace at::zoom {
+  namespace blas {
+    const char* _hipblasGetErrorEnum(hipblasStatus_t error) {
+  if (error == HIPBLAS_STATUS_SUCCESS) {
+    return "HIPBLAS_STATUS_SUCCESS";
+  }
+  if (error == HIPBLAS_STATUS_NOT_INITIALIZED) {
+    return "HIPBLAS_STATUS_NOT_INITIALIZED";
+  }
+  if (error == HIPBLAS_STATUS_ALLOC_FAILED) {
+    return "HIPBLAS_STATUS_ALLOC_FAILED";
+  }
+  if (error == HIPBLAS_STATUS_INVALID_VALUE) {
+    return "HIPBLAS_STATUS_INVALID_VALUE";
+  }
+  if (error == HIPBLAS_STATUS_ARCH_MISMATCH) {
+    return "HIPBLAS_STATUS_ARCH_MISMATCH";
+  }
+  if (error == HIPBLAS_STATUS_MAPPING_ERROR) {
+    return "HIPBLAS_STATUS_MAPPING_ERROR";
+  }
+  if (error == HIPBLAS_STATUS_EXECUTION_FAILED) {
+    return "HIPBLAS_STATUS_EXECUTION_FAILED";
+  }
+  if (error == HIPBLAS_STATUS_INTERNAL_ERROR) {
+    return "HIPBLAS_STATUS_INTERNAL_ERROR";
+  }
+  if (error == HIPBLAS_STATUS_NOT_SUPPORTED) {
+    return "HIPBLAS_STATUS_NOT_SUPPORTED";
+  }
+#ifdef HIPBLAS_STATUS_LICENSE_ERROR
+  if (error == HIPBLAS_STATUS_LICENSE_ERROR) {
+    return "HIPBLAS_STATUS_LICENSE_ERROR";
+  }
+#endif
+  return "<unknown>";
+}
+
+  } // namespace blas
+} //namespace at::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomException.h b/c10/zoom/ZoomException.h
new file mode 100644
index 00000000000000..aaeb140f0d76e3
--- /dev/null
+++ b/c10/zoom/ZoomException.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <c10/zoom/ZoomMiscFunctions.h>
+#include <c10/zoom/ZoomDeviceAssertionHost.h>
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+
+// Note [CHECK macro]
+// ~~~~~~~~~~~~~~~~~~
+// This is a macro so that AT_ERROR can get accurate __LINE__
+// and __FILE__ information.  We could split this into a short
+// macro and a function implementation if we pass along __LINE__
+// and __FILE__, but no one has found this worth doing.
+
+// Used to denote errors from CUDA framework.
+// This needs to be declared here instead util/Exception.h for proper conversion
+// during hipify.
+namespace c10 {
+class ZoomError : public c10::Error {
+  using Error::Error;
+};
+} // namespace c10
+
+#define C10_ZOOM_CHECK(EXPR)                                        \
+  do {                                                              \
+    const hipError_t __err = EXPR;                                 \
+    c10::zoom::c10_zoom_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        true);                                                      \
+  } while (0)
+
+#define C10_ZOOM_CHECK_WARN(EXPR)                              \
+  do {                                                         \
+    const hipError_t __err = EXPR;                            \
+    if (C10_UNLIKELY(__err != hipSuccess)) {                  \
+      auto error_unused C10_UNUSED = hipGetLastError();       \
+      (void)error_unused;                                      \
+      TORCH_WARN("ZOOM warning: ", hipGetErrorString(__err)); \
+    }                                                          \
+  } while (0)
+
+// Indicates that a CUDA error is handled in a non-standard way
+#define C10_ZOOM_ERROR_HANDLED(EXPR) EXPR
+
+// Intentionally ignore a CUDA error
+#define C10_ZOOM_IGNORE_ERROR(EXPR)                             \
+  do {                                                          \
+    const hipError_t __err = EXPR;                             \
+    if (C10_UNLIKELY(__err != hipSuccess)) {                   \
+      hipError_t error_unused C10_UNUSED = hipGetLastError(); \
+      (void)error_unused;                                       \
+    }                                                           \
+  } while (0)
+
+// Clear the last CUDA error
+#define C10_ZOOM_CLEAR_ERROR()                                \
+  do {                                                        \
+    hipError_t error_unused C10_UNUSED = hipGetLastError(); \
+    (void)error_unused;                                       \
+  } while (0)
+
+// This should be used directly after every kernel launch to ensure
+// the launch happened correctly and provide an early, close-to-source
+// diagnostic if it didn't.
+#define C10_ZOOM_KERNEL_LAUNCH_CHECK() C10_ZOOM_CHECK(hipGetLastError())
+
+/// Launches a HIP kernel appending to it all the information need to handle
+/// device-side assertion failures. Checks that the launch was successful.
+#define TORCH_DSA_KERNEL_LAUNCH(                                      \
+    kernel, blocks, threads, shared_mem, stream, ...)                 \
+  do {                                                                \
+    auto& launch_registry =                                           \
+        c10::zoom::ZoomKernelLaunchRegistry::get_singleton_ref();     \
+    kernel<<<blocks, threads, shared_mem, stream>>>(                  \
+        __VA_ARGS__,                                                  \
+        launch_registry.get_uvm_assertions_ptr_for_current_device(),  \
+        launch_registry.insert(                                       \
+            __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                   \
+  } while (0)
+
+#define HIP_DRIVER_CHECK(EXPR)                                                \
+  do {                                                                            \
+    hipError_t __err = EXPR;                                                        \
+    if (__err != hipSuccess) {                                                  \
+      AT_ERROR("HIP driver error: ", static_cast<int>(__err), "\nErrorName: ", hipGetErrorName(__err), "\nCause: ", hipGetErrorString(__err));                   \
+    }                                                                             \
+  } while (0)
+
+#define ZOOM_HIPRTC_CHECK(EXPR)                                       \
+  do {                                                                                              \
+    hiprtcResult __err = EXPR;                                                                       \
+    if (__err != HIPRTC_SUCCESS) {                                                                   \
+      if (static_cast<int>(__err) != 7) {                                                           \
+        AT_ERROR("HIPRTC error: ", hiprtcGetErrorString(__err));  \
+      } else {                                                                                      \
+        AT_ERROR("HIPRTC error: HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE");                        \
+      }                                                                                             \
+    }                                                                                               \
+  } while (0)
+
+
+#define ZOOM_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+
+
+namespace at::zoom::blas {
+  const char* _hipblasGetErrorEnum(hipblasStatus_t error);
+}
+
+
+#define TORCH_HIPBLAS_CHECK(EXPR)                              \
+do {                                                          \
+  hipblasStatus_t __err = EXPR;                                \
+  TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,                 \
+              "HIP error: ",                                 \
+              at::zoom::blas::_hipblasGetErrorEnum(__err),     \
+              " when calling `" #EXPR "`");                   \
+} while (0)
+
+#define TORCH_WARN_DISABLE_HIPBLASLT TORCH_WARN_ONCE("hipblasLt temporarily disabled in Zoom backend, using hipblas instead")
+#define TORCH_CHECK_DISABLE_HIPBLAS_LT TORCH_CHECK(false, "Error: hipblasLt routine called, but hipblasLt is disabled in the Zoom backend")
+
+const char *hipsparseGetErrorString(hipsparseStatus_t status);
+
+#define TORCH_HIPSPARSE_CHECK(EXPR)                            \
+  do {                                                          \
+    hipsparseStatus_t __err = EXPR;                              \
+    TORCH_CHECK(__err == HIPSPARSE_STATUS_SUCCESS,               \
+                "HIP error: ",                                 \
+                hipsparseGetErrorString(__err),                  \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+
+#ifdef hipsolverVersionMajor
+
+namespace at::zoom::solver {
+C10_EXPORT const char* hipsolverGetErrorMessage(hipsolverStatus_t status);
+
+constexpr const char* _hipsolver_backend_suggestion =            \
+  "If you keep seeing this error, you may use "                 \
+  "`torch.backends.zoom.preferred_linalg_library()` to try "    \
+  "linear algebra operators with other supported backends. "    \
+  "See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
+
+} // namespace at::zoom::solver
+
+#define TORCH_HIPSOLVER_CHECK(EXPR)                                      \
+  do {                                                                  \
+    hipsolverStatus_t __err = EXPR;                                      \                                                          \
+      TORCH_CHECK(                                                      \
+          __err == CUSOLVER_STATUS_SUCCESS,                             \
+          "hipsolver error: ",                                           \
+          at::zoom::solver::hipsolverGetErrorMessage(__err),             \
+          ", when calling `" #EXPR "`. ",                               \
+          at::zoom::solver::_hipsolver_backend_suggestion);              \                                                                 \
+  } while (0)
+
+#else
+#define TORCH_HIPSOLVER_CHECK(EXPR) EXPR
+#endif
+
+namespace c10::zoom {
+
+/// In the event of a HIP failure, formats a nice error message about that
+/// failure and also checks for device-side assertion failures
+void c10_zoom_check_implementation(
+    const int32_t err,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const bool include_device_assertions);
+
+} // namespace c10::zoom
diff --git a/c10/zoom/ZoomFunctions.cpp b/c10/zoom/ZoomFunctions.cpp
new file mode 100644
index 00000000000000..8169ec38d89e97
--- /dev/null
+++ b/c10/zoom/ZoomFunctions.cpp
@@ -0,0 +1,294 @@
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/macros/Macros.h>
+
+#include <limits>
+
+namespace c10::zoom {
+
+namespace {
+// returns -1 on failure
+int32_t driver_version() {
+  int driver_version = -1;
+  C10_ZOOM_IGNORE_ERROR(hipDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+int device_count_impl(bool fail_if_no_driver) {
+  int count = 0;
+  auto err = C10_ZOOM_ERROR_HANDLED(c10::zoom::GetDeviceCount(&count));
+  if (err == hipSuccess) {
+    return count;
+  }
+  // Clear out the error state, so we don't spuriously trigger someone else.
+  // (This shouldn't really matter, since we won't be running very much CUDA
+  // code in this regime.)
+  hipError_t last_err C10_UNUSED = hipGetLastError();
+  switch (err) {
+    case hipErrorNoDevice:
+      // Zero devices is ok here
+      count = 0;
+      break;
+    case hipErrorInsufficientDriver: {
+      auto version = driver_version();
+      if (version <= 0) {
+        if (!fail_if_no_driver) {
+          // No hip driver means no devices
+          count = 0;
+          break;
+        }
+        TORCH_CHECK(
+            false,
+            "Found no ROCm driver on your system. Please check that you "
+            "have an AMD GPU and installed a driver from "
+            "https://rocm.docs.amd.com/projects/install-on-linux/en/develop/tutorial/quick-start.html#rocm-install-quick");
+      } else {
+        TORCH_CHECK(
+            false,
+            "The ROCm driver on your system is too old (found version ",
+            version,
+            "). Please update your GPU driver by downloading and installing "
+            "a new version from the URL: "
+            "https://rocm.docs.amd.com/projects/install-on-linux/en/develop/tutorial/quick-start.html#rocm-install-quick");
+      }
+    } break;
+    case hipErrorInitializationError:
+      TORCH_CHECK(
+          false,
+          "ROCm driver initialization failed, you might not "
+          "have a ROCm gpu.");
+      break;
+    case hipErrorUnknown:
+      TORCH_CHECK(
+          false,
+          "ZOOM unknown error - this may be due to an "
+          "incorrectly set up environment, e.g. changing env "
+          "variable ZOOM_VISIBLE_DEVICES after program start. "
+          "Setting the available devices to be zero.");
+      break;
+#if C10_ASAN_ENABLED
+    case hipErrorMemoryAllocation:
+      // In ASAN mode, we know that a hipErrorMemoryAllocation error will
+      // pop up if compiled with hipcc (clang-hip is fine)
+      TORCH_CHECK(
+          false,
+          "Got 'out of memory' error while trying to initialize ZOOM. "
+          "ZOOM with hipcc does not work well with ASAN and it's probably "
+          "the reason. We will simply shut down HIP support. If you "
+          "would like to use GPUs, turn off ASAN.");
+      break;
+#endif // C10_ASAN_ENABLED
+    default:
+      TORCH_CHECK(
+          false,
+          "Unexpected error from hipGetDeviceCount(). Did you run "
+          "some hip functions before calling NumZoomDevices() "
+          "that might have already set an error? Error ",
+          err,
+          ": ",
+          hipGetErrorString(err));
+  }
+  return count;
+}
+} // namespace
+
+DeviceIndex device_count() noexcept {
+  // initialize number of devices only once
+  static int count = []() {
+    try {
+      auto result = device_count_impl(/*fail_if_no_driver=*/false);
+      TORCH_INTERNAL_ASSERT(
+          result <= std::numeric_limits<DeviceIndex>::max(),
+          "Too many ROCm devices, DeviceIndex overflowed");
+      return result;
+    } catch (const c10::Error& ex) {
+      // We don't want to fail, but still log the warning
+      // msg() returns the message without the stack trace
+      TORCH_WARN("ZOOM initialization: ", ex.msg());
+      return 0;
+    }
+  }();
+  return static_cast<DeviceIndex>(count);
+}
+
+DeviceIndex device_count_ensure_non_zero() {
+  // Call the implementation every time to throw the exception
+  int count = device_count_impl(/*fail_if_no_driver=*/true);
+  // Zero gpus doesn't produce a warning in `device_count` but we fail here
+  TORCH_CHECK(count, "No ROCm GPUs are available");
+  TORCH_INTERNAL_ASSERT(
+      count <= std::numeric_limits<DeviceIndex>::max(),
+      "Too many ROCm devices, DeviceIndex overflowed");
+  return static_cast<DeviceIndex>(count);
+}
+
+DeviceIndex current_device() {
+  DeviceIndex cur_device = -1;
+  C10_ZOOM_CHECK(c10::zoom::GetDevice(&cur_device));
+  return cur_device;
+}
+
+void set_device(DeviceIndex device) {
+  C10_ZOOM_CHECK(c10::zoom::SetDevice(device));
+}
+
+void device_synchronize() {
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_device_synchronization(c10::DeviceType::PrivateUse1);
+  }
+  C10_ZOOM_CHECK(hipDeviceSynchronize());
+}
+
+// this function has to be called from callers performing cuda synchronizing
+// operations, to raise proper error or warning
+void warn_or_error_on_sync() {
+  if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
+    TORCH_CHECK(false, "called a synchronizing HIP operation");
+  } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
+    TORCH_WARN("called a synchronizing HIP operation");
+  }
+}
+
+std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
+  // check current device first
+  auto current_device_index = current_device();
+  if (current_device_index >= 0) {
+    if (hasPrimaryContext(current_device_index)) {
+      return current_device_index;
+    }
+  }
+  for (const auto device_index : c10::irange(c10::zoom::device_count())) {
+    if (device_index == current_device_index)
+      continue;
+    if (hasPrimaryContext(device_index)) {
+      return device_index;
+    }
+  }
+  return c10::nullopt;
+}
+
+namespace _internal {
+bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
+  TORCH_CHECK(false, "Should never been called - did you remember to lazyInitPrivateUse1()?");
+}
+bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;
+
+// Private api to be called from CUDAHooks.cpp
+void setHasPrimaryContext(bool (*func)(DeviceIndex)) {
+  hasPrimaryContext = func ? func : dummyHasPrimaryContext;
+}
+} // namespace _internal
+
+bool hasPrimaryContext(DeviceIndex device_index) {
+  return _internal::hasPrimaryContext(device_index);
+}
+
+// Wrappers for raw CUDA device management functions
+hipError_t GetDeviceCount(int* dev_count) {
+  return hipGetDeviceCount(dev_count);
+}
+
+// This is a codepath for CUDA 12 that comes with a critical change in behavior
+// of `cudaSetDevice`. Unlike to previous CUDA versions that allocate context
+// lazily CUDA 12.x eagerly allocates primary context the moment `cudaSetDevice`
+// is called. This can lead to dramatic consequences and pollute the device
+// memory in distributed runs. To avoid unnecessary context creation a new
+// function called `MaybeSetDevice` was introduced. This function is to be
+// called in device guard destructor and at the exit of torch.cuda.device
+// context manager. The behavior of `MaybeSetDevice` is quite simple, it calls
+// to `cudaSetDevice` if context already exist or if context was not allocated
+// on targeted device it simply saves the device index. This way we can keep
+// PyTorch backward compatible for applications like this:
+//
+// ```
+// import torch
+// x = torch.empty(1, device=“cuda:1”) # no CUDA context on cuda:0 after this
+// call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0
+// ```
+
+thread_local DeviceIndex targetDeviceIndex = -1;
+
+hipError_t GetDevice(DeviceIndex* device) {
+  if (targetDeviceIndex >= 0) {
+    *device = targetDeviceIndex;
+    return hipSuccess;
+  }
+  int tmp_device = -1;
+  auto err = hipGetDevice(&tmp_device);
+  if (err == hipSuccess) {
+    TORCH_INTERNAL_ASSERT(
+        tmp_device >= 0 &&
+            tmp_device <= std::numeric_limits<DeviceIndex>::max(),
+        "hipGetDevice returns invalid device ",
+        tmp_device);
+    *device = static_cast<DeviceIndex>(tmp_device);
+  }
+  return err;
+}
+
+hipError_t SetDevice(DeviceIndex device) {
+  TORCH_CHECK(device >= 0, "device id must be positive!", device);
+  targetDeviceIndex = -1;
+  int cur_device = -1;
+  C10_ZOOM_CHECK(hipGetDevice(&cur_device));
+  if (device == cur_device) {
+    return hipSuccess;
+  }
+  return hipSetDevice(device);
+}
+
+hipError_t MaybeSetDevice(DeviceIndex device) {
+  if (hasPrimaryContext(device)) {
+    return c10::zoom::SetDevice(device);
+  }
+  targetDeviceIndex = device;
+  return hipSuccess;
+}
+
+// This function always initializes the CUDA context
+// on to_device
+DeviceIndex ExchangeDevice(DeviceIndex to_device) {
+  auto cur_device = targetDeviceIndex;
+  targetDeviceIndex = -1;
+  if (cur_device < 0) {
+    int tmp_device = -1;
+    C10_ZOOM_CHECK(hipGetDevice(&tmp_device));
+    cur_device = static_cast<DeviceIndex>(tmp_device);
+    if (to_device == cur_device) {
+      return cur_device;
+    }
+  }
+  C10_ZOOM_CHECK(hipSetDevice(to_device));
+  return cur_device;
+}
+
+// This function does not initialize the CUDA context
+// on to_device if it does not already exist
+DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
+  int tmp_cur_device = -1;
+  C10_ZOOM_CHECK(hipGetDevice(&tmp_cur_device));
+  TORCH_INTERNAL_ASSERT(
+      tmp_cur_device >= 0 &&
+          tmp_cur_device <= std::numeric_limits<DeviceIndex>::max(),
+      "hipGetDevice returns invalid device ",
+      tmp_cur_device);
+  auto cur_device = static_cast<DeviceIndex>(tmp_cur_device);
+  if (to_device == tmp_cur_device) {
+    return cur_device;
+  }
+  if (hasPrimaryContext(to_device)) {
+    C10_ZOOM_CHECK(hipSetDevice(to_device));
+  } else {
+    targetDeviceIndex = to_device;
+  }
+  return cur_device;
+}
+
+void SetTargetDevice() {
+  if (targetDeviceIndex >= 0) {
+    C10_ZOOM_CHECK(c10::zoom::SetDevice(targetDeviceIndex));
+  }
+}
+
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomFunctions.h b/c10/zoom/ZoomFunctions.h
new file mode 100644
index 00000000000000..eb5ffa640967cd
--- /dev/null
+++ b/c10/zoom/ZoomFunctions.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <hip/hip_runtime.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomMacros.h>
+
+namespace c10::zoom {
+
+// NB: In the past, we were inconsistent about whether or not this reported
+// an error if there were driver problems are not.  Based on experience
+// interacting with users, it seems that people basically ~never want this
+// function to fail; it should just return zero if things are not working.
+// Oblige them.
+// It still might log a warning for user first time it's invoked
+C10_ZOOM_API DeviceIndex device_count() noexcept;
+
+// Version of device_count that throws is no devices are detected
+C10_ZOOM_API DeviceIndex device_count_ensure_non_zero();
+
+C10_ZOOM_API DeviceIndex current_device();
+
+C10_ZOOM_API void set_device(DeviceIndex device);
+
+C10_ZOOM_API void device_synchronize();
+
+C10_ZOOM_API void warn_or_error_on_sync();
+
+// Raw CUDA device management functions
+C10_ZOOM_API hipError_t GetDeviceCount(int* dev_count);
+
+C10_ZOOM_API hipError_t GetDevice(DeviceIndex* device);
+
+C10_ZOOM_API hipError_t SetDevice(DeviceIndex device);
+
+C10_ZOOM_API hipError_t MaybeSetDevice(DeviceIndex device);
+
+C10_ZOOM_API DeviceIndex ExchangeDevice(DeviceIndex device);
+
+C10_ZOOM_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);
+
+C10_ZOOM_API void SetTargetDevice();
+
+enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
+
+// this is a holder for c10 global state (similar to at GlobalContext)
+// currently it's used to store cuda synchronization warning state,
+// but can be expanded to hold other related global state, e.g. to
+// record stream usage
+class WarningState {
+ public:
+  void set_sync_debug_mode(SyncDebugMode l) {
+    sync_debug_mode = l;
+  }
+
+  SyncDebugMode get_sync_debug_mode() {
+    return sync_debug_mode;
+  }
+
+ private:
+  SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
+};
+
+C10_ZOOM_API __inline__ WarningState& warning_state() {
+  static WarningState warning_state_;
+  return warning_state_;
+}
+// the subsequent functions are defined in the header because for performance
+// reasons we want them to be inline
+C10_ZOOM_API void __inline__ memcpy_and_sync(
+    void* dst,
+    const void* src,
+    int64_t nbytes,
+    hipMemcpyKind kind,
+    hipStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(stream));
+  }
+
+  #if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
+    C10_ZOOM_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
+  #else
+    C10_ZOOM_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    C10_ZOOM_CHECK(hipStreamSynchronize(stream));
+  #endif
+
+}
+
+C10_ZOOM_API void __inline__ stream_synchronize(hipStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(stream));
+  }
+  C10_ZOOM_CHECK(hipStreamSynchronize(stream));
+}
+
+C10_ZOOM_API bool hasPrimaryContext(DeviceIndex device_index);
+C10_ZOOM_API std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomGuard.h b/c10/zoom/ZoomGuard.h
new file mode 100644
index 00000000000000..1f549e9cd59bf2
--- /dev/null
+++ b/c10/zoom/ZoomGuard.h
@@ -0,0 +1,301 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+// #include <c10/cuda/CUDAMacros.h>
+#include <c10/zoom/impl/ZoomGuardImpl.h>
+
+namespace c10::zoom {
+
+// This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
+// boilerplate]
+
+/// A variant of DeviceGuard that is specialized for HIP.  It accepts
+/// integer indices (interpreting them as HIP devices) and is a little
+/// more efficient than DeviceGuard (it compiles to straight line
+/// hipSetDevice/hipGetDevice calls); however, it can only be used
+/// from code that links against HIP directly.
+struct ZoomGuard {
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit ZoomGuard() = delete;
+
+  /// Set the current HIP device to the passed device index.
+  explicit ZoomGuard(DeviceIndex device_index) : guard_(device_index) {}
+
+  /// Sets the current HIP device to the passed device.  Errors if the passed
+  /// device is not a HIP device.
+  explicit ZoomGuard(Device device) : guard_(device) {}
+
+  // Copy is not allowed
+  ZoomGuard(const ZoomGuard&) = delete;
+  ZoomGuard& operator=(const ZoomGuard&) = delete;
+
+  // Move is not allowed (there is no uninitialized state)
+  ZoomGuard(ZoomGuard&& other) = delete;
+  ZoomGuard& operator=(ZoomGuard&& other) = delete;
+
+  /// Sets the HIP device to the given device.  Errors if the given device
+  /// is not a HIP device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the HIP device to the given device.  Errors if the given device
+  /// is not a HIP device.  (This method is provided for uniformity with
+  /// DeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the HIP device to the given device index.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set upon construction of the guard
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any, otherwise
+  /// the device passed during construction.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  /// The guard for the current device.
+  c10::impl::InlineDeviceGuard<impl::ZoomGuardImpl> guard_;
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for HIP.  See
+/// ZoomGuard for when you can use this.
+struct OptionalZoomGuard {
+  /// Create an uninitialized OptionalZoomGuard.
+  explicit OptionalZoomGuard() : guard_() {}
+
+  /// Set the current HIP device to the passed Device, if it is not nullopt.
+  explicit OptionalZoomGuard(optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current HIP device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalZoomGuard(optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalZoomGuard(const OptionalZoomGuard&) = delete;
+  OptionalZoomGuard& operator=(const OptionalZoomGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalZoomGuard(OptionalZoomGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalZoomGuard& operator=(OptionalZoomGuard&& other) = delete;
+
+  /// Sets the HIP device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a HIP
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the HIP device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a HIP device.
+  /// (This method is provided for uniformity with OptionalDeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the HIP device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original HIP device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<impl::ZoomGuardImpl> guard_;
+};
+
+/// A variant of StreamGuard that is specialized for HIP.  See ZoomGuard
+/// for when you can use this.
+struct ZoomStreamGuard {
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit ZoomStreamGuard() = delete;
+
+  /// Set the current HIP device to the device associated with the passed
+  /// stream, and set the current HIP stream on that device to the passed
+  /// stream. Errors if the Stream is not a HIP stream.
+  explicit ZoomStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Copy is disallowed
+  ZoomStreamGuard(const ZoomStreamGuard&) = delete;
+  ZoomStreamGuard& operator=(const ZoomStreamGuard&) = delete;
+
+  /// Move is disallowed, as ZoomStreamGuard does not have an uninitialized
+  /// state, which is required for moves on types with nontrivial destructors.
+  ZoomStreamGuard(ZoomStreamGuard&& other) = delete;
+  ZoomStreamGuard& operator=(ZoomStreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Errors if the stream passed is not a HIP stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// on HIP, use ZoomMultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the HIP stream that was set at the time the guard was
+  /// constructed.
+  ZoomStream original_stream() const {
+    return ZoomStream(ZoomStream::UNCHECKED, guard_.original_stream());
+  }
+
+  /// Returns the most recent HIP stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  ZoomStream current_stream() const {
+    return ZoomStream(ZoomStream::UNCHECKED, guard_.current_stream());
+  }
+
+  /// Returns the most recent HIP device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Returns the HIP device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+ private:
+  c10::impl::InlineStreamGuard<impl::ZoomGuardImpl> guard_;
+};
+
+/// A variant of OptionalStreamGuard that is specialized for HIP.  See
+/// ZoomGuard for when you can use this.
+struct OptionalZoomStreamGuard {
+  /// Create an uninitialized guard.
+  explicit OptionalZoomStreamGuard() : guard_() {}
+
+  /// Set the current HIP device to the device associated with the passed
+  /// stream, and set the current HIP stream on that device to the passed
+  /// stream. Errors if the Stream is not a HIP stream.
+  explicit OptionalZoomStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit OptionalZoomStreamGuard(optional<Stream> stream_opt)
+      : guard_(stream_opt) {}
+
+  /// Copy is disallowed
+  OptionalZoomStreamGuard(const OptionalZoomStreamGuard&) = delete;
+  OptionalZoomStreamGuard& operator=(const OptionalZoomStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalZoomStreamGuard(OptionalZoomStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalZoomStreamGuard& operator=(OptionalZoomStreamGuard&& other) = delete;
+
+  /// Resets the currently set HIP stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the guard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the HIP stream that was set at the time the guard was most
+  /// recently initialized, or nullopt if the guard is uninitialized.
+  optional<ZoomStream> original_stream() const {
+    auto r = guard_.original_stream();
+    if (r.has_value()) {
+      return make_optional(ZoomStream(ZoomStream::UNCHECKED, r.value()));
+    } else {
+      return nullopt;
+    }
+  }
+
+  /// Returns the most recent HIP stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  optional<ZoomStream> current_stream() const {
+    auto r = guard_.current_stream();
+    if (r.has_value()) {
+      return make_optional(ZoomStream(ZoomStream::UNCHECKED, r.value()));
+    } else {
+      return nullopt;
+    }
+  }
+
+  /// Restore the original HIP device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalStreamGuard<impl::ZoomGuardImpl> guard_;
+};
+
+/// A variant of MultiStreamGuard that is specialized for HIP.
+struct ZoomMultiStreamGuard {
+  explicit ZoomMultiStreamGuard(ArrayRef<ZoomStream> streams)
+      : guard_(unwrapStreams(streams)) {}
+
+  /// Copy is disallowed
+  ZoomMultiStreamGuard(const ZoomMultiStreamGuard&) = delete;
+  ZoomMultiStreamGuard& operator=(const ZoomMultiStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  ZoomMultiStreamGuard(ZoomMultiStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  ZoomMultiStreamGuard& operator=(ZoomMultiStreamGuard&& other) = delete;
+
+ private:
+  c10::impl::InlineMultiStreamGuard<impl::ZoomGuardImpl> guard_;
+
+  static std::vector<Stream> unwrapStreams(ArrayRef<ZoomStream> zoomStreams) {
+    std::vector<Stream> streams;
+    streams.reserve(zoomStreams.size());
+    for (const ZoomStream& zoomStream : zoomStreams) {
+      streams.push_back(zoomStream);
+    }
+    return streams;
+  }
+};
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomMacros.h b/c10/zoom/ZoomMacros.h
new file mode 100644
index 00000000000000..21492ccd4847dc
--- /dev/null
+++ b/c10/zoom/ZoomMacros.h
@@ -0,0 +1,41 @@
+#pragma once
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#ifdef _WIN32
+#if defined(C10_HIP_BUILD_SHARED_LIBS)
+#define C10_ZOOM_EXPORT __declspec(dllexport)
+#define C10_ZOOM_IMPORT __declspec(dllimport)
+#else
+#define C10_ZOOM_EXPORT
+#define C10_ZOOM_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_ZOOM_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_ZOOM_EXPORT
+#endif // defined(__GNUC__)
+#define C10_ZOOM_IMPORT C10_ZOOM_EXPORT
+#endif // _WIN32
+
+// This one is being used by libc10_zoom.so
+#ifdef C10_ZOOM_BUILD_MAIN_LIB
+#define C10_ZOOM_API C10_ZOOM_EXPORT
+#else
+#define C10_ZOOM_API C10_ZOOM_IMPORT
+#endif
+
+/**
+ * The maximum number of GPUs that we recognizes. Increasing this beyond the
+ * initial limit of 16 broke Caffe2 testing, hence the ifdef guards.
+ * This value cannot be more than 128 because our DeviceIndex is a uint8_t.
+o */
+#ifdef FBCODE_CAFFE2
+// fbcode depends on this value being 16
+#define C10_COMPILE_TIME_MAX_GPUS 16
+#else
+#define C10_COMPILE_TIME_MAX_GPUS 120
+#endif
\ No newline at end of file
diff --git a/c10/zoom/ZoomMallocAsyncAllocator.cpp b/c10/zoom/ZoomMallocAsyncAllocator.cpp
new file mode 100644
index 00000000000000..938be05125e8f5
--- /dev/null
+++ b/c10/zoom/ZoomMallocAsyncAllocator.cpp
@@ -0,0 +1,899 @@
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>
+
+#include <unordered_set>
+#include <vector>
+
+namespace c10::zoom::ZoomCachingAllocator::ZoomMallocAsync {
+
+// CUDA device allocator that uses hipMallocAsync to implement
+// the same interface as ZoomCachingAllocator.cpp.
+
+// Designed to be safe for CUDA graph capture.
+// Interactions with CUDA graph capture are mediated by
+// notifyCaptureBegin
+// notifyCaptureAboutToEnd
+// notifyCaptureEnded
+// notifyCaptureDestroy
+
+// Implementation details, not declared in ZoomCachingAllocator.h
+namespace {
+
+// General helpers
+
+struct UsageStream {
+  hipStream_t stream;
+  c10::DeviceIndex device;
+  UsageStream() = default;
+  UsageStream(hipStream_t s, c10::DeviceIndex d) : stream(s), device(d) {}
+  UsageStream(const UsageStream& us) = default;
+  UsageStream(const UsageStream&& us) noexcept
+      : stream(us.stream), device(us.device) {}
+  UsageStream& operator=(UsageStream other) {
+    stream = other.stream;
+    device = other.device;
+    return *this;
+  }
+};
+
+bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
+  return (lhs.stream == rhs.stream) && (lhs.device == rhs.device);
+}
+
+struct UsageStreamHash {
+  size_t operator()(const UsageStream& us) const noexcept {
+    return std::hash<void*>{}(us.stream) + size_t(us.device);
+  }
+};
+
+struct PtrUsage {
+  // recorded_streams holds side usage streams added by record_stream calls.
+  // In other words, it does NOT include the original creation stream.
+  ska::flat_hash_set<UsageStream, UsageStreamHash> recorded_streams;
+  UsageStream creation_stream{};
+  uint64_t size;
+  bool captured;
+  PtrUsage(uint64_t s, bool c) : size(s), captured(c) {}
+};
+
+int device_count = 0;
+// these don't need to be c10::once_flags as in CUDAGeneratorImpl.cpp
+// because they'll only be flipped by functions that have locked the mutex.
+std::vector<bool> devs_initialized_flags;
+std::vector<UsageStream> dummy_unifying_free_streams;
+
+// Possible micro-optimization:
+// Some accesses to ptr_info are read-only.
+// We could let those be concurrent with a shared_mutex and
+// have concurrent calls take a shared_lock.
+// Keeping it simple with an ordinary mutex for now.
+std::mutex general_mutex;
+
+/**
+ * Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * During CUDA graph capture, it's illegal to call hipFreeAsync
+ * on a pointer that came from a non-captured hipMallocAsync.
+ * Unfortunately, Python being what it is, it's impossible to be
+ * sure no uncaptured tensor will ever have its destructor called
+ * in a capturing region.
+ * We avoid errors by
+ *  1. remembering if allocated pointers were captured or uncaptured
+ *  2. during capture, if we detect an attempt to free an uncaptured
+ *     allocation on a capturing stream, don't free it immediately,
+ *     just remember it and defer its hipFreeAsync call to after
+ *     the end of capture (specifically, to notifyCaptureEnded).
+ */
+
+using PtrInfo = ska::flat_hash_map<void*, PtrUsage>;
+PtrInfo ptr_info;
+std::vector<void*> ungraphed_ptrs_defer_free_until_no_capture;
+
+// These two help setMemoryFraction limit the amount of memory
+// used by PyTorch in particular (as opposed to other libraries
+// in the same process that might be sharing the same hipMemPool_t).
+std::vector<size_t> pytorch_used_bytes;
+std::vector<size_t> pytorch_memory_limits;
+
+// Graph-specific helpers
+
+/**
+ * Note [Avoid dangling free streams during CUDA graph capture]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * During capture, all stream dependencies must branch out from
+ * the stream on which capture began and rejoin this initial stream
+ * before capture ends.
+ * The user rigs desired forking and joining with event waits.
+ * But it's hard to be sure when tensor destructors get called relative
+ * to the final joins.
+ * For example, suppose a user
+ *   forks work stream B from initial capture stream A
+ *   creates a tensor T in B
+ *   joins by syncing A with B
+ *   ends capture.
+ * All well and good, right? Maybe not: maybe T went out of scope
+ * and its destructor got called AFTER the rejoin, leaving the graph with
+ * "unjoined work": a dangling hipFreeAsync node in stream B.
+ * Ensuring that all tensor destructors for all side stream tensors
+ * are called before side streams rejoin the main stream is
+ * difficult. The user might have to add a bunch of explicit
+ * "del"s at the right spots in code that was fine for ordinary
+ * eager execution.
+ * Fortunately, we can spare the user this burden:
+ * during capture, we remember _all_ free streams,
+ * and manually rejoin them with the capture stream during
+ * notifyCaptureAboutToEnd.
+ * This approach is heavy-handed, but hopefully capture only needs to
+ * happen once, so we don't mind being heavy-handed.
+ *
+ * TODO: If, someday, we augment the graph bindings to support recapture
+ * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#whole-graph-update
+ * (eg, as a way to accommodate dynamic params) we should think more
+ * carefully about the CPU overhead of remembering and rejoining
+ * all free streams during capture. Maybe it's not a big deal.
+ */
+std::unordered_set<UsageStream, UsageStreamHash> capture_free_streams;
+bool capture_underway = false;
+
+// Implementation functions
+
+// Assumes the caller holds general_mutex
+inline void lazy_init_device(c10::DeviceIndex device) {
+  if (!devs_initialized_flags[device]) {
+    ZoomGuard g(device);
+
+    // See "Retaining memory in the pool" here:
+    // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/
+    hipMemPool_t mempool = nullptr;
+    C10_ZOOM_CHECK(hipDeviceGetDefaultMemPool(&mempool, device));
+    uint64_t threshold = UINT64_MAX;
+    C10_ZOOM_CHECK(hipMemPoolSetAttribute(
+        mempool, hipMemPoolAttrReleaseThreshold, &threshold));
+
+    // I think all these are on by default, but I want to enable them
+    // explicitly to ensure awareness.
+    int enable = 1;
+    C10_ZOOM_CHECK(hipMemPoolSetAttribute(
+        mempool, hipMemPoolReuseFollowEventDependencies, &enable));
+    C10_ZOOM_CHECK(hipMemPoolSetAttribute(
+        mempool, hipMemPoolReuseAllowOpportunistic, &enable));
+    C10_ZOOM_CHECK(hipMemPoolSetAttribute(
+        mempool, hipMemPoolReuseAllowInternalDependencies, &enable));
+
+    // Grabs a stream from the current device to use as the "unifier" free
+    // stream for allocations that end up used on multiple streams.
+    const auto dufs = getStreamFromPool();
+    dummy_unifying_free_streams[device] =
+        UsageStream(dufs.stream(), dufs.device_index());
+
+    pytorch_used_bytes[device] = 0;
+    pytorch_memory_limits[device] = UINT64_MAX;
+
+    devs_initialized_flags[device] = true;
+  }
+}
+
+inline void sync_raw(hipStream_t dependency, hipStream_t dependent) {
+  // ZoomCachingAllocator.cpp uses raw hip events, as do we.
+  hipEvent_t event = nullptr;
+  C10_ZOOM_CHECK(hipEventCreateWithFlags(&event, hipEventDisableTiming));
+  C10_ZOOM_CHECK(hipEventRecord(event, dependency));
+  C10_ZOOM_CHECK(hipStreamWaitEvent(dependent, event, 0));
+  C10_ZOOM_CHECK(hipEventDestroy(event));
+}
+
+// Assumes the caller holds general_mutex
+inline void free_impl(PtrInfo::iterator& it) {
+  // Possible micro-optimization: If we did a value-copy here, we could move
+  // ptr_info.erase(it) up here and drop the lock immediately.
+  const auto& recorded_streams = it->second.recorded_streams;
+  const auto& creation_stream = it->second.creation_stream;
+
+  // If the usage stream is a null (default) stream,
+  // hipFreeAsync infers the device from the ambient context,
+  // so we need to set the right ambient context.
+  ZoomGuard g(creation_stream.device);
+
+  if (recorded_streams.empty()) {
+    // ptr was only used on one stream, which must have been
+    // the original allocation stream.
+    // Frees ptr in the original allocation stream.
+
+    C10_ZOOM_CHECK(hipFreeAsync(it->first, creation_stream.stream));
+
+    if (C10_UNLIKELY(capture_underway)) {
+      // See Note [Avoid dangling free streams during CUDA graph capture]
+      capture_free_streams.insert(creation_stream);
+    }
+  } else {
+    // ptr was used on many streams. We don't know which was the most recent.
+    // There could even have been multiple most recent usage streams acting
+    // on different regions of the memory.
+    // But hipFreeAsync only accepts a single most recent usage stream.
+    // We can still safely free ptr with a trick:
+    // Use a dummy "unifying stream", sync the unifying stream with all of
+    // ptr's usage streams, and pass the dummy stream to hipFreeAsync.
+
+    // Retrieves the dummy "unifier" stream from the device
+    // on which the pointer was originally allocated.
+    auto dummy_unifying_free_stream =
+        dummy_unifying_free_streams[creation_stream.device];
+    TORCH_INTERNAL_ASSERT(
+        dummy_unifying_free_stream.device == creation_stream.device);
+
+    // we're already on creation_stream.device, no need to re-guard
+    sync_raw(creation_stream.stream, dummy_unifying_free_stream.stream);
+
+    // The number of usage streams is typically small (low single digits)
+    for (const auto& recorded_stream : recorded_streams) {
+      // Logic here accommodates the chance some of the usage streams were on
+      // other devices, which is possible if some usage kernels accessed the
+      // memory via p2p.
+
+      // hipEventRecord requires that the input event and stream are on the
+      // same device.
+      ZoomGuard g_usage(recorded_stream.device);
+
+      sync_raw(recorded_stream.stream, dummy_unifying_free_stream.stream);
+    }
+
+    // Frees ptr in the dummy "unifier" stream.
+    C10_ZOOM_CHECK(hipFreeAsync(it->first, dummy_unifying_free_stream.stream));
+    // At this point, unless dummy_unifying_free_stream happens to alias some
+    // future user stream, the allocation is only available for "opportunistic"
+    // reuse, ie, if the CPU sees dummy_unifying_free_stream has reached the
+    // point that all events recorded on all usage streams have resolved from
+    // the CPU's perspective. In theory, we could remove the need for the driver
+    // to do this tracking by e.g. replacing
+    // hipStreamWaitEvent(dummy_unifying_free_stream.stream, event);
+    // with
+    // hipStreamWaitEvent(creation_stream.stream, event);
+    // then hipFreeAsyncing straight back into creation_stream.stream,
+    // but this forces a potentially false dependency of creation_stream.stream
+    // on all the recorded_streams.
+
+    if (C10_UNLIKELY(capture_underway)) {
+      // See Note [Avoid dangling free streams during CUDA graph capture]
+      capture_free_streams.emplace(
+          dummy_unifying_free_stream.stream, dummy_unifying_free_stream.device);
+    }
+  }
+
+  pytorch_used_bytes[creation_stream.device] -= it->second.size;
+
+  ptr_info.erase(it);
+}
+
+void freeAsync(void* ptr) {
+  std::lock_guard<std::mutex> lk(general_mutex);
+
+  auto err = hipGetLastError();
+  C10_ZOOM_CHECK(err);
+  auto it = ptr_info.find(ptr);
+  TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
+
+  if (C10_UNLIKELY(capture_underway)) {
+    if (!it->second.captured) {
+      TORCH_WARN_ONCE(
+          "freeAsync() was called on an uncaptured allocation during graph capture "
+          "(address = ",
+          ptr,
+          "). This may be benign, for example, a Python tensor in the capture "
+          "might happen to shadow (use the same name as) an unrelated temporary "
+          "tensor from somewhere before capture, pushing the earlier tensor "
+          "out of scope. "
+          "However, if the tensor we're freeing here IS used by the capture, "
+          "freeing it is an error, and may cause illegal memory accesses or "
+          "memory corruption during graph replay.");
+      // See Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
+      // Remembers the raw pointer, not the iterator.
+      // This forces notifyCaptureEnded to do another lookup,
+      // but avoids the risk the iterator might be invalidated
+      // between now and then.
+      ungraphed_ptrs_defer_free_until_no_capture.push_back(ptr);
+      return;
+    }
+  } else if (C10_UNLIKELY(it->second.captured)) {
+    TORCH_WARN(
+        "Attempting uncaptured free of a captured allocation with address ",
+        ptr,
+        "\nThis is technically allowed, but may indicate you are losing "
+        "the last user-visible tensor through which the allocation can "
+        "be accessed, so you'll have no way to view the data after "
+        "future replays of the owning graph.");
+  }
+
+  free_impl(it);
+}
+
+// Symmetric with NativeCachingAllocator::malloc for now,
+// although I don't think we absolutely need the symmetry.
+void mallocAsync(
+    void** devPtr,
+    c10::DeviceIndex device,
+    size_t size,
+    hipStream_t stream) {
+  TORCH_INTERNAL_ASSERT(
+      0 <= device && device < device_count,
+      "Invalid device index ",
+      device,
+      ": did you call init?");
+
+  // If stream is a null (default) stream,
+  // hipMallocAsync infers the device from the ambient context,
+  // so we need to set the right ambient context.
+  ZoomGuard g(device);
+
+  std::lock_guard<std::mutex> lk(general_mutex);
+
+  if (!capture_underway &&
+      !ungraphed_ptrs_defer_free_until_no_capture.empty()) {
+    // See Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
+    for (const auto ptr : ungraphed_ptrs_defer_free_until_no_capture) {
+      auto it = ptr_info.find(ptr);
+      TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
+      free_impl(it);
+    }
+
+    ungraphed_ptrs_defer_free_until_no_capture.clear();
+  }
+
+  lazy_init_device(device);
+
+  // Defensively checks for preexisting CUDA error state.
+  auto err = hipGetLastError();
+  C10_ZOOM_CHECK(err);
+
+  // TODO: Could we avoid calling hipMallocAsync while holding general_mutex,
+  // perhaps by letting lazy_init_device use separate once_flags or an internal
+  // static initializer?
+  if (pytorch_used_bytes[device] + size > pytorch_memory_limits[device]) {
+    err = hipErrorMemoryAllocation;
+  } else {
+    err = hipMallocAsync(devPtr, size, stream);
+  }
+
+  if (err == hipErrorMemoryAllocation) {
+    // Clears CUDA's internal error state so the user, if desired, can catch the
+    // OOM exception, free some stuff on the script side, and retry the
+    // allocation. This aligns with the behavior of alloc_block in
+    // ZoomCachingAllocator.cpp.
+    (void)hipGetLastError(); // clear CUDA error
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_ZOOM_CHECK(hipMemGetInfo(&device_free, &device_total));
+    TORCH_CHECK_WITH(
+        OutOfMemoryError,
+        false,
+        "Allocation on device ",
+        device,
+        " would exceed allowed memory. (out of memory)",
+        "\nCurrently allocated     : ",
+        format_size(pytorch_used_bytes[device]),
+        "\nRequested               : ",
+        format_size(size),
+        "\nDevice limit            : ",
+        format_size(device_total),
+        "\nFree (according to CUDA): ",
+        format_size(device_free),
+        "\nPyTorch limit (set by user-supplied memory fraction)"
+        "\n                        : ",
+        format_size(pytorch_memory_limits[device]));
+  } else {
+    C10_ZOOM_CHECK(err);
+  }
+
+  auto inserted = ptr_info.emplace(*devPtr, PtrUsage(size, capture_underway));
+  TORCH_INTERNAL_ASSERT(
+      inserted.second,
+      "address returned by hipMallocAsync already exists "
+      "in ptr_info");
+
+  inserted.first->second.creation_stream = {stream, device};
+
+  pytorch_used_bytes[device] += size;
+}
+
+} // anonymous namespace
+
+void local_raw_delete(void* ptr);
+
+// Same pattern as ZoomCachingAllocator.cpp.
+struct ZoomMallocAsyncAllocator : public ZoomAllocator {
+  DataPtr allocate(size_t size) override {
+    constexpr size_t one_exa_bytes = 1152921504606846976ULL;
+    TORCH_CHECK_WITH(
+        OutOfMemoryError,
+        size < one_exa_bytes,
+        "HIP out of memory. Tried to allocate more than 1EB memory.");
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    void* r = nullptr;
+    if (size != 0) {
+      mallocAsync(&r, device, size, zoom::getCurrentZoomStream(device));
+    }
+    return {r, r, &local_raw_delete, Device(DeviceType::PrivateUse1, device)};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return &local_raw_delete;
+  }
+
+  // This function should not issue any context-creating calls,
+  // just set up for later calls to init per-device pools based
+  // on the current device each later call sees.
+  void init(int dev_count) override {
+    static bool called = [](int dev_count) {
+      ;
+      // Are there external guarantees init will be called before
+      // any of the allocator's other functions?
+      // std::lock_guard<std::mutex> lk(general_mutex);
+      device_count = dev_count;
+      devs_initialized_flags.resize(dev_count, false);
+      dummy_unifying_free_streams.resize(dev_count);
+      pytorch_used_bytes.resize(dev_count);
+      pytorch_memory_limits.resize(dev_count);
+      return true;
+    }(dev_count);
+    (void)called;
+  }
+
+  bool initialized() override {
+    return !devs_initialized_flags.empty();
+  }
+
+  static inline void assertValidDevice(c10::DeviceIndex device) {
+    TORCH_CHECK(
+        0 <= device && device < device_count, "Invalid device argument.");
+  }
+
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override {
+    TORCH_INTERNAL_ASSERT(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within (0, 1).");
+
+    std::lock_guard<std::mutex> lk(general_mutex);
+    assertValidDevice(device);
+    ZoomGuard g(device);
+    // Should setMemoryFraction be allowed to trigger a full device context and
+    // pool-creating lazy_init_device, or should we simply assert this device is
+    // already initialized, ie
+    // TORCH_CHECK(devs_initialized_flags[device], ...)?
+    lazy_init_device(device);
+
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_ZOOM_CHECK(hipMemGetInfo(&device_free, &device_total));
+    pytorch_memory_limits[device] =
+        static_cast<uint64_t>(fraction * static_cast<double>(device_total));
+
+    // Alternative: Instead of a manual hard limit, we could use
+    // hipMemPoolSetAttribute(mempool, hipMemPoolAttrReleaseThreshold,
+    // &threshold); This is a soft hint: The driver allows the pool's reserved
+    // memory to spike above threshold in regions of high hipMallocAsync
+    // demand, but opportunistically trims reserved memory back to threshold
+    // when the memory in use is < threshold. I don't like this because it
+    // introduces performance nondeterminism.
+  }
+
+  void emptyCache() override {
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    for (int dev = 0; dev < device_count; dev++) {
+      if (devs_initialized_flags[dev]) {
+        ZoomGuard g(static_cast<c10::DeviceIndex>(dev));
+
+        hipMemPool_t mempool = nullptr;
+        hipDeviceGetDefaultMemPool(&mempool, dev);
+        hipDeviceSynchronize();
+        hipMemPoolTrimTo(mempool, 0);
+      }
+    }
+  }
+
+  void cacheInfo(c10::DeviceIndex device, size_t* maxWorkspaceGuess) override {
+    // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
+    // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
+    // maximum workspace size to use for an upcoming cudnnFind call.
+    //
+    // The native allocator's cacheInfo chooses to return the size of its
+    // largest unused block (which is the largest allocation the native
+    // allocator can service immediately and asynchronously without a
+    // hipMalloc.
+    //
+    // Here, we use a different heuristic: figure out the max usable workspace
+    // size with a bit of educated trial and error. It's ok to be
+    // perf-inefficient because cacheInfo is a prelude to cudnnFind.
+    //
+    // The algo cache then stores the best-performing algo with workspace <=
+    // maxWorkspaceGuess. Later calls with the same param set hit in cache and
+    // try to allocate the same workspace. If, in one of those future calls,
+    // workspace allocation fails (ie because less ambient memory is available),
+    // the bindings rerun cudnnFind, including calling cacheInfo again
+    // beforehand to estimate a new (smaller) largest-available workspace. Over
+    // a few such calls, the cache should settle to the algo with a workspace
+    // size that's small enough to succeed every time (for that param set).
+    //
+    // So the strategy here is to return a rough, largeish guess and let the
+    // bindings retry to trim as needed over time.
+    //
+    // The only caveat is, even if a workspace is allocated without OOM errors
+    // now and in future calls, it's hard to be sure those later error-free
+    // hipMallocAsyncs are fast and come straight from the pool (ie,
+    // hipMallocAsync didn't need to reserve more memory from the system).
+    // Hopefully, after repeated workspace requests, the pool's reserved memory
+    // also stabilizes to a point where they all come straight from the pool.
+    std::lock_guard<std::mutex> lk(general_mutex);
+    assertValidDevice(device);
+    ZoomGuard g(device);
+    lazy_init_device(device);
+
+    size_t free_upper_bound = 0;
+    size_t device_total = 0;
+    C10_ZOOM_CHECK(hipMemGetInfo(&free_upper_bound, &device_total));
+    TORCH_INTERNAL_ASSERT(
+        free_upper_bound + pytorch_used_bytes[device] <= device_total);
+    size_t guess = std::min(
+        free_upper_bound,
+        pytorch_memory_limits[device] - pytorch_used_bytes[device]);
+    auto stream = c10::zoom::getCurrentZoomStream();
+    void* dummy = nullptr;
+
+    // Defensively checks for preexisting CUDA error state.
+    auto err = hipGetLastError();
+    C10_ZOOM_CHECK(err);
+
+    while (true) {
+      // Duplicates some logic from mallocAsync to work with the error state
+      // directly instead of repeatedly catching an exception thrown by
+      // mallocAsync.
+      if (pytorch_used_bytes[device] + guess > pytorch_memory_limits[device]) {
+        err = hipErrorMemoryAllocation;
+      } else {
+        err = hipMallocAsync(&dummy, guess, stream);
+      }
+
+      if (err == hipSuccess) {
+        hipFreeAsync(dummy, stream);
+        *maxWorkspaceGuess = guess;
+        return;
+      } else if (err == hipErrorMemoryAllocation) {
+        (void)hipGetLastError(); // clear CUDA error
+        guess >>= 1; // quick and dirty: try half the size next iteration
+      } else {
+        C10_ZOOM_CHECK(err);
+      }
+    }
+  }
+
+  void* getBaseAllocation(void* ptr, size_t* size) override {
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    auto it = ptr_info.find(ptr);
+    TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
+
+    if (size) {
+      *size = it->second.size;
+    }
+
+    return ptr;
+  }
+
+  void recordStream(const DataPtr& ptr, zoom::ZoomStream stream) override {
+    std::lock_guard<std::mutex> lk(general_mutex);
+    auto ptr_val = ptr.get();
+    // Empty tensor's storage().data() might be a null ptr. As there is no
+    // blocks associated with those tensors, it is fine to do nothing here.
+    if (!ptr_val) {
+      return;
+    }
+
+    // The pointer should exist in the map already.
+    auto it = ptr_info.find(ptr_val);
+    TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
+
+    UsageStream to_record{stream.stream(), stream.device_index()};
+    if (to_record == it->second.creation_stream) {
+      TORCH_WARN_ONCE(
+          "Called record_stream on tensor whose original creation stream "
+          "matches the recorded stream. This is unnecessary and has no effect.");
+    } else {
+      it->second.recorded_streams.insert(to_record);
+    }
+  }
+
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
+    TORCH_CHECK(
+        false,
+        "hipMallocAsync does not yet support getIpcDevPtr. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when) override {
+    TORCH_CHECK(
+        false,
+        "hipMallocAsync does not yet support recordHistory. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
+    TORCH_CHECK(
+        false,
+        "hipMallocAsync does not yet support attachOutOfMemoryObserver. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) override {
+    TORCH_CHECK(
+        false,
+        "hipMallocAsync does not yet support attachAllocatorTraceTracker. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) override {
+    TORCH_CHECK(
+        false,
+        "hipMallocAsync does not yet support getCheckpointState. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<AllocatorState> pps) override {
+    TORCH_CHECK(
+        false,
+        "hipMallocAsync does not yet support setCheckpointPoolState. "
+        "If you need it, please file an issue describing your use case.");
+  }
+
+  // Collects stats for device.
+  // If device hasn't been used yet, returns 0s without creating a context.
+  DeviceStats getDeviceStats(c10::DeviceIndex device) override {
+    assertValidDevice(device);
+
+    // Memory currently reserved by the mempool
+    uint64_t reserved_mem_current = 0;
+    // High-water mark of memory reserved by the mempool since last reset
+    uint64_t reserved_mem_peak = 0;
+    // Memory currently in use by the mempool
+    uint64_t used_mem_current = 0;
+    // High-water mark of memory
+    uint64_t used_mem_peak = 0;
+
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    if (devs_initialized_flags[device]) {
+      ZoomGuard g(device);
+
+      hipMemPool_t mempool = nullptr;
+      C10_ZOOM_CHECK(hipDeviceGetDefaultMemPool(&mempool, device));
+      C10_ZOOM_CHECK(hipMemPoolGetAttribute(
+          mempool, hipMemPoolAttrReservedMemCurrent, &reserved_mem_current));
+
+      C10_ZOOM_CHECK(hipMemPoolGetAttribute(
+          mempool, hipMemPoolAttrReservedMemHigh, &reserved_mem_peak));
+
+      C10_ZOOM_CHECK(hipMemPoolGetAttribute(
+          mempool, hipMemPoolAttrUsedMemCurrent, &used_mem_current));
+
+      C10_ZOOM_CHECK(hipMemPoolGetAttribute(
+          mempool, hipMemPoolAttrUsedMemHigh, &used_mem_peak));
+    }
+
+    // Many stat types are specific to the native allocator. We leave these
+    // untouched. Their "struct Stat"s will contain zeroed values.
+    DeviceStats stats;
+
+    // In the native allocator:
+    // allocated_bytes is the total bytes of blocks that have been malloc()ed
+    // and not yet free()d.
+    // active_bytes is the total bytes of blocks that have been malloc()ed but
+    // not yet released back into a free pool. In other words, it includes all
+    // allocated_bytes, as well as the bytes of "limbo state" blocks had have
+    // already been free()ed but not yet free_block()ed back into a pool due to
+    // outstanding stream_uses.
+    //
+    // Here, in the hipMallocAsync allocator:
+    // We simply ask the driver's opinion about active memory.
+    // We don't bother distinguishing between allocated_bytes and active_bytes.
+    stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
+        static_cast<int64_t>(used_mem_current);
+    stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
+        static_cast<int64_t>(used_mem_peak);
+    stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
+        static_cast<int64_t>(used_mem_current);
+    stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
+        static_cast<int64_t>(used_mem_peak);
+    stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
+        static_cast<int64_t>(reserved_mem_current);
+    stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
+        static_cast<int64_t>(reserved_mem_peak);
+
+    return stats;
+  }
+
+  void resetAccumulatedStats(c10::DeviceIndex device) override {
+    assertValidDevice(device);
+    TORCH_WARN_ONCE(
+        "For backend:hipMallocAsync, resetAccumulatedStats has no effect.");
+  }
+
+  void resetPeakStats(c10::DeviceIndex device) override {
+    assertValidDevice(device);
+
+    ZoomGuard g(device);
+    hipMemPool_t mempool = nullptr;
+    C10_ZOOM_CHECK(hipDeviceGetDefaultMemPool(&mempool, device));
+    // Using zero as the reset value is the method recommended by Cuda driver
+    // team. Vivek Kini says:
+    //   "Resetting to zero (which is the only valid value when setting
+    //    ReservedMemHigh) resets it to ReservedMemCurrent inside the driver
+    //   (same goes for UsedMemHigh/UsedMemCurrent)"
+    uint64_t zero = 0;
+    C10_ZOOM_CHECK(hipMemPoolSetAttribute(
+        mempool, hipMemPoolAttrReservedMemHigh, &zero));
+    C10_ZOOM_CHECK(
+        hipMemPoolSetAttribute(mempool, hipMemPoolAttrUsedMemHigh, &zero));
+  }
+
+  SnapshotInfo snapshot() override {
+    TORCH_CHECK(
+        false,
+        "Calling snapshot with backend:hipMallocAsync is not meaningful. "
+        "(For backend:native, snapshot returns a detailed summary of all "
+        "blocks tracked by the allocator, but the hipMallocAsync backend "
+        "does not track individual blocks.)");
+    // Alternative: TORCH_WARN
+    return {};
+  }
+
+  // CUDAGraph interactions
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)>) override {
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    TORCH_INTERNAL_ASSERT(capture_free_streams.empty());
+    TORCH_CHECK(
+        !capture_underway,
+        "Only one capture at a time is allowed in a process.")
+    capture_underway = true;
+  }
+
+  void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
+      override {
+    assertValidDevice(device);
+
+    std::lock_guard<std::mutex> lk(general_mutex);
+
+    TORCH_CHECK(
+        capture_underway,
+        "hipMallocAsync::notifyCaptureAboutToEnd called, "
+        "but hipMallocAsync::capture_underway is false.");
+
+    auto capture_stream = zoom::getCurrentZoomStream(device);
+
+    // See Note [Avoid dangling free streams during CUDA graph capture]
+    for (const auto& free_stream : capture_free_streams) {
+      // hipEventRecord requires that the input event and stream are on the
+      // same device.
+      ZoomGuard g(free_stream.device);
+
+      // ZoomCachingAllocator.cpp uses raw hip events, as do we.
+      hipEvent_t event = nullptr;
+      C10_ZOOM_CHECK(hipEventCreateWithFlags(&event, hipEventDisableTiming));
+      C10_ZOOM_CHECK(hipEventRecord(event, free_stream.stream));
+      C10_ZOOM_CHECK(hipStreamWaitEvent(capture_stream.stream(), event, 0));
+      C10_ZOOM_CHECK(hipEventDestroy(event));
+    }
+
+    capture_free_streams.clear();
+    TORCH_CHECK(
+        capture_underway,
+        "hipMallocAsync::notifyCaptureEnded called, "
+        "but hipMallocAsync::capture_underway is false.");
+    capture_underway = false;
+  }
+
+  void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override {
+    // Q: Do we need to do anything special here, like clear long-lived
+    //    pointers created during the original capture (for example,
+    //    tensors intended as the graph's I/O surface) that might still
+    //    be resident in ptr_info?
+    // A: I don't think so.
+    //    Those allocations survived capture because the user held
+    //    explicit tensor references to them,
+    //    Those tensors' destructors will call freeAsync() on each pointer
+    //    when the user is done with them.
+    //    The freeAsync()s will probably incur
+    //    TORCH_WARN("Attempting uncaptured free of a captured allocation..."
+    //    but stale ptrs will not permanently leak into ptr_info.
+  }
+
+  void* raw_alloc(size_t nbytes) override {
+    if (nbytes == 0) {
+      return nullptr;
+    }
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    void* r = nullptr;
+    mallocAsync(&r, device, nbytes, zoom::getCurrentZoomStream(device));
+    return r;
+  }
+
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override {
+    if (nbytes == 0) {
+      return nullptr;
+    }
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    void* r = nullptr;
+    mallocAsync(&r, device, nbytes, stream);
+    return r;
+  }
+  void raw_delete(void* ptr) override {
+    freeAsync(ptr);
+  }
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override {
+    // Double-checks allocator backend hasn't changed, which would definitely be
+    // an error. hipMallocAsync pools are unaffected by
+    // hipDeviceEnablePeerAccess. We need pool-specific enablement. See
+    // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
+    c10::zoom::ZoomGuard device_guard(dev);
+    hipMemPool_t mempool = nullptr;
+    C10_ZOOM_CHECK(hipDeviceGetDefaultMemPool(&mempool, dev_to_access));
+    hipMemAccessDesc desc = {};
+    desc.location.type = hipMemLocationTypeDevice;
+    // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+    desc.location.id = dev;
+    desc.flags = hipMemAccessFlagsProtReadWrite;
+    C10_ZOOM_CHECK(hipMemPoolSetAccess(mempool, &desc, 1 /* numDescs */));
+  }
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override {
+    if (p2p_enabled || dstDevice == srcDevice) {
+      return hipMemcpyAsync(dst, src, count, hipMemcpyDeviceToDevice, stream);
+    } else {
+      return hipMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
+    }
+  }
+  std::string name() override {
+    return "hipMallocAsync";
+  }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    C10_ZOOM_CHECK(
+        hipMemcpy(dest, src, count, hipMemcpyKind::hipMemcpyDeviceToDevice));
+  }
+};
+
+ZoomMallocAsyncAllocator device_allocator;
+
+void local_raw_delete(void* ptr) {
+  freeAsync(ptr);
+}
+ZoomAllocator* allocator() {
+  return &device_allocator;
+}
+
+
+} // namespace c10::zoom::ZoomCachingAllocator::ZoomMallocAsync
\ No newline at end of file
diff --git a/c10/zoom/ZoomMiscFunctions.cpp b/c10/zoom/ZoomMiscFunctions.cpp
new file mode 100644
index 00000000000000..cba225e314a0f7
--- /dev/null
+++ b/c10/zoom/ZoomMiscFunctions.cpp
@@ -0,0 +1,23 @@
+#include <c10/zoom/ZoomMiscFunctions.h>
+#include <cstdlib>
+
+namespace c10::zoom {
+
+const char* get_hip_check_suffix() noexcept {
+  static char* device_blocking_flag = getenv("HIP_LAUNCH_BLOCKING");
+  static bool blocking_enabled =
+      (device_blocking_flag && atoi(device_blocking_flag));
+  if (blocking_enabled) {
+    return "";
+  } else {
+    return "\nHIP kernel errors might be asynchronously reported at some"
+           " other API call, so the stacktrace below might be incorrect."
+           "\nFor debugging consider passing HIP_LAUNCH_BLOCKING=1";
+  }
+}
+std::mutex* getFreeMutex() {
+  static std::mutex hip_free_mutex;
+  return &hip_free_mutex;
+}
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomMiscFunctions.h b/c10/zoom/ZoomMiscFunctions.h
new file mode 100644
index 00000000000000..8031194734d5e1
--- /dev/null
+++ b/c10/zoom/ZoomMiscFunctions.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <mutex>
+
+namespace c10::zoom {
+const char* get_hip_check_suffix() noexcept;
+std::mutex* getFreeMutex();
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomStream.cpp b/c10/zoom/ZoomStream.cpp
new file mode 100644
index 00000000000000..4dac263d78db45
--- /dev/null
+++ b/c10/zoom/ZoomStream.cpp
@@ -0,0 +1,375 @@
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/ZoomStream.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+
+#define C10_ZOOM_COMPILE_TIME_MAX_GPUS 16
+
+namespace c10::zoom {
+
+namespace {
+
+// Global stream state and constants
+static c10::once_flag init_flag;
+static DeviceIndex num_gpus = -1;
+static constexpr int kStreamsPerPoolBits = 5;
+static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+static constexpr unsigned int kDefaultFlags = hipStreamNonBlocking;
+static constexpr int kStreamTypeBits = 4;
+
+static int max_stream_priorities;
+
+// Non-default streams
+// Note: the number of CUDA devices is determined at run time,
+// and the low and high priority pools are lazily initialized
+// when the first stream is requested for a device.
+// The device flags track the initialization of each device, while
+// the low and high priority counters track, for each device, the next stream
+// in the pool to be returned when a stream is requested (round-robin fashion
+// , see the note in ZoomStream.h).
+// The streams are "leaked": they are created but never destroyed because the
+// destruction of global variables could happen after the CUDA runtime has
+// already been destroyed and thus invoking ZoomStreamDestroy could lead to a
+// crash. It's likely an issue in CUDA, but to be safe - let's just "forget"
+// the destruction.
+
+static std::array<
+    std::array<std::atomic<uint32_t>, C10_ZOOM_COMPILE_TIME_MAX_GPUS>,
+    c10::zoom::max_compile_time_stream_priorities>
+    priority_counters;
+
+static std::array<
+    std::array<
+        std::array<hipStream_t, kStreamsPerPool>,
+        C10_ZOOM_COMPILE_TIME_MAX_GPUS>,
+    c10::zoom::max_compile_time_stream_priorities>
+    streams;
+
+static c10::once_flag
+    stream_flags[c10::zoom::max_compile_time_stream_priorities]
+                [C10_ZOOM_COMPILE_TIME_MAX_GPUS][kStreamsPerPool];
+
+
+// Note [HIP Lazy Streams]
+// ~~~~~~~~~~~~~~~~~~~~~~~
+// For ROCm/HIP, each stream is lazily initialized rather than creating all
+// streams when the first stream is requested. HIP streams are not as
+// lightweight as CUDA streams; the pooling strategy can affect performance.
+// Rather than changing the pooling implementation, ROCm/HIP will lazy init
+// each stream when it is first requested.
+
+// Note [StreamId assignment]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+// How do we assign stream IDs?
+//
+// -- 54 bits --  -- 5 bits -----  -- 4 bits --     --1 bit --
+// zeros          stream id index  StreamIdType     Ext/native stream
+//                ignored for ext   ignored for ext
+// for external stream, StreamID is a hipStream_t pointer
+// this means that last bit will always be 0
+// so when constructing StreamId for a native stream we set last bit to 1
+// to distinguish between native and external streams
+//
+//
+// We are obligated to treat the stream ID 0 as the default stream, per the
+// invariant specified in c10::Stream, so this is one exception to
+// "last bit = 1 for native streams". However, all other numbers are entirely
+// an internal implementation detail, we reserve the right to renumber streams
+// however we like.
+//
+// Note that it is really important that the MSB is zero; StreamId is a
+// *signed* integer, and unsigned to signed conversion outside of the
+// bounds of signed integer representation is undefined behavior.  You
+// could work around this with something like
+// https://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+// but it seems a bit overkill for this.
+//
+// Also, external managed stream pointers (hipStream_t) can be directly stored
+// in the Id field so in this case, we need to check the stream alignment.
+
+class StreamIdType {
+  // StreamIdType encodes whether this stream is DEFAULT, EXTernal or
+  // for all other native streams, the stream priority (higher value is higher
+  // priority)
+ private:
+  uint8_t stream_type;
+
+ public:
+  static const uint8_t DEFAULT = 0x0;
+  static const uint8_t EXT = 0xF;
+
+ public:
+  StreamIdType(const uint8_t _stream_type) : stream_type(_stream_type) {}
+
+  bool isExt() const {
+    return EXT == stream_type;
+  }
+
+  bool isDefault() const {
+    return DEFAULT == stream_type;
+  }
+
+  uint8_t getStreamType() const {
+    return stream_type;
+  }
+};
+
+std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
+  if (s.isDefault()) {
+    stream << "DEFAULT";
+  } else if (s.isExt()) {
+    stream << "EXT";
+  } else {
+    stream << "PRIORITY " << int(s.getStreamType());
+  }
+  return stream;
+}
+
+// StreamId is 64-bit, so we can just rely on regular promotion rules.
+// We rely on streamIdIndex and streamIdType being non-negative;
+// see Note [Hazard when concatenating signed integers]
+
+static inline StreamIdType streamIdType(StreamId s) {
+  // Externally allocated streams have their id being the ZoomStream_ptr
+  // so the last bit will be 0
+  if ((!(s & 1)) && s) {
+    return StreamIdType(StreamIdType::EXT);
+  }
+  // last bit is external/internal stream, the mask should start from second
+  // rightmost bit
+  int mask_for_type = (1 << kStreamTypeBits) - 1;
+  auto val = (s >> 1) & mask_for_type;
+  TORCH_INTERNAL_ASSERT(val || !(s & 1), "invalid StreamId", s);
+  return StreamIdType(val);
+}
+
+static inline size_t streamIdIndex(StreamId s) {
+  return static_cast<size_t>(
+      (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
+}
+
+StreamId makeStreamId(StreamIdType st, size_t si) {
+  if (st.isDefault()) {
+    return static_cast<StreamId>(0);
+  }
+  return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
+      static_cast<StreamId>(st.getStreamType() << 1) | 1;
+}
+
+// Thread-local current streams
+// NOLINTNEXTLINE(*-arrays)
+static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+
+// Populates global values.
+// Warning: this function must only be called once!
+static void initGlobalStreamState() {
+  num_gpus = device_count();
+  // Check if the number of GPUs matches the expected compile-time max number
+  // of GPUs.
+  TORCH_CHECK(
+      num_gpus <= C10_ZOOM_COMPILE_TIME_MAX_GPUS,
+      "Number of ROCm devices on the machine is larger than the compiled "
+      "max number of gpus expected (",
+      C10_ZOOM_COMPILE_TIME_MAX_GPUS,
+      "). Increase that and recompile.");
+  // Note [HIP stream priorities]
+  // HIP stream priorities are 1=low, 0=default, -1=high which differs from CUDA
+  // which is 0=default, -1=high, -2=higher etc.
+  // Clamp leastPriority to 0 for HIP.
+  int leastPriority = 0, greatestPriority = -1;
+  C10_ZOOM_CHECK(
+      hipDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
+  
+
+  // greatestPriority is negative
+  auto range = leastPriority - greatestPriority + 1;
+  max_stream_priorities = range >= c10::zoom::max_compile_time_stream_priorities
+      ? c10::zoom::max_compile_time_stream_priorities
+      : range;
+}
+
+// Init a single HIP stream
+// See Note [HIP Lazy Streams]
+static void initSingleStream(int p, DeviceIndex device_index, int i) {
+  auto& stream = streams[p][device_index][i];
+  auto pri = -p; // lower number is higher priority
+
+  C10_ZOOM_CHECK(hipStreamCreateWithPriority(&stream, kDefaultFlags, pri));
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_creation(
+        c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(stream));
+    priority_counters[p][device_index] = 0;
+  }
+}
+
+// Creates the low and high priority stream pools for the specified device
+// Warning: only call once per device!
+static void initDeviceStreamState(DeviceIndex device_index) {
+  // Switches to the requested device so streams are properly associated
+  // with it.
+  ZoomGuard device_guard{device_index};
+  for (const auto i : c10::irange(kStreamsPerPool)) {
+    for (const auto p : c10::irange(max_stream_priorities)) {
+      initSingleStream(p, device_index, i);
+    }
+  }
+}
+
+// Init front-end to ensure initialization only occurs once
+static void initZoomStreamsOnce() {
+  // Inits default streams (once, globally)
+  c10::call_once(init_flag, initGlobalStreamState);
+
+  if (current_streams) {
+    return;
+  }
+
+  // Inits current streams (thread local) to default streams
+  // NOLINTNEXTLINE(*-arrays)
+  current_streams = std::make_unique<StreamId[]>(num_gpus);
+  for (const auto i : c10::irange(num_gpus)) {
+    current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
+  }
+}
+
+// Helper to verify the GPU index is valid
+static inline void check_gpu(DeviceIndex device_index) {
+  TORCH_INTERNAL_ASSERT(device_index >= 0 && device_index < num_gpus);
+}
+
+// Helper to determine the index of the stream to return
+// Note: Streams are returned round-robin (see note in ZoomStream.h)
+static uint32_t get_idx(std::atomic<uint32_t>& counter) {
+  auto raw_idx = counter++;
+  return raw_idx % kStreamsPerPool;
+}
+
+ZoomStream ZoomStreamForId(DeviceIndex device_index, StreamId stream_id) {
+  return ZoomStream(
+      ZoomStream::UNCHECKED,
+      Stream(
+          Stream::UNSAFE,
+          c10::Device(DeviceType::PrivateUse1, device_index),
+          stream_id));
+}
+
+} // anonymous namespace
+
+// See Note [StreamId assignment]
+hipStream_t ZoomStream::stream() const {
+  c10::DeviceIndex device_index = stream_.device_index();
+  StreamId stream_id = stream_.id();
+  StreamIdType st = streamIdType(stream_id);
+  size_t si = streamIdIndex(stream_id);
+  if (st.isDefault()) {
+    TORCH_INTERNAL_ASSERT(
+        si == 0,
+        "Unrecognized stream ",
+        stream_,
+        " (I think this should be the default stream, but I got a non-zero index ",
+        si,
+        ").",
+        " Did you manufacture the StreamId yourself?  Don't do that; use the",
+        " official API like c10::zoom::getStreamFromPool() to get a new stream.");
+    return nullptr;
+  } else if (st.isExt()) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    return reinterpret_cast<hipStream_t>(stream_id);
+  } else {
+    auto streamType = st.getStreamType();
+    TORCH_INTERNAL_ASSERT(
+        streamType >= 1 && streamType <= max_stream_priorities,
+        "Unrecognized stream ",
+        stream_,
+        " (I didn't recognize the stream type, ",
+        st,
+        " with the value ",
+        streamType,
+        ")");
+
+    // See Note [HIP Lazy Streams]
+    c10::call_once(
+        stream_flags[st.getStreamType() - 1][device_index][si],
+        initSingleStream,
+        st.getStreamType() - 1,
+        device_index,
+        si);
+
+    return streams[st.getStreamType() - 1][device_index][si];
+  }
+}
+
+// Returns a stream from the requested pool
+// Note: when called the first time on a device, this will create the
+// stream pools for that device.
+ZoomStream getStreamFromPool(const int priority, DeviceIndex device_index) {
+  initZoomStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+    c10::zoom::SetTargetDevice();
+  }
+  TORCH_CHECK(
+      priority <= 0,
+      "Expected hip stream priority to be less than or equal to 0, got ",
+      priority);
+  check_gpu(device_index);
+
+  auto pri_idx = -priority;
+  pri_idx =
+      std::min(pri_idx, max_stream_priorities - 1); // pri_idx is zero-based
+  const auto idx = get_idx(priority_counters[pri_idx][device_index]);
+  StreamIdType id_type = StreamIdType(pri_idx + 1);
+  return ZoomStreamForId(device_index, makeStreamId(id_type, idx));
+}
+
+ZoomStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
+  initZoomStreamsOnce();
+  int priority = isHighPriority ? -max_stream_priorities + 1 : 0;
+  return getStreamFromPool(priority, device);
+}
+
+ZoomStream getStreamFromExternal(
+    hipStream_t ext_stream,
+    DeviceIndex device_index) {
+  // The stream pointer will be the actual id
+  return ZoomStreamForId(device_index, reinterpret_cast<int64_t>(ext_stream));
+}
+
+ZoomStream getDefaultZoomStream(DeviceIndex device_index) {
+  initZoomStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+    c10::zoom::SetTargetDevice();
+  }
+  check_gpu(device_index);
+  return ZoomStreamForId(device_index, makeStreamId(StreamIdType::DEFAULT, 0));
+}
+
+ZoomStream getCurrentZoomStream(DeviceIndex device_index) {
+  initZoomStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+    c10::zoom::SetTargetDevice();
+  }
+  check_gpu(device_index);
+  return ZoomStreamForId(device_index, current_streams[device_index]);
+}
+
+void setCurrentZoomStream(ZoomStream stream) {
+  initZoomStreamsOnce();
+  current_streams[stream.device_index()] = stream.id();
+}
+
+std::ostream& operator<<(std::ostream& stream, const ZoomStream& s) {
+  return stream << s.unwrap();
+}
+
+} // namespace c10::zoom
\ No newline at end of file
diff --git a/c10/zoom/ZoomStream.h b/c10/zoom/ZoomStream.h
new file mode 100644
index 00000000000000..04041318cf1781
--- /dev/null
+++ b/c10/zoom/ZoomStream.h
@@ -0,0 +1,221 @@
+#pragma once
+#include <hip/hip_runtime.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/util/Exception.h>
+
+namespace c10::zoom {
+
+static constexpr int max_compile_time_stream_priorities = 4;
+
+// Value object representing a CUDA stream.  This is just a wrapper
+// around c10::Stream, but it comes with a little extra CUDA-specific
+// functionality (conversion to hipStream_t), and a guarantee that
+// the wrapped c10::Stream really is a CUDA stream.
+class ZoomStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a ZoomStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a CUDA stream.
+  explicit ZoomStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
+  }
+
+  /// Construct a ZoomStream from a Stream with no error checking.
+  /// This constructor uses the "named" constructor idiom, and can
+  /// be invoked as: ZoomStream(ZoomStream::UNCHECKED, stream)
+  explicit ZoomStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const ZoomStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const ZoomStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to hipStream_t.
+  operator hipStream_t() const {
+    return stream();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// CUDA stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Used to avoid baking in device type explicitly to Python-side API.
+  DeviceType device_type() const {
+    return DeviceType::PrivateUse1;
+  }
+
+  /// Get the CUDA device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  Device device() const {
+    return Device(DeviceType::PrivateUse1, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+    hipError_t err = C10_ZOOM_ERROR_HANDLED(hipStreamQuery(stream()));
+
+    if (err == hipSuccess) {
+      return true;
+    } else if (err != hipErrorNotReady) {
+      C10_ZOOM_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)hipGetLastError();
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    c10::zoom::stream_synchronize(stream());
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    C10_ZOOM_CHECK(hipStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  /// Explicit conversion to hipStream_t.
+  hipStream_t stream() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a ZoomStream into a struct representation.
+  /// Previously the stream's data was packed into a single int64_t,
+  /// as it was assumed the fields would not require more than
+  /// 64 bits of storage in total.
+  /// See https://github.com/pytorch/pytorch/issues/75854
+  /// for more information regarding newer platforms that may violate
+  /// this assumption.
+  ///
+  /// The ZoomStream can be unpacked using unpack().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  // Unpack a ZoomStream from the 3 fields generated by pack().
+  static ZoomStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return ZoomStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+    // Note: this returns the range of priority **supported by PyTorch**, not
+    // the range of priority **supported by CUDA**. The former is a subset of
+    // the latter.
+    int least_priority = 0, greatest_priority = 0;
+    C10_ZOOM_CHECK(
+        hipDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+
+    // See Note [HIP stream priorities]
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 1, "Unexpected HIP stream priority range");
+    least_priority = 0;
+
+    TORCH_INTERNAL_ASSERT(
+        greatest_priority <= -1, "Unexpected HIP stream priority range");
+    greatest_priority = std::max(
+        -c10::zoom::max_compile_time_stream_priorities + 1, greatest_priority);
+    return std::make_tuple(least_priority, greatest_priority);
+  }
+
+  // Deleted for now; use CUDAEvent::block instead
+  // void synchronize_with(const CUDAEvent& event) const;
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a new stream from the CUDA stream pool.  You can think of this
+ * as "creating" a new stream, but no such creation actually happens;
+ * instead, streams are preallocated from the pool and returned in a
+ * round-robin fashion.
+ *
+ * You can request a stream from the high priority pool by setting
+ * isHighPriority to true, or a stream for a specific device by setting device
+ * (defaulting to the current CUDA stream.)
+ */
+ZoomStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+// no default priority to disambiguate overloads
+ZoomStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get a ZoomStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+ZoomStream
+getStreamFromExternal(hipStream_t ext_stream, DeviceIndex device_index);
+
+/**
+ * Get the default CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The default stream is
+ * where most computation occurs when you aren't explicitly using
+ * streams.
+ */
+ZoomStream getDefaultZoomStream(DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The current CUDA stream
+ * will usually be the default CUDA stream for the device, but it may
+ * be different if someone called 'setCurrentZoomStream' or used 'StreamGuard'
+ * or 'ZoomStreamGuard'.
+ */
+ZoomStream getCurrentZoomStream(DeviceIndex device_index = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be
+ * the passed in stream.  Yes, you read that right: this function
+ * has *nothing* to do with the current device: it toggles the current
+ * stream of the device of the passed stream.
+ *
+ * Confused?  Avoid using this function; prefer using 'ZoomStreamGuard' instead
+ * (which will switch both your current device and current stream in the way you
+ * expect, and reset it back to its original state afterwards).
+ */
+void setCurrentZoomStream(ZoomStream stream);
+
+std::ostream& operator<<(std::ostream& stream, const ZoomStream& s);
+
+} // namespace c10::zoom
+
+namespace std {
+template <>
+struct hash<c10::zoom::ZoomStream> {
+  size_t operator()(c10::zoom::ZoomStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
\ No newline at end of file
diff --git a/c10/zoom/impl/ZoomGuardImpl.cpp b/c10/zoom/impl/ZoomGuardImpl.cpp
new file mode 100644
index 00000000000000..0327253b26d1f0
--- /dev/null
+++ b/c10/zoom/impl/ZoomGuardImpl.cpp
@@ -0,0 +1,7 @@
+#include <c10/zoom/impl/ZoomGuardImpl.h>
+
+namespace c10::zoom::impl {
+
+C10_REGISTER_GUARD_IMPL(PrivateUse1, ZoomGuardImpl);
+
+} // namespace c10::zoom::impl
\ No newline at end of file
diff --git a/c10/zoom/impl/ZoomGuardImpl.h b/c10/zoom/impl/ZoomGuardImpl.h
new file mode 100644
index 00000000000000..49f0813cf24884
--- /dev/null
+++ b/c10/zoom/impl/ZoomGuardImpl.h
@@ -0,0 +1,249 @@
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/zoom/ZoomStream.h>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Optional.h>
+#include <hip/hip_runtime.h>
+#include <cstdint>
+
+namespace c10::zoom::impl {
+
+struct ZoomGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::PrivateUse1;
+
+  ZoomGuardImpl() = default;
+  explicit ZoomGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == DeviceType::PrivateUse1);
+  }
+  DeviceType type() const override {
+    return DeviceType::PrivateUse1;
+  }
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_privateuseone());
+    auto old_device_index = c10::zoom::ExchangeDevice(d.index());
+    return Device(DeviceType::PrivateUse1, old_device_index);
+  }
+  Device getDevice() const override {
+    DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    return Device(DeviceType::PrivateUse1, device);
+  }
+  std::optional<Device> uncheckedGetDevice() const noexcept {
+    DeviceIndex device{-1};
+    const auto err = C10_ZOOM_ERROR_HANDLED(c10::zoom::GetDevice(&device));
+    C10_ZOOM_CHECK_WARN(err);
+    if (err != hipSuccess) {
+      return c10::nullopt;
+    }
+    return Device(DeviceType::PrivateUse1, device);
+  }
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_privateuseone());
+    C10_ZOOM_CHECK(c10::zoom::SetDevice(d.index()));
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    C10_ZOOM_CHECK_WARN(c10::zoom::MaybeSetDevice(d.index()));
+  }
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentZoomStream(d.index()).unwrap();
+  }
+  Stream getDefaultStream(Device d) const override {
+    return getDefaultZoomStream(d.index());
+  }
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return getStreamFromPool(priority, d.index());
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    ZoomStream cs(s);
+    auto old_stream = getCurrentZoomStream(s.device().index());
+    setCurrentZoomStream(cs);
+    return old_stream.unwrap();
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return device_count();
+  }
+
+  // Event-related functions
+  void createEvent(hipEvent_t* zoom_event, const EventFlag flag) const {
+    // Maps PyTorch's Event::Flag to HIP flag
+    auto hip_flag = hipEventDefault;
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+        hip_flag = hipEventDisableTiming;
+        break;
+      case EventFlag::BACKEND_DEFAULT:
+        hip_flag = hipEventDefault;
+        break;
+      default:
+        TORCH_CHECK(false, "HIP event received unknown flag");
+    }
+
+    C10_ZOOM_CHECK(hipEventCreateWithFlags(zoom_event, hip_flag));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(
+          c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(zoom_event));
+    }
+  }
+
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+    auto zoom_event = static_cast<hipEvent_t>(event);
+    DeviceIndex orig_device{-1};
+    C10_ZOOM_CHECK_WARN(c10::zoom::GetDevice(&orig_device));
+    C10_ZOOM_CHECK_WARN(c10::zoom::SetDevice(device_index));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(zoom_event));
+    }
+    C10_ZOOM_CHECK_WARN(hipEventDestroy(zoom_event));
+    C10_ZOOM_CHECK_WARN(c10::zoom::SetDevice(orig_device));
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    hipEvent_t zoom_event = static_cast<hipEvent_t>(*event);
+    ZoomStream zoom_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!zoom_event)
+      createEvent(&zoom_event, flag);
+    C10_ZOOM_CHECK(hipEventRecord(zoom_event, zoom_stream));
+    // Makes the void* point to the (possibly just allocated) HIP event
+    *event = zoom_event;
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          c10::DeviceType::PrivateUse1,
+          reinterpret_cast<uintptr_t>(zoom_event),
+          reinterpret_cast<uintptr_t>(zoom_stream.stream()));
+    }
+
+    // Resets device
+    setDevice(orig_device);
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    hipEvent_t zoom_event = static_cast<hipEvent_t>(event);
+    ZoomStream zoom_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    C10_ZOOM_CHECK(hipStreamWaitEvent(
+        zoom_stream,
+        zoom_event,
+        /*flags (must be zero)=*/0));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          c10::DeviceType::PrivateUse1,
+          reinterpret_cast<uintptr_t>(zoom_event),
+          reinterpret_cast<uintptr_t>(zoom_stream.stream()));
+    }
+    setDevice(orig_device);
+  }
+
+  // May be called from any device
+  bool queryEvent(void* event) const override {
+    if (!event)
+      return true;
+    hipEvent_t zoom_event = static_cast<hipEvent_t>(event);
+    // Note: hipEventQuery can be safely called from any device
+    const hipError_t err = C10_ZOOM_ERROR_HANDLED(hipEventQuery(zoom_event));
+    if (err != hipErrorNotReady) {
+      C10_ZOOM_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)hipGetLastError();
+    }
+    return (err == hipSuccess);
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    ZoomStream zoom_stream{stream};
+    return zoom_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    ZoomStream zoom_stream{stream};
+    zoom_stream.synchronize();
+  }
+
+  void synchronizeEvent(void* event) const override {
+    if (!event)
+      return;
+    hipEvent_t zoom_event = static_cast<hipEvent_t>(event);
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_synchronization(
+          c10::DeviceType::PrivateUse1, reinterpret_cast<uintptr_t>(zoom_event));
+    }
+    // Note: hipEventSynchronize can be safely called from any device
+    C10_ZOOM_CHECK(hipEventSynchronize(zoom_event));
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    ZoomStream zoom_stream{stream};
+    ZoomCachingAllocator::recordStream(data_ptr, zoom_stream);
+  }
+
+  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
+      const override {
+    TORCH_CHECK(
+        event1 && event2,
+        "Both events must be recorded before calculating elapsed time.");
+    // Even though zoomEventElapsedTime can be safely called from any device, if
+    // the current device is not initialized, it will create a new zoom context,
+    // which will consume a lot of memory.
+    DeviceIndex orig_device{-1};
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&orig_device));
+    C10_ZOOM_CHECK(c10::zoom::SetDevice(device_index));
+    hipEvent_t zoom_event1 = static_cast<hipEvent_t>(event1);
+    hipEvent_t zoom_event2 = static_cast<hipEvent_t>(event2);
+    float time_ms = 0;
+    // raise hipErrorNotReady if either event is recorded but not yet completed
+    C10_ZOOM_CHECK(hipEventElapsedTime(&time_ms, zoom_event1, zoom_event2));
+    C10_ZOOM_CHECK(c10::zoom::SetDevice(orig_device));
+    return static_cast<double>(time_ms);
+  }
+};
+
+} // namespace c10::zoom::impl
\ No newline at end of file
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 369bb9b106a0db..1a43c7d53aa9fb 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -71,6 +71,7 @@ if(INTERN_BUILD_ATEN_OPS)
   list(APPEND Caffe2_GPU_CU_SRCS ${ATen_CUDA_CU_SRCS})
   list(APPEND Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY})
   list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS})
+  list(APPEND Caffe2_ZOOM_SRCS ${ATen_ZOOM_SRCS})
   list(APPEND Caffe2_MPS_SRCS ${ATen_MPS_SRCS})
   list(APPEND Caffe2_XPU_SRCS ${ATen_XPU_SRCS})
   list(APPEND Caffe2_HIP_SRCS ${ATen_HIP_SRCS_W_SORT_BY_KEY})
@@ -84,13 +85,16 @@ if(INTERN_BUILD_ATEN_OPS)
   list(APPEND Caffe2_CPU_INCLUDE ${ATen_CPU_INCLUDE})
   list(APPEND Caffe2_GPU_INCLUDE ${ATen_CUDA_INCLUDE})
   list(APPEND Caffe2_HIP_INCLUDE ${ATen_HIP_INCLUDE})
+  list(APPEND Caffe2_ZOOM_INCLUDE ${ATen_ZOOM_INCLUDE})
   list(APPEND Caffe2_XPU_INCLUDE ${ATen_XPU_INCLUDE})
   list(APPEND Caffe2_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE})
   list(APPEND Caffe2_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS})
   list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
   list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS})
+  list(APPEND Caffe2_ZOOM_DEPENDENCY_LIBS ${ATen_ZOOM_DEPENDENCY_LIBS})
   list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
   set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
+  set(Caffe2_ZOOM_DEPENDENCY_LIBS ${Caffe2_ZOOM_DEPENDENCY_LIBS} PARENT_SCOPE)
 endif()
 
 # ---[ Caffe2 build
@@ -128,6 +132,7 @@ if(CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_GPU_CU_SRCS CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY CAFFE2_ALLOWLISTED_FILES)
   caffe2_do_allowlist(Caffe2_HIP_SRCS CAFFE2_ALLOWLISTED_FILES)
+  caffe2_do_allowlist(Caffe2_ZOOM_SRCS CAFFE2_ALLOWLISTED_FILES)
 endif()
 
 if(PRINT_CMAKE_DEBUG_INFO)
@@ -181,6 +186,11 @@ if(PRINT_CMAKE_DEBUG_INFO)
     message(STATUS "  " ${tmp})
   endforeach()
 
+  message(STATUS "ZOOM sources: ")
+  foreach(tmp ${Caffe2_ZOOM_SRCS})
+    message(STATUS "  " ${tmp})
+  endforeach()
+
   message(STATUS "MPS sources: ")
   foreach(tmp ${Caffe2_MPS_SRCS})
     message(STATUS "  " ${tmp})
@@ -594,6 +604,10 @@ if(USE_CUDA OR USE_ROCM)
   append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()
 
+# if (USE_ZOOM)
+#   append_filelist("libtorch_zoom_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
+# endif()
+
 if(USE_CUDA)
   list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
   add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@@ -675,6 +689,26 @@ if(USE_ROCM)
   install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
 
+# if(USE_ZOOM)
+#   list(APPEND Caffe2_ZOOM_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
+#   if(USE_NCCL)
+#     list(APPEND Caffe2_ZOOM_SRCS
+#       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
+#   endif()
+#   if(USE_DISTRIBUTED)
+#     append_filelist("libtorch_zoom_distributed_base_sources" Caffe2_ZOOM_SRCS)
+#     if(NOT WIN32)
+#       append_filelist("libtorch_zoom_distributed_extra_sources" Caffe2_ZOOM_SRCS)
+#     endif()
+#   endif()
+  # See NOTE [ ATen NVRTC Stub and HIP ]
+#   hip_add_library(caffe2_hiprtc SHARED ${ATen_HIPRTC_STUB_SRCS})
+#   target_link_libraries(caffe2_hiprtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB})
+#   target_include_directories(caffe2_hiprtc PRIVATE ${CMAKE_BINARY_DIR} ${ROCM_SOURCE_DIR}/include)
+#   target_compile_definitions(caffe2_hiprtc PRIVATE USE_ROCM __HIP_PLATFORM_AMD__)
+#   install(TARGETS caffe2_hiprtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+# endif()
+
 if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
   list(APPEND TORCH_SRCS
     ${TORCH_SRC_DIR}/csrc/api/src/cuda.cpp
@@ -920,6 +954,11 @@ if(USE_ROCM)
   set_source_files_properties(${__caffe2_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
 endif()
 
+if(USE_ZOOM)
+  filter_list(__caffe2_zoom_hip_srcs_cpp Caffe2_ZOOM_SRCS "\\.(cu|hip)$")
+  set_source_files_properties(${_caffe2_zoom_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+endif()
+
 # Compile exposed libraries.
 if(USE_ROCM)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
@@ -941,6 +980,15 @@ if(USE_ROCM)
     target_precompile_headers(torch_hip PRIVATE
         "$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
   endif()
+elseif(USE_ZOOM)
+  ADD_DEFINITIONS(-DUSE_ZOOM)
+  set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
+  # list(APPEND Caffe2_ZOOM_SRCS ${GENERATED_CXX_TORCH_CUDA})
+
+  # TODO(Arham): disentangle this and build caffe2_hiprtc instead
+  hip_add_library(torch_zoom ${Caffe2_ZOOM_SRCS} ${ATen_HIPRTC_STUB_SRCS})
+  set(CUDA_LINK_LIBRARIES_KEYWORD)
+  torch_compile_options(torch_zoom)  # see cmake/public/utils.cmake
 elseif(USE_CUDA)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
   list(APPEND Caffe2_GPU_SRCS ${GENERATED_CXX_TORCH_CUDA})
@@ -1348,6 +1396,39 @@ if(USE_ROCM)
   endif()
 endif()
 
+if(USE_ZOOM)
+  target_compile_definitions(torch_zoom PRIVATE
+    USE_ZOOM
+    __HIP_PLATFORM_AMD__
+    )
+  # NB: Massive hack.  torch/csrc/jit/codegen/fuser/codegen.cpp includes
+  # torch/csrc/jit/codegen/fuser/cuda/resource_strings.h which changes the
+  # strings depending on if you're __HIP_PLATFORM_AMD__ or not.
+  # But that file is in torch_cpu!  So, against all odds, this macro
+  # has to be set on torch_cpu too.  I also added it to torch for
+  # better luck
+  target_compile_definitions(torch_cpu PRIVATE
+    USE_ZOOM
+    __HIP_PLATFORM_AMD__
+    )
+  target_compile_definitions(torch PRIVATE
+    USE_ZOOM
+    __HIP_PLATFORM_AMD__
+    )
+
+  if(NOT ROCM_SOURCE_DIR)
+    set(ROCM_SOURCE_DIR "$ENV{ROCM_SOURCE_DIR}")
+  endif()
+  if($ROCM_SOURCE_DIR STREQUAL "")
+    set(ROCM_SOURCE_DIR "/opt/rocm")
+  endif()
+  message(INFO "caffe2 ROCM_SOURCE_DIR = ${ROCM_SOURCE_DIR}")
+  target_include_directories(torch_zoom PRIVATE
+    ${ROCM_SOURCE_DIR}/include
+    ${ROCM_SOURCE_DIR}/hcc/include
+    )
+endif()
+
 if(BUILD_LITE_INTERPRETER)
   target_compile_definitions(torch_cpu PRIVATE BUILD_LITE_INTERPRETER)
   # Enable template selective build only when SELECTED_OP_LIST is provided.
@@ -1453,6 +1534,8 @@ if(USE_CUDA)
   target_compile_definitions(torch_cuda PRIVATE TORCH_CUDA_BUILD_MAIN_LIB)
 elseif(USE_ROCM)
   target_compile_definitions(torch_hip PRIVATE TORCH_HIP_BUILD_MAIN_LIB)
+elseif(USE_ZOOM)
+  target_compile_definitions(torch_zoom PRIVATE TORCH_HIP_BUILD_MAIN_LIB)
 endif()
 
 if(USE_XPU)
@@ -1546,6 +1629,8 @@ if(USE_CUDA)
   caffe2_interface_library(torch_cuda torch_cuda_library)
 elseif(USE_ROCM)
   caffe2_interface_library(torch_hip torch_hip_library)
+elseif(USE_ZOOM)
+  caffe2_interface_library(torch_zoom torch_zoom_library)
 elseif(USE_XPU)
   caffe2_interface_library(torch_xpu torch_xpu_library)
 endif()
@@ -1558,6 +1643,8 @@ if(USE_CUDA)
   install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 elseif(USE_ROCM)
   install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+elseif(USE_ZOOM)
+  install(TARGETS torch_zoom torch_zoom_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 elseif(USE_XPU)
   install(TARGETS torch_xpu torch_xpu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
@@ -1570,6 +1657,8 @@ if(USE_CUDA)
   target_link_libraries(torch PUBLIC torch_cuda_library)
 elseif(USE_ROCM)
   target_link_libraries(torch PUBLIC torch_hip_library)
+elseif(USE_ZOOM)
+  target_link_libraries(torch PUBLIC torch_zoom_library)
 endif()
 
 if(USE_XPU)
@@ -1715,6 +1804,43 @@ if(USE_ROCM)
   target_include_directories(torch_hip INTERFACE $<INSTALL_INTERFACE:include>)
 endif()
 
+# ---[ Caffe2 ZOOM HIP sources.
+if(USE_ZOOM)
+  # Call again since Caffe2_ZOOM_INCLUDE is extended with ATen include dirs.
+  # Get Compile Definitions from the directory (FindHIP.cmake bug)
+  get_directory_property(MY_DEFINITIONS COMPILE_DEFINITIONS)
+  if(MY_DEFINITIONS)
+    foreach(_item ${MY_DEFINITIONS})
+      list(APPEND HIP_CLANG_FLAGS "-D${_item}")
+    endforeach()
+  endif()
+
+  # Call again since Caffe2_ZOOM_INCLUDE is extended with ATen include dirs.
+  hip_include_directories(${Caffe2_ZOOM_INCLUDE})
+  # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added.
+  target_compile_options(torch_zoom PUBLIC ${HIP_CXX_FLAGS})  # experiment
+  target_link_libraries(torch_zoom PUBLIC c10_zoom)
+  # target_link_libraries(torch_zoom PUBLIC c10)
+
+  # this is where lib amdhip64 is actually linked (e.g. HIP symbols)
+  # should be included in c10_zoom
+  # target_link_libraries(torch_zoom PUBLIC ${PYTORCH_HIP_LIBRARIES})
+  if(NOT INTERN_BUILD_MOBILE)
+    # TODO: Cut this over to ATEN_HIP_FILES_GEN_LIB.  At the moment, we
+    # only generate CUDA files
+    # NB: This dependency must be PRIVATE, because we don't install
+    # ATEN_CUDA_FILES_GEN_LIB (it's a synthetic target just to get the
+    # correct dependency from generated files.)
+    #target_link_libraries(torch_zoom PRIVATE ATEN_ZOOM_FILES_GEN_LIB)
+  endif()
+  target_link_libraries(torch_zoom PUBLIC torch_cpu_library ${Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS})
+  target_link_libraries(torch_zoom PRIVATE ${Caffe2_ZOOM_DEPENDENCY_LIBS})
+
+  # Since PyTorch files contain HIP headers, this is also needed to capture the includes.
+  target_include_directories(torch_zoom PRIVATE ${Caffe2_ZOOM_INCLUDE})
+  target_include_directories(torch_zoom INTERFACE $<INSTALL_INTERFACE:include>)
+endif()
+
 if(BUILD_STATIC_RUNTIME_BENCHMARK)
   add_subdirectory(${TORCH_ROOT}/benchmarks/static_runtime ${PROJECT_BINARY_DIR}/bin)
   add_executable(static_runtime_bench "${STATIC_RUNTIME_BENCHMARK_SRCS}")
diff --git a/cmake/Caffe2Config.cmake.in b/cmake/Caffe2Config.cmake.in
index c23b3990aff8a9..67771a17548e80 100644
--- a/cmake/Caffe2Config.cmake.in
+++ b/cmake/Caffe2Config.cmake.in
@@ -74,6 +74,10 @@ if (@USE_ROCM@)
   include("${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake")
 endif()
 
+if (@USE_ZOOM@)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake")
+endif()
+
 if(@USE_CUDA@)
   # The file public/cuda.cmake exclusively uses CAFFE2_USE_*.
   # If Caffe2 was compiled with the libraries below, they must
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index b478f3cc2e1b08..f022db009f4673 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -239,6 +239,9 @@ if(INTERN_BUILD_ATEN_OPS)
   add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
   add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
   add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
+  
+  message(cuda_gen_headers="${cuda_generated_headers}")
+  message(cuda_gen_sources="${cuda_generated_sources}")
 
   if(USE_PER_OPERATOR_HEADERS)
     target_compile_definitions(ATEN_CPU_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a7e38ee73bcce5..e29c89479f9dad 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1155,7 +1155,7 @@ if(USE_CUDNN)
 endif()
 
 # ---[ HIP
-if(USE_ROCM)
+if(USE_ROCM OR USE_ZOOM)
   # This prevents linking in the libtinfo from /opt/conda/lib which conflicts with ROCm libtinfo.
   # Currently only active for Ubuntu 20.04 and greater versions.
   if(UNIX AND EXISTS "/etc/os-release")
@@ -1184,7 +1184,12 @@ if(USE_ROCM)
   include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")
-    caffe2_update_option(USE_ROCM ON)
+    if(USE_ROCM)
+      caffe2_update_option(USE_ROCM ON)
+    endif()
+    if(USE_ZOOM)
+      caffe2_update_option(USE_ZOOM ON)
+    endif()
 
     if(USE_NCCL AND NOT USE_SYSTEM_NCCL)
       message(INFO "Forcing USE_SYSTEM_NCCL to ON since it's required by using RCCL")
@@ -1251,7 +1256,10 @@ if(USE_ROCM)
       message(STATUS "Disabling Kernel Assert for ROCm")
     endif()
 
-    include(${CMAKE_CURRENT_LIST_DIR}/External/aotriton.cmake)
+    if(USE_ROCM)
+      include(${CMAKE_CURRENT_LIST_DIR}/External/aotriton.cmake)
+    endif()
+    
     if(USE_CUDA)
       caffe2_update_option(USE_MEM_EFF_ATTENTION OFF)
     endif()
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index de64370b37a26f..8b67accd6254ac 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -6,7 +6,10 @@ if(NOT __AOTRITON_INCLUDED)
   set(__AOTRITON_INSTALL_DIR "${PROJECT_SOURCE_DIR}/torch")
   ExternalProject_Add(aotriton_external
     GIT_REPOSITORY https://github.com/ROCm/aotriton.git
-    GIT_TAG 24a3fe9cb57e5cda3c923df29743f9767194cc27
+    # Note (Arham): I changed this commit because the one in nod-ai was old and had some errors,
+    # in upstream pytorch this commit tag is determined by some CI actions that would be useful to copy
+    # in order to keep this working
+    GIT_TAG 04b5df8c8123f90cba3ede7e971e6fbc6040d506
     SOURCE_DIR ${__AOTRITON_SOURCE_DIR}
     BINARY_DIR ${__AOTRITON_BUILD_DIR}
     PREFIX ${__AOTRITON_INSTALL_DIR}
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 09af98d0bc0666..31f6cbf8838852 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -180,6 +180,8 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
   message(STATUS "  Public CUDA Deps.    : ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS}")
   message(STATUS "  Private CUDA Deps.   : ${Caffe2_CUDA_DEPENDENCY_LIBS}")
+  message(STATUS "  Public ZOOM Deps.    : ${Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS}")
+  message(STATUS "  Private ZOOM Deps.   : ${Caffe2_ZOOM_DEPENDENCY_LIBS}")
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
   message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index fa39156031ff36..107a6fbc15dac5 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -1,17 +1,20 @@
 set(PYTORCH_FOUND_HIP FALSE)
 
 if(NOT DEFINED ENV{ROCM_PATH})
+  message (WARNING "ROCM_PATH undefined, using ROCM_PATH=/opt/rocm")
   set(ROCM_PATH /opt/rocm)
 else()
   set(ROCM_PATH $ENV{ROCM_PATH})
 endif()
 if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
+  message (WARNING "ROCM_INCLUDE_DIRS undefined, using ROCM_INCLUDE_DIRS=$ROCM_PATH/include")
   set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
 else()
   set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
 endif()
 
 if(NOT EXISTS ${ROCM_PATH})
+  message(WARNING "$ROCM_PATH does not exist, failed to load HIP")
   return()
 endif()
 
@@ -39,6 +42,7 @@ endmacro()
 
 # Find the HIP Package
 find_package_and_print_version(HIP 1.0)
+message("HIP FOUND? -> " ${HIP_FOUND})
 
 if(HIP_FOUND)
   set(PYTORCH_FOUND_HIP TRUE)
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 8f879a8ecc783e..60b6038f7bb9be 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -146,6 +146,17 @@ if(USE_ROCM)
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${ROCM_ROCTX_LIB})
 endif()
 
+if(USE_ZOOM)
+    append_filelist("libtorch_python_zoom_sources" TORCH_PYTHON_SRCS)
+    # list(APPEND TORCH_PYTHON_SRCS ${GENERATED_THNN_CXX_CUDA})
+
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS
+      USE_ZOOM
+      __HIP_PLATFORM_AMD__
+      )
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${ROCM_ROCTX_LIB})
+endif()
+
 if(USE_XPU)
     include(${TORCH_ROOT}/cmake/public/xpu.cmake)
     append_filelist("libtorch_python_xpu_sources" TORCH_PYTHON_SRCS)
@@ -342,6 +353,11 @@ if(USE_ROCM)
   set_source_files_properties(${TORCH_SRC_DIR}/csrc/cuda/Module.cpp PROPERTIES COMPILE_FLAGS "-DCUDA_ARCH_FLAGS=\"${PYTORCH_ROCM_ARCH_readable}\"")
 endif()
 
+if(USE_ZOOM)
+  string(REPLACE ";" " " PYTORCH_ROCM_ARCH_readable "${PYTORCH_ROCM_ARCH}")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/zoom/Module.cpp PROPERTIES COMPILE_FLAGS "-DROCM_ARCH_FLAGS=\"${PYTORCH_ROCM_ARCH_readable}\"")
+endif()
+
 target_compile_definitions(torch_python PRIVATE "-DTHP_BUILD_MAIN_LIB")
 
 target_link_libraries(torch_python PRIVATE torch_library ${TORCH_PYTHON_LINK_LIBRARIES})
diff --git a/torch/__init__.py b/torch/__init__.py
index 1dc0e9c8287fd4..263d9a28bc7802 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -1580,6 +1580,7 @@ def _assert(condition, message):
 # the public API. The "regular" import lines are there solely for the runtime
 # side effect of adding to the imported module's members for other users.
 from torch import cuda as cuda
+from torch import zoom as zoom
 from torch import cpu as cpu
 from torch import mps as mps
 from torch import xpu as xpu
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 040fbc825becdf..459da2348e2cac 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1459,7 +1459,7 @@ def _addmm_activation(
 ):
     out = addmm(self, mat1, mat2, beta, alpha)
     if use_gelu:
-        if self.is_cuda:
+        if self.is_cuda or self.is_zoom:
             return aten.gelu(out, approximate="tanh")
         else:
             return aten.gelu(out)
@@ -2608,7 +2608,7 @@ def _index_copy(
 def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
     min = torch.minimum(self.new_zeros(()), self)
     z = torch.exp(-torch.abs(self))
-    if self.is_cuda:
+    if (self.is_cuda or self.is_zoom):
         buffer = self.new_zeros((0,))
     else:
         buffer = z
@@ -2853,7 +2853,7 @@ def _upsample_nearest(
 
         # following "heuristic: only use channels_last path when it's faster than the contiguous path"
         n_channels = input.shape[1]
-        if input.device.type == "cuda" and n_channels < 4:
+        if (input.device.type == "cuda" or input.device.type == "zoom") and n_channels < 4:
             memory_format = torch.contiguous_format
 
         result = result.contiguous(memory_format=memory_format)
@@ -3686,7 +3686,7 @@ def get_values(inp_size, out_size, scales, nsqueeze):
     memory_format = utils.suggest_memory_format(input)
 
     # following "heuristic: only use channels_last path when it's faster than the contiguous path"
-    if input.device.type == "cuda" and n_channels < 16:
+    if (input.device.type == "cuda" or input.device.type == "zoom") and n_channels < 16:
         memory_format = torch.contiguous_format
 
     assert isinstance(result, torch.Tensor)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 9ff9131435f4cf..308fc9abe994e1 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -109,6 +109,10 @@
 #endif
 #endif
 
+#ifdef USE_ZOOM
+#include <ATen/zoom/HIPConfig.h>
+#endif
+
 #ifdef USE_DISTRIBUTED
 #ifdef USE_C10D
 #include <torch/csrc/distributed/autograd/python_autograd.h>
@@ -1528,6 +1532,13 @@ void initModule(PyObject* module);
 } // namespace torch::cuda
 #endif
 
+#ifdef USE_ZOOM
+PyMethodDef* THCPModule_methods();
+namespace torch::zoom {
+void initModule(PyObject* module);
+} // namespace torch::zoom
+#endif
+
 #ifdef USE_XPU
 PyMethodDef* THXPModule_methods();
 void THXPStream_init(PyObject* module);
@@ -1596,6 +1607,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
+#ifdef USE_ZOOM
+  THPUtils_addPyMethodDefs(methods, THCPModule_methods());
+#endif
 #ifdef USE_XPU
   THPUtils_addPyMethodDefs(methods, THXPModule_methods());
 #endif
@@ -1659,6 +1673,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
 #endif
+#ifdef USE_ZOOM
+  torch::zoom::initModule(module);
+#endif
 #ifdef USE_XPU
   torch::xpu::initModule(module);
 #endif
@@ -1677,6 +1694,16 @@ PyObject* initModule() {
   THCPGraph_init(module);
 #endif
 
+#ifdef USE_ZOOM
+  // This will only initialise base classes and attach them to library namespace
+  // They won't be ready for real usage until importing cuda module, that will
+  // complete the process (but it defines Python classes before calling back
+  // into C, so these lines have to execute first)..
+  THCPStream_init(module);
+  THCPEvent_init(module);
+  THCPGraph_init(module);
+#endif
+
 #ifdef USE_XPU
   THXPStream_init(module);
   THXPEvent_init(module);
@@ -1697,7 +1724,7 @@ PyObject* initModule() {
         return ret == 0;
       };
 
-#if defined(USE_CUDNN) || defined(USE_ROCM)
+#if defined(USE_CUDNN) || (defined(USE_ROCM) && !defined(USE_ZOOM))
   PyObject* has_cudnn = Py_True;
 #else
   PyObject* has_cudnn = Py_False;
@@ -2067,6 +2094,12 @@ Call this whenever a new thread is created in order to propagate values from
   PyObject* has_cuda = Py_False;
 #endif
 
+#ifdef USE_ZOOM
+  PyObject* has_zoom = Py_True;
+#else
+  PyObject* has_zoom = Py_False;
+#endif
+
 #ifdef USE_MPS
   PyObject* has_mps = Py_True;
 #else
@@ -2080,6 +2113,7 @@ Call this whenever a new thread is created in order to propagate values from
 #endif
 
   ASSERT_TRUE(set_module_attr("_has_cuda", has_cuda));
+  ASSERT_TRUE(set_module_attr("_has_zoom", has_zoom));
   ASSERT_TRUE(
       set_module_attr("_has_magma", at::hasMAGMA() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("_has_mps", has_mps));
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index fdcafd6cd70910..09bb02cabeaf48 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -454,10 +454,11 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   at::Device self_device = self_.device();
   Variable value;
   // TODO: This qint special case looks very suspicious...
+  // TODO(Arham): exchange keys
   if (isQIntType(self_.scalar_type())) {
     value =
         valueToTensor(device(kCPU).dtype(kFloat), py_value, at::Device(kCPU));
-  } else if (self_device.is_cuda()) {
+  } else if (self_device.is_cuda() || self_device.is_privateuseone()) {
     value = valueToTensor(self_.options(), py_value, at::Device(kCPU));
   } else {
     value = valueToTensor(self_.options(), py_value, self_device);
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index 8d18180ed91955..b1fb4f80ae59c6 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -35,6 +35,7 @@ struct PyTensorType {
   THPDtype* dtype;
   THPLayout* layout;
   bool is_cuda;
+  bool is_zoom;
   bool is_xpu;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays)
   char name[64];
@@ -130,6 +131,15 @@ static PyObject* Tensor_is_cuda(PyTensorType* self, void* unused) {
   }
 }
 
+static PyObject* Tensor_is_zoom(PyTensorType* self, void* unused) {
+  if (self->is_zoom) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+
 static PyObject* Tensor_is_xpu(PyTensorType* self, void* unused) {
   if (self->is_xpu) {
     Py_RETURN_TRUE;
@@ -166,6 +176,7 @@ static struct PyGetSetDef metaclass_properties[] = {
     {"dtype", (getter)Tensor_dtype, nullptr, nullptr, nullptr},
     {"layout", (getter)Tensor_layout, nullptr, nullptr, nullptr},
     {"is_cuda", (getter)Tensor_is_cuda, nullptr, nullptr, nullptr},
+    {"is_zoom", (getter)Tensor_is_zoom, nullptr, nullptr, nullptr},
     {"is_xpu", (getter)Tensor_is_xpu, nullptr, nullptr, nullptr},
     {"is_sparse", (getter)Tensor_is_sparse, nullptr, nullptr, nullptr},
     {"is_sparse_csr", (getter)Tensor_is_sparse_csr, nullptr, nullptr, nullptr},
@@ -247,6 +258,9 @@ static void set_type(
   type_obj.dtype = (THPDtype*)Py_NewRef(torch::getTHPDtype(scalarType));
   type_obj.is_cuda =
       (backend == at::Backend::CUDA || backend == at::Backend::SparseCUDA);
+  // TODO(Arham): exchange keys
+  type_obj.is_zoom =
+      (backend == at::Backend::PrivateUse1 || backend == at::Backend::SparsePrivateUse1);
   type_obj.is_xpu =
       (backend == at::Backend::XPU || backend == at::Backend::SparseXPU);
 }
diff --git a/torch/csrc/zoom/Event.cpp b/torch/csrc/zoom/Event.cpp
new file mode 100644
index 00000000000000..f07f6e2954c0e3
--- /dev/null
+++ b/torch/csrc/zoom/Event.cpp
@@ -0,0 +1,250 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/zoom/Event.h>
+#include <torch/csrc/zoom/Module.h>
+#include <torch/csrc/zoom/Stream.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+
+#include <c10/zoom/ZoomGuard.h>
+
+#include <hip/hip_runtime_api.h>
+#include <structmember.h>
+
+PyObject* THCPEventClass = nullptr;
+
+static PyObject* THCPEvent_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  unsigned char enable_timing = 0;
+  unsigned char blocking = 0;
+  unsigned char interprocess = 0;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  constexpr const char* kwlist[] = {
+      "enable_timing", "blocking", "interprocess", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|bbb",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(kwlist),
+          &enable_timing,
+          &blocking,
+          &interprocess)) {
+    return nullptr;
+  }
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return nullptr;
+  }
+
+  THCPEvent* self = (THCPEvent*)ptr.get();
+  unsigned int flags = (blocking ? hipEventBlockingSync : hipEventDefault) |
+      (enable_timing ? hipEventDefault : hipEventDisableTiming) |
+      (interprocess ? hipEventInterprocess : hipEventDefault);
+
+  new (&self->zoom_event) at::zoom::ZoomEvent(flags);
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_from_ipc_handle(
+    PyObject* _type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  auto type = (PyTypeObject*)_type;
+
+  static torch::PythonArgParser parser({
+      "from_ipc_handle(Device device, std::string ipc_handle)",
+  });
+  torch::ParsedArgs<2> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  at::Device device = r.device(0);
+  std::string handle_string = r.string(1);
+
+  TORCH_CHECK(
+      handle_string.size() == sizeof(hipIpcEventHandle_t),
+      "hipIpcEventHandle_t expects byte-like object of size ",
+      sizeof(hipIpcEventHandle_t),
+      ", but got ",
+      handle_string.size());
+  TORCH_CHECK(
+      device.type() == at::kPrivateUse1,
+      "Event can only be created on "
+      "Zoom devices, but got device type ",
+      device.type())
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return nullptr;
+  }
+  THCPEvent* self = (THCPEvent*)ptr.get();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  hipIpcEventHandle_t handle;
+  std::memcpy(&handle, handle_string.c_str(), handle_string.size());
+  new (&self->zoom_event) at::zoom::ZoomEvent(device.index(), &handle);
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static void THCPEvent_dealloc(THCPEvent* self) {
+  {
+    pybind11::gil_scoped_release no_gil{};
+    self->zoom_event.~ZoomEvent();
+  }
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* THCPEvent_get_zoom_event(THCPEvent* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return PyLong_FromVoidPtr(self->zoom_event.event());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_get_device(THCPEvent* self, void* unused) {
+  HANDLE_TH_ERRORS
+  at::optional<at::Device> device = self->zoom_event.device();
+  if (!device) {
+    Py_RETURN_NONE;
+  }
+  return THPDevice_New(device.value());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_record(PyObject* _self, PyObject* _stream) {
+  HANDLE_TH_ERRORS
+  auto self = (THCPEvent*)_self;
+  auto stream = (THCPStream*)_stream;
+  self->zoom_event.record(stream->zoom_stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_wait(PyObject* _self, PyObject* _stream) {
+  HANDLE_TH_ERRORS {
+    auto self = (THCPEvent*)_self;
+    auto stream = (THCPStream*)_stream;
+    pybind11::gil_scoped_release no_gil{};
+    self->zoom_event.block(stream->zoom_stream);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_query(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THCPEvent*)_self;
+  return PyBool_FromLong(self->zoom_event.query());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
+  HANDLE_TH_ERRORS
+  auto self = (THCPEvent*)_self;
+  auto other = (THCPEvent*)_other;
+  return PyFloat_FromDouble(self->zoom_event.elapsed_time(other->zoom_event));
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_synchronize(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    auto self = (THCPEvent*)_self;
+    pybind11::gil_scoped_release no_gil{};
+    self->zoom_event.synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPEvent_ipc_handle(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THCPEvent*)_self;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  hipIpcEventHandle_t handle;
+  self->zoom_event.ipc_handle(&handle);
+  return PyBytes_FromStringAndSize((const char*)&handle, sizeof(handle));
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
+static struct PyGetSetDef THCPEvent_properties[] = {
+    {"device", (getter)THCPEvent_get_device, nullptr, nullptr, nullptr},
+    {"zoom_event", (getter)THCPEvent_get_zoom_event, nullptr, nullptr, nullptr},
+    {nullptr}};
+
+// NOLINTNEXTLINE(*c-arrays*, *global-variables)
+static PyMethodDef THCPEvent_methods[] = {
+    {(char*)"from_ipc_handle",
+     castPyCFunctionWithKeywords(THCPEvent_from_ipc_handle),
+     METH_CLASS | METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {(char*)"record", THCPEvent_record, METH_O, nullptr},
+    {(char*)"wait", THCPEvent_wait, METH_O, nullptr},
+    {(char*)"query", THCPEvent_query, METH_NOARGS, nullptr},
+    {(char*)"elapsed_time", THCPEvent_elapsed_time, METH_O, nullptr},
+    {(char*)"synchronize", THCPEvent_synchronize, METH_NOARGS, nullptr},
+    {(char*)"ipc_handle", THCPEvent_ipc_handle, METH_NOARGS, nullptr},
+    {nullptr}};
+
+PyTypeObject THCPEventType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch._C._ZoomEventBase", /* tp_name */
+    sizeof(THCPEvent), /* tp_basicsize */
+    0, /* tp_itemsize */
+    (destructor)THCPEvent_dealloc, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    nullptr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    THCPEvent_methods, /* tp_methods */
+    nullptr, /* tp_members */
+    THCPEvent_properties, /* tp_getset */
+    nullptr, /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    nullptr, /* tp_init */
+    nullptr, /* tp_alloc */
+    THCPEvent_pynew, /* tp_new */
+};
+
+void THCPEvent_init(PyObject* module) {
+  THCPEventClass = (PyObject*)&THCPEventType;
+  if (PyType_Ready(&THCPEventType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THCPEventType);
+  if (PyModule_AddObject(module, "_ZoomEventBase", (PyObject*)&THCPEventType) <
+      0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/zoom/Event.h b/torch/csrc/zoom/Event.h
new file mode 100644
index 00000000000000..6f10c28f86f84d
--- /dev/null
+++ b/torch/csrc/zoom/Event.h
@@ -0,0 +1,18 @@
+#ifndef THCP_EVENT_INC
+#define THCP_EVENT_INC
+
+#include <ATen/zoom/ZoomEvent.h>
+#include <torch/csrc/python_headers.h>
+
+struct THCPEvent {
+  PyObject_HEAD at::zoom::ZoomEvent zoom_event;
+};
+extern PyObject* THCPEventClass;
+
+void THCPEvent_init(PyObject* module);
+
+inline bool THCPEvent_Check(PyObject* obj) {
+  return THCPEventClass && PyObject_IsInstance(obj, THCPEventClass);
+}
+
+#endif // THCP_EVENT_INC
diff --git a/torch/csrc/zoom/Graph.cpp b/torch/csrc/zoom/Graph.cpp
new file mode 100644
index 00000000000000..4d95f871f5af11
--- /dev/null
+++ b/torch/csrc/zoom/Graph.cpp
@@ -0,0 +1,91 @@
+#include <torch/csrc/python_headers.h>
+
+#include <pybind11/chrono.h>
+
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <ATen/zoom/HIPGraph.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+
+// Cargo culted partially from csrc/distributed/c10d/init.cpp
+// and partially from csrc/zoom/Stream.cpp.
+// THCPStream_init is also declared at global scope.
+
+// Because THCPGraph_init is forward declared in the only consumer
+// (csrc/Module.cpp) I don't think we need a Graph.h.
+
+template <typename T>
+using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
+
+void THCPGraph_init(PyObject* module) {
+  // Pybind11 patch notes say "py::module_" is more up-to-date syntax,
+  // but CI linter and some builds prefer "module".
+  auto torch_C_m = py::handle(module).cast<py::module>();
+
+  torch_C_m.def("_graph_pool_handle", &::at::zoom::graph_pool_handle);
+
+  shared_ptr_class_<::at::zoom::HIPGraph>(torch_C_m, "_HIPGraph")
+      .def(py::init<>())
+      .def(
+          "capture_begin",
+          [](::at::zoom::HIPGraph& self,
+             std::optional<c10::zoom::MempoolId_t> pool_opt,
+             std::string capture_error_mode) {
+            hipStreamCaptureMode capture_mode;
+            c10::zoom::MempoolId_t pool = pool_opt.has_value()
+                ? pool_opt.value()
+                : c10::zoom::MempoolId_t{0, 0};
+            if (capture_error_mode == "global") {
+              capture_mode = hipStreamCaptureModeGlobal;
+            } else if (capture_error_mode == "thread_local") {
+              capture_mode = hipStreamCaptureModeThreadLocal;
+            } else if (capture_error_mode == "relaxed") {
+              capture_mode = hipStreamCaptureModeRelaxed;
+            } else {
+              TORCH_CHECK(
+                  false,
+                  "Unknown capture error mode. Expected `global`, `thread_local`, or `relaxed`, got ",
+                  capture_error_mode);
+            }
+            return self.capture_begin(pool, capture_mode);
+          },
+          py::arg("pool"),
+          py::arg("capture_error_mode"),
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "capture_end",
+          torch::wrap_pybind_function_no_gil(&at::zoom::HIPGraph::capture_end))
+      .def(
+          "register_generator_state",
+          [](::at::zoom::HIPGraph& self, py::handle raw_generator) {
+            auto generator = THPGenerator_Unwrap(raw_generator.ptr());
+            // We've unwrapped Python object to C++ object,
+            // so we could release GIL before calling into C++
+            py::gil_scoped_release release;
+            return self.register_generator_state(generator);
+          },
+          py::arg("generator"))
+      .def(
+          "replay",
+          torch::wrap_pybind_function_no_gil(&at::zoom::HIPGraph::replay))
+      .def(
+          "reset",
+          torch::wrap_pybind_function_no_gil(&at::zoom::HIPGraph::reset))
+      .def(
+          "pool",
+          torch::wrap_pybind_function_no_gil(&at::zoom::HIPGraph::pool))
+      .def(
+          "debug_dump",
+          torch::wrap_pybind_function_no_gil(
+              &::at::zoom::HIPGraph::debug_dump))
+      .def(
+          "enable_debug_mode",
+          torch::wrap_pybind_function_no_gil(
+              &::at::zoom::HIPGraph::enable_debug_mode))
+      .def(
+          "debug_dump",
+          torch::wrap_pybind_function_no_gil(
+              &::at::zoom::HIPGraph::debug_dump),
+          py::arg("debug_path"));
+}
diff --git a/torch/csrc/zoom/Module.cpp b/torch/csrc/zoom/Module.cpp
new file mode 100644
index 00000000000000..7a0470fad0613e
--- /dev/null
+++ b/torch/csrc/zoom/Module.cpp
@@ -0,0 +1,1533 @@
+#include <ATen/ATen.h>
+#include <ATen/CachedTensorUtils.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/zoom/HIPConfig.h>
+#include <ATen/native/ConvUtils.h>
+#include <c10/core/Device.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <pybind11/pytypes.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <unordered_set>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/zoom/CachingHostAllocator.h>
+// #include <ATen/zoom/Sleep.h>
+#include <ATen/zoom/detail/ZoomHooks.h>
+// #include <ATen/zoom/jiterator.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/zoom/ZoomAllocatorConfig.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/zoom/ZoomMiscFunctions.h>
+#include <ATen/zoom/HIPGraphsUtils.hpp>
+
+#include <c10/util/CallOnce.h>
+#include <c10/util/irange.h>
+
+// #include <torch/csrc/HIPIPCTypes.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/zoom/ZoomPluggableAllocator.h>
+#include <torch/csrc/zoom/THCP.h>
+#include <torch/csrc/zoom/memory_snapshot.h>
+#include <torch/csrc/zoom/python_comm.h>
+#include <torch/csrc/profiler/python/combined_traceback.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils.h>
+#include <torch/csrc/copy_utils.h>
+#include <array>
+#include <chrono>
+#include <iostream>
+#include <sstream>
+#include <thread>
+#include <unordered_map>
+#ifndef WIN32
+#include <pthread.h>
+#endif
+
+using namespace torch;
+
+static bool in_bad_fork = false; // True for children forked after zoom init
+
+#ifndef WIN32
+// Called in the forked child if zoom has already been initialized
+static void forked_child() {
+  in_bad_fork = true;
+  torch::utils::set_requires_device_init(at::kPrivateUse1, true);
+}
+#endif
+
+// Should be called before the first zoom call.
+// Note: This is distinct from initExtension because a stub zoom implementation
+// has some working functions (e.g. device_count) but cannot fully initialize.
+static void poison_fork() {
+#ifndef WIN32
+  static c10::once_flag flag;
+  c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Zoom management methods
+////////////////////////////////////////////////////////////////////////////////
+
+PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
+  auto device = THPUtils_unpackLong(arg);
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  c10::zoom::set_device(static_cast<c10::DeviceIndex>(device));
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_exchangeDevice(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
+    return THPUtils_packInt32(-1);
+  }
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  auto current_device = c10::zoom::ExchangeDevice(device_index);
+
+  return THPUtils_packDeviceIndex(current_device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_maybeExchangeDevice(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
+    return THPUtils_packInt32(-1);
+  }
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  auto current_device = c10::zoom::MaybeExchangeDevice(device_index);
+
+  return THPUtils_packDeviceIndex(current_device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+  auto device = static_cast<int32_t>(c10::zoom::current_device());
+  return THPUtils_packInt32(device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_canDeviceAccessPeer_wrap(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject* arg1 = nullptr;
+  PyObject* arg2 = nullptr;
+  if (!PyArg_ParseTuple(args, "OO", &arg1, &arg2)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "can_device_peer_access",
+        1,
+        "(int device, int peer_device);");
+    return nullptr;
+  }
+  TORCH_CHECK(
+      THPUtils_checkLong(arg1), "invalid argument to canDeviceAccessPeer");
+  TORCH_CHECK(
+      THPUtils_checkLong(arg2), "invalid argument to canDeviceAccessPeer");
+  int64_t device = THPUtils_unpackLong(arg1);
+  int64_t peer_device = THPUtils_unpackLong(arg2);
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  auto can_access = at::zoom::canDeviceAccessPeer(device, peer_device);
+  return PyBool_FromLong(can_access);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  poison_fork();
+  return THPUtils_packUInt64(c10::zoom::device_count());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  poison_fork();
+#ifdef ROCM_ARCH_FLAGS
+  static const char* flags = C10_STRINGIZE(ROCM_ARCH_FLAGS);
+  return THPUtils_packString(flags);
+#else
+  Py_RETURN_NONE;
+#endif
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPModule_isInBadFork(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return PyBool_FromLong(in_bad_fork);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getCurrentStream_wrap(
+    PyObject* /* unused */,
+    PyObject* device_index) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  auto stream = c10::zoom::getCurrentZoomStream(c10_device_index);
+  PyObject* output_tuple = PyTuple_New(3);
+  PyTuple_SetItem(
+      output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
+  PyTuple_SetItem(
+      output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
+  PyTuple_SetItem(
+      output_tuple,
+      2,
+      THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
+  return output_tuple;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getCurrentStream_raw(
+    PyObject* /* unused */,
+    PyObject* device_index) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(device_index), "invalid argument to getCurrentStream");
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  return PyLong_FromVoidPtr(
+      c10::zoom::getCurrentZoomStream(c10_device_index).stream());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getDefaultStream_wrap(
+    PyObject* /* unused */,
+    PyObject* device_index) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(device_index), "invalid argument to getDefaultStream");
+  auto c10_device_index = THPUtils_unpackDeviceIndex(device_index);
+  auto stream = c10::zoom::getDefaultZoomStream(c10_device_index);
+  PyObject* output_tuple = PyTuple_New(3);
+  PyTuple_SetItem(
+      output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
+  PyTuple_SetItem(
+      output_tuple, 1, THPUtils_packDeviceIndex(stream.device_index()));
+  PyTuple_SetItem(
+      output_tuple,
+      2,
+      THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
+  return output_tuple;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_setStream_wrap(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  int64_t stream_id = 0;
+  int64_t device_index = 0;
+  int64_t device_type = 0;
+
+  // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+  constexpr const char* kwlist[] = {
+      "stream_id", "device_index", "device_type", nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|LLL",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(kwlist),
+          &stream_id,
+          &device_index,
+          &device_type)) {
+  }
+
+  auto stream = c10::zoom::ZoomStream::unpack3(
+      stream_id,
+      static_cast<c10::DeviceIndex>(device_index),
+      static_cast<c10::DeviceType>(device_type));
+
+  auto device = c10::zoom::current_device();
+  if (device != stream.device_index()) {
+    c10::zoom::set_device(stream.device_index());
+  }
+  c10::zoom::setCurrentZoomStream(stream);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getCompiledVersion(PyObject* self, PyObject* noargs) {
+  return THPUtils_packInt64((int64_t)ROCM_VERSION);
+}
+
+PyObject* THCPModule_zoomHostAllocator(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  c10::Allocator* allocator = at::zoom::getCachingHostAllocator();
+  return PyLong_FromVoidPtr(allocator);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_zoomCachingAllocator_raw_alloc(
+    PyObject* _unused,
+    PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject* size_o = nullptr;
+  PyObject* stream_o = nullptr;
+  if (!PyArg_ParseTuple(args, "OO", &size_o, &stream_o)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "caching_allocator_alloc",
+        1,
+        "(ssize_t size, intptr_t stream);");
+    return nullptr;
+  }
+  auto size = PyLong_AsSsize_t(size_o);
+  hipStream_t stream = static_cast<hipStream_t>(PyLong_AsVoidPtr(stream_o));
+  void* mem = nullptr;
+  {
+    pybind11::gil_scoped_release no_gil;
+    mem = c10::zoom::ZoomCachingAllocator::raw_alloc_with_stream(size, stream);
+  }
+  return PyLong_FromVoidPtr(mem);
+  END_HANDLE_TH_ERRORS
+}
+
+// Unpack a PyObject to at::Scalar, throw an exception if it fails
+at::Scalar as_scalar(PyObject* arg) {
+  // Zero-dim tensors are converted to Scalars as-is. Note this doesn't
+  // currently handle most NumPy scalar types except np.float64.
+  if (THPVariable_Check(arg)) {
+    return THPVariable_Unpack(arg).item();
+  }
+
+  if (THPUtils_checkLong(arg)) {
+    return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(arg)));
+  }
+
+  if (PyBool_Check(arg)) {
+    return at::Scalar(THPUtils_unpackBool(arg));
+  }
+
+  if (PyComplex_Check(arg)) {
+    return at::Scalar(THPUtils_unpackComplexDouble(arg));
+  }
+  return at::Scalar(THPUtils_unpackDouble(arg));
+}
+
+// Entrypoint for the callable created by torch.zoom.jiterator
+// See jiterator.py for more details
+// PyObject* THCPModule_zoomJiteratorCompileAndLaunchKernel(
+//     PyObject* _unused,
+//     PyObject* args) {
+//   HANDLE_TH_ERRORS
+
+//   PyObject* code_string_o = nullptr;
+//   PyObject* kernel_name_o = nullptr;
+//   PyObject* return_by_ref_o = nullptr;
+//   PyObject* num_outputs_o = nullptr;
+//   PyObject* tensors_o = nullptr;
+//   PyObject* kwargs_o = nullptr;
+//   if (!PyArg_ParseTuple(
+//           args,
+//           "OOOOO|O",
+//           &code_string_o,
+//           &kernel_name_o,
+//           &return_by_ref_o,
+//           &num_outputs_o,
+//           &tensors_o,
+//           &kwargs_o)) {
+//     return nullptr;
+//   }
+
+//   const std::string code_string = THPUtils_unpackString(code_string_o);
+//   const std::string kernel_name = THPUtils_unpackString(kernel_name_o);
+//   const bool return_by_ref = THPUtils_unpackBool(return_by_ref_o);
+//   const int num_outputs = static_cast<int>(THPUtils_unpackLong(num_outputs_o));
+
+//   TORCH_CHECK(
+//       PyTuple_Check(tensors_o),
+//       "tensors argument is expected to "
+//       "be a tuple, but got ",
+//       THPUtils_typename(tensors_o));
+//   Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors_o);
+
+//   c10::SmallVector<at::Tensor> tensors;
+//   for (const auto i : c10::irange(num_tensors)) {
+//     PyObject* _tensor = PyTuple_GET_ITEM(tensors_o, i);
+//     TORCH_CHECK(
+//         THPVariable_Check(_tensor),
+//         i,
+//         " of input tensors tuple is not a Tensor");
+
+//     tensors.emplace_back(THPVariable_Unpack(_tensor));
+//   }
+
+//   c10::SmallVector<at::Scalar> extra_args;
+//   PyObject* key = nullptr;
+//   PyObject* value = nullptr;
+//   Py_ssize_t pos = 0;
+//   while (PyDict_Next(kwargs_o, &pos, &key, &value)) {
+//     extra_args.emplace_back(as_scalar(value));
+//   }
+
+//   c10::SmallVector<at::Tensor> outputs = at::zoom::CompileAndLaunchKernel(
+//       code_string,
+//       kernel_name,
+//       num_outputs,
+//       tensors,
+//       extra_args,
+//       return_by_ref);
+
+//   if (num_outputs == 1) {
+//     return THPVariable_Wrap(outputs[0]);
+//   } else {
+//     PyObject* output_tuple = PyTuple_New(num_outputs);
+//     for (int i = 0; i < num_outputs; ++i) {
+//       PyTuple_SetItem(output_tuple, i, THPVariable_Wrap(outputs[i]));
+//     }
+//     return output_tuple;
+//   }
+
+//   END_HANDLE_TH_ERRORS
+// }
+
+PyObject* THCPModule_zoomCachingAllocator_raw_delete(
+    PyObject* _unused,
+    PyObject* obj) {
+  HANDLE_TH_ERRORS
+  void* mem_ptr = PyLong_AsVoidPtr(obj);
+  {
+    pybind11::gil_scoped_release no_gil;
+    c10::zoom::ZoomCachingAllocator::raw_delete(mem_ptr);
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_zoomCachingAllocator_set_allocator_settings(
+    PyObject* _unused,
+    PyObject* env) {
+  HANDLE_TH_ERRORS
+  c10::zoom::ZoomCachingAllocator::setAllocatorSettings(
+      THPUtils_unpackString(env));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getAllocatorBackend(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return THPUtils_packString(c10::zoom::ZoomCachingAllocator::name());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_zoomSynchronize(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil;
+    c10::zoom::device_synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// PyObject* THCPModule_zoomIPCCollect(PyObject* _unused, PyObject* noargs) {
+//   HANDLE_TH_ERRORS
+//   torch::zoomIPCCollect();
+//   Py_RETURN_NONE;
+//   END_HANDLE_TH_ERRORS
+// }
+
+// PyObject* THCPModule_zoomSleep(PyObject* _unused, PyObject* cycles) {
+//   HANDLE_TH_ERRORS
+//   TORCH_CHECK(
+//       THPUtils_checkLong(cycles), "torch.zoom._sleep(): expected 'int'");
+//   int64_t unpacked_cycles = THPUtils_unpackLong(cycles);
+//   {
+//     pybind11::gil_scoped_release no_gil;
+//     at::zoom::sleep(unpacked_cycles);
+//   }
+//   Py_RETURN_NONE;
+//   END_HANDLE_TH_ERRORS
+// }
+
+// We need to ensure that as long as a thread will NEVER loose the GIL as long
+// as it holds the CUDA mutex. Otherwise another thread might be scheduled and
+// try to e.g. allocate a new tensor which will cause a deadlock. It's enough to
+// have a single global, because it can be only set once (zoomMutex is not
+// recursive) by the thread that owns the mutex (obviously there can be only one
+// such thread).
+static PyGILState_STATE zoomMutexGILState;
+
+PyObject* THCPModule_zoomLockMutex(PyObject* module, PyObject* noargs) {
+  auto mutex = c10::zoom::getFreeMutex();
+  // This has to be a busy loop because we **absolutely need to** hold the GIL
+  // or it's a recipe for a deadlock otherwise (if we let other Python threads
+  // run while we have the zoomMutex, but not the GIL, they might try to e.g.
+  // free a Zoom tensor and acquire the zoomMutex without giving up the GIL,
+  // because it happens deep within THC).
+  while (true) {
+    if (mutex->try_lock())
+      break;
+    {
+      pybind11::gil_scoped_release no_gil;
+      std::this_thread::sleep_for(std::chrono::microseconds(10));
+    }
+  }
+
+  zoomMutexGILState = PyGILState_Ensure();
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_zoomUnlockMutex(PyObject* module, PyObject* noargs) {
+  auto mutex = c10::zoom::getFreeMutex();
+  PyGILState_Release(zoomMutexGILState);
+  mutex->unlock();
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_hasPrimaryContext(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg), "invalid argument to has_primary_context");
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (c10::zoom::hasPrimaryContext(device_index)) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject* fraction_o = nullptr;
+  PyObject* device_o = nullptr;
+  if (!PyArg_ParseTuple(args, "OO", &fraction_o, &device_o)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "set_memory_fraction",
+        1,
+        "(double fraction, int device);");
+    return nullptr;
+  }
+  double fraction = PyFloat_AsDouble(fraction_o);
+  auto device_index = THPUtils_unpackDeviceIndex(device_o);
+
+  c10::zoom::ZoomCachingAllocator::setMemoryFraction(fraction, device_index);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_emptyCache(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  c10::zoom::ZoomCachingAllocator::emptyCache();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
+  const auto device_index = THPUtils_unpackDeviceIndex(arg);
+
+  using c10::zoom::ZoomCachingAllocator::DeviceStats;
+  using c10::zoom::ZoomCachingAllocator::Stat;
+  using c10::zoom::ZoomCachingAllocator::StatArray;
+  using c10::zoom::ZoomCachingAllocator::StatType;
+
+  const auto statToDict = [](const Stat& stat) {
+    py::dict dict;
+
+    dict["current"] = stat.current;
+    dict["peak"] = stat.peak;
+    dict["allocated"] = stat.allocated;
+    dict["freed"] = stat.freed;
+    return dict;
+  };
+
+  const auto statArrayToDict = [=](const StatArray& statArray) {
+    const std::array<const char*, static_cast<size_t>(StatType::NUM_TYPES)>
+        statTypeNames = {"all", "small_pool", "large_pool"};
+    py::dict dict;
+    for (const auto i : c10::irange(statTypeNames.size())) {
+      dict[statTypeNames[i]] = statToDict(statArray[i]);
+    }
+    return dict;
+  };
+
+  const DeviceStats stats =
+      c10::zoom::ZoomCachingAllocator::getDeviceStats(device_index);
+
+  py::dict result;
+  result["num_alloc_retries"] = stats.num_alloc_retries;
+  result["num_ooms"] = stats.num_ooms;
+  result["max_split_size"] = stats.max_split_size;
+  result["num_sync_all_streams"] = stats.num_sync_all_streams;
+  result["num_device_alloc"] = stats.num_device_alloc;
+  result["num_device_free"] = stats.num_device_free;
+  result["allocation"] = statArrayToDict(stats.allocation);
+  result["segment"] = statArrayToDict(stats.segment);
+  result["active"] = statArrayToDict(stats.active);
+  result["inactive_split"] = statArrayToDict(stats.inactive_split);
+  result["allocated_bytes"] = statArrayToDict(stats.allocated_bytes);
+  result["reserved_bytes"] = statArrayToDict(stats.reserved_bytes);
+  result["active_bytes"] = statArrayToDict(stats.active_bytes);
+  result["inactive_split_bytes"] = statArrayToDict(stats.inactive_split_bytes);
+  result["requested_bytes"] = statArrayToDict(stats.requested_bytes);
+  result["oversize_allocations"] = statToDict(stats.oversize_allocations);
+  result["oversize_segments"] = statToDict(stats.oversize_segments);
+
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_resetAccumulatedMemoryStats(
+    PyObject* _unused,
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg),
+      "invalid argument to reset_accumulated_memory_stats");
+  const auto device_index = THPUtils_unpackDeviceIndex(arg);
+  c10::zoom::ZoomCachingAllocator::resetAccumulatedStats(device_index);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_resetPeakMemoryStats(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg), "invalid argument to reset_peak_memory_stats");
+  const auto device_index = THPUtils_unpackDeviceIndex(arg);
+  c10::zoom::ZoomCachingAllocator::resetPeakStats(device_index);
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+CapturedTraceback* getFromContext(
+    const std::shared_ptr<c10::GatheredContext>& x) {
+  if (CapturedTraceback* sc = dynamic_cast<CapturedTraceback*>(x.get())) {
+    return sc;
+  }
+  TORCH_CHECK(
+      false,
+      "attempting to gather stack context from the wrong StackContext type.");
+}
+
+PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+
+  using c10::zoom::ZoomCachingAllocator::BlockInfo;
+  using c10::zoom::ZoomCachingAllocator::SegmentInfo;
+
+  py::str device_s = "device";
+  py::str address_s = "address";
+  py::str total_size_s = "total_size";
+  py::str allocated_size_s = "allocated_size";
+  py::str active_size_s = "active_size";
+  py::str requested_size_s = "requested_size";
+  py::str stream_s = "stream";
+  py::str segment_type_s = "segment_type";
+  py::str segment_pool_id = "segment_pool_id";
+  py::str large_s = "large";
+  py::str small_s = "small";
+  py::str size_s = "size";
+  py::str state_s = "state";
+  py::str active_allocated_s = "active_allocated";
+  py::str active_pending_free_s = "active_pending_free";
+  py::str inactive_s = "inactive";
+  py::str addr_s = "addr";
+  py::str cpp_frames_s = "cpp_frames";
+  py::str blocks_s = "blocks";
+  py::str is_expandable_s = "is_expandable";
+  py::str frames_s = "frames";
+  py::str time_us_s = "time_us";
+
+  py::list empty_frames;
+  std::vector<CapturedTraceback*> to_gather_frames;
+  std::vector<py::dict> to_gather_dest;
+
+  auto add_frame_key = [&](const py::dict& d,
+                           const std::shared_ptr<c10::GatheredContext>& ctx) {
+    if (ctx) {
+      auto sc = getFromContext(ctx);
+      to_gather_frames.emplace_back(sc);
+      to_gather_dest.emplace_back(d);
+    } else {
+      d[frames_s] = empty_frames;
+    }
+  };
+
+  const auto segmentInfoToDict = [&](const SegmentInfo& segmentInfo) {
+    py::dict segmentDict;
+    segmentDict[device_s] = segmentInfo.device;
+    segmentDict[address_s] = segmentInfo.address;
+    segmentDict[total_size_s] = segmentInfo.total_size;
+    segmentDict[allocated_size_s] = segmentInfo.allocated_size;
+    segmentDict[active_size_s] = segmentInfo.active_size;
+    segmentDict[requested_size_s] = segmentInfo.requested_size;
+    // we want the python objects to pickle easily so use an int to
+    // represent the stream rather than a torch.zoom.stream object
+    segmentDict[stream_s] = int64_t(segmentInfo.stream);
+    segmentDict[segment_type_s] = (segmentInfo.is_large ? large_s : small_s);
+    segmentDict[segment_pool_id] = segmentInfo.owner_private_pool_id;
+    segmentDict[is_expandable_s] = segmentInfo.is_expandable;
+    add_frame_key(segmentDict, segmentInfo.context_when_allocated);
+
+    auto address = segmentInfo.address;
+    py::list blocks;
+    for (const auto& blockInfo : segmentInfo.blocks) {
+      py::dict blockDict;
+      blockDict[address_s] = address;
+      blockDict[size_s] = blockInfo.size;
+      blockDict[requested_size_s] = blockInfo.requested_size;
+      blockDict[state_s] =
+          (blockInfo.allocated
+               ? active_allocated_s
+               : (blockInfo.active ? active_pending_free_s : inactive_s));
+      add_frame_key(blockDict, blockInfo.context_when_allocated);
+      blocks.append(blockDict);
+      address += blockInfo.size;
+    }
+    segmentDict[blocks_s] = blocks;
+
+    return segmentDict;
+  };
+
+  auto snapshot = c10::zoom::ZoomCachingAllocator::snapshot();
+
+  py::list segments;
+
+  for (const auto& segmentInfo : snapshot.segments) {
+    segments.append(segmentInfoToDict(segmentInfo));
+  }
+
+  py::list traces;
+  py::str action_s = "action";
+  py::str alloc_s = "alloc";
+  py::str free_requested_s = "free_requested";
+  py::str free_completed_s = "free_completed";
+  py::str segment_alloc_s = "segment_alloc";
+  py::str segment_free_s = "segment_free";
+  py::str segment_map_s = "segment_map";
+  py::str segment_unmap_s = "segment_unmap";
+
+  py::str snapshot_s = "snapshot";
+  py::str oom_s = "oom";
+  py::str device_free_s = "device_free";
+
+  using namespace c10::zoom::ZoomCachingAllocator;
+
+  auto action_to_str = [&](TraceEntry::Action action) {
+    switch (action) {
+      case TraceEntry::ALLOC:
+        return alloc_s;
+      case TraceEntry::FREE_REQUESTED:
+        return free_requested_s;
+      case TraceEntry::FREE_COMPLETED:
+        return free_completed_s;
+      case TraceEntry::SEGMENT_ALLOC:
+        return segment_alloc_s;
+      case TraceEntry::SEGMENT_FREE:
+        return segment_free_s;
+      case TraceEntry::OOM:
+        return oom_s;
+      case TraceEntry::SNAPSHOT:
+        return snapshot_s;
+      case TraceEntry::SEGMENT_UNMAP:
+        return segment_unmap_s;
+      case TraceEntry::SEGMENT_MAP:
+        return segment_map_s;
+    }
+    throw std::runtime_error("unreachable");
+  };
+
+  for (const auto& traceInfo : snapshot.device_traces) {
+    py::list trace;
+    for (const auto& te : traceInfo) {
+      py::dict trace_entry;
+      if (te.context_) {
+        // without further compression frames can get really large on dump
+        auto sc = getFromContext(te.context_);
+        to_gather_frames.emplace_back(sc);
+        to_gather_dest.emplace_back(trace_entry);
+      }
+      trace_entry[action_s] = action_to_str(te.action_);
+      trace_entry[TraceEntry::OOM == te.action_ ? device_free_s : addr_s] =
+          te.addr_;
+      trace_entry[size_s] = te.size_;
+      trace_entry[stream_s] = int64_t(te.stream_);
+      trace_entry[time_us_s] = te.time_.t_;
+      trace.append(trace_entry);
+    }
+    traces.append(trace);
+  }
+
+  py::dict allocator_settings;
+  py::str last_allocator_settings_s = "PYTORCH_ZOOM_ALLOC_CONF";
+  py::str max_split_size_s = "max_split_size";
+  py::str garbage_collection_threshold_s = "garbage_collection_threshold";
+  py::str expandable_segments_s = "expandable_segments";
+  py::str pinned_num_register_threads_s = "pinned_num_register_threads";
+  py::str release_lock_on_malloc_s = "release_lock_on_hipMalloc";
+  py::str pinned_use_host_register_s = "pinned_use_zoom_host_register";
+  py::str roundup_power2_divisions_s = "roundup_power2_divisions";
+
+  allocator_settings[last_allocator_settings_s] =
+      snapshot.config_metadata.last_allocator_settings;
+  allocator_settings[max_split_size_s] =
+      int64_t(snapshot.config_metadata.max_split_size);
+  allocator_settings[garbage_collection_threshold_s] =
+      snapshot.config_metadata.garbage_collection_threshold;
+  allocator_settings[expandable_segments_s] =
+      snapshot.config_metadata.expandable_segments;
+  allocator_settings[pinned_num_register_threads_s] =
+      int64_t(snapshot.config_metadata.pinned_num_register_threads);
+  allocator_settings[release_lock_on_malloc_s] =
+      snapshot.config_metadata.release_lock_on_malloc;
+  allocator_settings[pinned_use_host_register_s] =
+      snapshot.config_metadata.pinned_use_host_register;
+  unsigned int roundup_key = 1;
+  py::dict roundup_settings;
+  for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
+    py::str roundup_key_s = std::to_string(roundup_key);
+    roundup_settings[roundup_key_s] = int64_t(v);
+    roundup_key *= 2;
+  }
+  allocator_settings[roundup_power2_divisions_s] = roundup_settings;
+
+  py::dict result;
+  result["segments"] = segments;
+  result["device_traces"] = traces;
+  result["allocator_settings"] = allocator_settings;
+
+  auto frames = py_symbolize(to_gather_frames);
+  for (auto i : c10::irange(frames.size())) {
+    to_gather_dest.at(i)[frames_s] = frames.at(i);
+  }
+
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_attachOutOfMemoryObserver(
+    PyObject* _unused,
+    PyObject* observer) {
+  HANDLE_TH_ERRORS
+  Py_XINCREF(observer);
+  auto obs = [observer](
+                 int64_t device,
+                 int64_t alloc,
+                 int64_t device_allocated,
+                 int64_t device_free) {
+    py::gil_scoped_acquire g;
+    PyObject* result = PyObject_CallFunction(
+        observer, "LLLL", device, alloc, device_allocated, device_free);
+    if (!result) {
+      throw py::error_already_set();
+    }
+    Py_XDECREF(result);
+  };
+  at::globalContext().lazyInitPrivateUse1();
+  c10::zoom::ZoomCachingAllocator::attachOutOfMemoryObserver(std::move(obs));
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_zoomSetSyncDebugMode(PyObject* _unused, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_WARN_ONCE(
+      "Synchronization debug mode is a prototype feature and does not yet detect all "
+      "synchronizing operations");
+  TORCH_CHECK(
+      THPUtils_checkLong(arg), "invalid argument to set_sync_debug_mode");
+  int64_t debug_mode = THPUtils_unpackLong(arg);
+  TORCH_CHECK(
+      debug_mode >= 0 && debug_mode <= 2,
+      "invalid value of debug_mode, expected one of 0,1,2");
+  c10::zoom::SyncDebugMode l = c10::zoom::SyncDebugMode::L_DISABLED;
+  switch (debug_mode) {
+    case 0:
+      l = c10::zoom::SyncDebugMode::L_DISABLED;
+      break;
+    case 1:
+      l = c10::zoom::SyncDebugMode::L_WARN;
+      break;
+    case 2:
+      l = c10::zoom::SyncDebugMode::L_ERROR;
+      break;
+    default:
+      break; // can't happen
+  }
+  c10::zoom::warning_state().set_sync_debug_mode(l);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_zoomGetSyncDebugMode(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto debug_mode = c10::zoom::warning_state().get_sync_debug_mode();
+  switch (debug_mode) {
+    case c10::zoom::SyncDebugMode::L_DISABLED:
+      return THPUtils_packInt32(0);
+    case c10::zoom::SyncDebugMode::L_WARN:
+      return THPUtils_packInt32(1);
+    case c10::zoom::SyncDebugMode::L_ERROR:
+      return THPUtils_packInt32(2);
+    default:
+      return THPUtils_packInt32(-1); // can't happen
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Zoom module initialization
+////////////////////////////////////////////////////////////////////////////////
+
+static void registerZoomDeviceProperties(PyObject* module) {
+  // Add _hipDeviceProp_tertires class to torch._C
+  auto m = py::handle(module).cast<py::module>();
+  py::class_<hipDeviceProp_t>(m, "_ZoomDeviceProperties")
+      .def_readonly("name", &hipDeviceProp_t::name)
+      .def_readonly("major", &hipDeviceProp_t::major)
+      .def_readonly("minor", &hipDeviceProp_t::minor)
+      .def_readonly("is_multi_gpu_board", &hipDeviceProp_t::isMultiGpuBoard)
+      .def_readonly("is_integrated", &hipDeviceProp_t::integrated)
+      .def_readonly(
+          "multi_processor_count", &hipDeviceProp_t::multiProcessorCount)
+      .def_readonly("total_memory", &hipDeviceProp_t::totalGlobalMem)
+      .def_readonly(
+          "max_threads_per_multi_processor",
+          &hipDeviceProp_t::maxThreadsPerMultiProcessor)
+      .def_readonly(
+          "gcnArchName",
+          &hipDeviceProp_t::gcnArchName
+          )
+      .def("__repr__", [](const hipDeviceProp_t& prop) {
+        std::ostringstream stream;
+        stream << "_ZoomDeviceProperties(name='" << prop.name
+               << "', major=" << prop.major << ", minor=" << prop.minor
+               << ", gcnArchName='" << prop.gcnArchName << "'"
+               << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
+               << "MB, multi_processor_count=" << prop.multiProcessorCount
+               << ")";
+        return stream.str();
+      });
+
+  // m.def(
+  //     "_zoom_record_memory_history_legacy",
+  //     static_cast<void (*)(bool, bool, int64_t, bool, bool)>(
+  //         torch::zoom::_record_memory_history));
+
+  // m.def(
+  //     "_zoom_record_memory_history",
+  //     static_cast<void (*)(
+  //         std::optional<std::string>,
+  //         std::optional<std::string>,
+  //         const std::string&,
+  //         size_t)>(torch::zoom::_record_memory_history));
+
+  m.def("_zoom_isHistoryEnabled", []() {
+    return c10::zoom::ZoomCachingAllocator::isHistoryEnabled();
+  });
+
+  // m.def("_zoom_get_conv_benchmark_empty_cache", []() {
+  //   return at::native::_cudnn_get_conv_benchmark_empty_cache();
+  // });
+
+  // m.def("_cudnn_set_conv_benchmark_empty_cache", [](bool enable) {
+  //   return at::native::_cudnn_set_conv_benchmark_empty_cache(enable);
+  // });
+}
+
+// We choose to ignore certain blocks that are currently allocated
+// when we set the pool to its checkpoint. For those blocks, we need
+// to swap out the deleter function of their corresponding blocks
+// so that a deallocation is not triggered when they die.
+void removeStorageDeleterFns(
+    const std::vector<c10::StorageImpl*>& stale_live_storages,
+    std::unordered_set<void*> definitely_stale_pointers) {
+  for (c10::StorageImpl* stale_storage : stale_live_storages) {
+    auto ptr = stale_storage->data_ptr().get();
+    auto allocated_pointer = definitely_stale_pointers.find(ptr);
+    TORCH_CHECK(allocated_pointer != definitely_stale_pointers.end());
+    auto t = c10::zoom::ZoomCachingAllocator::get();
+    bool succeeded = stale_storage->mutable_data_ptr().compare_exchange_deleter(
+        t->raw_deleter(), &c10::detail::deleteNothing);
+
+    TORCH_CHECK(
+        succeeded,
+        "Unexpected deleter function on storage, could not swap function");
+  }
+}
+
+void addStorageDeleterFns(
+    std::vector<c10::StorageImpl*>& storages_to_add_deleters_to,
+    c10::zoom::ZoomCachingAllocator::CheckpointDelta& delta) {
+  std::unordered_map<void*, c10::StorageImpl*> storages;
+  for (auto& storage : storages_to_add_deleters_to) {
+    storages[storage->data_ptr().get()] = storage;
+  }
+
+  for (auto& data_ptr : delta.dataptrs_allocd) {
+    auto storage_pair = storages.find(data_ptr.get());
+    if (storage_pair != storages.end()) {
+      auto ctx = storage_pair->second->data_ptr().get_context();
+      TORCH_CHECK(ctx == nullptr, " Not expecting deleter function");
+      storage_pair->second->set_data_ptr_noswap(std::move(data_ptr));
+    } else {
+      data_ptr.release_context();
+    }
+  }
+}
+
+static void registerZoomPluggableAllocator(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      c10::zoom::ZoomCachingAllocator::ZoomAllocator,
+      std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>>(
+      m, "_zoom_ZoomAllocator");
+  m.def("_zoom_getAllocator", []() {
+    return py::cast(torch::zoom::ZoomPluggableAllocator::getCurrentAllocator());
+  });
+
+  m.def(
+      "_zoom_changeCurrentAllocator",
+      [](const std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>&
+             allocator) {
+        torch::zoom::ZoomPluggableAllocator::changeCurrentAllocator(allocator);
+      });
+  py::class_<
+      torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator,
+      c10::zoom::ZoomCachingAllocator::ZoomAllocator,
+      std::shared_ptr<
+          torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator>>(
+      m, "_ZoomPluggableAllocator")
+      .def(
+          "set_init_fn",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_init_fn(func);
+          })
+      .def(
+          "set_reset_fn",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void();
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_reset_fn(func);
+          })
+      .def(
+          "set_memory_fraction_fn",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(double, int);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_memory_fraction_fn(func);
+          })
+      .def(
+          "set_base_alloc_fn",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void*(void*, size_t*);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_base_alloc_fn(func);
+          })
+      .def(
+          "set_record_stream_fn",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(void*, hipStream_t);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_record_stream_fn(func);
+          })
+      .def(
+          "set_begin_allocate_to_pool",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(
+                int, c10::zoom::MempoolId_t, std::function<bool(hipStream_t)>);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_begin_allocate_to_pool(func);
+          })
+      .def(
+          "set_end_allocate_to_pool_fn",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int, c10::zoom::MempoolId_t);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_end_allocate_to_pool_fn(func);
+          })
+      .def(
+          "set_release_pool",
+          [](torch::zoom::ZoomPluggableAllocator::ZoomPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int, c10::zoom::MempoolId_t);
+            std::function<FuncType> func =
+                // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_release_pool(func);
+          });
+  m.def("_zoom_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
+    using MallocFuncType = void*(size_t, int, hipStream_t);
+    using FreeFuncType = void(void*, size_t, int, hipStream_t);
+    std::function<MallocFuncType> malloc_fn =
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        reinterpret_cast<MallocFuncType*>(malloc_ptr);
+    std::function<FreeFuncType> free_fn =
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        reinterpret_cast<FreeFuncType*>(free_ptr);
+    return torch::zoom::ZoomPluggableAllocator::createCustomAllocator(
+        malloc_fn, free_fn);
+  });
+
+  // NOLINTNEXTLINE(bugprone-unused-raii)
+  py::class_<
+      c10::zoom::ZoomCachingAllocator::AllocatorState,
+      std::shared_ptr<c10::zoom::ZoomCachingAllocator::AllocatorState>>(
+      m, "_zoom_ZoomAllocator_AllocatorState");
+
+  m.def(
+      "_zoom_getCheckpointState",
+      [](c10::DeviceIndex device, c10::zoom::MempoolId_t id) {
+        return c10::zoom::ZoomCachingAllocator::getCheckpointState(device, id);
+      });
+
+  m.def("_free_And_Remove_DeleterFn", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
+    auto alloc = c10::zoom::ZoomCachingAllocator::get();
+    auto data_ptr = storage_impl->data_ptr().get();
+    bool succeeded = storage_impl->mutable_data_ptr().compare_exchange_deleter(
+        alloc->raw_deleter(), c10::detail::deleteNothing);
+    TORCH_CHECK(succeeded, "Expected standard deleter");
+    c10::zoom::ZoomCachingAllocator::raw_delete(data_ptr);
+  });
+
+  m.def(
+      "_set_storage_access_error_msg", [](const at::Tensor& t, std::string s) {
+        t.unsafeGetTensorImpl()
+            ->release_storage_and_set_meta_custom_data_ptr_error_msg_(s);
+      });
+
+  m.def("_has_Standard_Deleter", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
+    auto alloc = c10::zoom::ZoomCachingAllocator::get();
+    return (storage_impl->data_ptr().get_deleter() == alloc->raw_deleter());
+  });
+
+  m.def("_set_cached_tensors_enabled", [](bool enabled) {
+    at::caching::set_cached_tensors_enabled(enabled);
+  });
+
+  m.def("_add_cached_tensor", [](const at::Tensor& t) {
+    at::caching::add_cached_tensor(t);
+  });
+
+  m.def("_remove_cached_tensor", [](const at::Tensor& t) {
+    at::caching::remove_cached_tensor(t);
+  });
+
+  m.def("_is_cached_tensor", [](const at::Tensor& t) {
+    return at::caching::is_cached_tensor(t);
+  });
+
+  m.def("_storage_Use_Count", [](size_t storage_impl_ptr) {
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    c10::StorageImpl* storage_impl = (c10::StorageImpl*)storage_impl_ptr;
+    return c10::raw::weak_intrusive_ptr::use_count(storage_impl);
+  });
+
+  m.def(
+      "_tensors_data_ptrs_at_indices_equal",
+      [](py::list& tensors, py::list& data_ptrs, py::list& indices) {
+        for (size_t i = 0, end = indices.size(); i < end; ++i) {
+          auto index = indices[i].cast<int64_t>();
+          auto t = tensors[index].cast<at::Tensor>();
+          auto data_ptr = data_ptrs[index].cast<int64_t>();
+          if (reinterpret_cast<int64_t>(t.data_ptr()) != data_ptr) {
+            return false;
+          }
+        }
+        return true;
+      });
+
+  m.def(
+      "_construct_Zoom_Tensor_From_Storage_And_Metadata",
+      [](py::dict& metadata, c10::Storage s) {
+        auto dtype_arg = metadata["dtype"].ptr();
+        auto meta = scalarTypeToTypeMeta(toScalarType(dtype_arg));
+
+        constexpr c10::DispatchKeySet zoom_dks(c10::DispatchKey::PrivateUse1);
+        at::Tensor tensor = at::detail::make_tensor_base<c10::TensorImpl>(
+            std::move(s), zoom_dks, meta);
+
+        tensor.unsafeGetTensorImpl()->set_sizes_and_strides(
+            metadata["size"].cast<std::vector<int64_t>>(),
+            metadata["stride"].cast<std::vector<int64_t>>());
+        tensor.unsafeGetTensorImpl()->set_storage_offset(
+            metadata["storage_offset"].cast<int64_t>());
+        return tensor;
+      });
+
+  m.def(
+      "_zoom_beginAllocateCurrentStreamToPool",
+      [](c10::DeviceIndex device, c10::zoom::MempoolId_t mempool_id) {
+        auto stream = c10::zoom::getCurrentZoomStream(device);
+        TORCH_CHECK(stream, "Expected stream capture to be under way");
+        c10::zoom::ZoomCachingAllocator::beginAllocateToPool(
+            device, mempool_id, [stream](hipStream_t target) {
+              return target == stream;
+            });
+      });
+
+  m.def(
+      "_zoom_endAllocateCurrentStreamToPool",
+      [](c10::DeviceIndex device, c10::zoom::MempoolId_t mempool_id) {
+        c10::zoom::ZoomCachingAllocator::endAllocateToPool(device, mempool_id);
+      });
+
+  m.def(
+      "_zoom_releasePool",
+      [](c10::DeviceIndex device, c10::zoom::MempoolId_t mempool_id) {
+        c10::zoom::ZoomCachingAllocator::releasePool(device, mempool_id);
+      });
+
+  m.def(
+      "_zoom_checkPoolLiveAllocations",
+      [](c10::DeviceIndex device,
+         c10::zoom::MempoolId_t mempool_id,
+         const py::set& expected_live_allocations) {
+        std::unordered_set<void*> allocations;
+        allocations.reserve(expected_live_allocations.size());
+        for (auto& elem : expected_live_allocations) {
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
+          allocations.insert(reinterpret_cast<void*>(py::cast<size_t>(elem)));
+        }
+        return c10::zoom::ZoomCachingAllocator::checkPoolLiveAllocations(
+            device, mempool_id, allocations);
+      });
+
+  m.def(
+      "_zoom_setCheckpointPoolState",
+      [](c10::DeviceIndex device,
+         std::shared_ptr<c10::zoom::ZoomCachingAllocator::AllocatorState> pps,
+         const std::vector<size_t>& stale_storages_ptr,
+         const std::vector<size_t>& storages_to_add_deleters_to_ptr = {}) {
+        std::unordered_set<c10::StorageImpl*> ptr_set;
+        // iterate on std::vector for determinism
+        std::vector<c10::StorageImpl*> ptrs;
+        for (size_t ptr_int : stale_storages_ptr) {
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
+          c10::StorageImpl* ptr = (c10::StorageImpl*)ptr_int;
+          if (!ptr_set.count(ptr)) {
+            ptrs.push_back(ptr);
+            ptr_set.insert(ptr);
+          }
+        }
+        auto delta = c10::zoom::ZoomCachingAllocator::setCheckpointPoolState(
+            device, std::move(pps));
+        auto& freed_pointers = delta.ptrs_freed;
+
+        std::unordered_set<void*> allocd_set;
+        for (auto& data_ptr : delta.dataptrs_allocd) {
+          allocd_set.insert(data_ptr.get());
+        }
+        std::unordered_set<void*> freed_pointer_set;
+        size_t definite_freed_count = 0;
+        for (void* ptr : freed_pointers) {
+          if (!allocd_set.count(ptr)) {
+            definite_freed_count += 1;
+          }
+          freed_pointer_set.insert((ptr));
+        }
+        // that block has already been freed,
+        // so even those this will error, so too will the allocator
+        // when the corresponding tensor dies because there is no
+        // live tensor corresponding to it
+        TORCH_CHECK(
+            ptr_set.size() >= definite_freed_count,
+            "Any stale tensors which are being manually freed"
+            " must be passed to set checkpoint");
+
+        removeStorageDeleterFns(ptrs, freed_pointer_set);
+        std::vector<c10::StorageImpl*> storages_to_add_deleters_to;
+        storages_to_add_deleters_to.reserve(
+            storages_to_add_deleters_to_ptr.size());
+        for (size_t ptr_int : storages_to_add_deleters_to_ptr) {
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
+          storages_to_add_deleters_to.push_back((c10::StorageImpl*)ptr_int);
+        }
+
+        addStorageDeleterFns(storages_to_add_deleters_to, delta);
+      });
+}
+
+static void bindGetDeviceProperties(PyObject* module) {
+  // Add method to torch.zoom
+  auto m = py::handle(module).cast<py::module>();
+  m.def(
+      "_get_device_properties",
+      [](c10::DeviceIndex device) -> hipDeviceProp_t* {
+        return at::zoom::getDeviceProperties(device);
+      },
+      py::return_value_policy::reference);
+}
+
+// Callback for python part. Used for additional initialization of python
+// classes
+static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
+#if C10_ASAN_ENABLED
+  TORCH_WARN(
+      "torch.zoom: your pytorch binary has address sanitizer (asan) built in, "
+      "asan is currently not compatible with torch.zoom module, "
+      "you might get unexpected behavior (eg. out of memory, crash, etc.), "
+      "please rebuild pytorch without asan if you need to use this module");
+#endif
+  HANDLE_TH_ERRORS
+  TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
+  poison_fork();
+  at::globalContext().lazyInitPrivateUse1();
+
+  auto m = THPObjectPtr(PyImport_ImportModule("torch.zoom"));
+  if (!m)
+    throw python_error();
+
+  auto set_module_attr = [&](const char* name, PyObject* v) {
+    // PyObject_SetAttrString doesn't steal reference. So no need to incref.
+    if (PyObject_SetAttrString(m, name, v) < 0) {
+      throw python_error();
+    }
+  };
+
+  auto num_gpus = c10::zoom::device_count();
+  auto default_zoom_generators = PyTuple_New(static_cast<Py_ssize_t>(num_gpus));
+  for (const auto i : c10::irange(num_gpus)) {
+    auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(
+        at::zoom::detail::getDefaultZoomGenerator(i));
+    // This reference is meant to be given away, so no need to incref here.
+    PyTuple_SetItem(default_zoom_generators, i, (PyObject*)cast_gen);
+  }
+  set_module_attr("default_generators", default_zoom_generators);
+  bindGetDeviceProperties(m);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_getCurrentBlasHandle_wrap(
+    PyObject* self,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  return PyLong_FromVoidPtr(handle);
+  END_HANDLE_TH_ERRORS
+}
+
+
+// PyObject* THCPModule_rocm_is_backward_pass(
+//     PyObject* _unused,
+//     PyObject* noargs) {
+//   HANDLE_TH_ERRORS
+// #if USE_ROCM
+//   if (at::ROCmBackwardPassGuard::is_backward_pass()) {
+//     Py_RETURN_TRUE;
+//   } else {
+//     Py_RETURN_FALSE;
+//   }
+// #else
+//   Py_RETURN_FALSE;
+// #endif
+//   END_HANDLE_TH_ERRORS
+// }
+
+static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
+    PyObject* self,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  // If there's no zoom context, at::zoom::currentStreamCaptureStatus returns
+  // CaptureStatus::None without initializing a context.
+  if (at::zoom::currentStreamCaptureStatus() == at::zoom::CaptureStatus::None) {
+    Py_RETURN_FALSE;
+  } else {
+    Py_RETURN_TRUE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static struct PyMethodDef _THCPModule_methods[] = {
+    {"_zoom_init", THCPModule_initExtension, METH_NOARGS, nullptr},
+    {"_zoom_setDevice", THCPModule_setDevice_wrap, METH_O, nullptr},
+    {"_zoom_exchangeDevice", THCPModule_exchangeDevice, METH_O, nullptr},
+    {"_zoom_maybeExchangeDevice",
+     THCPModule_maybeExchangeDevice,
+     METH_O,
+     nullptr},
+    {"_zoom_getDevice", THCPModule_getDevice_wrap, METH_NOARGS, nullptr},
+    {"_zoom_getDeviceCount",
+     THCPModule_getDeviceCount_wrap,
+     METH_NOARGS,
+     nullptr},
+    {"_zoom_canDeviceAccessPeer",
+     THCPModule_canDeviceAccessPeer_wrap,
+     METH_VARARGS,
+     nullptr},
+    {"_zoom_getArchFlags", THCPModule_getArchFlags, METH_NOARGS, nullptr},
+    {"_zoom_isInBadFork", THCPModule_isInBadFork, METH_NOARGS, nullptr},
+    {"_zoom_getCurrentStream",
+     THCPModule_getCurrentStream_wrap,
+     METH_O,
+     nullptr},
+    {"_zoom_getCurrentRawStream",
+     THCPModule_getCurrentStream_raw,
+     METH_O,
+     nullptr},
+    {"_zoom_getDefaultStream",
+     THCPModule_getDefaultStream_wrap,
+     METH_O,
+     nullptr},
+    {"_zoom_getCurrentBlasHandle",
+     THCPModule_getCurrentBlasHandle_wrap,
+     METH_NOARGS,
+     nullptr},
+    {"_zoom_isCurrentStreamCapturing",
+     THCPModule_isCurrentStreamCapturing_wrap,
+     METH_NOARGS,
+     nullptr},
+    {"_zoom_setStream",
+     castPyCFunctionWithKeywords(THCPModule_setStream_wrap),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+    {"_zoom_getCompiledVersion",
+     THCPModule_getCompiledVersion,
+     METH_NOARGS,
+     nullptr},
+    {"_zoom_hasPrimaryContext", THCPModule_hasPrimaryContext, METH_O, nullptr},
+    {"_zoom_setMemoryFraction",
+     THCPModule_setMemoryFraction,
+     METH_VARARGS,
+     nullptr},
+    {"_zoom_emptyCache", THCPModule_emptyCache, METH_NOARGS, nullptr},
+    {"_zoom_memoryStats", THCPModule_memoryStats, METH_O, nullptr},
+    {"_zoom_resetAccumulatedMemoryStats",
+     THCPModule_resetAccumulatedMemoryStats,
+     METH_O,
+     nullptr},
+    {"_zoom_resetPeakMemoryStats",
+     THCPModule_resetPeakMemoryStats,
+     METH_O,
+     nullptr},
+    {"_zoom_memorySnapshot", THCPModule_memorySnapshot, METH_NOARGS, nullptr},
+    {"_zoom_attach_out_of_memory_observer",
+     THCPModule_attachOutOfMemoryObserver,
+     METH_O,
+     nullptr},
+    {"_zoom_zoomHostAllocator",
+     THCPModule_zoomHostAllocator,
+     METH_NOARGS,
+     nullptr},
+    {"_zoom_zoomCachingAllocator_raw_alloc",
+     THCPModule_zoomCachingAllocator_raw_alloc,
+     METH_VARARGS,
+     nullptr},
+    {"_zoom_zoomCachingAllocator_raw_delete",
+     THCPModule_zoomCachingAllocator_raw_delete,
+     METH_O,
+     nullptr},
+    {"_zoom_zoomCachingAllocator_set_allocator_settings",
+     THCPModule_zoomCachingAllocator_set_allocator_settings,
+     METH_O,
+     nullptr},
+    {"_zoom_getAllocatorBackend",
+     THCPModule_getAllocatorBackend,
+     METH_NOARGS,
+     nullptr},
+    {"_zoom_synchronize", THCPModule_zoomSynchronize, METH_NOARGS, nullptr},
+    // {"_zoom_ipc_collect", THCPModule_zoomIPCCollect, METH_NOARGS, nullptr},
+    // {"_zoom_sleep", THCPModule_zoomSleep, METH_O, nullptr},
+    {"_zoom_lock_mutex", THCPModule_zoomLockMutex, METH_NOARGS, nullptr},
+    {"_zoom_unlock_mutex", THCPModule_zoomUnlockMutex, METH_NOARGS, nullptr},
+    {"_zoom_set_sync_debug_mode",
+     THCPModule_zoomSetSyncDebugMode,
+     METH_O,
+     nullptr},
+    {"_zoom_get_sync_debug_mode",
+     THCPModule_zoomGetSyncDebugMode,
+     METH_NOARGS,
+     nullptr},
+    // {"_zoom_jiterator_compile_and_launch_kernel",
+    //  THCPModule_zoomJiteratorCompileAndLaunchKernel,
+    //  METH_VARARGS,
+    //  nullptr},
+    // {"_rocm_is_backward_pass",
+    //  THCPModule_rocm_is_backward_pass,
+    //  METH_NOARGS,
+    //  nullptr},
+    {nullptr}};
+
+PyMethodDef* THCPModule_methods() {
+  return _THCPModule_methods;
+}
+
+namespace torch::zoom {
+
+namespace shared {
+
+void initHiprtBindings(PyObject* module);
+// void initNvtxBindings(PyObject* module);
+// #if defined(USE_CUDNN) || defined(USE_ROCM)
+// void initCudnnBindings(PyObject* module);
+// #endif
+
+} // namespace shared
+
+void initModule(PyObject* module) {
+//   python::initCommMethods(module);
+//   // As weird as it seems, this file is also compiled for ROCm,
+//   // so this condition might not always be true...
+  shared::initHiprtBindings(module);
+//   shared::initNvtxBindings(module);
+// #if defined(USE_CUDNN) || defined(USE_ROCM)
+//   shared::initCudnnBindings(module);
+// #endif
+  registerZoomDeviceProperties(module);
+  registerZoomPluggableAllocator(module);
+}
+
+} // namespace torch::zoom
diff --git a/torch/csrc/zoom/Module.h b/torch/csrc/zoom/Module.h
new file mode 100644
index 00000000000000..2553dad7c616a8
--- /dev/null
+++ b/torch/csrc/zoom/Module.h
@@ -0,0 +1,11 @@
+#ifndef THCP_ZOOM_MODULE_INC
+#define THCP_ZOOM_MODULE_INC
+
+PyObject* THCPModule_getDevice_wrap(PyObject* self);
+PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg);
+PyObject* THCPModule_getDeviceName_wrap(PyObject* self, PyObject* arg);
+PyObject* THCPModule_getDriverVersion(PyObject* self);
+PyObject* THCPModule_isDriverSufficient(PyObject* self);
+PyObject* THCPModule_getCurrentBlasHandle_wrap(PyObject* self);
+
+#endif
diff --git a/torch/csrc/zoom/Stream.cpp b/torch/csrc/zoom/Stream.cpp
new file mode 100644
index 00000000000000..bd14fed218e431
--- /dev/null
+++ b/torch/csrc/zoom/Stream.cpp
@@ -0,0 +1,216 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/zoom/Module.h>
+#include <torch/csrc/zoom/Stream.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+#include <c10/zoom/ZoomGuard.h>
+
+#include <hip/hip_runtime_api.h>
+#include <structmember.h>
+
+PyObject* THCPStreamClass = nullptr;
+
+static PyObject* THCPStream_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+
+  const auto current_device = c10::zoom::current_device();
+
+  int priority = 0;
+  int64_t stream_id = 0;
+  int64_t device_index = 0;
+  int64_t device_type = 0;
+  uint64_t stream_ptr = 0;
+
+  // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+  constexpr const char* kwlist[] = {
+      "priority",
+      "stream_id",
+      "device_index",
+      "device_type",
+      "stream_ptr",
+      nullptr};
+  if (!PyArg_ParseTupleAndKeywords(
+          args,
+          kwargs,
+          "|iLLLK",
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+          const_cast<char**>(kwlist),
+          &priority,
+          &stream_id,
+          &device_index,
+          &device_type,
+          &stream_ptr)) {
+    return nullptr;
+  }
+
+  THPObjectPtr ptr(type->tp_alloc(type, 0));
+  if (!ptr) {
+    return nullptr;
+  }
+
+  if (stream_ptr) {
+    TORCH_CHECK(
+        priority == 0, "Priority was explicitly set for a external stream")
+  }
+  c10::zoom::ZoomStream stream = (stream_id || device_index || device_type)
+      ? c10::zoom::ZoomStream::unpack3(
+            stream_id,
+            static_cast<c10::DeviceIndex>(device_index),
+            static_cast<c10::DeviceType>(device_type))
+      : stream_ptr ? c10::zoom::getStreamFromExternal(
+                         // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                         reinterpret_cast<hipStream_t>(stream_ptr),
+                         current_device)
+                   : c10::zoom::getStreamFromPool(priority);
+
+  THCPStream* self = (THCPStream*)ptr.get();
+  self->stream_id = static_cast<int64_t>(stream.id());
+  self->device_index = static_cast<int64_t>(stream.device_index());
+  self->device_type = static_cast<int64_t>(stream.device_type());
+  new (&self->zoom_stream) c10::zoom::ZoomStream(stream);
+
+  return (PyObject*)ptr.release();
+  END_HANDLE_TH_ERRORS
+}
+
+static void THCPStream_dealloc(THCPStream* self) {
+  self->zoom_stream.~ZoomStream();
+  Py_TYPE(self)->tp_free((PyObject*)self);
+}
+
+static PyObject* THCPStream_get_device(THCPStream* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return THPDevice_New(self->zoom_stream.device());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPStream_get_zoom_stream(THCPStream* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return PyLong_FromVoidPtr(self->zoom_stream.stream());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPStream_get_priority(THCPStream* self, void* unused) {
+  HANDLE_TH_ERRORS
+  return THPUtils_packInt64(self->zoom_stream.priority());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPStream_priority_range(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto [least_priority, greatest_priority] =
+      c10::zoom::ZoomStream::priority_range();
+  return Py_BuildValue("(ii)", least_priority, greatest_priority);
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPStream_query(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  auto self = (THCPStream*)_self;
+  return PyBool_FromLong(self->zoom_stream.query());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPStream_synchronize(PyObject* _self, PyObject* noargs) {
+  HANDLE_TH_ERRORS {
+    pybind11::gil_scoped_release no_gil;
+    auto self = (THCPStream*)_self;
+    self->zoom_stream.synchronize();
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THCPStream_eq(PyObject* _self, PyObject* _other) {
+  HANDLE_TH_ERRORS
+  auto self = (THCPStream*)_self;
+  auto other = (THCPStream*)_other;
+  return PyBool_FromLong(self->zoom_stream == other->zoom_stream);
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static struct PyMemberDef THCPStream_members[] = {{nullptr}};
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static struct PyGetSetDef THCPStream_properties[] = {
+    {"zoom_stream",
+     (getter)THCPStream_get_zoom_stream,
+     nullptr,
+     nullptr,
+     nullptr},
+    {"priority", (getter)THCPStream_get_priority, nullptr, nullptr, nullptr},
+    {nullptr}};
+
+// NOLINTNEXTLINE(*-c-arrays*, *-global-variables)
+static PyMethodDef THCPStream_methods[] = {
+    {"query", THCPStream_query, METH_NOARGS, nullptr},
+    {"synchronize", THCPStream_synchronize, METH_NOARGS, nullptr},
+    {"priority_range",
+     THCPStream_priority_range,
+     METH_STATIC | METH_NOARGS,
+     nullptr},
+    {"__eq__", THCPStream_eq, METH_O, nullptr},
+    {nullptr}};
+
+PyTypeObject THCPStreamType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) "torch._C._ZoomStreamBase", /* tp_name */
+    sizeof(THCPStream), /* tp_basicsize */
+    0, /* tp_itemsize */
+    (destructor)THCPStream_dealloc, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    nullptr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    THCPStream_methods, /* tp_methods */
+    THCPStream_members, /* tp_members */
+    THCPStream_properties, /* tp_getset */
+    nullptr, /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    nullptr, /* tp_init */
+    nullptr, /* tp_alloc */
+    THCPStream_pynew, /* tp_new */
+};
+
+void THCPStream_init(PyObject* module) {
+  Py_INCREF(THPStreamClass);
+  THCPStreamType.tp_base = THPStreamClass;
+  THCPStreamClass = (PyObject*)&THCPStreamType;
+  if (PyType_Ready(&THCPStreamType) < 0) {
+    throw python_error();
+  }
+  Py_INCREF(&THCPStreamType);
+  if (PyModule_AddObject(
+          module, "_ZoomStreamBase", (PyObject*)&THCPStreamType) < 0) {
+    throw python_error();
+  }
+}
diff --git a/torch/csrc/zoom/Stream.h b/torch/csrc/zoom/Stream.h
new file mode 100644
index 00000000000000..3799abcbe09df9
--- /dev/null
+++ b/torch/csrc/zoom/Stream.h
@@ -0,0 +1,20 @@
+#ifndef THCP_STREAM_INC
+#define THCP_STREAM_INC
+
+#include <c10/zoom/ZoomStream.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THCPStream : THPStream {
+  c10::zoom::ZoomStream zoom_stream;
+};
+extern PyObject* THCPStreamClass;
+
+void THCPStream_init(PyObject* module);
+
+inline bool THCPStream_Check(PyObject* obj) {
+  return THCPStreamClass && PyObject_IsInstance(obj, THCPStreamClass);
+}
+
+#endif // THCP_STREAM_INC
diff --git a/torch/csrc/zoom/THCP.h b/torch/csrc/zoom/THCP.h
new file mode 100644
index 00000000000000..c66359b3364908
--- /dev/null
+++ b/torch/csrc/zoom/THCP.h
@@ -0,0 +1,10 @@
+#ifndef THCP_H
+#define THCP_H
+
+#include <torch/csrc/THP.h>
+#include <torch/csrc/zoom/Event.h>
+#include <torch/csrc/zoom/Module.h>
+#include <torch/csrc/zoom/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+#endif
diff --git a/torch/csrc/zoom/Tensor.cpp b/torch/csrc/zoom/Tensor.cpp
new file mode 100644
index 00000000000000..ea97dfea288d78
--- /dev/null
+++ b/torch/csrc/zoom/Tensor.cpp
@@ -0,0 +1,15 @@
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+// Order of these includes matters, which should be fixed.
+// clang-format off
+#include <torch/csrc/python_headers.h>
+#include <structmember.h>
+
+#include <torch/csrc/zoom/THCP.h>
+
+#include <torch/csrc/utils/tensor_numpy.h>
+#include <torch/csrc/copy_utils.h>
+#include <torch/csrc/DynamicTypes.h>
+// clang-format on
diff --git a/torch/csrc/zoom/ZoomPluggableAllocator.cpp b/torch/csrc/zoom/ZoomPluggableAllocator.cpp
new file mode 100644
index 00000000000000..c6d31e4ec1ce1d
--- /dev/null
+++ b/torch/csrc/zoom/ZoomPluggableAllocator.cpp
@@ -0,0 +1,373 @@
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#include <torch/csrc/zoom/ZoomPluggableAllocator.h>
+
+namespace torch::zoom::ZoomPluggableAllocator {
+
+int device_count = 0;
+
+void custom_raw_deleter(void* ptr);
+
+_AllocationMetadata::_AllocationMetadata()
+    : size(0), device_idx(-1), stream{} {}
+
+_AllocationMetadata::_AllocationMetadata(
+    size_t size,
+    c10::DeviceIndex device_idx,
+    hipStream_t stream)
+    : size(size), device_idx(device_idx), stream(stream) {}
+
+// This is a fast API to just register allocators
+// based on function pointers (ie. external .so libraries)
+// This avoids having to link against libtorch for C++ based custom allocators
+// And also use this from python
+ZoomPluggableAllocator::ZoomPluggableAllocator(
+    std::function<void*(size_t, int, hipStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, hipStream_t)> free_fn)
+    : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
+
+ZoomPluggableAllocator::ZoomPluggableAllocator(ZoomPluggableAllocator& other)
+    : alloc_fn_(other.alloc_fn_),
+      free_fn_(other.free_fn_),
+      init_fn_(other.init_fn_),
+      reset_fn_(other.reset_fn_),
+      memory_fraction_fn_(other.memory_fraction_fn_),
+      base_alloc_fn_(other.base_alloc_fn_),
+      record_stream_fn_(other.record_stream_fn_),
+      begin_allocate_to_pool_fn_(other.begin_allocate_to_pool_fn_),
+      end_allocate_to_pool_fn_(other.end_allocate_to_pool_fn_),
+      relase_pool_fn_(other.relase_pool_fn_) {}
+
+void ZoomPluggableAllocator::set_init_fn(std::function<void(int)> init_fn) {
+  init_fn_ = std::move(init_fn);
+}
+
+void ZoomPluggableAllocator::set_reset_fn(std::function<void()> reset_fn) {
+  reset_fn_ = std::move(reset_fn);
+}
+
+void ZoomPluggableAllocator::set_memory_fraction_fn(
+    std::function<void(double, int)> memory_fraction_fn) {
+  memory_fraction_fn_ = std::move(memory_fraction_fn);
+}
+
+void ZoomPluggableAllocator::set_base_alloc_fn(
+    std::function<void*(void*, size_t*)> base_alloc_fn) {
+  base_alloc_fn_ = std::move(base_alloc_fn);
+}
+
+void ZoomPluggableAllocator::set_record_stream_fn(
+    std::function<void(void* ptr, hipStream_t stream)> record_stream_fn) {
+  record_stream_fn_ = std::move(record_stream_fn);
+}
+
+void ZoomPluggableAllocator::set_begin_allocate_to_pool(
+    std::function<
+        void(int, c10::zoom::MempoolId_t, std::function<bool(hipStream_t)>)>
+        capture_begin_fn) {
+  begin_allocate_to_pool_fn_ = std::move(capture_begin_fn);
+}
+
+void ZoomPluggableAllocator::set_end_allocate_to_pool_fn(
+    std::function<void(int, c10::zoom::MempoolId_t)> capture_about_to_end_fn) {
+  end_allocate_to_pool_fn_ = std::move(capture_about_to_end_fn);
+}
+
+void ZoomPluggableAllocator::set_release_pool(
+    std::function<void(int, c10::zoom::MempoolId_t)> capture_destroy_fn) {
+  relase_pool_fn_ = std::move(capture_destroy_fn);
+}
+
+void* ZoomPluggableAllocator::malloc(
+    size_t size,
+    c10::DeviceIndex device,
+    hipStream_t stream) {
+  void* r = alloc_fn_(size, device, stream);
+  {
+    const std::lock_guard<std::mutex> lock(allocator_mutex_);
+    allocation_metadata_.emplace(r, _AllocationMetadata(size, device, stream));
+  }
+  return r;
+}
+
+c10::DataPtr ZoomPluggableAllocator::allocate(size_t size) {
+  c10::DeviceIndex device = -1;
+  C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+  hipStream_t stream = c10::zoom::getCurrentZoomStream(device);
+  void* r = this->malloc(size, device, stream);
+  c10::DataPtr data_ptr = {
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::PrivateUse1, device)};
+  return data_ptr;
+}
+
+c10::DeleterFnPtr ZoomPluggableAllocator::raw_deleter() const {
+  return &custom_raw_deleter;
+}
+
+void* ZoomPluggableAllocator::raw_alloc(size_t nbytes) {
+  c10::DeviceIndex device = -1;
+  C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+  hipStream_t stream = c10::zoom::getCurrentZoomStream(device);
+  return malloc(nbytes, device, stream);
+}
+
+void* ZoomPluggableAllocator::raw_alloc_with_stream(
+    size_t nbytes,
+    hipStream_t stream) {
+  c10::DeviceIndex device = -1;
+  C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+  return malloc(nbytes, device, stream);
+}
+
+void ZoomPluggableAllocator::raw_delete(void* ptr) {
+  hipStream_t stream{};
+  c10::DeviceIndex device_idx = -1;
+  size_t size = 0;
+  {
+    const std::lock_guard<std::mutex> lock(allocator_mutex_);
+    TORCH_CHECK(
+        allocation_metadata_.count(ptr),
+        "Trying to free a pointer not allocated here");
+    _AllocationMetadata& metadata = allocation_metadata_[ptr];
+    size = metadata.size;
+    device_idx = metadata.device_idx;
+    stream = metadata.stream;
+    allocation_metadata_.erase(ptr);
+  }
+  free_fn_(ptr, size, device_idx, stream);
+}
+
+void ZoomPluggableAllocator::init(int device_count) {
+  if (init_fn_) {
+    init_fn_(device_count);
+  }
+  initialized_ = true;
+}
+
+bool ZoomPluggableAllocator::initialized() {
+  return initialized_;
+}
+
+void ZoomPluggableAllocator::setMemoryFraction(
+    double fraction,
+    c10::DeviceIndex device) {
+  if (memory_fraction_fn_) {
+    memory_fraction_fn_(fraction, device);
+  }
+}
+
+void ZoomPluggableAllocator::emptyCache() {
+  if (reset_fn_) {
+    return reset_fn_();
+  }
+}
+
+void ZoomPluggableAllocator::cacheInfo(
+    c10::DeviceIndex device,
+    size_t* largestBlock) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support cacheInfo. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void* ZoomPluggableAllocator::getBaseAllocation(void* ptr, size_t* size) {
+  if (base_alloc_fn_) {
+    return base_alloc_fn_(ptr, size);
+  } else {
+    return ptr;
+  }
+}
+
+void ZoomPluggableAllocator::recordStream(
+    const c10::DataPtr& ptr,
+    streamType stream) {
+  if (record_stream_fn_) {
+    record_stream_fn_(ptr.get(), stream);
+  }
+}
+
+c10::zoom::ZoomCachingAllocator::DeviceStats ZoomPluggableAllocator::
+    getDeviceStats(c10::DeviceIndex device) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support getDeviceStats. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void ZoomPluggableAllocator::resetAccumulatedStats(c10::DeviceIndex device) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support resetAccumulatedStats. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void ZoomPluggableAllocator::resetPeakStats(c10::DeviceIndex device) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support resetPeakStats. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+c10::zoom::ZoomCachingAllocator::SnapshotInfo ZoomPluggableAllocator::
+    snapshot() {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support snapshot. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+std::shared_ptr<void> ZoomPluggableAllocator::getIpcDevPtr(std::string handle) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support getIpcDevPtr. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+// HIPGraph interactions
+void ZoomPluggableAllocator::beginAllocateToPool(
+    c10::DeviceIndex device,
+    c10::zoom::MempoolId_t mempool_id,
+    std::function<bool(hipStream_t)> filter) {
+  if (begin_allocate_to_pool_fn_) {
+    begin_allocate_to_pool_fn_(device, mempool_id, std::move(filter));
+  }
+}
+
+void ZoomPluggableAllocator::endAllocateToPool(
+    c10::DeviceIndex device,
+    c10::zoom::MempoolId_t mempool_id) {
+  if (end_allocate_to_pool_fn_) {
+    end_allocate_to_pool_fn_(device, mempool_id);
+  }
+}
+
+void ZoomPluggableAllocator::releasePool(
+    c10::DeviceIndex device,
+    c10::zoom::MempoolId_t mempool_id) {
+  if (relase_pool_fn_) {
+    relase_pool_fn_(device, mempool_id);
+  }
+}
+
+void ZoomPluggableAllocator::recordHistory(
+    bool enabled,
+    c10::zoom::ZoomCachingAllocator::CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    c10::zoom::ZoomCachingAllocator::RecordContext when) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support recordHistory. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void ZoomPluggableAllocator::attachOutOfMemoryObserver(
+    c10::zoom::ZoomCachingAllocator::OutOfMemoryObserver observer) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support attachOutOfMemoryObserver. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void ZoomPluggableAllocator::attachAllocatorTraceTracker(
+    c10::zoom::ZoomCachingAllocator::AllocatorTraceTracker tracker) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not support attachAllocatorTraceTracker. "
+      "attachAllocatorTraceTracker is only used inside Pytorch.");
+}
+
+std::shared_ptr<c10::zoom::ZoomCachingAllocator::AllocatorState>
+ZoomPluggableAllocator::getCheckpointState(
+    c10::DeviceIndex device,
+    c10::zoom::MempoolId_t id) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support getCheckpointState. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+c10::zoom::ZoomCachingAllocator::CheckpointDelta ZoomPluggableAllocator::
+    setCheckpointPoolState(
+        c10::DeviceIndex device,
+        std::shared_ptr<c10::zoom::ZoomCachingAllocator::AllocatorState> pps) {
+  TORCH_CHECK(
+      false,
+      "ZoomPluggableAllocator does not yet support setCheckpointPoolState. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void ZoomPluggableAllocator::enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  c10::zoom::ZoomGuard device_guard(dev);
+  hipError_t err = hipDeviceEnablePeerAccess(dev_to_access, 0);
+  if (err == hipErrorPeerAccessAlreadyEnabled) {
+    // ignore and clear the error if access was already enabled
+    (void)hipGetLastError();
+  } else {
+    C10_ZOOM_CHECK(err);
+  }
+}
+
+hipError_t ZoomPluggableAllocator::memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    hipStream_t stream,
+    bool p2p_enabled) {
+  return hipMemcpyAsync(dst, src, count, hipMemcpyDeviceToDevice, stream);
+}
+
+std::string ZoomPluggableAllocator::name() {
+  return "pluggable";
+}
+
+void ZoomPluggableAllocator::copy_data(
+    void* dest,
+    const void* src,
+    std::size_t count) const {
+  C10_ZOOM_CHECK(
+      hipMemcpy(dest, src, count, hipMemcpyKind::hipMemcpyDeviceToDevice));
+}
+
+std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>
+    current_custom_allocator;
+
+std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>
+getCurrentAllocator() {
+  return current_custom_allocator;
+}
+
+// TODO: add more functions in the argument
+std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>
+createCustomAllocator(
+    std::function<void*(size_t, int, hipStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, hipStream_t)> free_fn) {
+  std::shared_ptr<ZoomPluggableAllocator> allocator(
+      new ZoomPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
+  allocator->init(device_count);
+  return allocator;
+}
+
+void changeCurrentAllocator(
+    const std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>&
+        allocator) {
+  TORCH_CHECK(
+      !c10::zoom::ZoomCachingAllocator::allocator.load()->initialized(),
+      "Can't swap an already initialized allocator");
+  c10::zoom::ZoomCachingAllocator::allocator.store(allocator.get());
+  current_custom_allocator = allocator;
+}
+
+void custom_raw_deleter(void* ptr) {
+  current_custom_allocator->raw_delete(ptr);
+}
+
+} // namespace torch::zoom::ZoomPluggableAllocator
diff --git a/torch/csrc/zoom/ZoomPluggableAllocator.h b/torch/csrc/zoom/ZoomPluggableAllocator.h
new file mode 100644
index 00000000000000..b2baf8671191c6
--- /dev/null
+++ b/torch/csrc/zoom/ZoomPluggableAllocator.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+#include <c10/zoom/ZoomMacros.h>
+#include <c10/zoom/ZoomStream.h>
+
+#include <c10/zoom/ZoomCachingAllocator.h>
+
+#include <mutex>
+
+namespace torch::zoom::ZoomPluggableAllocator {
+using streamType = c10::zoom::ZoomStream;
+
+std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>
+getCurrentAllocator();
+std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>
+createCustomAllocator(
+    std::function<void*(size_t, int, hipStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, hipStream_t)> free_fn);
+void changeCurrentAllocator(
+    const std::shared_ptr<c10::zoom::ZoomCachingAllocator::ZoomAllocator>&
+        allocator);
+
+struct _AllocationMetadata {
+  _AllocationMetadata();
+  _AllocationMetadata(
+      size_t size,
+      c10::DeviceIndex device_idx,
+      hipStream_t stream);
+  size_t size;
+  c10::DeviceIndex device_idx;
+  hipStream_t stream;
+};
+
+struct ZoomPluggableAllocator
+    : public c10::zoom::ZoomCachingAllocator::ZoomAllocator {
+  ZoomPluggableAllocator(
+      std::function<void*(size_t, int, hipStream_t)> alloc_fn,
+      std::function<void(void*, size_t, int, hipStream_t)> free_fn);
+
+  ZoomPluggableAllocator(ZoomPluggableAllocator& other);
+
+  void set_init_fn(std::function<void(int)> init_fn);
+
+  void set_reset_fn(std::function<void()> reset_fn);
+
+  void set_memory_fraction_fn(
+      std::function<void(double, int)> memory_fraction_fn);
+
+  void set_base_alloc_fn(std::function<void*(void*, size_t*)> base_alloc_fn);
+
+  void set_record_stream_fn(
+      std::function<void(void* ptr, hipStream_t stream)> record_stream_fn);
+
+  void set_begin_allocate_to_pool(
+      std::function<
+          void(int, c10::zoom::MempoolId_t, std::function<bool(hipStream_t)>)>
+          capture_begin_fn);
+
+  void set_end_allocate_to_pool_fn(
+      std::function<void(int, c10::zoom::MempoolId_t)> capture_about_to_end_fn);
+
+  void set_release_pool(
+      std::function<void(int, c10::zoom::MempoolId_t)> capture_destroy_fn);
+
+  void* malloc(size_t size, c10::DeviceIndex device, hipStream_t stream);
+
+  c10::DataPtr allocate(size_t size) override;
+  c10::DeleterFnPtr raw_deleter() const override;
+
+  void* raw_alloc(size_t nbytes) override;
+  void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override;
+  void raw_delete(void* ptr) override;
+  void init(int device_count) override;
+  bool initialized() override;
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
+  void emptyCache() override;
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override;
+  void* getBaseAllocation(void* ptr, size_t* size) override;
+
+  void recordStream(const c10::DataPtr&, streamType stream) override;
+
+  c10::zoom::ZoomCachingAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) override;
+  void resetAccumulatedStats(c10::DeviceIndex device) override;
+  void resetPeakStats(c10::DeviceIndex device) override;
+  c10::zoom::ZoomCachingAllocator::SnapshotInfo snapshot() override;
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      c10::zoom::MempoolId_t mempool_id,
+      std::function<bool(hipStream_t)>) override;
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      c10::zoom::MempoolId_t mempool_id) override;
+  void releasePool(c10::DeviceIndex device, c10::zoom::MempoolId_t mempool_id)
+      override;
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override;
+  void recordHistory(
+      bool enabled,
+      c10::zoom::ZoomCachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      c10::zoom::ZoomCachingAllocator::RecordContext when) override;
+  void attachOutOfMemoryObserver(
+      c10::zoom::ZoomCachingAllocator::OutOfMemoryObserver observer) override;
+  void attachAllocatorTraceTracker(
+      c10::zoom::ZoomCachingAllocator::AllocatorTraceTracker tracker) override;
+  std::shared_ptr<c10::zoom::ZoomCachingAllocator::AllocatorState>
+  getCheckpointState(c10::DeviceIndex device, c10::zoom::MempoolId_t id)
+      override;
+  c10::zoom::ZoomCachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<c10::zoom::ZoomCachingAllocator::AllocatorState> pps)
+      override;
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override;
+  hipError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      hipStream_t stream,
+      bool p2p_enabled) override;
+  std::string name() override;
+  void copy_data(void* dest, const void* src, std::size_t count) const final;
+
+ protected:
+  std::function<void*(size_t, int, hipStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, int, hipStream_t)> free_fn_;
+  std::function<void(int)> init_fn_;
+  std::function<void()> reset_fn_;
+  std::function<void(double, int)> memory_fraction_fn_;
+  std::function<void*(void*, size_t*)> base_alloc_fn_;
+  std::function<void(void* ptr, hipStream_t stream)> record_stream_fn_;
+  std::function<
+      void(int, c10::zoom::MempoolId_t, std::function<bool(hipStream_t)>)>
+      begin_allocate_to_pool_fn_;
+  std::function<void(int, c10::zoom::MempoolId_t)> end_allocate_to_pool_fn_;
+  std::function<void(int, c10::zoom::MempoolId_t)> relase_pool_fn_;
+  std::mutex allocator_mutex_;
+  // We do the bookeeping here in order to simplify custom allocators
+  std::unordered_map<void*, _AllocationMetadata> allocation_metadata_;
+
+  bool initialized_ = false;
+};
+} // namespace torch::zoom::ZoomPluggableAllocator
diff --git a/torch/csrc/zoom/comm.cpp b/torch/csrc/zoom/comm.cpp
new file mode 100644
index 00000000000000..d66450ab97a8b0
--- /dev/null
+++ b/torch/csrc/zoom/comm.cpp
@@ -0,0 +1,508 @@
+#include <torch/csrc/zoom/comm.h>
+
+#include <torch/csrc/zoom/device_set.h>
+#include <torch/csrc/utils/tensor_flatten.h>
+
+#ifdef USE_NCCL
+#include <torch/csrc/zoom/nccl.h>
+#endif
+
+#include <ATen/ATen.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::zoom {
+using namespace at;
+using namespace torch::autograd;
+
+// Some operations can be performed more efficiently if we're handling tensors
+// of a single type only. Adding this logic directly in the loop makes it a bit
+// ugly, so here's a helper for it.
+struct unique_type_checker {
+  void show(size_t type_id) {
+    if (!unique) {
+      return;
+    }
+    if (!type_id_) {
+      type_id_ = type_id;
+    }
+
+    unique = type_id_.value() == type_id;
+  }
+
+  std::optional<size_t> type_id_;
+  bool unique = true;
+};
+
+// ***************** Broadcast *******************
+//
+// Broadcast a source tensor (CPU or Zoom) to a list of Zoom devices, or Zoom
+// tensors on one or more devices.
+
+// no checks
+static inline std::vector<Tensor>& _broadcast_out_impl(
+    const Tensor& tensor,
+    std::vector<Tensor>& out_tensors) {
+#ifdef USE_NCCL
+  std::vector<Tensor> nccl_list;
+  nccl_list.reserve(out_tensors.size() + 1);
+  nccl_list.emplace_back(tensor);
+  for (auto& out_tensor : out_tensors) {
+    nccl_list.emplace_back(out_tensor);
+  }
+  if (nccl::is_available(nccl_list)) {
+    nccl::broadcast(nccl_list);
+  } else {
+#else
+  {
+#endif
+    for (auto& out_tensor : out_tensors) {
+      out_tensor.copy_(tensor, /*non_blocking=*/true);
+    }
+  }
+  return out_tensors;
+}
+
+std::vector<Tensor>& broadcast_out(
+    const Tensor& tensor,
+    std::vector<Tensor>& out_tensors) {
+  for (const auto i : c10::irange(out_tensors.size())) {
+    TORCH_CHECK(
+        out_tensors[i].is_privateuseone(),
+        "Expected all output tensors to be Zoom tensors, but output tensor at index ",
+        i,
+        " has device '",
+        out_tensors[i].device(),
+        "'");
+    TORCH_CHECK(
+        out_tensors[i].sizes() == tensor.sizes(),
+        "Expected all output tensors to have same shape as the source tensor ",
+        tensor.sizes(),
+        ", but output tensor at index ",
+        i,
+        " has shape ",
+        out_tensors[i].sizes());
+  }
+  return _broadcast_out_impl(tensor, out_tensors);
+}
+
+std::vector<Tensor> broadcast(const Tensor& tensor, IntArrayRef devices) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<Tensor> diff_device_dst_tensors;
+  diff_device_dst_tensors.reserve(devices.size());
+  for (auto device : devices) {
+    TORCH_CHECK(
+        device >= 0, "Expected non-negative device index, but got ", device);
+    if (device != tensor.get_device()) {
+      diff_device_dst_tensors.emplace_back(at::empty(
+          tensor.sizes(),
+          tensor.options().device(at::Device(
+              DeviceType::PrivateUse1,
+              static_cast<DeviceIndex>(device))))); // preserve memory format
+    }
+  }
+  _broadcast_out_impl(tensor, diff_device_dst_tensors);
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<Tensor> dst_tensors;
+  dst_tensors.reserve(devices.size());
+  auto it = diff_device_dst_tensors.begin();
+  for (auto device : devices) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (device != tensor.get_device()) {
+      dst_tensors.emplace_back(*it++);
+    } else {
+      dst_tensors.emplace_back(tensor);
+    }
+  }
+  TORCH_INTERNAL_ASSERT(it == diff_device_dst_tensors.end());
+  return dst_tensors;
+}
+
+// NOTE [ Version Counter in comm.*_coalesced ]
+//
+// broadcast_coalesced
+// ~~~~~~~~~~~~~~~~~~~
+//
+// In broadcast_coalesced, multiple variables may be coalesced into a single
+// large one, broadcast to other devices, and the get split according to the
+// original shapes.
+//
+// When splitting, the view operations will make all Variables broadcast
+// together to share a single version counter, because they are all views of the
+// large Variable. However, that large Variable is immediately discarded and all
+// these Variables do not share storage at all.
+//
+// For example, when two buffers are broadcast together in `DataParallel` and
+// one of them is modified in-place during `forward` but the other is needed in
+// backward, autograd engine will complain.
+//
+// We thus re-wrap these Variables after broadcasting (i.e., effectively doing
+// what is equivalent to .data in Python), and give them individual version
+// counters.
+//
+// NB: Just calling detach() on the variables is not sufficient
+//
+// NB: For `device[0]` in broadcast_coalesced, the input Variables are always
+//     returned as-is, so **do not** re-wrap them.
+//
+// reduce_add_coalesced
+// ~~~~~~~~~~~~~~~~~~~~
+//
+// Similarly for reduce_add_coalesced, when the output are newly created
+// Variables.
+tensor_list2d broadcast_coalesced(
+    TensorList tensors,
+    IntArrayRef devices,
+    size_t buffer_size) {
+  TORCH_CHECK(
+      std::all_of(
+          tensors.begin(),
+          tensors.end(),
+          [&](const at::Tensor& t) { return t.get_device() == devices[0]; }),
+      "All tensors must be on devices[0]: ",
+      devices[0]);
+#ifdef USE_NCCL
+  buffer_size = std::min(torch::zoom::nccl::get_max_count(), buffer_size);
+#endif
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  tensor_list2d outputs(devices.size());
+  outputs[0] = tensors.vec();
+  for (auto& o : outputs)
+    o.reserve(tensors.size());
+
+  unique_type_checker type_checker;
+  c10::zoom::ZoomGuard device_guard(static_cast<DeviceIndex>(devices[0]));
+  for (auto& chunk : torch::utils::take_tensors(tensors, buffer_size)) {
+    auto type_id = chunk.type_id();
+    type_checker.show(type_id);
+    std::vector<at::Tensor> results;
+    if (chunk.options().is_sparse()) {
+      auto flat_tuple = torch::utils::flatten_sparse_tensors(chunk.tensors);
+      auto broadcast_indices = broadcast(flat_tuple.first, devices);
+      auto broadcast_values = broadcast(flat_tuple.second, devices);
+      results.reserve(devices.size());
+      for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
+        device_guard.set_index(static_cast<DeviceIndex>(devices[i]));
+        auto& device_outputs = outputs[i];
+        auto& inds = broadcast_indices[i];
+        auto& vals = broadcast_values[i];
+        for (const auto& var : torch::utils::unflatten_sparse_tensors(
+                 inds, vals, chunk.tensors)) {
+          // See NOTE [ Version Counter in comm.*_coalesced ]
+          device_outputs.emplace_back(make_variable(var.tensor_data(), false));
+        }
+      }
+    } else {
+      auto results = broadcast(
+          torch::utils::flatten_dense_tensors(chunk.tensors), devices);
+      for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
+        device_guard.set_index(static_cast<DeviceIndex>(devices[i]));
+        auto& device_outputs = outputs[i];
+        for (auto& var :
+             torch::utils::unflatten_dense_tensors(results[i], chunk.tensors)) {
+          // See NOTE [ Version Counter in comm.*_coalesced ]
+          device_outputs.emplace_back(make_variable(var.tensor_data(), false));
+        }
+      }
+    }
+  }
+
+  // If we only saw a single tensor type, then we can skip expensive reordering
+  if (!type_checker.unique) {
+    for (auto& o : outputs)
+      torch::utils::reorder_tensors_like(o, tensors);
+  }
+  return outputs;
+}
+
+// ***************** Scatter *******************
+//
+// Scatter a source tensor (CPU or Zoom) to a list of Zoom tensors on one or
+// more devices.
+
+std::vector<at::Tensor>& scatter_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors,
+    int64_t dim,
+    const std::optional<std::vector<c10::optional<c10::zoom::ZoomStream>>>&
+        streams) {
+  TORCH_CHECK(
+      !out_tensors.empty(),
+      "Expected at least one output tensor to scatter to");
+  dim = at::maybe_wrap_dim(dim, tensor);
+  int64_t total_size = 0;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<int64_t> chunk_sizes;
+  chunk_sizes.reserve(out_tensors.size());
+  for (const auto i : c10::irange(out_tensors.size())) {
+    TORCH_CHECK(
+        out_tensors[i].is_privateuseone(),
+        "Expected all output tensors to be Zoom tensors, but output tensor at index ",
+        i,
+        " has device '",
+        out_tensors[i].device(),
+        "'");
+    auto out_sizes = out_tensors[i].sizes().vec();
+    bool same_ndim = out_sizes.size() == static_cast<size_t>(tensor.dim());
+    if (same_ndim) {
+      total_size += out_sizes[dim];
+      chunk_sizes.emplace_back(out_sizes[dim]);
+      out_sizes[dim] = tensor.size(dim);
+    }
+    TORCH_CHECK(
+        same_ndim && out_sizes == tensor.sizes(),
+        "Output tensor at index ",
+        i,
+        " has incorrect shape: ",
+        out_tensors[i].sizes(),
+        ". Expected same "
+        "shape except for scatter dim ",
+        dim,
+        " as the source tensor: ",
+        at::IntArrayRef(tensor.sizes()));
+  }
+  TORCH_CHECK(
+      total_size == tensor.size(dim),
+      "Total size for output tensors along scatter dim ",
+      dim,
+      " does not match "
+      "the source tensor size at dim ",
+      dim,
+      ". Expected ",
+      tensor.size(dim),
+      ", but got total size ",
+      total_size);
+
+  auto chunks =
+      tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
+  c10::zoom::OptionalZoomStreamGuard zoom_guard;
+  for (const auto i : c10::irange(chunks.size())) {
+    if (i < (streams ? streams->size() : 0U) && (*streams)[i]) {
+      const auto device_index =
+          static_cast<int16_t>(out_tensors[i].get_device());
+      TORCH_CHECK(
+          (*streams)[i]->device_index() == device_index,
+          "Expected the device associated with the stream at index ",
+          i,
+          " (was ",
+          (*streams)[i]->device_index(),
+          ") ",
+          "to match the device supplied at that index ",
+          "(expected ",
+          device_index,
+          ")");
+      zoom_guard.reset_stream(*(*streams)[i]);
+    }
+    // NB: We don't detect the case where `out_tensor` is already the correct
+    //     view of `tensor` since that would be nontrivial and involve checking
+    //     ptr, offset, and strides. So `scatter_out(src, src.chunk(...))` does
+    //     more copying than `scatter(src)`.
+    out_tensors[i].copy_(chunks[i], /*non_blocking=*/true);
+  }
+  return out_tensors;
+}
+
+std::vector<at::Tensor> scatter(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices,
+    const std::optional<std::vector<int64_t>>& chunk_sizes,
+    int64_t dim,
+    const std::optional<std::vector<c10::optional<c10::zoom::ZoomStream>>>&
+        streams) {
+  TORCH_CHECK(!devices.empty(), "Expected at least one device to scatter to");
+  if (chunk_sizes.has_value()) {
+    TORCH_CHECK(
+        chunk_sizes->size() == devices.size(),
+        "Expected devices and chunk_sizes to be of same length, but got "
+        "len(devices) = ",
+        devices.size(),
+        " and len(chunk_sizes) = ",
+        chunk_sizes->size());
+  }
+  dim = at::maybe_wrap_dim(dim, tensor);
+  std::vector<at::Tensor> chunks = chunk_sizes
+      ? tensor.split_with_sizes(/*split_sizes=*/*chunk_sizes, /*dim=*/dim)
+      : tensor.chunk(
+            /*chunks=*/static_cast<int64_t>(devices.size()), /*dim=*/dim);
+  c10::zoom::OptionalZoomStreamGuard zoom_guard;
+  for (const auto i : c10::irange(chunks.size())) {
+    const auto device_index = static_cast<int16_t>(devices[i]);
+    if (device_index != tensor.get_device()) {
+      if (i < (streams ? streams->size() : 0U) && (*streams)[i]) {
+        TORCH_CHECK(
+            (*streams)[i]->device_index() == device_index,
+            "Expected the device associated with the stream at index ",
+            i,
+            " (was ",
+            (*streams)[i]->device_index(),
+            ") ",
+            "to match the device supplied at that index ",
+            "(expected ",
+            device_index,
+            ")");
+        zoom_guard.reset_stream(*(*streams)[i]);
+      }
+      TORCH_CHECK(
+          device_index >= 0,
+          "Expected non-negative device index, but got ",
+          device_index);
+      chunks[i] = chunks[i].to(
+          {DeviceType::PrivateUse1, device_index},
+          /*non_blocking=*/true,
+          /*copy=*/false,
+          /*memory_format=*/at::MemoryFormat::Preserve);
+    }
+  }
+  return chunks;
+}
+
+// ***************** Gather *******************
+//
+// Gather a list of Zoom tensors on one or more devices to a target tensor or
+// device, either CPU or Zoom.
+
+// no checks
+static inline at::Tensor& _gather_out_impl(
+    at::TensorList tensors,
+    at::Tensor& out_tensor,
+    int64_t dim) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<int64_t> chunk_sizes;
+  chunk_sizes.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    chunk_sizes.emplace_back(tensor.size(dim));
+  }
+  auto chunks =
+      out_tensor.split_with_sizes(/*split_sizes=*/chunk_sizes, /*dim=*/dim);
+  for (const auto i : c10::irange(tensors.size())) {
+    chunks[i].copy_(tensors[i], /*non_blocking=*/out_tensor.is_privateuseone());
+  }
+  return out_tensor;
+}
+
+at::Tensor& gather_out(
+    at::TensorList tensors,
+    at::Tensor& out_tensor,
+    int64_t dim) {
+  TORCH_CHECK(!tensors.empty(), "Expected at least one tensor to gather from");
+  int64_t total_size = 0;
+  auto& first = tensors.front();
+  const auto first_size = first.sizes();
+  dim = at::maybe_wrap_dim(dim, first);
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
+  for (const auto i : c10::irange(tensors.size())) {
+    const auto& tensor = tensors[i];
+    TORCH_CHECK(
+        tensor.is_privateuseone(),
+        "Expected all input tensors to be Zoom tensors, but "
+        "tensor at index ",
+        i,
+        " has device '",
+        tensor.device(),
+        "'");
+    TORCH_CHECK(
+        tensor.ndimension() == static_cast<int64_t>(expected_size.size()),
+        "Expected all input tensors to have the same number of dimensions, but ",
+        "tensor at index ",
+        i,
+        "has ",
+        tensor.ndimension(),
+        " dimensions, (expected ",
+        expected_size.size(),
+        ")");
+    expected_size[dim] = tensor.size(dim);
+    for (const auto dimension : c10::irange(expected_size.size())) {
+      TORCH_CHECK(
+          expected_size[dimension] == tensor.size(dimension),
+          "Input tensor at index ",
+          i,
+          " has invalid shape ",
+          tensor.sizes(),
+          ", but expected ",
+          at::IntArrayRef(expected_size));
+    }
+    total_size += tensor.size(dim);
+  }
+  expected_size[dim] = total_size;
+  TORCH_CHECK(
+      out_tensor.sizes() == expected_size,
+      "Expected out tensor to have shape ",
+      at::IntArrayRef(expected_size),
+      ", but got ",
+      out_tensor.sizes())
+
+  return _gather_out_impl(tensors, out_tensor, dim);
+}
+
+at::Tensor gather(
+    at::TensorList tensors,
+    int64_t dim,
+    std::optional<int32_t> destination_index) {
+  TORCH_CHECK(!tensors.empty(), "Expected at least one tensor to gather from");
+  int64_t total_size = 0;
+  auto& first = tensors.front();
+  const auto first_size = first.sizes();
+  dim = at::maybe_wrap_dim(dim, first);
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<int64_t> expected_size(first_size.begin(), first_size.end());
+  auto memory_format = first.suggest_memory_format();
+  for (const auto i : c10::irange(tensors.size())) {
+    const auto& tensor = tensors[i];
+    TORCH_CHECK(
+        tensor.is_privateuseone(),
+        "Expected all input tensors to be Zoom tensors, but "
+        "tensor at index ",
+        i,
+        " has device ",
+        tensor.device());
+    TORCH_CHECK(
+        tensor.ndimension() == static_cast<int64_t>(expected_size.size()),
+        "Expected all input tensors to have the same number of dimensions, but ",
+        "tensor at index ",
+        i,
+        "has ",
+        tensor.ndimension(),
+        " dimensions, (expected ",
+        expected_size.size(),
+        ")");
+    expected_size[dim] = tensor.size(dim);
+    for (const auto dimension : c10::irange(expected_size.size())) {
+      TORCH_CHECK(
+          expected_size[dimension] == tensor.size(dimension),
+          "Input tensor at index ",
+          i,
+          " has invalid shape ",
+          tensor.sizes(),
+          ", but expected ",
+          at::IntArrayRef(expected_size));
+    }
+    total_size += tensor.size(dim);
+    if (memory_format != MemoryFormat::Contiguous &&
+        tensor.suggest_memory_format() != memory_format) {
+      memory_format = MemoryFormat::Contiguous;
+    }
+  }
+  expected_size[dim] = total_size;
+  at::Device device(DeviceType::CPU);
+  if (!destination_index || *destination_index != -1) {
+    device = at::Device(
+        DeviceType::PrivateUse1,
+        destination_index ? static_cast<DeviceIndex>(*destination_index)
+                          : DeviceIndex(-1));
+  }
+
+  at::Tensor result =
+      at::empty(expected_size, first.options().device(device), memory_format);
+  return _gather_out_impl(tensors, result, dim);
+}
+
+} // namespace torch::zoom
diff --git a/torch/csrc/zoom/comm.h b/torch/csrc/zoom/comm.h
new file mode 100644
index 00000000000000..27229ef3169f0a
--- /dev/null
+++ b/torch/csrc/zoom/comm.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/zoom/ATenZoomGeneral.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::zoom {
+
+using tensor_list2d = std::vector<std::vector<at::Tensor>>;
+
+TORCH_ZOOM_API std::vector<at::Tensor>& broadcast_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors);
+TORCH_ZOOM_API std::vector<at::Tensor> broadcast(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices);
+TORCH_ZOOM_API tensor_list2d broadcast_coalesced(
+    at::TensorList tensors,
+    at::IntArrayRef devices,
+    size_t buffer_size);
+
+TORCH_ZOOM_API std::vector<at::Tensor>& scatter_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors,
+    int64_t dim = 0,
+    const std::optional<std::vector<c10::optional<c10::zoom::ZoomStream>>>&
+        streams = c10::nullopt);
+
+TORCH_ZOOM_API std::vector<at::Tensor> scatter(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices,
+    const std::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
+    int64_t dim = 0,
+    const std::optional<std::vector<c10::optional<c10::zoom::ZoomStream>>>&
+        streams = c10::nullopt);
+
+TORCH_ZOOM_API at::Tensor& gather_out(
+    at::TensorList tensors,
+    at::Tensor& out_tensor,
+    int64_t dim);
+
+TORCH_ZOOM_API at::Tensor gather(
+    at::TensorList tensors,
+    int64_t dim,
+    std::optional<int32_t> destination_index);
+
+} // namespace torch::zoom
diff --git a/torch/csrc/zoom/device_set.h b/torch/csrc/zoom/device_set.h
new file mode 100644
index 00000000000000..14226ef2e1c92f
--- /dev/null
+++ b/torch/csrc/zoom/device_set.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <c10/zoom/ZoomMacros.h>
+#include <bitset>
+#include <cstddef>
+
+namespace torch {
+
+using device_set = std::bitset<C10_COMPILE_TIME_MAX_GPUS>;
+
+} // namespace torch
diff --git a/torch/csrc/zoom/memory_snapshot.cpp b/torch/csrc/zoom/memory_snapshot.cpp
new file mode 100644
index 00000000000000..79ed6cc9d19ced
--- /dev/null
+++ b/torch/csrc/zoom/memory_snapshot.cpp
@@ -0,0 +1,376 @@
+#include <ATen/Context.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <torch/csrc/zoom/memory_snapshot.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/profiler/combined_traceback.h>
+
+namespace torch::zoom {
+
+using c10::Dict;
+using c10::IValue;
+using torch::jit::Pickler;
+
+using c10::zoom::ZoomCachingAllocator::SegmentInfo;
+
+namespace {
+std::string write_pickle(const IValue& v) {
+  std::vector<char> result;
+  {
+    auto writer = [&](const char* data, size_t size) {
+      result.insert(result.end(), data, data + size);
+    };
+    Pickler pickler(writer, nullptr, nullptr, nullptr, nullptr, false);
+    pickler.protocol();
+    pickler.pushIValue(v);
+    pickler.stop();
+  }
+  return std::string(result.begin(), result.end());
+}
+Dict<IValue, IValue> new_dict() {
+  return Dict<IValue, IValue>(c10::AnyType::get(), c10::AnyType::get());
+}
+c10::List<IValue> new_list() {
+  return List<IValue>(c10::AnyType::get());
+}
+
+std::vector<IValue> ivalue_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize) {
+  // we dedup repeated to_symbolize objects to prevent
+  // creating a bunch of duplicated frame objects
+  std::unordered_map<CapturedTraceback*, uint64_t> cached_frames;
+  std::vector<CapturedTraceback*> unique_frames;
+  for (const auto& sc : to_symbolize) {
+    auto it = cached_frames.find(sc);
+    if (it == cached_frames.end()) {
+      cached_frames.insert({sc, unique_frames.size()});
+      unique_frames.push_back(sc);
+    }
+  }
+  auto s = symbolize(unique_frames);
+
+  IValue line_s = "line";
+  IValue name_s = "name";
+  IValue filename_s = "filename";
+  std::vector<IValue> all_frames;
+  for (const auto& f : s.all_frames) {
+    auto d = new_dict();
+    d.insert(name_s, f.funcname);
+    d.insert(filename_s, f.filename);
+    d.insert(line_s, int64_t(f.lineno));
+    all_frames.emplace_back(std::move(d));
+  }
+
+  std::vector<IValue> py_unique_frames;
+  for (const auto& t : s.tracebacks) {
+    auto l = new_list();
+    for (const auto& e : t) {
+      l.push_back(all_frames.at(e));
+    }
+    py_unique_frames.emplace_back(std::move(l));
+  }
+
+  std::vector<IValue> result;
+  result.reserve(to_symbolize.size());
+  for (const auto& sc : to_symbolize) {
+    result.push_back(py_unique_frames.at(cached_frames.at(sc)));
+  }
+  return result;
+}
+
+std::shared_ptr<c10::GatheredContext> gather() {
+  return CapturedTraceback::gather(true, true, false);
+}
+
+std::shared_ptr<c10::GatheredContext> gather_with_cpp() {
+  return CapturedTraceback::gather(true, true, true);
+}
+
+CapturedTraceback* getFromContext(
+    const std::shared_ptr<c10::GatheredContext>& x) {
+  if (CapturedTraceback* sc = dynamic_cast<CapturedTraceback*>(x.get())) {
+    return sc;
+  }
+  TORCH_CHECK(
+      false,
+      "attempting to gather stack context from the wrong StackContext type.");
+}
+
+} // namespace
+
+void _record_memory_history(
+    bool enabled,
+    bool record_context,
+    int64_t trace_alloc_max_entries,
+    bool trace_alloc_record_context,
+    bool record_cpp_context) {
+  c10::zoom::ZoomCachingAllocator::CreateContextFn recorder = gather;
+  if (enabled && record_cpp_context) {
+    recorder = gather_with_cpp;
+    // warm up C++ stack unwinding
+    unwind::unwind();
+  }
+  auto when = c10::zoom::ZoomCachingAllocator::RecordContext::NEVER;
+  if (trace_alloc_record_context) {
+    when = c10::zoom::ZoomCachingAllocator::RecordContext::ALLOC;
+  } else if (record_context) {
+    when = c10::zoom::ZoomCachingAllocator::RecordContext::STATE;
+  }
+  at::globalContext().lazyInitPrivateUse1();
+  c10::zoom::ZoomCachingAllocator::recordHistory(
+      enabled, recorder, trace_alloc_max_entries, when);
+}
+
+static void checkOptionIn(
+    const std::string& option,
+    std::initializer_list<std::string> valid,
+    const char* error) {
+  TORCH_CHECK(
+      valid.end() != std::find(valid.begin(), valid.end(), option), error);
+}
+
+void _record_memory_history(
+    std::optional<std::string> enabled,
+    std::optional<std::string> context,
+    const std::string& stacks,
+    size_t max_entries) {
+  if (enabled) {
+    checkOptionIn(
+        *enabled,
+        {"state", "all"},
+        "expected state to be 'state', 'all', or None");
+  }
+  if (context) {
+    checkOptionIn(
+        *context,
+        {"state", "alloc", "all"},
+        "expected context to be 'state', 'alloc', 'all', or None");
+  }
+  checkOptionIn(
+      stacks, {"python", "all"}, "expected stacks to be 'python', or 'all'");
+
+  c10::zoom::ZoomCachingAllocator::CreateContextFn recorder = gather;
+  if (enabled && stacks == "all") {
+    recorder = gather_with_cpp;
+    // warm up C++ stack unwinding
+    unwind::unwind();
+  }
+  max_entries = (enabled && *enabled == "all") ? max_entries : 1;
+  auto when = c10::zoom::ZoomCachingAllocator::RecordContext::NEVER;
+  if (context) {
+    if (context == "all") {
+      when = c10::zoom::ZoomCachingAllocator::RecordContext::ALL;
+    } else if (context == "alloc") {
+      when = c10::zoom::ZoomCachingAllocator::RecordContext::ALLOC;
+    } else if (context == "state") {
+      when = c10::zoom::ZoomCachingAllocator::RecordContext::STATE;
+    }
+  }
+  at::globalContext().lazyInitPrivateUse1();
+  c10::zoom::ZoomCachingAllocator::recordHistory(
+      enabled.has_value(), recorder, max_entries, when);
+}
+
+std::string _memory_snapshot_pickled() {
+  IValue device_s = "device";
+  IValue address_s = "address";
+  IValue total_size_s = "total_size";
+  IValue allocated_size_s = "allocated_size";
+  IValue active_size_s = "active_size";
+  IValue requested_size_s = "requested_size";
+  IValue stream_s = "stream";
+  IValue segment_type_s = "segment_type";
+  IValue segment_pool_id = "segment_pool_id";
+  IValue large_s = "large";
+  IValue small_s = "small";
+  IValue size_s = "size";
+  IValue state_s = "state";
+  IValue active_allocated_s = "active_allocated";
+  IValue active_pending_free_s = "active_pending_free";
+  IValue inactive_s = "inactive";
+  IValue addr_s = "addr";
+  IValue filename_s = "filename";
+  IValue name_s = "name";
+  IValue line_s = "line";
+  IValue frames_s = "frames";
+  IValue blocks_s = "blocks";
+  IValue is_expandable_s = "is_expandable";
+  IValue time_us_s = "time_us";
+
+  auto empty_frames = new_list();
+
+  std::vector<CapturedTraceback*> frame_tracebacks;
+  std::vector<Dict<IValue, IValue>> frame_dict;
+
+  auto add_frame_key = [&](const c10::Dict<IValue, IValue>& d,
+                           const std::shared_ptr<c10::GatheredContext>& ctx) {
+    if (ctx) {
+      frame_tracebacks.push_back(getFromContext(ctx));
+      frame_dict.push_back(d);
+    } else {
+      d.insert(frames_s, empty_frames);
+    }
+  };
+
+  const auto segmentInfoToDict = [&](const SegmentInfo& segmentInfo) {
+    auto segmentDict = new_dict();
+    segmentDict.insert(device_s, segmentInfo.device);
+    segmentDict.insert(address_s, static_cast<int64_t>(segmentInfo.address));
+    segmentDict.insert(
+        total_size_s, static_cast<int64_t>(segmentInfo.total_size));
+    segmentDict.insert(
+        allocated_size_s, static_cast<int64_t>(segmentInfo.allocated_size));
+    segmentDict.insert(
+        active_size_s, static_cast<int64_t>(segmentInfo.active_size));
+    segmentDict.insert(
+        requested_size_s, static_cast<int64_t>(segmentInfo.requested_size));
+    segmentDict.insert(stream_s, int64_t(segmentInfo.stream));
+    segmentDict.insert(
+        segment_type_s, (segmentInfo.is_large ? large_s : small_s));
+    segmentDict.insert(
+        segment_pool_id,
+        std::tuple<int64_t, int64_t>(segmentInfo.owner_private_pool_id));
+    segmentDict.insert(is_expandable_s, segmentInfo.is_expandable);
+
+    add_frame_key(segmentDict, segmentInfo.context_when_allocated);
+
+    auto address = segmentInfo.address;
+    auto blocks = new_list();
+    for (const auto& blockInfo : segmentInfo.blocks) {
+      auto blockDict = new_dict();
+      blockDict.insert(address_s, static_cast<int64_t>(address));
+      blockDict.insert(size_s, static_cast<int64_t>(blockInfo.size));
+      blockDict.insert(
+          requested_size_s, static_cast<int64_t>(blockInfo.requested_size));
+      blockDict.insert(
+          state_s,
+          (blockInfo.allocated
+               ? active_allocated_s
+               : (blockInfo.active ? active_pending_free_s : inactive_s)));
+      add_frame_key(blockDict, blockInfo.context_when_allocated);
+      address += blockInfo.size;
+      blocks.push_back(blockDict);
+    }
+    segmentDict.insert(blocks_s, blocks);
+
+    return segmentDict;
+  };
+
+  auto snapshot = c10::zoom::ZoomCachingAllocator::snapshot();
+
+  auto segments = new_list();
+  for (const auto& segmentInfo : snapshot.segments) {
+    segments.push_back(segmentInfoToDict(segmentInfo));
+  }
+
+  auto traces = new_list();
+  IValue action_s = "action";
+  IValue alloc_s = "alloc";
+  IValue free_requested_s = "free_requested";
+  IValue free_completed_s = "free_completed";
+  IValue segment_alloc_s = "segment_alloc";
+  IValue segment_free_s = "segment_free";
+  IValue segment_map_s = "segment_map";
+  IValue segment_unmap_s = "segment_unmap";
+  IValue snapshot_s = "snapshot";
+  IValue oom_s = "oom";
+  IValue device_free_s = "device_free";
+
+  using namespace c10::zoom::ZoomCachingAllocator;
+
+  auto action_to_str = [&](TraceEntry::Action action) {
+    switch (action) {
+      case TraceEntry::ALLOC:
+        return alloc_s;
+      case TraceEntry::FREE_REQUESTED:
+        return free_requested_s;
+      case TraceEntry::FREE_COMPLETED:
+        return free_completed_s;
+      case TraceEntry::SEGMENT_ALLOC:
+        return segment_alloc_s;
+      case TraceEntry::SEGMENT_FREE:
+        return segment_free_s;
+      case TraceEntry::OOM:
+        return oom_s;
+      case TraceEntry::SNAPSHOT:
+        return snapshot_s;
+      case TraceEntry::SEGMENT_UNMAP:
+        return segment_unmap_s;
+      case TraceEntry::SEGMENT_MAP:
+        return segment_map_s;
+    }
+    throw std::runtime_error("unreachable");
+  };
+
+  for (const auto& traceInfo : snapshot.device_traces) {
+    auto trace = new_list();
+    for (const auto& te : traceInfo) {
+      auto trace_entry = new_dict();
+      trace_entry.insert(action_s, action_to_str(te.action_));
+      trace_entry.insert(
+          TraceEntry::OOM == te.action_ ? device_free_s : addr_s,
+          static_cast<int64_t>(te.addr_));
+      trace_entry.insert(size_s, (int64_t)te.size_);
+      trace_entry.insert(stream_s, int64_t(te.stream_));
+      if (te.context_) {
+        auto sc = getFromContext(te.context_);
+        frame_tracebacks.push_back(sc);
+        frame_dict.push_back(trace_entry);
+      }
+      trace_entry.insert(time_us_s, te.time_.t_);
+      trace.push_back(trace_entry);
+    }
+    traces.push_back(trace);
+  }
+
+  auto allocator_settings = new_dict();
+  IValue last_allocator_settings_s = "PYTORCH_ZOOM_ALLOC_CONF";
+  IValue max_split_size_s = "max_split_size";
+  IValue garbage_collection_threshold_s = "garbage_collection_threshold";
+  IValue expandable_segments_s = "expandable_segments";
+  IValue pinned_num_register_threads_s = "pinned_num_register_threads";
+  IValue release_lock_on_malloc_s = "release_lock_on_hipmalloc";
+  IValue pinned_use_host_register_s = "pinned_use_zoom_host_register";
+  IValue roundup_power2_divisions_s = "roundup_power2_divisions";
+
+  allocator_settings.insert(
+      last_allocator_settings_s,
+      snapshot.config_metadata.last_allocator_settings);
+  allocator_settings.insert(
+      max_split_size_s, int64_t(snapshot.config_metadata.max_split_size));
+  allocator_settings.insert(
+      garbage_collection_threshold_s,
+      snapshot.config_metadata.garbage_collection_threshold);
+  allocator_settings.insert(
+      expandable_segments_s, snapshot.config_metadata.expandable_segments);
+  allocator_settings.insert(
+      pinned_num_register_threads_s,
+      int64_t(snapshot.config_metadata.pinned_num_register_threads));
+  allocator_settings.insert(
+      release_lock_on_malloc_s,
+      snapshot.config_metadata.release_lock_on_malloc);
+  allocator_settings.insert(
+      pinned_use_host_register_s,
+      snapshot.config_metadata.pinned_use_host_register);
+  unsigned int roundup_key = 1;
+  auto roundup_settings = new_dict();
+  for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
+    IValue roundup_key_s = std::to_string(roundup_key);
+    roundup_settings.insert(roundup_key_s, int64_t(v));
+    roundup_key *= 2;
+  }
+  allocator_settings.insert(roundup_power2_divisions_s, roundup_settings);
+
+  auto result = new_dict();
+  result.insert("segments", segments);
+  result.insert("device_traces", traces);
+  result.insert("allocator_settings", allocator_settings);
+
+  auto frames = ivalue_symbolize(frame_tracebacks);
+  for (auto i : c10::irange(frames.size())) {
+    frame_dict.at(i).insert(frames_s, frames.at(i));
+  }
+
+  return write_pickle(result);
+}
+} // namespace torch::zoom
diff --git a/torch/csrc/zoom/memory_snapshot.h b/torch/csrc/zoom/memory_snapshot.h
new file mode 100644
index 00000000000000..bacf3cf0ebafb9
--- /dev/null
+++ b/torch/csrc/zoom/memory_snapshot.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <string>
+
+namespace torch::zoom {
+
+// C++-only versions of these, for python use
+// those defined in zoom/Module.cpp which also record python state.
+TORCH_ZOOM_API void _record_memory_history(
+    bool enabled,
+    bool record_context = true,
+    int64_t trace_alloc_max_entries = 1,
+    bool trace_alloc_record_context = false,
+    bool record_cpp_context = false);
+
+TORCH_ZOOM_API void _record_memory_history(
+    std::optional<std::string> enabled = "all",
+    std::optional<std::string> context = "all",
+    const std::string& stacks = "all",
+    size_t max_entries = SIZE_MAX);
+
+TORCH_ZOOM_API std::string _memory_snapshot_pickled();
+
+} // namespace torch::zoom
diff --git a/torch/csrc/zoom/python_comm.cpp b/torch/csrc/zoom/python_comm.cpp
new file mode 100644
index 00000000000000..07e84b914a07b2
--- /dev/null
+++ b/torch/csrc/zoom/python_comm.cpp
@@ -0,0 +1,109 @@
+#include <ATen/core/functional.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/zoom/Stream.h>
+#include <torch/csrc/zoom/THCP.h>
+#include <torch/csrc/zoom/comm.h>
+#include <torch/csrc/zoom/utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <ATen/ATen.h>
+
+#include <cstddef>
+#include <vector>
+
+#include <torch/csrc/profiler/unwind/unwind.h>
+
+namespace torch::zoom::python {
+void initCommMethods(PyObject* module) {
+  auto m = py::cast<py::module>(module);
+  m.def(
+       "_broadcast_coalesced",
+       [](std::vector<at::Tensor>& tensors,
+          const std::vector<int64_t>& devices,
+          size_t buffer_size) {
+         return broadcast_coalesced(tensors, devices, buffer_size);
+       },
+       py::arg("tensors"),
+       py::arg("devices"),
+       py::arg("buffer_size"),
+       py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_broadcast",
+          [](at::Tensor& tensor, std::vector<int64_t> devices) {
+            return broadcast(tensor, devices);
+          },
+          py::call_guard<py::gil_scoped_release>(),
+          py::arg("tensor"),
+          py::arg("devices"))
+      .def(
+          "_broadcast_out",
+          [](at::Tensor& tensor, std::vector<at::Tensor>& out_tensors) {
+            return broadcast_out(tensor, out_tensors);
+          },
+          py::call_guard<py::gil_scoped_release>(),
+          py::arg("tensor"),
+          py::arg("out"))
+      .def(
+          "_scatter",
+          [](at::Tensor& tensor,
+             std::vector<int64_t>& devices,
+             std::optional<std::vector<int64_t>> chunk_sizes,
+             int64_t dim,
+             std::optional<py::object> py_streams) {
+            std::optional<std::vector<c10::optional<c10::zoom::ZoomStream>>>
+                streams;
+            if (py_streams) {
+              py::handle handle = *py_streams;
+              streams = THPUtils_PySequence_to_ZoomStreamList(handle.ptr());
+            }
+            // Note: We're holding the GIL up to here.
+            pybind11::gil_scoped_release no_gil;
+            return scatter(tensor, devices, chunk_sizes, dim, streams);
+          },
+          py::arg("tensor"),
+          py::arg("devices"),
+          py::arg("chunk_sizes"),
+          py::arg("dim"),
+          py::arg("streams"))
+      .def(
+          "_scatter_out",
+          [](at::Tensor& tensor,
+             std::vector<at::Tensor>& out_tensors,
+             int64_t dim,
+             std::optional<py::object> py_streams) {
+            std::optional<std::vector<c10::optional<c10::zoom::ZoomStream>>>
+                streams;
+            if (py_streams) {
+              py::handle handle = *py_streams;
+              streams = THPUtils_PySequence_to_ZoomStreamList(handle.ptr());
+            }
+            // Note: We're holding the GIL up to here.
+            pybind11::gil_scoped_release no_gil;
+            return scatter_out(tensor, out_tensors, dim, streams);
+          },
+          py::arg("tensor"),
+          py::arg("out"),
+          py::arg("dim"),
+          py::arg("streams"))
+      .def(
+          "_gather",
+          [](std::vector<at::Tensor>& tensors,
+             int64_t dim,
+             std::optional<int32_t> destination_index) {
+            return gather(tensors, dim, destination_index);
+          },
+          py::arg("tensors"),
+          py::arg("dim"),
+          py::arg("destination_index"),
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "_gather_out",
+          [](std::vector<at::Tensor>& tensors,
+             at::Tensor& out_tensor,
+             int64_t dim) { return gather_out(tensors, out_tensor, dim); },
+          py::arg("tensors"),
+          py::arg("out"),
+          py::arg("dim"),
+          py::call_guard<py::gil_scoped_release>());
+}
+} // namespace torch::zoom::python
diff --git a/torch/csrc/zoom/python_comm.h b/torch/csrc/zoom/python_comm.h
new file mode 100644
index 00000000000000..de5af273adc0a3
--- /dev/null
+++ b/torch/csrc/zoom/python_comm.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::zoom::python {
+
+void initCommMethods(PyObject* module);
+
+} // namespace torch::zoom::python
diff --git a/torch/csrc/zoom/shared/hiprt.cpp b/torch/csrc/zoom/shared/hiprt.cpp
new file mode 100644
index 00000000000000..823806750f4eef
--- /dev/null
+++ b/torch/csrc/zoom/shared/hiprt.cpp
@@ -0,0 +1,76 @@
+#include <torch/csrc/utils/pybind.h>
+#include <hip/hip_runtime_api.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomGuard.h>
+
+namespace torch::zoom::shared {
+
+namespace {
+hipError_t hipReturnSuccess() {
+  return hipSuccess;
+}
+} // namespace
+
+void initHiprtBindings(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  auto hiprt = m.def_submodule("_hiprt", "hip runtime bindings");
+
+  py::enum_<hipError_t>(
+      hiprt,
+      "hip"
+      "Error")
+      .value("success", hipSuccess);
+
+  hiprt.def(
+      "hip"
+      "GetErrorString",
+      hipGetErrorString);
+  hiprt.def(
+      "hip"
+      "ProfilerStart",
+      hipReturnSuccess
+  );
+  hiprt.def(
+      "hip"
+      "ProfilerStop",
+      hipReturnSuccess
+  );
+  hiprt.def(
+      "hip"
+      "HostRegister",
+      [](uintptr_t ptr, size_t size, unsigned int flags) -> hipError_t {
+        return C10_ZOOM_ERROR_HANDLED(
+            hipHostRegister((void*)ptr, size, flags));
+      });
+  hiprt.def(
+      "hip"
+      "HostUnregister",
+      [](uintptr_t ptr) -> hipError_t {
+        return C10_ZOOM_ERROR_HANDLED(hipHostUnregister((void*)ptr));
+      });
+  hiprt.def(
+      "hip"
+      "StreamCreate",
+      [](uintptr_t ptr) -> hipError_t {
+        return C10_ZOOM_ERROR_HANDLED(hipStreamCreate((hipStream_t*)ptr));
+      });
+  hiprt.def(
+      "hip"
+      "StreamDestroy",
+      [](uintptr_t ptr) -> hipError_t {
+        return C10_ZOOM_ERROR_HANDLED(hipStreamDestroy((hipStream_t)ptr));
+      });
+  hiprt.def(
+      "hip"
+      "MemGetInfo",
+      [](c10::DeviceIndex device) -> std::pair<size_t, size_t> {
+        c10::zoom::ZoomGuard guard(device);
+        size_t device_free = 0;
+        size_t device_total = 0;
+        C10_ZOOM_CHECK(hipMemGetInfo(&device_free, &device_total));
+        return {device_free, device_total};
+      });
+}
+
+} // namespace torch::zoom::shared
diff --git a/torch/csrc/zoom/utils.cpp b/torch/csrc/zoom/utils.cpp
new file mode 100644
index 00000000000000..e04d93ae4b99d2
--- /dev/null
+++ b/torch/csrc/zoom/utils.cpp
@@ -0,0 +1,41 @@
+#include <torch/csrc/zoom/THCP.h>
+#include <torch/csrc/python_headers.h>
+#include <cstdarg>
+#include <string>
+
+// NB: It's a list of *optional* ZoomStream; when nullopt, that means to use
+// whatever the current stream of the device the input is associated with was.
+std::vector<std::optional<c10::zoom::ZoomStream>>
+THPUtils_PySequence_to_ZoomStreamList(PyObject* obj) {
+  if (!PySequence_Check(obj)) {
+    throw std::runtime_error(
+        "Expected a sequence in THPUtils_PySequence_to_ZoomStreamList");
+  }
+  THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, nullptr));
+  if (seq.get() == nullptr) {
+    throw std::runtime_error(
+        "expected PySequence, but got " + std::string(THPUtils_typename(obj)));
+  }
+
+  std::vector<std::optional<c10::zoom::ZoomStream>> streams;
+  Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < length; i++) {
+    PyObject* stream = PySequence_Fast_GET_ITEM(seq.get(), i);
+
+    if (PyObject_IsInstance(stream, THCPStreamClass)) {
+      // Spicy hot reinterpret cast!!
+      streams.emplace_back(c10::zoom::ZoomStream::unpack3(
+          (reinterpret_cast<THCPStream*>(stream))->stream_id,
+          (reinterpret_cast<THCPStream*>(stream))->device_index,
+          static_cast<c10::DeviceType>(
+              (reinterpret_cast<THCPStream*>(stream))->device_type)));
+    } else if (stream == Py_None) {
+      streams.emplace_back();
+    } else {
+      // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
+      std::runtime_error(
+          "Unknown data type found in stream list. Need torch.cuda.Stream or None");
+    }
+  }
+  return streams;
+}
\ No newline at end of file
diff --git a/torch/csrc/zoom/utils.h b/torch/csrc/zoom/utils.h
new file mode 100644
index 00000000000000..39b9c17b60459c
--- /dev/null
+++ b/torch/csrc/zoom/utils.h
@@ -0,0 +1,4 @@
+#include <c10/zoom/ZoomStream.h>
+
+std::vector<std::optional<c10::zoom::ZoomStream>>
+THPUtils_PySequence_to_ZoomStreamList(PyObject* obj);
\ No newline at end of file
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index a1d2a846e75e02..844bd5ebc30f00 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -4073,7 +4073,7 @@ def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optiona
         # Two levels are necessary to prevent TorchScript from touching
         # are_deterministic_algorithms_enabled.
         if not torch.jit.is_scripting():
-            if torch.are_deterministic_algorithms_enabled() and input.is_cuda:
+            if torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_zoom):
                 # Use slow decomp whose backward will be in terms of index_put
                 # importlib is required because the import cannot be top level
                 # (cycle) and cannot be nested (TS doesn't support)
@@ -4528,7 +4528,7 @@ def pad(input: Tensor, pad: List[int], mode: str = "constant", value: Optional[f
         return handle_torch_function(
             torch.nn.functional.pad, (input,), input, pad, mode=mode, value=value)
     if not torch.jit.is_scripting():
-        if torch.are_deterministic_algorithms_enabled() and input.is_cuda:
+        if torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_zoom):
             if mode == 'replicate':
                 # Use slow decomp whose backward will be in terms of index_put.
                 # importlib is required because the import cannot be top level
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 07caa0ac3eee35..26594a33ca9d2e 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1083,6 +1083,16 @@ def _has_sufficient_memory(device, size):
             device = 'cuda:0'
         return torch.cuda.memory.mem_get_info(device)[0] >= size
 
+    if torch.device(device).type == 'zoom':
+        if not torch.zoom.is_available():
+            return False
+        gc.collect()
+        torch.zoom.empty_cache()
+        # torch.zoom.mem_get_info, aka hipMemGetInfo, returns a tuple of (free memory, total memory) of a GPU
+        if device == 'zoom':
+            device = 'zoom:0'
+        return torch.zoom.memory.mem_get_info(device)[0] >= size
+
     if device == 'xla':
         raise unittest.SkipTest('TODO: Memory availability checks for XLA?')
 
@@ -1318,6 +1328,12 @@ class dtypesIfCUDA(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type='cuda')
 
+# Overrides specified dtypes on Zoom.
+class dtypesIfZoom(dtypes):
+
+    def __init__(self, *args):
+        super().__init__(*args, device_type='zoom')
+
 class dtypesIfMPS(dtypes):
 
     def __init__(self, *args):
@@ -1335,6 +1351,8 @@ def onlyCPU(fn):
 def onlyCUDA(fn):
     return onlyOn('cuda')(fn)
 
+def onlyZOOM(fn):
+    return onlyOn('zoom')(fn)
 
 def onlyMPS(fn):
     return onlyOn('mps')(fn)
@@ -1362,6 +1380,17 @@ def only_fn(self, *args, **kwargs):
 
     return only_fn
 
+def onlyCUDAAndZOOM(fn):
+    @wraps(fn)
+    def only_fn(self, *args, **kwargs):
+        if self.device_type not in ('cuda', 'privateuseone'):
+            reason = f"onlyCUDAAndZOOM: doesn't run on {self.device_type}"
+            raise unittest.SkipTest(reason)
+
+        return fn(self, *args, **kwargs)
+
+    return only_fn
+
 def disablecuDNN(fn):
 
     @wraps(fn)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index af5dcf35b4a377..93988d025f3779 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -69,6 +69,7 @@
 import torch.backends.mps
 import torch.backends.xnnpack
 import torch.cuda
+import torch.zoom
 from torch import Tensor
 from torch._C import ScriptDict, ScriptList  # type: ignore[attr-defined]
 from torch._utils_internal import get_writable_path
@@ -1234,6 +1235,7 @@ def TemporaryDirectoryName(suffix=None):
 TEST_MPS = torch.backends.mps.is_available()
 TEST_XPU = torch.xpu.is_available()
 TEST_CUDA = torch.cuda.is_available()
+TEST_ZOOM = torch.zoom.is_available()
 custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name(), None)
 custom_device_is_available = hasattr(custom_device_mod, "is_available") and custom_device_mod.is_available()
 TEST_PRIVATEUSE1 = True if custom_device_is_available else False
@@ -1596,6 +1598,21 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def skipIfZoom(func=None, *, msg="test doesn't currently work on the ROCm stack"):
+    def dec_fn(fn):
+        reason = f"skipIfZoom: {msg}"
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if TEST_ZOOM:  # noqa: F821
+                raise unittest.SkipTest(reason)
+            else:
+                return fn(*args, **kwargs)
+        return wrapper
+    if func:
+        return dec_fn(func)
+    return dec_fn
+
 # Skips a test on CUDA if ROCm is available and its version is lower than requested.
 def skipIfRocmVersionLessThan(version=None):
     def dec_fn(fn):
@@ -1698,6 +1715,17 @@ def __enter__(self):
     def __exit__(self, exception_type, exception_value, traceback):
         torch.cuda.set_sync_debug_mode(self.debug_mode_restore)
 
+class ZoomSyncGuard:
+    def __init__(self, sync_debug_mode):
+        self.mode = sync_debug_mode
+
+    def __enter__(self):
+        self.debug_mode_restore = torch.zoom.get_sync_debug_mode()
+        torch.zoom.set_sync_debug_mode(self.mode)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.zoom.set_sync_debug_mode(self.debug_mode_restore)
+
 # Context manager for setting torch.__future__.set_swap_module_params_on_conversion
 # and automatically resetting it to its original value
 class SwapTensorsGuard:
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 0b17a4af0eaca3..8bdbde217683ba 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -1344,7 +1344,7 @@ def supported_dtypes(self, device_type):
         if device_type == "privateuse1":
             device_type = torch._C._get_privateuse1_backend_name()
         device_type = torch.device(device_type).type
-        if device_type == "cuda":
+        if device_type == "cuda" or device_type == "zoom":
             return self.dtypesIfROCM if TEST_WITH_ROCM else self.dtypesIfCUDA
         return self.dtypes
 
@@ -1356,7 +1356,7 @@ def supported_backward_dtypes(self, device_type):
             device_type = torch._C._get_privateuse1_backend_name()
         device_type = torch.device(device_type).type
         backward_dtypes = None
-        if device_type == "cuda":
+        if device_type == "cuda" or device_type == "zoom":
             backward_dtypes = (
                 self.backward_dtypesIfROCM
                 if TEST_WITH_ROCM
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 960aa1e79d7395..50a753225e565e 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1078,10 +1078,15 @@ def CUDAExtension(name, sources, *args, **kwargs):
     libraries.append('torch_cpu')
     libraries.append('torch_python')
     if IS_HIP_EXTENSION:
+        print("IS_HIP_EXTENSION")
         libraries.append('amdhip64')
-        libraries.append('c10_hip')
-        libraries.append('torch_hip')
+        libraries.append('rocblas')
+        libraries.append('hipblas')
+        # (Arham): commented out for zoom development
+        # libraries.append('c10_hip')
+        # libraries.append('torch_hip')
     else:
+        print("LOADING CUDA")
         libraries.append('cudart')
         libraries.append('c10_cuda')
         libraries.append('torch_cuda')
@@ -1089,7 +1094,8 @@ def CUDAExtension(name, sources, *args, **kwargs):
 
     include_dirs = kwargs.get('include_dirs', [])
 
-    if IS_HIP_EXTENSION:
+    # (Arham): disable hipify
+    if False and IS_HIP_EXTENSION:
         build_dir = os.getcwd()
         hipify_result = hipify_python.hipify(
             project_directory=build_dir,
@@ -1690,7 +1696,8 @@ def _jit_compile(name,
         try:
             if version != old_version:
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
-                    if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
+                    # (Arham): to disable hipifying for testing the zoom extension
+                    if False and IS_HIP_EXTENSION and (with_cuda or with_cudnn):
                         hipify_result = hipify_python.hipify(
                             project_directory=build_directory,
                             output_directory=build_directory,
@@ -1866,11 +1873,12 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
     else:
         extra_ldflags.append(f'-L{TORCH_LIB_PATH}')
         extra_ldflags.append('-lc10')
-        if with_cuda:
-            extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda')
+        # (Arham): commented out to develop zoom
+        # if with_cuda:
+        #     extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda')
         extra_ldflags.append('-ltorch_cpu')
-        if with_cuda:
-            extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda')
+        # if with_cuda:
+        #     extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda')
         extra_ldflags.append('-ltorch')
         if not is_standalone:
             extra_ldflags.append('-ltorch_python')
diff --git a/torch/zoom/__init__.py b/torch/zoom/__init__.py
new file mode 100644
index 00000000000000..7b5a757d08520c
--- /dev/null
+++ b/torch/zoom/__init__.py
@@ -0,0 +1,577 @@
+import importlib
+import os
+import threading
+import traceback
+import warnings
+from functools import lru_cache
+from typing import Any, Callable, cast, List, Optional, Tuple, Union
+
+import torch
+import torch._C
+from torch.types import Device
+from .. import device as _device
+from .._utils import _dummy_type, _LazySeedTracker, classproperty
+from ._utils import _get_device_index
+from .streams import Event, ExternalStream, Stream
+
+
+try:
+    from torch._C import _hiprt  # type: ignore[attr-defined]
+except ImportError:
+    _hiprt = None
+    
+
+# Define dummy _ZoomDeviceProperties type if PyTorch was compiled without Zoom
+if hasattr(torch._C, "_ZoomDeviceProperties"):
+    _ZoomDeviceProperties = torch._C._ZoomDeviceProperties
+else:
+    _ZoomDeviceProperties = _dummy_type("_ZoomDeviceProperties")  # type: ignore[assignment, misc]
+
+if hasattr(torch._C, "_zoom_exchangeDevice"):
+    _exchange_device = torch._C._zoom_exchangeDevice
+else:
+    def _exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without Zoom support")
+
+
+if hasattr(torch._C, "_zoom_maybeExchangeDevice"):
+    _maybe_exchange_device = torch._C._zoom_maybeExchangeDevice
+else:
+    def _maybe_exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without Zoom support")
+
+
+
+_initialized = False
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_is_in_bad_fork = getattr(torch._C, "_zoom_isInBadFork", lambda: False)
+_device_t = Union[_device, str, int, None]
+_lazy_seed_tracker = _LazySeedTracker()
+_cached_device_count: Optional[int] = None
+
+class DeferredZoomCallError(Exception):
+    pass
+
+def get_amp_supported_dtype() -> List[torch.dtype]:
+    return [torch.float16, torch.bfloat16, torch.float32]
+
+def _is_compiled() -> bool:
+    r"""Return true if compile with Zoom support."""
+    return hasattr(torch._C, "_zoom_getDeviceCount")
+
+def is_available() -> bool:
+    r"""Return a bool indicating if Zoom is currently available."""
+    if not _is_compiled():
+        return False
+    return torch._C._zoom_getDeviceCount() > 0
+
+def is_bf16_supported():
+    r"""bfloat16 is supported on AMD GPU Archs"""
+    return True
+
+def is_initialized():
+    r"""Return whether PyTorch's HIP state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+def init():
+    r"""Initialize PyTorch's HIP state.
+
+    You may need to call this explicitly if you are interacting with
+    PyTorch via its C API, as Python bindings for Zoom functionality
+    will not be available until this initialization takes place.
+
+    No-op if Zoom is already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # We be double-checked locking, boys!  This is OK because
+        # the above test was GIL protected anyway.  The inner test
+        # is for when a thread blocked on some other thread which was
+        # doing the initialization; when they get the lock, they will
+        # find there is nothing left to do.
+        if is_initialized():
+            return
+        # It is important to prevent other threads from entering _lazy_init
+        # immediately, while we are still guaranteed to have the GIL, because some
+        # of the C calls we make below will release the GIL
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize Zoom in forked subprocess. To use Zoom with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not hasattr(torch._C, "_zoom_getDeviceCount"):
+            raise AssertionError("Torch not compiled with Zoom enabled")
+        if _hiprt is None:
+            raise AssertionError(
+                "HIP runtime functions unavailable. It looks like you have a broken build?"
+            )
+        # This function throws if there's a driver initialization error, no GPUs
+        # are found or any other error occurs
+        # if "CUDA_MODULE_LOADING" not in os.environ:
+        #     os.environ["CUDA_MODULE_LOADING"] = "LAZY"
+        torch._C._zoom_init()
+        # Some of the queued calls may reentrantly call _lazy_init();
+        # we need to just return without initializing in that case.
+        # However, we must not let any *other* threads in!
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"Zoom call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"Zoom call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise DeferredZoomCallError(msg) from e
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+def hiprt():
+    _lazy_init()
+    return _hiprt
+
+class hipStatus:
+    SUCCESS: int = 0
+    ERROR_NOT_READY: int = 34
+
+
+class ZoomError(RuntimeError):
+    def __init__(self, code: int) -> None:
+        msg = _hiprt.hipGetErrorString(_hiprt.hipError(code))
+        super().__init__(f"{msg} ({code})")
+
+
+def check_error(res: int) -> None:
+    if res != _hiprt.hipError.success:
+        raise ZoomError(res)
+
+
+class _DeviceGuard:
+    def __init__(self, index: int):
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.zoom._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.zoom._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.zoom._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.zoom._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a GPU, this is a no-op.
+
+    Args:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_zoom else -1
+        super().__init__(idx)
+
+
+def set_device(device: _device_t) -> None:
+    r"""Set the current device.
+
+    Usage of this function is discouraged in favor of :any:`device`. In most
+    cases it's better to use ``ZOOM_VISIBLE_DEVICES`` environmental variable.
+
+    Args:
+        device (torch.device or int): selected device. This function is a no-op
+            if this argument is negative.
+    """
+    device = _get_device_index(device)
+    if device >= 0:
+        torch._C._zoom_setDevice(device)
+
+
+def get_device_name(device: Optional[_device_t] = None) -> str:
+    r"""Get the name of a device.
+
+    Args:
+        device (torch.device or int, optional): device for which to return the
+            name. This function is a no-op if this argument is a negative
+            integer. It uses the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
+    """
+    return get_device_properties(device).name
+
+
+def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]:
+    r"""Get the HIP capability of a device.
+
+    Args:
+        device (torch.device or int, optional): device for which to return the
+            device capability. This function is a no-op if this argument is
+            a negative integer. It uses the current device, given by
+            :func:`~torch.zoom.current_device`, if :attr:`device` is ``None``
+            (default).
+
+    Returns:
+        tuple(int, int): the major and minor HIP capability of the device
+    """
+    prop = get_device_properties(device)
+    return prop.major, prop.minor
+
+
+def get_device_properties(device: _device_t) -> _ZoomDeviceProperties:
+    r"""Get the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _ZoomDeviceProperties: the properties of the device
+    """
+    _lazy_init()  # will define _get_device_properties
+    device = _get_device_index(device, optional=True)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    return _get_device_properties(device)  # type: ignore[name-defined]
+
+
+def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool:
+    r"""Check if peer access between two devices is possible."""
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    peer_device = _get_device_index(peer_device)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    if peer_device < 0 or peer_device >= device_count():
+        raise AssertionError("Invalid peer device id")
+    return torch._C._zoom_canDeviceAccessPeer(device, peer_device)
+
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._zoom_getDevice()
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Wait for all kernels in all streams on a Zoom device to complete.
+
+    Args:
+        device (torch.device or int, optional): device for which to synchronize.
+            It uses the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    _lazy_init()
+    with torch.zoom.device(device):
+        return torch._C._zoom_synchronize()
+
+def device_count() -> int:
+    r"""Return the number of GPUs available."""
+    global _cached_device_count
+    if not _is_compiled():
+        return 0
+    if _cached_device_count is not None:
+        return _cached_device_count
+    r = torch._C._zoom_getDeviceCount()
+    # NB: Do not cache the device count prior to Zoom initialization, because
+    # the number of devices can change due to changes to ZOOM_VISIBLE_DEVICES
+    # setting prior to Zoom initialization.
+    if _initialized:
+        _cached_device_count = r
+    return r
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.zoom.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._zoom_getCurrentStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def current_blas_handle():
+    r"""Return cublasHandle_t pointer to current cuBLAS handle"""
+    _lazy_init()
+    return torch._C._zoom_getCurrentBlasHandle()
+
+
+def set_sync_debug_mode(debug_mode: Union[int, str]) -> None:
+    r"""Set the debug mode for zoom synchronizing operations.
+
+    Args:
+        debug_mode(str or int): if "default" or 0, don't error or warn on synchronizing operations,
+            if "warn" or 1, warn on synchronizing operations, if "error" or 2, error out synchronizing operations.
+
+    Warning:
+        This is an experimental feature, and not all synchronizing operations will trigger warning or error. In
+        particular, operations in torch.distributed and torch.sparse namespaces are not covered yet.
+    """
+    _lazy_init()
+    if isinstance(debug_mode, str):
+        if debug_mode == "default":
+            debug_mode = 0
+        elif debug_mode == "warn":
+            debug_mode = 1
+        elif debug_mode == "error":
+            debug_mode = 2
+        else:
+            raise RuntimeError(
+                "invalid value of debug_mode, expected one of `default`, `warn`, `error`"
+            )
+
+    torch._C._zoom_set_sync_debug_mode(debug_mode)
+
+
+def get_sync_debug_mode() -> int:
+    r"""Return current value of debug mode for zoom synchronizing operations."""
+    _lazy_init()
+    return torch._C._zoom_get_sync_debug_mode()
+
+
+################################################################################
+# Define Storage and Tensor classes
+################################################################################
+
+
+@staticmethod  # type: ignore[misc]
+def _lazy_new(cls, *args, **kwargs):
+    _lazy_init()
+    # We may need to call lazy init again if we are a forked child
+    # del _ZoomBase.__new__
+    return super(_ZoomBase, cls).__new__(cls, *args, **kwargs)
+
+
+class _ZoomBase:
+    is_zoom = True
+    is_sparse = False
+
+    def type(self, *args, **kwargs):
+        # We could use a Protocol here to tell mypy that self has `get_device` method
+        # but it is only available in the typing module on Python >= 3.8
+        # or on typing_extensions module on Python >= 3.6
+        with device(self.get_device()):  # type: ignore[attr-defined]
+            return super().type(*args, **kwargs)  # type: ignore[misc]
+
+    __new__ = _lazy_new
+
+
+from torch.storage import _LegacyStorage, _warn_typed_storage_removal
+
+
+class _ZoomLegacyStorage(_LegacyStorage):
+    @classmethod
+    def from_buffer(cls, *args, **kwargs):
+        _warn_typed_storage_removal()
+        raise RuntimeError("from_buffer: Not available for Zoom storage")
+
+    @classmethod
+    def _new_with_weak_ptr(cls, *args, **kwargs):
+        raise RuntimeError("_new_with_weak_ptr: Not available for Zoom storage")
+
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None):
+        raise RuntimeError("_new_shared_filename: Not available for Zoom storage")
+
+
+class ByteStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.uint8
+
+
+class DoubleStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.double
+
+
+class FloatStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.float
+
+
+class HalfStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.half
+
+
+class LongStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.long
+
+
+class IntStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int
+
+
+class ShortStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.short
+
+
+class CharStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int8
+
+
+class BoolStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bool
+
+
+class BFloat16Storage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bfloat16
+
+
+class ComplexDoubleStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cdouble
+
+
+class ComplexFloatStorage(_ZoomLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cfloat
+
+
+del _LegacyStorage
+del _ZoomLegacyStorage
+
+torch._storage_classes.add(DoubleStorage)
+torch._storage_classes.add(FloatStorage)
+torch._storage_classes.add(LongStorage)
+torch._storage_classes.add(IntStorage)
+torch._storage_classes.add(ShortStorage)
+torch._storage_classes.add(CharStorage)
+torch._storage_classes.add(ByteStorage)
+torch._storage_classes.add(HalfStorage)
+torch._storage_classes.add(BoolStorage)
+torch._storage_classes.add(BFloat16Storage)
+torch._storage_classes.add(ComplexDoubleStorage)
+torch._storage_classes.add(ComplexFloatStorage)
+
+from .memory import *  # noqa: F403
\ No newline at end of file
diff --git a/torch/zoom/_memory_viz.py b/torch/zoom/_memory_viz.py
new file mode 100644
index 00000000000000..8b39bebf35637d
--- /dev/null
+++ b/torch/zoom/_memory_viz.py
@@ -0,0 +1,627 @@
+import pickle
+import sys
+import os
+import io
+import subprocess
+import json
+from functools import lru_cache
+from typing import Any
+from itertools import groupby
+import base64
+import warnings
+import operator
+
+cache = lru_cache(None)
+
+__all__ = ["format_flamegraph", "segments", "memory", "compare"]
+
+def _frame_fmt(f, full_filename=False):
+    i = f['line']
+    fname = f['filename']
+    if not full_filename:
+        fname = fname.split('/')[-1]
+    func = f['name']
+    return f'{fname}:{i}:{func}'
+
+@cache
+def _frame_filter(name, filename):
+    omit_functions = [
+        "unwind::unwind",
+        "CapturedTraceback::gather",
+        "gather_with_cpp",
+        "_start",
+        "__libc_start_main",
+        "PyEval_",
+        "PyObject_",
+        "PyFunction_",
+    ]
+    omit_filenames = [
+        "core/boxing",
+        "/Register",
+        "/Redispatch",
+        "pythonrun.c",
+        "Modules/main.c",
+        "Objects/call.c",
+        "Objects/methodobject.c",
+        "pycore_ceval.h",
+        "ceval.c",
+        "cpython/abstract.h",
+    ]
+    for of in omit_functions:
+        if of in name:
+            return False
+    for of in omit_filenames:
+        if of in filename:
+            return False
+    return True
+
+def _frames_fmt(frames, full_filename=False, reverse=False):
+    if reverse:
+        frames = reversed(frames)
+    return [_frame_fmt(f, full_filename) for f in frames if _frame_filter(f['name'], f['filename'])]
+
+def _block_extra_legacy(b):
+    if 'history' in b:
+        frames = b['history'][0].get('frames', [])
+        real_size = b['history'][0]['real_size']
+    else:
+        real_size = b.get('requested_size', b['size'])
+        frames = []
+    return frames, real_size
+
+def _block_extra(b):
+    if 'frames' not in b:
+        # old snapshot format made it more complicated to get frames/allocated size
+        return _block_extra_legacy(b)
+    return b['frames'], b['requested_size']
+
+def format_flamegraph(flamegraph_lines, flamegraph_script=None):
+    if flamegraph_script is None:
+        flamegraph_script = f'/tmp/{os.getuid()}_flamegraph.pl'
+    if not os.path.exists(flamegraph_script):
+        import urllib.request
+        print(f"Downloading flamegraph.pl to: {flamegraph_script}")
+        urllib.request.urlretrieve(
+            'https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl', flamegraph_script)
+        subprocess.check_call(['chmod', '+x', flamegraph_script])
+    args = [flamegraph_script, '--countname', 'bytes']
+    p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8')
+    assert p.stdin is not None
+    assert p.stdout is not None
+    p.stdin.write(flamegraph_lines)
+    p.stdin.close()
+    result = p.stdout.read()
+    p.stdout.close()
+    p.wait()
+    assert p.wait() == 0
+    return result
+
+def _write_blocks(f, prefix, blocks):
+    def frames_fragment(frames):
+        if not frames:
+            return "<non-python>"
+        return ';'.join(_frames_fmt(frames, reverse=True))
+    for b in blocks:
+        if 'history' not in b:
+            frames, accounted_for_size = _block_extra(b)
+            f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {accounted_for_size}\n')
+        else:
+            accounted_for_size = 0
+            for h in b['history']:
+                sz = h['real_size']
+                accounted_for_size += sz
+                if 'frames' in h:
+                    frames = h['frames']
+                    f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {sz}\n')
+                else:
+                    f.write(f'{prefix};{b["state"]};<no-context> {sz}\n')
+        gaps = b['size'] - accounted_for_size
+        if gaps:
+            f.write(f'{prefix};{b["state"]};<gaps> {gaps}\n')
+
+def segments(snapshot, format_flamegraph=format_flamegraph):
+    f = io.StringIO()
+    for seg in snapshot['segments']:
+        prefix = f'stream_{seg["stream"]};seg_{seg["address"]}'
+        _write_blocks(f, prefix, seg['blocks'])
+    return format_flamegraph(f.getvalue())
+
+def memory(snapshot, format_flamegraph=format_flamegraph):
+    f = io.StringIO()
+    for seg in snapshot['segments']:
+        prefix = f'stream_{seg["stream"]}'
+        _write_blocks(f, prefix, seg['blocks'])
+    return format_flamegraph(f.getvalue())
+
+def compare(before, after, format_flamegraph=format_flamegraph):
+    def _seg_key(seg):
+        return (seg['address'], seg['total_size'])
+
+    def _seg_info(seg):
+        return f'stream_{seg["stream"]};seg_{seg["address"]}'
+
+    f = io.StringIO()
+
+    before_segs = {_seg_key(seg) for seg in before}
+    after_segs = {_seg_key(seg) for seg in after}
+
+    print(f'only_before = {[a for a,_ in (before_segs - after_segs)]}')
+    print(f'only_after = {[a for a,_ in (after_segs - before_segs)]}')
+
+    for seg in before:
+        if _seg_key(seg) not in after_segs:
+            _write_blocks(f, f'only_before;{_seg_info(seg)}', seg['blocks'])
+
+    for seg in after:
+        if _seg_key(seg) not in before_segs:
+            _write_blocks(f, f'only_after;{_seg_info(seg)}', seg['blocks'])
+
+    return format_flamegraph(f.getvalue())
+
+def _format_size(num):
+    # https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}B"
+        num /= 1024.0
+    return f"{num:.1f}YiB"
+
+class Bytes:
+    def __init__(self, value):
+        self.value = value
+
+    def __add__(self, rhs):
+        return Bytes(self.value + rhs)
+
+    def __repr__(self):
+        return _format_size(self.value)
+
+def calc_active(seg):
+    return sum(b['size'] for b in seg['blocks'] if b['state'] == 'active_allocated')
+
+def _report_free(free_external, free_internal):
+    total = free_external + free_internal
+    suffix = ''
+    if total != 0:
+        pct = (free_internal / total) * 100
+        suffix = f' ({pct:.1f}% internal)'
+    return f'{Bytes(total)}{suffix}'
+
+PAGE_SIZE = 1024 * 1024 * 20
+legend = f"""\
+
+Legend:
+    [a     ] - a segment in the allocator
+     ^-- a page {Bytes(PAGE_SIZE)} of memory in the segment
+    a-z: pages filled with a single block's content
+    ' ': page is completely free
+    *: page if completely full with multiple blocks
+    0-9: page is partially full with tensors of multiple blocks (9 == 90% full)
+    (X% internal) - of the free memory, X% is free because we rounded the size of the allocation.
+"""
+
+def segsum(data):
+    r"""Visually reports how the allocator has filled its segments.
+
+    This printout can help debug fragmentation issues since free fragments
+    will appear as gaps in this printout.  The amount of free space is reported
+    for each segment.
+    We distinguish between internal free memory which occurs because the
+    allocator rounds the allocation size, and external free memory, which are
+    the gaps between allocations in a segment.
+    Args:
+        data: snapshot dictionary created from _snapshot()
+    """
+    segments = []
+    out = io.StringIO()
+    out.write(f"Summary of segments >= {Bytes(PAGE_SIZE)} in size\n")
+    total_reserved = 0
+    total_allocated = 0
+    free_external = 0
+    free_internal = 0
+    for seg in sorted(data['segments'], key=lambda x: (x['total_size'], calc_active(x))):
+        total_reserved += seg['total_size']
+
+        seg_free_external = 0
+        seg_free_internal = 0
+        seg_allocated = 0
+        all_ranges = []
+        boffset = 0
+        for b in seg['blocks']:
+            active = b['state'] == 'active_allocated'
+            if active:
+                _, allocated_size = _block_extra(b)
+                all_ranges.append((boffset, allocated_size, True))
+                seg_allocated += allocated_size
+                seg_free_internal += b['size'] - allocated_size
+            else:
+                seg_free_external += b['size']
+
+            boffset += b['size']
+
+        total_allocated += seg_allocated
+        free_external += seg_free_external
+        free_internal += seg_free_internal
+
+        nseg = (seg['total_size'] - 1) // PAGE_SIZE + 1
+        occupied = [' ' for _ in range(nseg)]
+        frac = [0.0 for _ in range(nseg)]
+        active_size = 0
+        for i, (start_, size, active) in enumerate(all_ranges):
+            active_size += size
+            finish_ = (start_ + size)
+            start = start_ // PAGE_SIZE
+            finish = (finish_ - 1) // PAGE_SIZE + 1
+            m = chr(ord('a' if active else 'A') + (i % 26))
+            for j in range(start, finish):
+                s = max(start_, j * PAGE_SIZE)
+                e = min(finish_, (j + 1) * PAGE_SIZE)
+                frac[j] += (e - s) / PAGE_SIZE
+                if occupied[j] != ' ':
+                    occupied[j] = '0123456789*'[int(frac[j] * 10)]
+                else:
+                    occupied[j] = m
+        stream = '' if seg['stream'] == 0 else f', stream_{seg["stream"]}'
+        body = ''.join(occupied)
+        assert seg_free_external + seg_free_internal + seg_allocated == seg['total_size']
+        stream = f' stream_{seg["stream"]}' if seg['stream'] != 0 else ''
+        if seg['total_size'] >= PAGE_SIZE:
+            out.write(f'[{body}] {Bytes(seg["total_size"])} allocated, '
+                      f'{_report_free(seg_free_external, seg_free_internal)} free{stream}\n')
+    out.write(f'segments: {len(data["segments"])}\n')
+    out.write(f'total_reserved: {Bytes(total_reserved)}\n')
+    out.write(f'total_allocated: {Bytes(total_allocated)}\n')
+    internal_external = f' ({Bytes(free_internal)} internal + {Bytes(free_external)} external)' if free_internal else ''
+    out.write(f'total_free: {_report_free(free_external, free_internal)}\n')
+    out.write(legend)
+    assert free_internal + free_external + total_allocated == total_reserved
+    return out.getvalue()
+
+def trace(data):
+    out = io.StringIO()
+
+    def format(entries):
+        segment_intervals : list = []
+        segment_addr_to_name = {}
+        allocation_addr_to_name = {}
+
+        free_names : list = []
+        next_name = 0
+
+        def _name():
+            nonlocal next_name
+            if free_names:
+                return free_names.pop()
+            r, m = next_name // 26, next_name % 26
+            next_name += 1
+            return f'{chr(ord("a") + m)}{"" if r == 0 else r}'
+
+        def find_segment(addr):
+            for name, saddr, size in segment_intervals:
+                if addr >= saddr and addr < saddr + size:
+                    return name, saddr
+            for i, seg in enumerate(data['segments']):
+                saddr = seg['address']
+                size = seg['allocated_size']
+                if addr >= saddr and addr < saddr + size:
+                    return f'seg_{i}', saddr
+            return None, None
+        count = 0
+        out.write(f'{len(entries)} entries\n')
+
+
+        total_reserved = 0
+        for seg in data['segments']:
+            total_reserved += seg['total_size']
+
+        for count, e in enumerate(entries):
+            if e['action'] == 'alloc':
+                addr, size = e['addr'], e['size']
+                n = _name()
+                seg_name, seg_addr = find_segment(addr)
+                if seg_name is None:
+                    seg_name = "MEM"
+                    offset = addr
+                else:
+                    offset = addr - seg_addr
+                out.write(f'{n} = {seg_name}[{offset}:{Bytes(size)}]\n')
+                allocation_addr_to_name[addr] = (n, size, count)
+                count += size
+            elif e['action'] == 'free_requested':
+                addr, size = e['addr'], e['size']
+                name, _, _ = allocation_addr_to_name.get(addr, (addr, None, None))
+                out.write(f'del {name} # {Bytes(size)}\n')
+            elif e['action'] == 'free_completed':
+                addr, size = e['addr'], e['size']
+                count -= size
+                name, _, _ = allocation_addr_to_name.get(addr, (addr, None, None))
+                out.write(f'# free completed for {name} {Bytes(size)}\n')
+                if name in allocation_addr_to_name:
+                    free_names.append(name)
+                    del allocation_addr_to_name[name]
+            elif e['action'] == 'segment_alloc':
+                addr, size = e['addr'], e['size']
+                name = _name()
+                out.write(f'{name} = hipMalloc({addr}, {Bytes(size)})\n')
+                segment_intervals.append((name, addr, size))
+                segment_addr_to_name[addr] = name
+            elif e['action'] == 'segment_free':
+                addr, size = e['addr'], e['size']
+                name = segment_addr_to_name.get(addr, addr)
+                out.write(f'hipFree({name}) # {Bytes(size)}\n')
+                if name in segment_addr_to_name:
+                    free_names.append(name)
+                    del segment_addr_to_name[name]
+            elif e['action'] == 'oom':
+                size = e['size']
+                free = e['device_free']
+                out.write(f'raise OutOfMemoryError # {Bytes(size)} requested, {Bytes(free)} free in Zoom\n')
+            else:
+                out.write(f'{e}\n')
+        out.write(f"TOTAL MEM: {Bytes(count)}")
+    for i, d in enumerate(data['device_traces']):
+        if d:
+            out.write(f'Device {i} ----------------\n')
+            format(d)
+    return out.getvalue()
+
+
+_memory_viz_template = r"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+<script type="module">
+import {add_local_files} from "https://cdn.jsdelivr.net/gh/pytorch/pytorch@main/torch/utils/viz/MemoryViz.js"
+const local_files = $SNAPSHOT
+add_local_files(local_files, $VIZ_KIND)
+</script>
+</body>
+"""
+
+def _format_viz(data, viz_kind, device):
+    if device is not None:
+        warnings.warn('device argument is deprecated, plots now contain all device')
+    buffer = pickle.dumps(data)
+    buffer += b'\x00' * (3 - len(buffer) % 3)
+    # Encode the buffer with base64
+    encoded_buffer = base64.b64encode(buffer).decode('utf-8')
+
+    json_format = json.dumps([{"name": 'snapshot.pickle', "base64": encoded_buffer}])
+    return _memory_viz_template.replace('$VIZ_KIND', repr(viz_kind)) \
+                               .replace('$SNAPSHOT', json_format)
+
+def trace_plot(data, device=None, plot_segments=False):
+    """Generate a visualization over time of the memory usage recorded by the trace as an html file.
+
+    Args:
+        data: Memory snapshot as generated from torch.zoom.memory._snapshot()
+        device (torch.device, optional): Generate the trace for this device, needed if multiple devices have allocations.
+        plot_segments (bool, optional): Plots memory returned from hipMalloc, rather than individual allocations.
+                                        Defaults to False.
+
+    Returns:
+        str: HTML of visualization
+    """
+    return _format_viz(data, 'Active Memory Timeline' if not plot_segments else 'Active Cached Memory Timeline', device)
+
+
+def _profile_to_snapshot(profile):
+    import torch
+    from torch.profiler._memory_profiler import Action, TensorKey
+    from torch._C._profiler import _EventType
+    memory_profile = profile._memory_profile()
+
+    allocation_stacks = {}
+    for event in memory_profile._op_tree.sorted_nodes:
+        if event.tag == _EventType.Allocation:
+            parent = event.parent
+            python_parents = []
+            while parent:
+                if parent.tag in (_EventType.PyCall, _EventType.PyCCall):
+                    python_parents.append(parent)
+                parent = parent.parent
+            key = TensorKey.from_allocation(event.extra_fields)
+
+            # Corner case: If allocation doesn't have an ID (can't prove it was used as a Tensor)
+            #              key will be None. I should add some way to identify these, I just haven't yet.
+            if key and event.extra_fields.alloc_size > 0:
+                allocation_stacks[key] = python_parents
+
+
+    device_count = torch.zoom.device_count()
+    snapshot = {
+        'device_traces': [[] for _ in range(device_count + 1)],
+        'segments': [{'device': device,
+                      'address': None,
+                      'total_size': 0,
+                      'stream': 0,
+                      'blocks': []} for device in range(device_count + 1)]
+    }
+
+    def to_device(device):
+        if device.type == 'zoom':
+            return device.index
+        else:
+            return device_count
+
+    def allocate(size, tensor_key, version, during_trace=True):
+        device = to_device(tensor_key.device)
+        addr = tensor_key.storage.ptr
+
+        seg = snapshot['segments'][device]  # type: ignore[index]
+        if seg['address'] is None or seg['address'] > addr:
+            seg['address'] = addr
+        seg['total_size'] = max(seg['total_size'], addr + size)  # record max addr for now, we will make it the size later
+        category = memory_profile._categories.get(tensor_key, version)
+        category = category.name.lower() if category is not None else "unknown"
+        stack = allocation_stacks.get(tensor_key, ())
+        stack = [{'filename': 'none', 'line': 0, 'name': p.name} for p in stack]
+        r = {'action': 'alloc', 'addr': addr, 'size': size, 'stream': 0, 'frames': stack, 'category': category}
+        if during_trace:
+            snapshot['device_traces'][device].append(r)  # type: ignore[index]
+        return r
+
+    def free(alloc, device):
+        for e in ('free_requested', 'free_completed'):
+            snapshot['device_traces'][device].append({'action': e,  # type: ignore[index]
+                                                      'addr': alloc['addr'],
+                                                      'size': alloc['size'],
+                                                      'stream': 0,
+                                                      'frames': alloc['frames']})
+
+    kv_to_elem = {}
+
+
+
+    # create the device trace
+    for time, action, (tensor_key, version), size in memory_profile.timeline:
+        if not isinstance(tensor_key, TensorKey):
+            continue
+        if action == Action.CREATE:
+            kv_to_elem[(tensor_key, version)] = allocate(size, tensor_key, version)
+        elif action == Action.DESTROY:
+            free(kv_to_elem.pop((tensor_key, version)), to_device(tensor_key.device))
+        elif action == Action.INCREMENT_VERSION:
+            free(kv_to_elem.pop((tensor_key, version)), to_device(tensor_key.device))
+            kv_to_elem[(tensor_key, version + 1)] = allocate(size, tensor_key, version + 1)
+        elif action == Action.PREEXISTING:
+            kv_to_elem[(tensor_key, version)] = allocate(size, tensor_key, version, during_trace=False)
+
+
+    # create the final snapshot state
+    blocks_at_end = [(to_device(tensor_key.device), event['addr'], event['size'], event['frames'])
+                     for (tensor_key, version), event in kv_to_elem.items()]
+    for device, blocks in groupby(sorted(blocks_at_end), key=operator.itemgetter(0)):
+        seg = snapshot['segments'][device]  # type: ignore[index]
+        last_addr = seg['address']
+        for _, addr, size, frames in blocks:
+            if last_addr < addr:
+                seg['blocks'].append({'size': addr - last_addr, 'state': 'inactive'})
+            seg['blocks'].append({'size': size, 'state': 'active_allocated', 'requested_size': size, 'frames': frames})
+            last_addr = addr + size
+        if last_addr < seg['total_size']:
+            seg['blocks'].append({'size': seg['total_size'] - last_addr, 'state': 'inactive'})
+
+    snapshot['segments'] = [seg for seg in snapshot['segments'] if seg['blocks']]  # type: ignore[attr-defined]
+    for seg in snapshot['segments']:  # type: ignore[attr-defined, name-defined, no-redef]
+        seg['total_size'] -= seg['address']
+        if not seg['blocks']:
+            seg['blocks'].append({'size': seg['total_size'], 'state': 'inactive'})
+
+    return snapshot
+
+def profile_plot(profile, device=None):
+    """Generate a visualization over time of the memory usage recorded by kineto memory profiling as an html file.
+
+    Args:
+        profile: profile as generated by `torch.profiler.profile(profile_memory=True)`
+        device (torch.device, optional): Generate the trace for this device, needed if multiple devices have allocations.
+
+    Returns:
+        str: HTML of visualization
+    """
+    snapshot = _profile_to_snapshot(profile)
+    return _format_viz(snapshot, 'Active Memory Timeline', device)
+
+
+def segment_plot(data: Any, device=None):
+    return _format_viz(data, 'Allocator State History', device)
+
+if __name__ == "__main__":
+    import os.path
+    thedir = os.path.realpath(os.path.dirname(__file__))
+    if thedir in sys.path:
+        # otherwise we find zoom/random.py as random...
+        sys.path.remove(thedir)
+    import argparse
+
+    fn_name = 'torch.zoom.memory._snapshot()'
+    pickled = f'pickled memory statistics from {fn_name}'
+    parser = argparse.ArgumentParser(description=f'Visualize memory dumps produced by {fn_name}')
+
+    subparsers = parser.add_subparsers(dest='action')
+
+    def _output(p):
+        p.add_argument('-o', '--output', default='output.svg', help='flamegraph svg (default: output.svg)')
+
+    description = 'Prints overall allocation statistics and a visualization of how the allocators segments are currently filled.'
+    stats_a = subparsers.add_parser('stats', description=description)
+    stats_a.add_argument('input', help=pickled)
+
+    description = 'Prints buffer of the most recent allocation events embedded in the snapshot in a Pythonic style.'
+    trace_a = subparsers.add_parser('trace', description=description)
+    trace_a.add_argument('input', help=pickled)
+
+    description = 'Generate a flamegraph that visualizes what memory is stored in each allocator segment (aka block)'
+    segments_a = subparsers.add_parser('segments', description=description)
+    segments_a.add_argument('input', help=pickled)
+    _output(segments_a)
+
+    description = "Generate a flamegraph the program locations contributing to Zoom memory usage."
+    memory_a = subparsers.add_parser('memory', description=description)
+    memory_a.add_argument('input', help=pickled)
+    _output(memory_a)
+
+    description = 'Generate a flamegraph that shows segments (aka blocks) that have been added ' \
+        'or removed between two different memorys snapshots.'
+    compare_a = subparsers.add_parser('compare', description=description)
+    compare_a.add_argument('before', help=pickled)
+    compare_a.add_argument('after', help=pickled)
+    _output(compare_a)
+
+    plots = (
+        ("trace_plot", "Generate a visualization over time of the memory usage recorded by the trace as an html file."),
+        ("segment_plot", "Visualize how allocations are packed into allocator segments at each point in a trace as an html file.")
+    )
+    for cmd, description in plots:
+        trace_plot_a = subparsers.add_parser(cmd, description=description)
+        trace_plot_a.add_argument('input', help=pickled)
+        help = 'visualize trace from this device (default: chooses the only device with trace info or errors)'
+        trace_plot_a.add_argument('-d', '--device', type=int, default=None, help=help)
+        help = 'path to save the visualization(default: output.html)'
+        trace_plot_a.add_argument('-o', '--output', default='output.html', help=help)
+        if cmd == "trace_plot":
+            help = 'visualize change to segments rather than individual allocations'
+            trace_plot_a.add_argument('-s', '--segments', action='store_true', help=help)
+
+
+    args = parser.parse_args()
+
+    def _read(name):
+        if name == '-':
+            f = sys.stdin.buffer
+        else:
+            f = open(name, 'rb')
+        data = pickle.load(f)
+        if isinstance(data, list):  # segments only...
+            data = {'segments': data, 'traces': []}
+        return data
+
+    def _write(name, data):
+        with open(name, 'w') as f:
+            f.write(data)
+
+    if args.action == 'segments':
+        data = _read(args.input)
+        _write(args.output, segments(data))
+    elif args.action == 'memory':
+        data = _read(args.input)
+        _write(args.output, memory(data))
+    elif args.action == 'stats':
+        data = _read(args.input)
+        print(segsum(data))
+    elif args.action == 'trace':
+        data = _read(args.input)
+        print(trace(data))
+    elif args.action == 'compare':
+        before = _read(args.before)
+        after = _read(args.after)
+        _write(args.output, compare(before, after))
+    elif args.action == 'trace_plot':
+        data = _read(args.input)
+        _write(args.output, trace_plot(data, device=args.device, plot_segments=args.segments))
+    elif args.action == 'segment_plot':
+        data = _read(args.input)
+        _write(args.output, segment_plot(data, device=args.device))
diff --git a/torch/zoom/_utils.py b/torch/zoom/_utils.py
new file mode 100644
index 00000000000000..a6a0b4fa39f13c
--- /dev/null
+++ b/torch/zoom/_utils.py
@@ -0,0 +1,38 @@
+from typing import Any
+
+import torch
+
+# The _get_device_index has been moved to torch.utils._get_device_index
+from torch._utils import _get_device_index as _torch_get_device_index
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a Zoom device. Note that for a Zoom device without a specified index,
+    i.e., ``torch.device('zoom')``, this will return the current default zoom
+    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default zoom
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(device, torch.device):
+        if allow_cpu:
+            if device.type not in ["zoom", "cpu"]:
+                raise ValueError(f"Expected a zoom or cpu device, but got: {device}")
+        elif device.type != "zoom":
+            raise ValueError(f"Expected a zoom device, but got: {device}")
+    if not torch.jit.is_scripting():
+        if isinstance(device, torch.zoom.device):
+            return device.idx
+    return _torch_get_device_index(device, optional, allow_cpu)
diff --git a/torch/zoom/graphs.py b/torch/zoom/graphs.py
new file mode 100644
index 00000000000000..c418abd3e6ef7a
--- /dev/null
+++ b/torch/zoom/graphs.py
@@ -0,0 +1,479 @@
+import gc
+import typing
+
+import torch
+from torch.utils import _pytree
+from .._utils import _dummy_type
+
+if not hasattr(torch._C, "_ZoomStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_HIPGraph"] = _dummy_type("_HIPGraph")
+    torch._C.__dict__["_graph_pool_handle"] = _dummy_type("_graph_pool_handle")
+    torch._C.__dict__["_zoom_isCurrentStreamCapturing"] = _dummy_type(
+        "_zoom_isCurrentStreamCapturing"
+    )
+
+from torch._C import (  # noqa: F401
+    _zoom_isCurrentStreamCapturing,
+    _HIPGraph,
+    _graph_pool_handle,
+)
+
+
+def is_current_stream_capturing():
+    r"""Return True if CUDA graph capture is underway on the current Zoom stream, False otherwise.
+
+    If a Zoom context does not exist on the current device, returns False without initializing the context.
+    """
+    return _zoom_isCurrentStreamCapturing()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+def graph_pool_handle():
+    r"""Return an opaque token representing the id of a graph memory pool.
+
+    See :ref:`Graph memory management<graph-memory-management>`.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+    return _graph_pool_handle()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+class HIPGraph(torch._C._HIPGraph):
+    r"""Wrapper around a HIP graph.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+
+    def __new__(cls):
+        return super().__new__(cls)
+
+    def capture_begin(self, pool=None, capture_error_mode="global"):
+        r"""Begin capturing Zoom work on the current stream.
+
+        Typically, you shouldn't call ``capture_begin`` yourself.
+        Use :class:`~torch.zoom.graph` or :func:`~torch.zoom.make_graphed_callables`,
+        which call ``capture_begin`` internally.
+
+        Arguments:
+            pool (optional): Token (returned by :func:`~torch.zoom.graph_pool_handle` or
+                :meth:`other_Graph_instance.pool()<torch.zoom.HIPGraph.pool>`) that hints this graph may share memory
+                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+            capture_error_mode (str, optional): specifies the hipStreamCaptureMode for the graph capture stream.
+                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as hipMalloc,
+                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
+                unless you're familiar with `hipStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
+        """  # noqa: B950
+        super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
+
+    def capture_end(self):
+        r"""End HIP graph capture on the current stream.
+
+        After ``capture_end``, ``replay`` may be called on this instance.
+
+        Typically, you shouldn't call ``capture_end`` yourself.
+        Use :class:`~torch.zoom.graph` or :func:`~torch.zoom.make_graphed_callables`,
+        which call ``capture_end`` internally.
+        """
+        super().capture_end()
+
+    def replay(self):
+        r"""Replay the HIP work captured by this graph."""
+        super().replay()
+
+    def reset(self):
+        r"""Delete the graph currently held by this instance."""
+        super().reset()
+
+    def pool(self):
+        r"""Return an opaque token representing the id of this graph's memory pool.
+
+        This id can optionally be passed to another graph's ``capture_begin``,
+        which hints the other graph may share the same memory pool.
+        """
+        return super().pool()
+
+    def enable_debug_mode(self):
+        r"""Enable debugging mode for HIPGraph.debug_dump."""
+        return super().enable_debug_mode()
+
+    def debug_dump(self, debug_path):
+        r"""
+        Arguments:
+            debug_path (required): Path to dump the graph to.
+
+        Calls a debugging function to dump the graph if the debugging is
+        enabled via HIPGraph.enable_debug_mode()
+        """
+        return super().debug_dump(debug_path)
+
+
+class graph:
+    r"""Context-manager that captures HIP work into a :class:`torch.zoom.HIPGraph` object for later replay.
+
+    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
+    detailed use, and constraints.
+
+    Arguments:
+        hip_graph (torch.zoom.HIPGraph): Graph object used for capture.
+        pool (optional): Opaque token (returned by a call to :func:`~torch.zoom.graph_pool_handle()` or
+            :meth:`other_Graph_instance.pool()<torch.zoom.HIPGraph.pool>`) hinting this graph's capture
+            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
+        stream (torch.zoom.Stream, optional): If supplied, will be set as the current stream in the context.
+            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
+        capture_error_mode (str, optional): specifies the hipStreamCaptureMode for the graph capture stream.
+            Can be "global", "thread_local" or "relaxed". During hip graph capture, some actions, such as hipMalloc,
+            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
+            unless you're familiar with `hipStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
+
+    .. note::
+        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
+        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. _hipStreamCaptureMode:
+        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
+    """  # noqa: B950
+
+    default_capture_stream: typing.Optional["torch.zoom.Stream"] = None
+
+    def __init__(
+        self,
+        hip_graph,
+        pool=None,
+        stream=None,
+        capture_error_mode: str = "global",
+    ):
+        # Lazy-init of default_capture_stream helps avoid circular-import errors.
+        # Not thread safe, but graphs already have the general (explicitly documented)
+        # restriction that only one capture may be underway at a time in the process.
+        if self.__class__.default_capture_stream is None:
+            self.__class__.default_capture_stream = torch.zoom.Stream()
+
+        self.pool = () if pool is None else (pool,)
+        self.capture_stream = (
+            stream if stream is not None else self.__class__.default_capture_stream
+        )
+        assert self.capture_stream is not None
+        self.stream_ctx = torch.zoom.stream(self.capture_stream)
+        self.hip_graph = hip_graph
+        self.capture_error_mode = capture_error_mode
+
+    def __enter__(self):
+        # Free as much memory as we can for the graph
+        torch.zoom.synchronize()
+        gc.collect()
+        torch.zoom.empty_cache()
+
+        # Stackoverflow seems comfortable with this pattern
+        # https://stackoverflow.com/questions/26635684/calling-enter-and-exit-manually#39172487
+        self.stream_ctx.__enter__()
+
+        self.hip_graph.capture_begin(
+            *self.pool, capture_error_mode=self.capture_error_mode
+        )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.hip_graph.capture_end()
+        self.stream_ctx.__exit__(exc_type, exc_value, traceback)
+        # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
+
+
+def make_graphed_callables(
+    callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None
+):
+    r"""Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.
+
+    Each graphed callable's forward pass runs its source callable's
+    forward CUDA work as a CUDA graph inside a single autograd node.
+
+    The graphed callable's forward pass also appends
+    a backward node to the autograd graph. During backward, this node runs the
+    callable's backward work as a CUDA graph.
+
+    Therefore, each graphed callable should be a drop-in replacement for its source callable
+    in an autograd-enabled training loop.
+
+    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.
+
+    If you pass a tuple of several callables, their captures will use the same memory pool.
+    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.
+
+    Arguments:
+        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
+            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
+            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
+            they'll run in the live workload.
+        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
+            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
+            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
+        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
+            11 iterations for warm up. Default: ``3``.
+        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
+            (and therefore their grad is always zero) is an error. Defaults to False.
+        pool (optional): Token (returned by :func:`~torch.zoom.graph_pool_handle` or
+            :meth:`other_Graph_instance.pool()<torch.zoom.HIPGraph.pool>`) that hints this graph may share memory
+            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+    .. note::
+        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
+        that's expected for the corresponding real input in the training loop.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.
+
+    .. warning::
+        Returned callables do not support higher order differentiation (e.g., double backward).
+
+    .. warning::
+        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
+        may be trainable. Buffers must have ``requires_grad=False``.
+
+    .. warning::
+        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
+        you may not add or remove any of that Module's parameters or buffers.
+
+    .. warning::
+        :class:`torch.nn.Module`\s passed to :func:`~torch.zoom.make_graphed_callables` must not have module hooks
+        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
+        through :func:`~torch.zoom.make_graphed_callables` is allowed.
+
+    .. warning::
+        When running a graphed callable, you must pass its arguments in the same order and format
+        they appeared in that callable's ``sample_args``.
+
+    .. warning::
+        The automatic mixed precision is supported in :func:`~torch.zoom.make_graphed_callables` only with disabled
+        caching. The context manager `torch.zoom.amp.autocast()` must have `cache_enabled=False`.
+    """
+    if torch.is_autocast_enabled() and torch.is_autocast_cache_enabled():
+        raise RuntimeError(
+            "make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`."
+        )
+
+    just_one_callable = False
+
+    if not isinstance(callables, tuple):
+        just_one_callable = True
+        callables = (callables,)
+        sample_args = (sample_args,)
+
+    flatten_sample_args = []
+
+    for c, args in zip(callables, sample_args):
+        if isinstance(c, torch.nn.Module):
+            assert (
+                len(c._backward_hooks) == 0
+                and len(c._forward_hooks) == 0
+                and len(c._forward_pre_hooks) == 0
+            ), (
+                "Modules must not have hooks registered at the time they are passed. However, registering hooks "
+                + "on modules after passing them through make_graphed_callables is allowed."
+            )
+            assert all(b.requires_grad is False for b in c.buffers()), (
+                "In any :class:`~torch.nn.Module` passed to "
+                + ":func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have "
+                + "``requires_grad=False``."
+            )
+        flatten_arg = _pytree.arg_tree_leaves(*args)
+        flatten_sample_args.append(tuple(flatten_arg))
+        assert all(isinstance(arg, torch.Tensor) for arg in flatten_arg), (
+            "In the beta API, sample_args "
+            + "for each callable must contain only Tensors. Other types are not allowed."
+        )
+
+    # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly
+    # passes to forward (ie, its sample_args) AND the module's parameter attributes.
+    per_callable_len_user_args = [len(args) for args in flatten_sample_args]
+    per_callable_module_params = [
+        tuple(c.parameters()) if isinstance(c, torch.nn.Module) else ()
+        for c in callables
+    ]
+    per_callable_static_input_surfaces = [
+        flatten_sample_args[i] + per_callable_module_params[i]
+        for i in range(len(callables))
+    ]
+
+    fwd_graphs = [torch.zoom.HIPGraph() for _ in range(len(callables))]
+    bwd_graphs = [torch.zoom.HIPGraph() for _ in range(len(callables))]
+
+    mempool = graph_pool_handle() if pool is None else pool
+
+    # Warmup
+    # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
+    # from ending up in any captures.
+    torch.zoom.synchronize()
+    with torch.zoom.stream(torch.zoom.Stream()):
+        for func, args, static_input_surface in zip(
+            callables, sample_args, per_callable_static_input_surfaces
+        ):
+            for _ in range(num_warmup_iters):
+                outputs = _pytree.tree_leaves(func(*args))
+                grad_inputs = torch.autograd.grad(
+                    outputs=tuple(o for o in outputs if o.requires_grad),
+                    inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                    grad_outputs=tuple(
+                        torch.empty_like(o) for o in outputs if o.requires_grad
+                    ),
+                    only_inputs=True,
+                    allow_unused=allow_unused_input,
+                )
+            del outputs, grad_inputs  # type: ignore[possibly-undefined]
+    torch.zoom.synchronize()
+
+    # All captures here share a mempool. To avoid replays corrupting each other's memory,
+    # the safest approach is to capture all passes in the same order they'll run:
+    # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1.
+
+    # Capture forward graphs
+    per_callable_static_outputs = []
+    per_callable_output_unflatten_spec = []
+    for func, args, fwd_graph in zip(callables, sample_args, fwd_graphs):
+        with torch.zoom.graph(fwd_graph, pool=mempool):
+            outputs = func(*args)
+
+        flatten_outputs, spec = _pytree.tree_flatten(outputs)
+        per_callable_static_outputs.append(tuple(flatten_outputs))
+        per_callable_output_unflatten_spec.append(spec)
+
+    # Capture backward graphs in reverse order
+    per_callable_static_grad_outputs = []
+    per_callable_static_grad_inputs = []
+    for static_input_surface, static_outputs, bwd_graph, module_params in zip(
+        reversed(per_callable_static_input_surfaces),
+        reversed(per_callable_static_outputs),
+        reversed(bwd_graphs),
+        reversed(per_callable_module_params),
+    ):
+        # For now, assumes all static_outputs require grad
+        # assert all(o.requires_grad for o in static_outputs), "Outputs of graphed callables must require grad."
+        static_grad_outputs = tuple(
+            torch.empty_like(o) if o.requires_grad else None for o in static_outputs
+        )
+
+        with torch.zoom.graph(bwd_graph, pool=mempool):
+            grad_inputs = torch.autograd.grad(
+                outputs=tuple(o for o in static_outputs if o.requires_grad),
+                inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                only_inputs=True,
+                allow_unused=allow_unused_input,
+            )
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad.
+        # I couldn't think of a slick one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in static_input_surface:
+            if arg.requires_grad:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)  # type: ignore[arg-type]
+        static_grad_inputs = tuple(static_grad_inputs)  # type: ignore[assignment]
+
+        per_callable_static_grad_outputs.append(static_grad_outputs)
+        per_callable_static_grad_inputs.append(static_grad_inputs)
+
+    # Reverses the most recent two lists
+    per_callable_static_grad_outputs.reverse()
+    per_callable_static_grad_inputs.reverse()
+    # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
+
+    def make_graphed_autograd_function(
+        fwd_graph,
+        bwd_graph,
+        module_params,
+        len_user_args,
+        output_unflatten_spec,
+        static_input_surface,
+        static_outputs,
+        static_grad_outputs,
+        static_grad_inputs,
+    ):
+        class Graphed(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *inputs):
+                # At this stage, only the user args may (potentially) be new tensors.
+                for i in range(len_user_args):
+                    if static_input_surface[i].data_ptr() != inputs[i].data_ptr():
+                        static_input_surface[i].copy_(inputs[i])
+                fwd_graph.replay()
+                assert isinstance(static_outputs, tuple)
+                return tuple(o.detach() for o in static_outputs)
+
+            @staticmethod
+            @torch.autograd.function.once_differentiable
+            def backward(ctx, *grads):
+                assert len(grads) == len(static_grad_outputs)
+                for g, grad in zip(static_grad_outputs, grads):
+                    if g is not None:
+                        # don't copy if autograd gods have been kind and the
+                        # incoming grad is already in the right place
+                        if g.data_ptr() != grad.data_ptr():
+                            g.copy_(grad)
+                bwd_graph.replay()
+
+                # Input args that didn't require grad expect a None gradient.
+                assert isinstance(static_grad_inputs, tuple)
+                return tuple(
+                    b.detach() if b is not None else b for b in static_grad_inputs
+                )
+
+        def functionalized(*user_args):
+            # Runs the autograd function with inputs == all inputs to the graph that might require grad
+            # (explicit user args + module parameters)
+            # Assumes module params didn't change since capture.
+            flatten_user_args = _pytree.arg_tree_leaves(*user_args)
+            out = Graphed.apply(*(tuple(flatten_user_args) + module_params))
+            return _pytree.tree_unflatten(out, output_unflatten_spec)
+
+        return functionalized
+
+    # Put together the final graphed callables
+    ret = []
+    for i, func in enumerate(callables):
+        graphed = make_graphed_autograd_function(
+            fwd_graphs[i],
+            bwd_graphs[i],
+            per_callable_module_params[i],
+            per_callable_len_user_args[i],
+            per_callable_output_unflatten_spec[i],
+            per_callable_static_input_surfaces[i],
+            per_callable_static_outputs[i],
+            per_callable_static_grad_outputs[i],
+            per_callable_static_grad_inputs[i],
+        )
+
+        if isinstance(func, torch.nn.Module):
+
+            def make_graphed_forward(func, graph_training_state, graphed, orig_fwd):
+                def new_fwd(*user_args):
+                    # If the module's training-or-eval state matches what we graphed,
+                    # run the graph, otherwise run the original forward method
+                    if func.training == graph_training_state:
+                        return graphed(*user_args)
+                    else:
+                        return orig_fwd(*user_args)
+
+                return new_fwd
+
+            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+            ret.append(func)
+        else:
+            ret.append(graphed)
+
+    if just_one_callable:
+        return ret[0]
+
+    return tuple(ret)
diff --git a/torch/zoom/memory.py b/torch/zoom/memory.py
new file mode 100644
index 00000000000000..e910e6271fc8ff
--- /dev/null
+++ b/torch/zoom/memory.py
@@ -0,0 +1,910 @@
+r"""This package adds support for device memory management implemented in Zoom."""
+
+import collections
+import contextlib
+import ctypes
+import pickle
+import sys
+import warnings
+from inspect import signature
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import _C
+
+from torch.types import Device
+from .._utils import _dummy_type
+# from . import _get_device_index, _get_nvml_device_index, _lazy_init, is_initialized
+from . import _get_device_index, _lazy_init, is_initialized
+
+from ._memory_viz import memory as _memory, segments as _segments
+
+__all__ = [
+    "caching_allocator_alloc",
+    "caching_allocator_delete",
+    "set_per_process_memory_fraction",
+    "empty_cache",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
+    "reset_max_memory_allocated",
+    "reset_max_memory_cached",
+    "memory_allocated",
+    "max_memory_allocated",
+    "memory_reserved",
+    "max_memory_reserved",
+    "memory_cached",
+    "max_memory_cached",
+    "memory_snapshot",
+    "memory_summary",
+    # "list_gpu_processes",
+    "mem_get_info",
+    "get_allocator_backend",
+    "ZoomPluggableAllocator",
+    "change_current_allocator",
+]
+
+
+if not hasattr(torch._C, "_zoom_ZoomAllocator"):
+    # Define dummy base classes
+    torch._C.__dict__["_zoom_ZoomAllocator"] = _dummy_type("_zoom_ZoomAllocator")
+
+
+def _host_allocator():
+    _lazy_init()
+    return torch._C._zoom_zoomHostAllocator()
+
+
+@contextlib.contextmanager
+def _free_mutex():
+    torch._C._zoom_lock_mutex()
+    try:
+        yield
+    finally:
+        torch._C._zoom_unlock_mutex()
+
+
+def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
+    r"""Perform a memory allocation using the Zoom memory allocator.
+
+    Memory is allocated for a given device and a stream, this
+    function is intended to be used for interoperability with other
+    frameworks. Allocated memory is released through
+    :func:`~torch.zoom.caching_allocator_delete`.
+
+    Args:
+        size (int): number of bytes to be allocated.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default Zoom device is used.
+        stream (torch.zoom.Stream or int, optional): selected stream. If is ``None`` then
+            the default stream for the selected device is used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = torch.zoom.current_device()
+    device = _get_device_index(device)
+    if stream is None:
+        stream = torch.zoom.current_stream(device)
+    if isinstance(stream, torch.zoom.streams.Stream):
+        stream = stream.zoom_stream
+    if not isinstance(stream, int):
+        raise TypeError(
+            "Invalid type for stream argument, must be "
+            "`torch.zoom.Stream` or `int` representing a pointer "
+            "to a existing stream"
+        )
+    with torch.zoom.device(device):
+        return torch._C._zoom_zoomCachingAllocator_raw_alloc(size, stream)
+
+
+def caching_allocator_delete(mem_ptr):
+    r"""Delete memory allocated using the Zoom memory allocator.
+
+    Memory allocated with :func:`~torch.zoom.caching_allocator_alloc`.
+    is freed here. The associated device and stream are tracked inside
+    the allocator.
+
+    Args:
+        mem_ptr (int): memory address to be freed by the allocator.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    torch._C._zoom_zoomCachingAllocator_raw_delete(mem_ptr)
+
+
+def set_per_process_memory_fraction(
+    fraction, device: Union[Device, int] = None
+) -> None:
+    r"""Set memory fraction for a process.
+
+    The fraction is used to limit an caching allocator to allocated memory on a Zoom device.
+    The allowed value equals the total visible memory multiplied fraction.
+    If trying to allocate more than the allowed value in a process, will raise an out of
+    memory error in allocator.
+
+    Args:
+        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default Zoom device is used.
+    .. note::
+        In general, the total available free memory is less than the total capacity.
+    """
+    _lazy_init()
+    if device is None:
+        device = torch.zoom.current_device()
+    device = _get_device_index(device)
+    if not isinstance(fraction, float):
+        raise TypeError("Invalid type for fraction argument, must be `float`")
+    if fraction < 0 or fraction > 1:
+        raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1")
+
+    torch._C._zoom_setMemoryFraction(fraction, device)
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU application and visible in
+    `nvidia-smi`.
+
+    .. note::
+        :func:`~torch.zoom.empty_cache` doesn't increase the amount of GPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of GPU memory in certain cases. See :ref:`cuda-memory-management` for
+        more details about GPU memory management.
+    """
+    if is_initialized():
+        torch._C._zoom_emptyCache()
+
+
+def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Return a dictionary of Zoom memory allocator statistics for a given device.
+
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+
+    Core statistics:
+
+    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of allocation requests received by the memory allocator.
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of reserved segments from ``hipMalloc()``.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of active memory blocks.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of inactive, non-releasable memory blocks.
+    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of inactive, non-releasable memory.
+
+    For these core statistics, values are broken down as follows.
+
+    Pool type:
+
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool
+      (as of October 2019, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of October 2019, for size < 1MB allocations).
+
+    Metric type:
+
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+
+    In addition to the core statistics, we also provide some simple event
+    counters:
+
+    - ``"num_alloc_retries"``: number of failed ``hipMalloc`` calls that
+      result in a cache flush and retry.
+    - ``"num_ooms"``: number of out-of-memory errors thrown.
+    - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls.
+    - ``"num_device_alloc"``: number of Zoom allocation calls. This includes both
+      cuMemMap and hipMalloc.
+    - ``"num_device_free"``: number of Zoom free calls. This includes both cuMemUnmap
+      and hipFree.
+
+    The caching allocator can be configured via ENV to not split blocks larger than a
+    defined size (see Memory Management section of the Cuda Semantics documentation).
+    This helps avoid memory fragmentation but may have a performance
+    penalty. Additional outputs to assist with tuning and evaluating impact:
+
+    - ``"max_split_size"``: blocks above this size will not be split.
+    - ``"oversize_allocations.{current,peak,allocated,freed}"``:
+      number of over-size allocation requests received by the memory allocator.
+    - ``"oversize_segments.{current,peak,allocated,freed}"``:
+      number of over-size reserved segments from ``hipMalloc()``.
+
+    The caching allocator can be configured via ENV to round memory allocations in order
+    to reduce fragmentation. Sometimes the overhead from rounding can be higher than
+    the fragmentation it helps reduce. The following stat can be used to check if
+    rounding adds too much overhead:
+
+    - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      memory requested by client code, compare this with allocated_bytes to check if
+      allocation rounding adds too much overhead.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistics for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+
+    .. note::
+        With :ref:`backend:hipMallocAsync<cuda-memory-envvars>`, some stats are not
+        meaningful, and are always reported as zero.
+    """
+    result = []
+
+    def _recurse_add_to_result(prefix, obj):
+        if isinstance(obj, dict):
+            if len(prefix) > 0:
+                prefix += "."
+            for k, v in obj.items():
+                _recurse_add_to_result(prefix + k, v)
+        else:
+            result.append((prefix, obj))
+
+    stats = memory_stats_as_nested_dict(device=device)
+    _recurse_add_to_result("", stats)
+    result.sort()
+
+    return collections.OrderedDict(result)
+
+
+def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Return the result of :func:`~torch.zoom.memory_stats` as a nested dictionary."""
+    if not is_initialized():
+        return {}
+    device = _get_device_index(device, optional=True)
+    return torch._C._zoom_memoryStats(device)
+
+
+def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Reset the "accumulated" (historical) stats tracked by the Zoom memory allocator.
+
+    See :func:`~torch.zoom.memory_stats` for details. Accumulated stats correspond to
+    the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
+    `"num_alloc_retries"` and `"num_ooms"`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._zoom_resetAccumulatedMemoryStats(device)
+
+
+def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Reset the "peak" stats tracked by the Zoom memory allocator.
+
+    See :func:`~torch.zoom.memory_stats` for details. Peak stats correspond to the
+    `"peak"` key in each individual stat dict.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._zoom_resetPeakMemoryStats(device)
+
+
+def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
+    r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device.
+
+    See :func:`~torch.zoom.max_memory_allocated` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.zoom.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.zoom.reset_max_memory_allocated now calls torch.zoom.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning,
+    )
+    return reset_peak_memory_stats(device=device)
+
+
+def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
+    r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
+
+    See :func:`~torch.zoom.max_memory_cached` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.zoom.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.zoom.reset_max_memory_cached now calls torch.zoom.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning,
+    )
+    return reset_peak_memory_stats(device=device)
+
+
+def memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Return the current GPU memory occupied by tensors in bytes for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        This is likely less than the amount shown in `nvidia-smi` since some
+        unused memory can be held by the caching allocator and some context
+        needs to be created on GPU. See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.current", 0)
+
+
+def max_memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
+
+    By default, this returns the peak allocated memory since the beginning of
+    this program. :func:`~torch.zoom.reset_peak_memory_stats` can be used to
+    reset the starting point in tracking this metric. For example, these two
+    functions can measure the peak allocated memory usage of each iteration in a
+    training loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
+
+
+def memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.current", 0)
+
+
+def max_memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
+
+    By default, this returns the peak cached memory since the beginning of this
+    program. :func:`~torch.zoom.reset_peak_memory_stats` can be used to reset
+    the starting point in tracking this metric. For example, these two functions
+    can measure the peak cached memory amount of each iteration in a training
+    loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
+
+
+def memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.zoom.memory_reserved`."""
+    warnings.warn(
+        "torch.zoom.memory_cached has been renamed to torch.zoom.memory_reserved",
+        FutureWarning,
+    )
+    return memory_reserved(device=device)
+
+
+def max_memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.zoom.max_memory_reserved`."""
+    warnings.warn(
+        "torch.zoom.max_memory_cached has been renamed to torch.zoom.max_memory_reserved",
+        FutureWarning,
+    )
+    return max_memory_reserved(device=device)
+
+
+def memory_snapshot():
+    r"""Return a snapshot of the Zoom memory allocator state across all devices.
+
+    Interpreting the output of this function requires familiarity with the
+    memory allocator internals.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return torch._C._zoom_memorySnapshot()["segments"]
+
+
+def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
+    r"""Return a human-readable printout of the current memory allocator statistics for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+        abbreviated (bool, optional): whether to return an abbreviated summary
+            (default: False).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    stats = memory_stats(device=device)
+
+    def _format_size(sz, pref_sz):
+        prefixes = ["B  ", "KiB", "MiB", "GiB", "TiB", "PiB"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_sz < 768 * 1024:
+                break
+            prefix = new_prefix
+            sz //= 1024
+            pref_sz /= 1024
+        return f"{sz:6d} {prefix}"
+
+    def _format_count(cnt, pref_cnt):
+        prefixes = [" ", "K", "M"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_cnt < 750 * 1000:
+                break
+            prefix = new_prefix
+            cnt //= 1000
+            pref_cnt /= 1000
+        return f"{cnt:7d} {prefix} "
+
+    metrics_to_display = [
+        ("allocated_bytes", "Allocated memory", _format_size),
+        ("active_bytes", "Active memory", _format_size),
+        ("requested_bytes", "Requested memory", _format_size),
+        ("reserved_bytes", "GPU reserved memory", _format_size),
+        ("inactive_split_bytes", "Non-releasable memory", _format_size),
+        ("allocation", "Allocations", _format_count),
+        ("active", "Active allocs", _format_count),
+        ("segment", "GPU reserved segments", _format_count),
+        ("inactive_split", "Non-releasable allocs", _format_count),
+    ]
+
+    lines = []
+    lines.append("=" * 75)
+    lines.append(" {_:16} PyTorch Zoom memory summary, device ID {device:<17d} ")
+    lines.append("-" * 75)
+    lines.append(
+        "  {_:9} HIP OOMs: {num_ooms:<12d} | {_:6} hipMalloc retries: {num_alloc_retries:<8d}  "
+    )
+    lines.append("=" * 75)
+    lines.append(
+        "        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  "
+    )
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+        submetrics = [("all", metric_name)]
+        if not abbreviated:
+            submetrics.append(("large_pool", "      from large pool"))
+            submetrics.append(("small_pool", "      from small pool"))
+
+        current_prefval, peak_prefval, allocated_prefval, freed_prefval = (
+            None,
+            None,
+            None,
+            None,
+        )
+
+        for submetric_key, submetric_name in submetrics:
+            prefix = metric_key + "." + submetric_key + "."
+
+            current = stats[prefix + "current"]
+            peak = stats[prefix + "peak"]
+            allocated = stats[prefix + "allocated"]
+            freed = stats[prefix + "freed"]
+
+            if current_prefval is None:
+                current_prefval = current
+                peak_prefval = peak
+                allocated_prefval = allocated
+                freed_prefval = freed
+
+            lines.append(
+                f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | "
+                f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ",
+            )
+
+    metrics_to_display = [
+        ("oversize_allocations", "Oversize allocations", _format_count),
+        ("oversize_segments", "Oversize GPU segments", _format_count),
+    ]
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+
+        prefix = metric_key + "."
+
+        current = stats[prefix + "current"]
+        peak = stats[prefix + "peak"]
+        allocated = stats[prefix + "allocated"]
+        freed = stats[prefix + "freed"]
+
+        lines.append(
+            f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | "
+            f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ",
+        )
+
+    lines.append("=" * 75)
+
+    fmt_dict = {"_": "", "device": device}
+    for k, v in stats.items():
+        fmt_dict[k.replace(".", "-")] = v
+    return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
+
+
+# def list_gpu_processes(device: Union[Device, int] = None) -> str:
+#     r"""Return a human-readable printout of the running processes and their GPU memory use for a given device.
+
+#     This can be useful to display periodically during training, or when
+#     handling out-of-memory exceptions.
+
+#     Args:
+#         device (torch.device or int, optional): selected device. Returns
+#             printout for the current device, given by :func:`~torch.zoom.current_device`,
+#             if :attr:`device` is ``None`` (default).
+#     """
+#     # try:
+#     #     import pynvml  # type: ignore[import]
+#     # except ModuleNotFoundError:
+#     #     return "pynvml module not found, please install pynvml"
+#     # from pynvml import NVMLError_DriverNotLoaded
+
+#     try:
+#         pynvml.nvmlInit()
+#     except NVMLError_DriverNotLoaded:
+#         return "cuda driver can't be loaded, is cuda enabled?"
+#     device = _get_nvml_device_index(device)
+#     handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+#     procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+#     lines = []
+#     lines.append(f"GPU:{device}")
+#     if len(procs) == 0:
+#         lines.append("no processes are running")
+#     for p in procs:
+#         mem = p.usedGpuMemory / (1024 * 1024)
+#         lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
+#     return "\n".join(lines)
+
+
+def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
+    r"""Return the global free and total GPU memory for a given device using hipMemGetInfo.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.zoom.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    if device is None:
+        device = torch.zoom.current_device()
+    device = _get_device_index(device)
+    return torch.zoom.hiprt().hipMemGetInfo(device)
+
+
+def _record_memory_history_legacy(
+    enabled: bool,
+    record_context=True,
+    trace_alloc_max_entries=1,
+    trace_alloc_record_context=False,
+    device: Union[Device, int] = None,
+    record_context_cpp=False,
+):
+    _C._zoom_record_memory_history_legacy(
+        enabled,
+        record_context,
+        trace_alloc_max_entries,
+        trace_alloc_record_context,
+        record_context_cpp,
+    )
+
+
+def _record_memory_history(enabled="all", *args, **kwargs):
+    """Enable recording of stack traces associated with memory
+    allocations, so you can tell what allocated any piece of memory in
+    :func:`torch.zoom.memory._snapshot()`.
+
+    In addition too keeping stack traces with each current allocation and free,
+    this will also enable recording of a history of all alloc/free events.
+
+    Use :func:`torch.zoom.memory._snapshot()` to retrieve this information,
+    and the tools in `_memory_viz.py` to visualize snapshots.
+
+    The Python trace collection is fast (2us per trace), so you may consider
+    enabling this on production jobs if you anticipate ever having to debug
+    memory issues.
+
+    C++ trace collection is also fast (~50ns/frame), which for many typical programs
+    works out to ~2us per trace, but can vary depending on stack depth.
+
+    Args:
+        enabled (Literal[None, "state", "all"], optional):
+            `None`, disable recording memory history.
+            `"state"`, keep information for currenly allocated memory.
+            `"all"`, additionally keep a history of all alloc/free calls.
+            Defaults to "all".
+        context (Literal[None, "state", "alloc", "all"], optional):
+            `None`, Do not record any tracebacks.
+            `"state"`, Record tracebacks for currently allocated memory.
+            `"alloc"`, additionally keep tracebacks for alloc calls.
+            `"all"`, additionally keep tracebacks for free calls.
+            Defaults to "all".
+        stacks (Literal["python", "all"], optional):
+            `"python"`, include Python, TorchScript, and inductor frames in tracebacks
+            `"all"`, additionally include C++ frames
+            Defaults to "all".
+        max_entries (int, optional): Keep a maximum of `max_entries`
+            alloc/free events in the recorded history recorded.
+    """
+    if isinstance(enabled, bool):
+        return _record_memory_history_legacy(enabled, *args, **kwargs)
+    else:
+        return _record_memory_history_impl(enabled, *args, **kwargs)
+
+
+def _record_memory_history_impl(
+    enabled: Optional[str] = "all",
+    context: Optional[str] = "all",
+    stacks: str = "all",
+    max_entries: int = sys.maxsize,
+    device: Union[Device, int] = None,
+):
+    _C._zoom_record_memory_history(enabled, context, stacks, max_entries)
+
+
+_record_memory_history.__signature__ = signature(_record_memory_history_impl)  # type: ignore[attr-defined]
+
+
+def _snapshot(device: Union[Device, int] = None):
+    """Save a snapshot of HIP memory state at the time it was called.
+
+    The state is represented as a dictionary with the following structure.
+
+    .. code-block:: python
+
+        class Snapshot(TypedDict):
+            segments : List[Segment]
+            device_traces: List[List[TraceEntry]]
+
+        class Segment(TypedDict):
+            # Segments are memory returned from a hipMalloc call.
+            # The size of reserved memory is the sum of all Segments.
+            # Segments are cached and reused for future allocations.
+            # If the reuse is smaller than the segment, the segment
+            # is split into more then one Block.
+            # empty_cache() frees Segments that are entirely inactive.
+            address: int
+            total_size: int #  hipMalloc'd size of segment
+            stream: int
+            segment_type: Literal['small', 'large'] # 'large' (>1MB)
+            allocated_size: int # size of memory in use
+            active_size: int # size of memory in use or in active_awaiting_free state
+            blocks : List[Block]
+
+        class Block(TypedDict):
+            # A piece of memory returned from the allocator, or
+            # current cached but inactive.
+            size: int
+            requested_size: int # size requested during malloc, may be smaller than
+                                # size due to rounding
+            address: int
+            state: Literal['active_allocated', # used by a tensor
+                        'active_awaiting_free', # waiting for another stream to finish using
+                                                # this, then it will become free
+                        'inactive',] # free for reuse
+            frames: List[Frame] # stack trace from where the allocation occurred
+
+        class Frame(TypedDict):
+                filename: str
+                line: int
+                name: str
+
+        class TraceEntry(TypedDict):
+            # When `torch.zoom.memory._record_memory_history()` is enabled,
+            # the snapshot will contain TraceEntry objects that record each
+            # action the allocator took.
+            action: Literal[
+            'alloc'  # memory allocated
+            'free_requested', # the allocated received a call to free memory
+            'free_completed', # the memory that was requested to be freed is now
+                            # able to be used in future allocation calls
+            'segment_alloc', # the caching allocator ask hipMalloc for more memory
+                            # and added it as a segment in its cache
+            'segment_free',  # the caching allocator called hipFree to return memory
+                            # to hip possibly trying free up memory to
+                            # allocate more segments or because empty_caches was called
+            'oom',          # the allocator threw an OOM exception. 'size' is
+                            # the requested number of bytes that did not succeed
+            'snapshot'      # the allocator generated a memory snapshot
+                            # useful to coorelate a previously taken
+                            # snapshot with this trace
+            ]
+            addr: int # not present for OOM
+            frames: List[Frame]
+            size: int
+            stream: int
+            device_free: int # only present for OOM, the amount of
+                            # memory hip still reports to be free
+
+    Returns:
+        The Snapshot dictionary object
+    """
+    return _C._zoom_memorySnapshot()
+
+
+def _dump_snapshot(filename="dump_snapshot.pickle"):
+    """
+    Save a pickled version of the `torch.memory._snapshot()` dictionary to a file.
+
+    This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz
+
+    Args:
+        filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
+    """
+    s = _snapshot()
+    with open(filename, "wb") as f:
+        pickle.dump(s, f)
+
+
+def _save_segment_usage(filename="output.svg", snapshot=None):
+    if snapshot is None:
+        snapshot = _snapshot()
+    with open(filename, "w") as f:
+        f.write(_segments(snapshot))
+
+
+def _save_memory_usage(filename="output.svg", snapshot=None):
+    if snapshot is None:
+        snapshot = _snapshot()
+    with open(filename, "w") as f:
+        f.write(_memory(snapshot))
+
+
+def _set_allocator_settings(env: str):
+    return torch._C._zoom_zoomCachingAllocator_set_allocator_settings(env)
+
+
+def get_allocator_backend() -> str:
+    r"""Return a string describing the active allocator backend as set by
+    ``PYTORCH_ZOOM_ALLOC_CONF``. Currently available backends are
+    ``native`` (PyTorch's native caching allocator) and `hipMallocAsync``
+    (HIP's built-in asynchronous allocator).
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on choosing the allocator backend.
+    """
+    return torch._C._zoom_getAllocatorBackend()
+
+
+class _ZoomAllocator:
+    r"""Wrapper over internal Zoom memory allocators."""
+
+    def __init__(self, allocator: torch._C._zoom_ZoomAllocator):
+        self._allocator = allocator
+
+    def allocator(self):
+        return self._allocator
+
+
+class ZoomPluggableAllocator(_ZoomAllocator):
+    r"""Zoom memory allocator loaded from a so file."""
+
+    def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
+        r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes.
+
+        To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function.
+
+        Args:
+            path_to_so_file(str): Path in the filesystem to the `.so` file containing
+                the allocator functions
+            alloc_fn_name(str): Name of the function to perform the memory allocation
+                in the so file. The signature must be:
+                void* alloc_fn_name(ssize_t size, int device, hipStream_t stream);
+            free_fn_name(str): Name of the function to perform the memory release
+                in the so file. The signature must be:
+                void free_fn_name(void* ptr, size_t size, hipStream_t stream);
+
+        .. warning::
+            This is currently supported only in unix OSs
+
+        .. note::
+            See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+        """
+        allocator = ctypes.CDLL(path_to_so_file)
+        alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
+        free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
+        assert alloc_fn is not None
+        assert free_fn is not None
+        self._allocator = torch._C._zoom_customAllocator(alloc_fn, free_fn)
+
+
+def change_current_allocator(allocator: _ZoomAllocator) -> None:
+    r"""Change the currently used memory allocator to be the one provided.
+
+    If the current allocator has already been used/initialized, this function will error.
+
+
+    Args:
+        allocator (torch.zoom.memory._ZoomAllocator): allocator to be set as the active one.
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    torch._C._zoom_changeCurrentAllocator(allocator.allocator())
+
+
+def _get_current_allocator() -> _ZoomAllocator:
+    r"""Return the allocator being currently used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    return _ZoomAllocator(torch._C._zoom_getAllocator())
diff --git a/torch/zoom/random.py b/torch/zoom/random.py
new file mode 100644
index 00000000000000..30c906063698bb
--- /dev/null
+++ b/torch/zoom/random.py
@@ -0,0 +1,179 @@
+from typing import Iterable, List, Union
+
+import torch
+from .. import Tensor
+from . import _lazy_call, _lazy_init, current_device, device_count
+
+__all__ = [
+    "get_rng_state",
+    "get_rng_state_all",
+    "set_rng_state",
+    "set_rng_state_all",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "initial_seed",
+]
+
+
+def get_rng_state(device: Union[int, str, torch.device] = "zoom") -> Tensor:
+    r"""Return the random number generator state of the specified GPU as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'zoom'`` (i.e., ``torch.device('zoom')``, the current Zoom device).
+
+    .. warning::
+        This function eagerly initializes Zoom.
+    """
+    _lazy_init()
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("zoom", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch.zoom.default_generators[idx]
+    return default_generator.get_state()
+
+
+def get_rng_state_all() -> List[Tensor]:
+    r"""Return a list of ByteTensor representing the random number states of all devices."""
+    results = []
+    for i in range(device_count()):
+        results.append(get_rng_state(i))
+    return results
+
+
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "zoom"
+) -> None:
+    r"""Set the random number generator state of the specified GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'zoom'`` (i.e., ``torch.device('zoom')``, the current Zoom device).
+    """
+    with torch._C._DisableFuncTorch():
+        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("zoom", device)
+
+    def cb():
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = torch.zoom.default_generators[idx]
+        default_generator.set_state(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
+    r"""Set the random number generator state of all devices.
+
+    Args:
+        new_states (Iterable of torch.ByteTensor): The desired state for each device.
+    """
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current GPU.
+
+    It's safe to call this function if Zoom is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.zoom.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed=True)
+
+
+def manual_seed_all(seed: int) -> None:
+    r"""Set the seed for generating random numbers on all GPUs.
+
+    It's safe to call this function if Zoom is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.zoom.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed_all=True)
+
+
+def seed() -> None:
+    r"""Set the seed for generating random numbers to a random number for the current GPU.
+
+    It's safe to call this function if Zoom is not available; in that
+    case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.zoom.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all() -> None:
+    r"""Set the seed for generating random numbers to a random number on all GPUs.
+
+    It's safe to call this function if Zoom is not available; in that
+    case, it is silently ignored.
+    """
+
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.zoom.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed() -> int:
+    r"""Return the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes Zoom.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.zoom.default_generators[idx]
+    return default_generator.initial_seed()
diff --git a/torch/zoom/streams.py b/torch/zoom/streams.py
new file mode 100644
index 00000000000000..29a69fbb9d8ba6
--- /dev/null
+++ b/torch/zoom/streams.py
@@ -0,0 +1,241 @@
+import ctypes
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+from .._utils import _dummy_type
+
+
+if not hasattr(torch._C, "_ZoomStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_ZoomStreamBase"] = _dummy_type("_ZoomStreamBase")
+    torch._C.__dict__["_ZoomEventBase"] = _dummy_type("_ZoomEventBase")
+
+
+class Stream(torch._C._ZoomStreamBase, _StreamBase):
+    r"""Wrapper around a Zoom stream.
+
+    A Zoom stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.  See :ref:`cuda-semantics` for
+    details.
+
+    Args:
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream, should be 0 or
+            negative, where negative numbers indicate higher priority. By default,
+            streams have priority 0.
+
+    """
+
+    def __new__(cls, device=None, priority=0, **kwargs):
+        # setting device manager is expensive, so we avoid it unless necessary
+        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
+            return super().__new__(cls, priority=priority, **kwargs)
+        else:
+            with torch.zoom.device(device):
+                return super().__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event) -> None:
+        r"""Make all future work submitted to the stream wait for an event.
+
+        Args:
+            event (torch.zoom.Event): an event to wait for.
+
+        .. note:: This is a wrapper around ``hipStreamWaitEvent()``: see
+           `CUDA Stream documentation`_ for more info.
+
+           This function returns without waiting for :attr:`event`: only future
+           operations are affected.
+
+        .. _CUDA Stream documentation:
+           https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+        """
+        event.wait(self)
+
+    def wait_stream(self, stream) -> None:
+        r"""Synchronize with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Args:
+            stream (Stream): a stream to synchronize.
+
+        .. note:: This function returns without waiting for currently enqueued
+           kernels in :attr:`stream`: only future operations are affected.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        r"""Record an event.
+
+        Args:
+            event (torch.zoom.Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        event.record(self)
+        return event
+
+    def query(self) -> bool:
+        r"""Check if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        return super().query()
+
+    def synchronize(self) -> None:
+        r"""Wait for all the kernels in this stream to complete.
+
+        .. note:: This is a wrapper around ``hipStreamSynchronize()``: see
+           `CUDA Stream documentation`_ for more info.
+        """
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.zoom_stream)
+
+    def __eq__(self, o) -> bool:
+        if isinstance(o, Stream):
+            return super().__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.zoom_stream, self.device))
+
+    def __repr__(self):
+        return f"<torch.zoom.Stream device={self.device} zoom_stream={self.zoom_stream:#x}>"
+
+
+class ExternalStream(Stream):
+    r"""Wrapper around an externally allocated Zoom stream.
+
+    This class is used to wrap streams allocated in other libraries in order
+    to facilitate data exchange and multi-library interactions.
+
+    .. note:: This class doesn't manage the stream life-cycle, it is the user
+       responsibility to keep the referenced stream alive while this class is
+       being used.
+
+    Args:
+        stream_ptr(int): Integer representation of the `hipStream_t` value.
+            allocated externally.
+        device(torch.device or int, optional): the device where the stream
+            was originally allocated. If device is specified incorrectly,
+            subsequent launches using this stream may fail.
+    """
+
+    def __new__(cls, stream_ptr, device=None, **kwargs):
+        with torch.zoom.device(device):
+            return super().__new__(cls, stream_ptr=stream_ptr, **kwargs)
+
+
+class Event(torch._C._ZoomEventBase, _EventBase):
+    r"""Wrapper around a Zoom event.
+
+    Zoom events are synchronization markers that can be used to monitor the
+    device's progress, to accurately measure timing, and to synchronize Zoom
+    streams.
+
+    The underlying Zoom events are lazily initialized when the event is first
+    recorded or exported to another process. After creation, only streams on the
+    same device may record the event. However, streams on any device can wait on
+    the event.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+        blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+        interprocess (bool): if ``True``, the event can be shared between processes
+            (default: ``False``)
+
+    .. _CUDA Event Documentation:
+       https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+    """
+
+    def __new__(cls, enable_timing=False, blocking=False, interprocess=False):
+        return super().__new__(
+            cls,
+            enable_timing=enable_timing,
+            blocking=blocking,
+            interprocess=interprocess,
+        )
+
+    @classmethod
+    def from_ipc_handle(cls, device, handle):
+        r"""Reconstruct an event from an IPC handle on the given device."""
+        return super().from_ipc_handle(device, handle)
+
+    def record(self, stream=None):
+        r"""Record the event in a given stream.
+
+        Uses ``torch.zoom.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch.zoom.current_stream()
+        super().record(stream)
+
+    def wait(self, stream=None) -> None:
+        r"""Make all future work submitted to the given stream wait for this event.
+
+        Use ``torch.zoom.current_stream()`` if no stream is specified.
+
+        .. note:: This is a wrapper around ``hipStreamWaitEvent()``: see
+            `CUDA Event documentation`_ for more info.
+        """
+        if stream is None:
+            stream = torch.zoom.current_stream()
+        super().wait(stream)
+
+    def query(self):
+        r"""Check if all work currently captured by event has completed.
+
+        Returns:
+            A boolean indicating if all work currently captured by event has
+            completed.
+        """
+        return super().query()
+
+    def elapsed_time(self, end_event):
+        r"""Return the time elapsed.
+
+        Time reported in milliseconds after the event was recorded and
+        before the end_event was recorded.
+        """
+        return super().elapsed_time(end_event)
+
+    def synchronize(self) -> None:
+        r"""Wait for the event to complete.
+
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+
+         .. note:: This is a wrapper around ``hipEventSynchronize()``: see
+            `CUDA Event documentation`_ for more info.
+        """
+        super().synchronize()
+
+    def ipc_handle(self):
+        r"""Return an IPC handle of this event.
+
+        If not recorded yet, the event will use the current device.
+        """
+        return super().ipc_handle()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.zoom_event)
+
+    def __repr__(self) -> str:
+        if self.zoom_event:
+            return f"<torch.zoom.Event {self._as_parameter_.value:#x}>"
+        else:
+            return "<torch.zoom.Event uninitialized>"

From 16d3bea4ca71b9b2d88deadd9b31b593e7468299 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Mon, 23 Dec 2024 00:06:14 +0000
Subject: [PATCH 02/23] resolve some build deps

---
 aten/CMakeLists.txt                           |   2 +
 aten/src/ATen/CMakeLists.txt                  |   6 +-
 aten/src/ATen/native/zoom/ForeachFunctors.cuh | 681 ++++++++++++++++++
 .../src/ATen/native/zoom/MultiTensorApply.cuh | 379 ++++++++++
 aten/src/ATen/native/zoom/Pow.cuh             |  58 ++
 aten/src/ATen/native/zoom/PowKernel.cu        | 209 ++++++
 6 files changed, 1332 insertions(+), 3 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/ForeachFunctors.cuh
 create mode 100644 aten/src/ATen/native/zoom/MultiTensorApply.cuh
 create mode 100644 aten/src/ATen/native/zoom/Pow.cuh
 create mode 100644 aten/src/ATen/native/zoom/PowKernel.cu

diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index bda6aea327062f..d1459366a2e945 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -34,6 +34,7 @@ set(ATen_HIP_SRCS)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY)
 set(ATen_HIP_TEST_SRCS)
 set(ATen_HIP_INCLUDE)
+set(ATen_ZOOM_SRCS)
 set(ATen_MPS_SRCS)
 set(ATen_MPS_TEST_SRCS)
 set(ATen_XPU_SRCS)
@@ -116,6 +117,7 @@ set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_SRCS_W_SORT_BY_KEY ${ATen_CUDA_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE)
+set(ATen_ZOOM_SRCS ${ATen_ZOOM_SRCS} PARENT_SCOPE)
 set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE)
 set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 1cd471cee47bc0..42ca9254a64885 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -82,9 +82,9 @@ file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
 file(GLOB miopen_h "miopen/*.h")
 file(GLOB miopen_cpp "miopen/*.cpp")
 
-file(GLOB zoom_h "zoom/*.h" "zoom/detail/*.h" "zoom/*.cuh" "zoom/detail/*.cuh" "zoom/tunable/*.cuh" "zoom/tunable/*.h" "zoom/jit/*.cuh" "zoom/jit/*.h")
-file(GLOB zoom_cpp "zoom/*.cpp" "zoom/detail/*.cpp" "zoom/tunable/*.cpp" "zoom/jit/*.cpp")
-file(GLOB zoom_hip "zoom/*.cu" "zoom/detail/*.cu" "zoom/impl/*.cu" "zoom/tunable/*.cu")
+file(GLOB zoom_h "zoom/*.h" "zoom/detail/*.h" "zoom/*.cuh" "zoom/detail/*.cuh" "zoom/tunable/*.h" "zoom/jit/*.cuh" "zoom/jit/*.h")
+file(GLOB zoom_cpp "zoom/*.cpp" "zoom/detail/*.cpp" "zoom/jit/*.cpp")
+file(GLOB zoom_hip "zoom/*.cu" "zoom/detail/*.cu")
 file(GLOB zoom_hiprtc_stub_h "zoom/hiprtc_stub/*.h")
 file(GLOB zoom_hiprtc_stub_cpp "zoom/hiprtc_stub/*.cpp")
 
diff --git a/aten/src/ATen/native/zoom/ForeachFunctors.cuh b/aten/src/ATen/native/zoom/ForeachFunctors.cuh
new file mode 100644
index 00000000000000..869e6fa3fd4389
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachFunctors.cuh
@@ -0,0 +1,681 @@
+#pragma once
+#include <ATen/OpMathType.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/Pow.cuh>
+
+namespace at::native {
+
+namespace {
+
+// TODO(crcrpar): Handle version bump in codegen.
+// rel:
+// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482
+inline void increment_version(TensorList tensors) {
+  for (const auto& t : tensors) {
+    t.unsafeGetTensorImpl()->bump_version();
+  }
+}
+
+// Initializes args and checks if all args are aligned
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    TensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+// Initializes args and checks if all args are aligned
+template <int depth, typename T, typename T2>
+__device__ bool init_args(
+    T** args,
+    TensorListScalarListMetadata<T2, depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    FusedOptimizerTensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+template <int depth, typename T>
+__device__ void load_args(
+    T r_args[][kILP],
+    T** args,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const auto i = i_start + threadIdx.x + ii * blockDim.x;
+    for (int r_index = 0; r_index < depth; r_index++) {
+      r_args[r_index][ii] = 0;
+      if (i < n && i < chunk_size) {
+        r_args[r_index][ii] = args[r_index][i];
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ void store_args(
+    T* dst,
+    T* src,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const int64_t i = i_start + threadIdx.x + ii * blockDim.x;
+    if (i < n && i < chunk_size)
+      dst[i] = src[ii];
+  }
+}
+
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void binary_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args
+      // has depth 1
+      load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void pointwise_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+      load_store(r_args[1], args[1], 0, i_start);
+      load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args
+      // has depth 3
+      load_args<3>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+
+//
+// Binary Functors
+//
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpListAlphaFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarTensorFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      T* scalar,
+      opmath_t alpha) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        // Regardless if depth is 1 (for inplace) or 2 (for out of place),
+        // r_args has depth 1
+        load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+//
+// Unary Functors
+//
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct ZeroFunctor {
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<1>& tl) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const auto all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        // store
+        load_store(args[0], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct UnaryOpFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+//
+// Pointwise Functors
+//
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth>
+struct PointwiseOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[depth - 1][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[2], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth - 1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[2], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    static_assert(depth == 3 || depth == 4, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+        load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    static_assert(depth == 2 || depth == 3, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(a, b);
+  }
+};
+
+template <typename T>
+struct reverse_power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(b, a);
+  }
+};
+
+} // namespace
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/MultiTensorApply.cuh b/aten/src/ATen/native/zoom/MultiTensorApply.cuh
new file mode 100644
index 00000000000000..9efa863f49ceaf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MultiTensorApply.cuh
@@ -0,0 +1,379 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+#include <vector>
+
+namespace at::native {
+
+namespace {
+
+static constexpr int64_t kILP = 4;
+static constexpr int64_t kChunkSize = 65536;
+static constexpr int64_t kBlockSize = 512;
+
+// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy`
+// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
+static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
+static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = {
+    72,
+    60};
+
+template <typename T>
+__device__ __forceinline__ bool is_aligned(T* p) {
+  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
+}
+
+template <typename T>
+__device__ __forceinline__ void load_store(
+    T* dst,
+    T* src,
+    int64_t dst_offset,
+    int64_t src_offset) {
+  using LT = at::native::memory::aligned_vector<T, kILP>;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template <int n>
+struct TensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename scalar_vals_t, int n>
+struct TensorListScalarListMetadata {
+  const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+};
+
+// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of
+// 4kb with `c10::complex<double>`
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 1> {
+  const void* addresses[1]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]];
+  int block_to_chunk[depth_to_max_blocks[1 - 1]];
+};
+
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 2> {
+  const void* addresses[2]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]];
+  int block_to_chunk[depth_to_max_blocks[2 - 1]];
+};
+
+// NOTE(crcrpar): This is a conservative resolution to handle `state_steps`
+// whose each element is `at::Tensor` of 1 element representing the number of
+// `step`s called so far.
+template <int n>
+struct FusedOptimizerTensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+C10_LAUNCH_BOUNDS_1(kBlockSize)
+__global__ void multi_tensor_apply_kernel(
+    T tensorListMeta,
+    U callable,
+    ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(kChunkSize, tensorListMeta, args...);
+}
+
+} // namespace
+
+// multi_tensor_apply enables horizontal fusion across lists of tensors.
+// For example, whereas you once had a for-loop of a + b = c, where a, b,
+// and c are individual tensors in lists as, bs, and cs, you can now with
+// fewer kernel launches compute as + bs = cs.
+//
+// You can also imagine bs to be a scalar list vs a tensor list.
+//
+// The function below takes in tensor lists, scalars, and a callable and
+// chunks up the computation to launch as few kernels as possible by iterating
+// through every "chunk" in every tensor (thus the nested for loops). In the
+// simplest case, everything gets bundled into just one kernel launch, but
+// due to blocksize constraints, we may need to launch multiple kernels.
+// Each kernel launch is defined by one tensorListMeta construct, which we
+// use to track and reset the necessary metadata for each launch.
+template <int depth, typename scalar_T, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<Scalar> scalars,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  using scalar_vals_t = typename T::opmath_t;
+  TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // now we enter [chunking territory].
+    // we will launch a kernel when EITHER the blocks get filled up OR
+    // the tensors get filled up. There will always be at least one block
+    // per tensor since the zero-sized ones will not enter the loop, so
+    // the nested forloop within represents iterating through the chunks
+    // of a single tensor.
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      // a tensor is not considered full unless all its chunks have been
+      // processed
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        // all chunks have already been handled in the kernel
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else { // blocks were full and tensor chunks remain
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.scalar_vals[0] =
+              tensorListMeta.scalar_vals[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // note: [finishing what we started]
+  // if there's remaining work to be done but the tensors/blocks aren't full
+  // yet we are at the end, submit the kernel to do the work!
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  TensorListMetadata<depth> tensorListMeta;
+  tensorListMeta.start_tensor_this_launch = 0;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see note: [chunking territory].
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+          tensorListMeta.start_tensor_this_launch = t + 1;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+          tensorListMeta.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+
+  // see note: [finishing what we started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply_for_fused_optimizer(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::TensorList state_steps,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth");
+  const auto num_tensors = tensor_lists[0].size();
+  FusedOptimizerTensorListMetadata<depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][tensor_index].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.state_steps_addresses[loc_tensor_info] =
+        state_steps[tensor_index].const_data_ptr();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][tensor_index].numel();
+    for (const auto& d : c10::irange(depth)) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][tensor_index].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see above note: [chunking territory]
+    const auto numel = tensor_lists[0][tensor_index].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    TORCH_CHECK(chunks > -1);
+    for (const auto& chunk : c10::irange(chunks)) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const auto tensor_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1];
+
+      if (tensor_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.state_steps_addresses[0] =
+              tensorListMeta.state_steps_addresses[loc_tensor_info - 1];
+          for (const auto& d : c10::irange(depth)) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // see above note: [finishing what we've started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Pow.cuh b/aten/src/ATen/native/zoom/Pow.cuh
new file mode 100644
index 00000000000000..eee86031f8d932
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Pow.cuh
@@ -0,0 +1,58 @@
+#pragma once
+#include <ATen/native/Pow.h>
+#include <c10/core/Scalar.h>
+
+namespace at { namespace native {
+
+namespace {
+
+
+// SFINAE doesn't work well with NVCC under Windows for math functions like pow and sqrt.
+// So we need to define the functions with the explicit function signatures.
+// As for pow, the following signatures are defined as the device function:
+//   pow(float, int)
+//   pow(double, int)
+//   pow(float, float)
+//   pow(double, double)
+#ifdef _MSC_VER
+// Functions for pow
+// pow for at::Half
+static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
+  return static_cast<at::Half>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow for at::BFloat16
+static inline __host__ __device__ at::BFloat16 pow_(at::BFloat16 base, at::BFloat16 exp) {
+  return static_cast<at::BFloat16>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow (floating, floating/int)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<std::is_floating_point<Base_type>::value && (std::is_same<Base_type, Exp_type>::value || std::is_same<Exp_type, int>::value), Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return std::pow(base, exp);
+}
+// pow (Otherwise)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<!std::is_same<Base_type, Exp_type>::value && !std::is_same<Exp_type, int>::value, Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return static_cast<Base_type>(std::pow(static_cast<double>(base), static_cast<double>(exp)));
+}
+#else
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ Base_type pow_(Base_type base, Exp_type exp) {
+  return ::pow(base, exp);
+}
+#endif
+
+template <typename T>
+static inline __host__ __device__ std::enable_if_t<std::is_integral<T>::value, T> pow_(
+    T base, T exp) {
+  return at::native::powi(base, exp);
+}
+
+template <typename T>
+static inline __host__ __device__ c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
+  return c10_complex_math::pow(base, exp);
+}
+
+} // namespace
+}} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/PowKernel.cu b/aten/src/ATen/native/zoom/PowKernel.cu
new file mode 100644
index 00000000000000..e67e47201687ad
--- /dev/null
+++ b/aten/src/ATen/native/zoom/PowKernel.cu
@@ -0,0 +1,209 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/native/zoom/Pow.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/Pow.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+// Forward declare some unary kernels
+void rsqrt_kernel_zoom(TensorIteratorBase& iter);
+void sqrt_kernel_zoom(TensorIteratorBase& iter);
+void reciprocal_kernel_zoom(TensorIteratorBase& iter);
+
+namespace {
+
+void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar);
+
+template <typename scalar_t>
+void pow_scalar_tensor_impl(TensorIteratorBase& iter, scalar_t base) {
+  gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t exp) -> scalar_t {
+    return pow_(base, exp);
+  });
+}
+
+template <typename value_t>
+void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<value_t> base) {
+  // For complex, thrust::pow uses the identity
+  // pow(a, b) = exp(log(a) * b)
+  const auto fct = std::log(base);
+  gpu_kernel(iter, [=]GPU_LAMBDA(c10::complex<value_t> exp) -> c10::complex<value_t> {
+    return std::exp(fct * exp);
+  });
+}
+
+/* complex<Half> support impl */
+CONSTEXPR_EXCEPT_WIN_CUDA char pow_scalar_base_name[] = "pow_scalar_base_kernel";
+template <>
+void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<at::Half> base) {
+  using scalar_t = c10::complex<at::Half>;
+  using opmath_t = at::opmath_type<scalar_t>;
+  // For complex, thrust::pow uses the identity
+  // pow(a, b) = exp(log(a) * b)
+  const auto fct = std::log(opmath_t{base});
+#if AT_USE_JITERATOR()
+  static const auto pow_kernel_string =
+      jiterator_stringify(template <typename T> T pow_scalar_base_kernel(T exp, T fct) {
+        return std::exp(fct * exp);
+      });
+  jitted_gpu_kernel<pow_scalar_base_name, scalar_t, scalar_t, 1>(
+      iter,
+      pow_kernel_string,
+      /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+      /*scalar_val=*/0,
+      /*extra_args=*/std::make_tuple(fct));
+#else
+  gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t exp) -> scalar_t {
+    return std::exp(fct * opmath_t{exp});
+  });
+#endif
+}
+
+namespace {
+
+#if AT_USE_JITERATOR()
+/* complex<Half> support impl */
+CONSTEXPR_EXCEPT_WIN_CUDA char pow_name[] = "pow_kernel";
+static const auto pow_kernel_string =
+    jiterator_stringify(template <typename T> T pow_kernel(T base, T exp) {
+      return std::pow(base, exp);
+    });
+#endif
+
+/* complex<Half> support impl */
+void pow_chalf_tensor_scalar_impl(TensorIteratorBase& iter, const Scalar& exp_scalar) {
+  using scalar_t = c10::complex<at::Half>;
+  using opmath_t = at::opmath_type<scalar_t>;
+  auto exp = exp_scalar.to<opmath_t>();
+#if AT_USE_JITERATOR()
+  jitted_gpu_kernel<pow_name, scalar_t, scalar_t, 1>(
+      iter,
+      pow_kernel_string,
+      /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+      /*scalar_val=*/0,
+      /*extra_args=*/std::make_tuple(exp));
+#else
+  gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t base) -> scalar_t {
+    return std::pow(opmath_t{base}, exp);
+  });
+#endif
+}
+
+}  // anonymous namespace
+
+void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (common_dtype == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+    if (iter.is_cpu_scalar(1)) {
+      const auto base = iter.scalar_value<scalar_t>(1);
+      iter.remove_operand(1);
+      pow_scalar_tensor_impl(iter, base);
+    } else if (iter.is_cpu_scalar(2)) {
+      const auto exp = iter.scalar_value<scalar_t>(2);
+      iter.remove_operand(2);
+      pow_chalf_tensor_scalar_impl(iter, exp);
+    } else {
+      using opmath_t = at::opmath_type<scalar_t>;
+      TORCH_INTERNAL_ASSERT(!iter.is_cpu_scalar(1) && !iter.is_cpu_scalar(2));
+#if AT_USE_JITERATOR()
+      jitted_gpu_kernel<pow_name, scalar_t, scalar_t, 2>(
+          iter, pow_kernel_string);
+#else
+      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return pow_(opmath_t{base}, opmath_t{exp});
+          });
+#endif
+    }
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        kHalf, kBFloat16, iter.common_dtype(), "pow_zoom", [&] {
+      if (iter.is_cpu_scalar(1)) {
+        const auto base = iter.scalar_value<scalar_t>(1);
+        iter.remove_operand(1);
+        pow_scalar_tensor_impl(iter, base);
+      } else if (iter.is_cpu_scalar(2)) {
+        const auto exp = iter.scalar_value<scalar_t>(2);
+        iter.remove_operand(2);
+        pow_tensor_scalar_kernel(iter, exp);
+      } else {
+        gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
+          return pow_(base, exp);
+        });
+      }
+    });
+  }
+}
+
+
+template<typename Base_type, typename Exp_type>
+void pow_tensor_scalar_kernel_impl(TensorIteratorBase& iter,
+                                                 Exp_type exp) {
+  const auto d_exp = static_cast<double>(exp);
+  // .5 (sqrt), -.5 (rsqrt) and -1 (reciprocal) specializations are handled
+  // in pow_tensor_scalar_kernel
+  if (d_exp == 2) {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return base * base;
+    });
+  } else if (d_exp == 3) {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return base * base * base;
+    });
+  } else if (d_exp == -2) {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return 1.0 / (base * base);
+    });
+  } else {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return pow_(base, exp);
+    });
+  }
+}
+
+void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar) {
+  // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
+  if (!exp_scalar.isComplex()) {
+    if (exp_scalar.equal(.5)) {
+      return sqrt_kernel_zoom(iter);
+    } else if (exp_scalar.equal(-0.5)) {
+      return rsqrt_kernel_zoom(iter);
+    } else if (exp_scalar.equal(-1.0)) {
+      return reciprocal_kernel_zoom(iter);
+    }
+  }
+  if (isComplexType(iter.common_dtype()) || exp_scalar.isComplex()) {
+    if (iter.common_dtype() == kComplexHalf) {
+      using scalar_t = c10::complex<at::Half>;
+      pow_chalf_tensor_scalar_impl(iter, exp_scalar);
+      return;
+    }
+    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "pow_zoom", [&]() {
+      const auto exp = exp_scalar.to<scalar_t>();
+      gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base) -> scalar_t {
+        return pow_(base, exp);
+      });
+    });
+  } else if (isFloatingType(iter.common_dtype()) || exp_scalar.isIntegral(false)) {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "pow_zoom", [&]() {
+      const auto exp = exp_scalar.to<scalar_t>();
+      pow_tensor_scalar_kernel_impl<scalar_t>(iter, exp);
+    });
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "invalid combination of type in Pow function, common dtype:", iter.common_dtype(),
+                                 "exp is integral?", exp_scalar.isIntegral(false));
+  }
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(pow_tensor_tensor_stub, &pow_tensor_tensor_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(pow_tensor_scalar_stub, &pow_tensor_scalar_kernel);
+
+} // namespace at::native
\ No newline at end of file

From 53deb9560b64702c9d95329f88e00052d9c3b0f4 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Wed, 25 Dec 2024 23:18:04 +0000
Subject: [PATCH 03/23] minimize, fix build, torchgen logic

---
 BUILD.bazel                                   |  11 +-
 CMakeLists.txt                                |   1 +
 aten/CMakeLists.txt                           |  19 +-
 aten/src/ATen/CMakeLists.txt                  |   2 +-
 aten/src/ATen/Context.cpp                     |   5 -
 aten/src/ATen/Context.h                       |   3 -
 aten/src/ATen/detail/ZoomHooksInterface.h     |   4 -
 aten/src/ATen/native/native_functions.yaml    |  32 +-
 aten/src/ATen/native/zoom/AbsKernel.cu        |  42 +
 aten/src/ATen/native/zoom/AmpKernels.cu       | 252 ------
 aten/src/ATen/native/zoom/CompareKernels.cu   | 103 ---
 aten/src/ATen/native/zoom/Copy.cu             |  63 +-
 aten/src/ATen/native/zoom/ForeachFunctors.cuh | 681 --------------
 aten/src/ATen/native/zoom/MiscUtils.h         |  32 -
 .../src/ATen/native/zoom/MultiTensorApply.cuh | 379 --------
 aten/src/ATen/native/zoom/Nonzero.cu          | 130 ---
 aten/src/ATen/native/zoom/Pow.cuh             |  58 --
 aten/src/ATen/native/zoom/PowKernel.cu        | 209 -----
 aten/src/ATen/native/zoom/TensorCompare.cu    | 133 ---
 aten/src/ATen/native/zoom/TensorShape.cu      | 833 ------------------
 .../ATen/native/zoom/TensorTransformations.cu | 154 ----
 aten/src/ATen/native/zoom/ZoomScalar.cu       |  38 +
 .../ATen/native/zoom/reduction_template.cuh   | 680 ++++++++++++++
 aten/src/ATen/templates/UfuncZoom.cu          |  17 +
 aten/src/ATen/zoom/ZoomContext.cpp            |   1 -
 aten/src/ATen/zoom/ZoomContextLight.h         |  50 +-
 aten/src/ATen/zoom/detail/ZoomHooks.cpp       |  32 -
 aten/src/ATen/zoom/detail/ZoomHooks.h         |   1 -
 buckbuild.bzl                                 |   4 +
 build.bzl                                     |  17 +-
 build.sh                                      | 130 +++
 caffe2/CMakeLists.txt                         |   7 +-
 cmake/Codegen.cmake                           |  11 +
 torch/csrc/zoom/Module.cpp                    | 162 ----
 torchgen/dest/__init__.py                     |   1 +
 torchgen/dest/register_dispatch_key.py        |  15 +
 torchgen/dest/ufunc.py                        |  33 +
 torchgen/gen.py                               |  40 +-
 torchgen/model.py                             |  12 +-
 ufunc_defs.bzl                                |   6 +
 40 files changed, 1168 insertions(+), 3235 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/AbsKernel.cu
 delete mode 100644 aten/src/ATen/native/zoom/AmpKernels.cu
 delete mode 100644 aten/src/ATen/native/zoom/CompareKernels.cu
 delete mode 100644 aten/src/ATen/native/zoom/ForeachFunctors.cuh
 delete mode 100644 aten/src/ATen/native/zoom/MiscUtils.h
 delete mode 100644 aten/src/ATen/native/zoom/MultiTensorApply.cuh
 delete mode 100644 aten/src/ATen/native/zoom/Nonzero.cu
 delete mode 100644 aten/src/ATen/native/zoom/Pow.cuh
 delete mode 100644 aten/src/ATen/native/zoom/PowKernel.cu
 delete mode 100644 aten/src/ATen/native/zoom/TensorCompare.cu
 delete mode 100644 aten/src/ATen/native/zoom/TensorShape.cu
 delete mode 100644 aten/src/ATen/native/zoom/TensorTransformations.cu
 create mode 100644 aten/src/ATen/native/zoom/ZoomScalar.cu
 create mode 100644 aten/src/ATen/native/zoom/reduction_template.cuh
 create mode 100644 aten/src/ATen/templates/UfuncZoom.cu
 create mode 100644 build.sh

diff --git a/BUILD.bazel b/BUILD.bazel
index 3f7e6327452c09..c30d8c3df92327 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -9,7 +9,7 @@ load("@pytorch//tools/config:defs.bzl", "if_cuda")
 load("@pytorch//:aten.bzl", "generate_aten", "intern_build_aten_ops")
 load(":build.bzl", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON", "define_targets")
 load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
-load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
+load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources", "aten_ufunc_generated_zoom_sources")
 load("//:tools/bazel.bzl", "rules")
 
 define_targets(rules = rules)
@@ -104,6 +104,12 @@ generated_cuda_cpp = [
     "aten/src/ATen/RegisterSparseCsrCUDA.cpp",
 ]
 
+generated_zoom_cpp = [
+    "aten/src/ATen/ZoomFunctions.h",
+    "aten/src/ATen/ZoomFunctions_inl.h",
+    "aten/src/ATen/RegisterPrivateUse1.cpp",
+]
+
 generate_aten(
     name = "generated_aten_cpp",
     srcs = aten_generation_srcs,
@@ -112,7 +118,8 @@ generate_aten(
         generated_cuda_cpp +
         aten_ufunc_generated_cpu_sources("aten/src/ATen/{}") +
         aten_ufunc_generated_cpu_kernel_sources("aten/src/ATen/{}") +
-        aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + [
+        aten_ufunc_generated_cuda_sources("aten/src/ATen/{}") + 
+        aten_ufunc_generated_zoom_sources("aten/src/ATen/{}") + [
             "aten/src/ATen/Declarations.yaml",
         ]
     ),
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c6320e68d3903..528ebfb8f55a47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,6 +203,7 @@ option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
+option(USE_ZOOM "Use ZOOM HIP Backend" OFF)
 option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
     USE_XPU "Use XPU. Only available on Linux." ON
diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index d1459366a2e945..f1753f50c32fdc 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -30,11 +30,13 @@ set(ATen_CUDA_SRCS_W_SORT_BY_KEY)
 set(ATen_CUDA_TEST_SRCS)
 set(ATen_CUDA_INCLUDE)
 set(ATen_NVRTC_STUB_SRCS)
+set(ATen_HIPRTC_STUB_SRCS)
 set(ATen_HIP_SRCS)
+set(ATen_ZOOM_SRCS)
 set(ATen_HIP_SRCS_W_SORT_BY_KEY)
 set(ATen_HIP_TEST_SRCS)
 set(ATen_HIP_INCLUDE)
-set(ATen_ZOOM_SRCS)
+set(ATen_ZOOM_INCLUDE)
 set(ATen_MPS_SRCS)
 set(ATen_MPS_TEST_SRCS)
 set(ATen_XPU_SRCS)
@@ -45,6 +47,7 @@ set(ATen_CPU_DEPENDENCY_LIBS)
 set(ATen_XPU_DEPENDENCY_LIBS)
 set(ATen_CUDA_DEPENDENCY_LIBS)
 set(ATen_HIP_DEPENDENCY_LIBS)
+set(ATen_ZOOM_DEPENDENCY_LIBS)
 set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS)
 set(ATen_PUBLIC_HIP_DEPENDENCY_LIBS)
 set(ATEN_INSTALL_BIN_SUBDIR "bin" CACHE PATH "ATen install binary subdirectory")
@@ -71,6 +74,17 @@ if(USE_ROCM)
   endif()
 endif()
 
+if(USE_ZOOM)
+  include(LoadHIP)
+  if(NOT PYTORCH_FOUND_HIP)
+    message(WARNING "Could not load HIP, setting USE_ZOOM = OFF")
+    set(USE_ZOOM OFF)
+  else()
+    message(STATUS "Loaded HIP, Zoom Enabled")
+  endif()
+endif()
+
+
 # Both CUDA and ROCM are enabled and found. Report an error.
 if(USE_CUDA AND USE_ROCM)
   message(FATAL_ERROR "Both CUDA and ROCm are enabled and found. PyTorch can only be built with either of them. Please turn one off by using either USE_CUDA=OFF or USE_ROCM=OFF.")
@@ -124,6 +138,7 @@ set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE)
 set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE)
 set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE)
+set(ATen_HIPRTC_STUB_SRCS ${ATen_HIPRTC_STUB_SRCS} PARENT_SCOPE)
 set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE)
 set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE)
@@ -134,12 +149,14 @@ set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE)
 set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE)
 set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE)
 set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE)
+set(ATen_ZOOM_INCLUDE ${ATen_ZOOM_INCLUDE} PARENT_SCOPE)
 set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE)
 set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE)
 set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
+set(ATen_ZOOM_DEPENDENCY_LIBS ${ATen_ZOOM_DEPENDENCY_LIBS} PARENT_SCOPE)
 set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE)
 set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
 set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 42ca9254a64885..684b2c4cdeb905 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -613,7 +613,7 @@ endif()
 if(USE_ZOOM)
   set(ATen_ZOOM_SRCS ${all_zoom_cpp})
   set(ATen_HIPRTC_STUB_SRCS ${zoom_hiprtc_stub_cpp})
-  # list(APPEND ATen_ZOOM_DEPENDENCY_LIBS ATEN_ZOOM_FILES_GEN_LIB)
+  list(APPEND ATen_ZOOM_DEPENDENCY_LIBS ATEN_ZOOM_FILES_GEN_LIB)
 endif()
 
 set(ATEN_INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/${AT_INSTALL_INCLUDE_DIR}")
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 1136b05b265491..20679ab7ff5afa 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -153,7 +153,6 @@ static const char* const cublas_deterministic_configs[] = { ":4096:8", ":16:8" }
 
 bool Context::checkCuBLASConfigDeterministic() {
   bool cublas_config_deterministic = true;
-  #ifndef USE_ZOOM
   // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
   // is set to deterministic setting
   if (hasCUDART() && (versionCUDART() >= 10020)) {
@@ -164,10 +163,6 @@ bool Context::checkCuBLASConfigDeterministic() {
     );
   }
   return cublas_config_deterministic;
-  #else
-  // Zoom uses hipBLAS with the rocBLAS backend - this is only deterministic if atomics are disabled
-  return checkHIPBlasDeterministic();
-  #endif
 }
 
 void Context::alertCuBLASConfigNotDeterministic() const {
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index f241e91be6f731..4b71d3813353cd 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -127,9 +127,6 @@ class TORCH_API Context {
   static bool hasCuBLASLt() {
     return detail::getCUDAHooks().hasCuBLASLt();
   }
-  static bool checkHIPBlasDeterministic() {
-    return detail::getZoomHooks().checkHIPBlasDeterministic();
-  }
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
diff --git a/aten/src/ATen/detail/ZoomHooksInterface.h b/aten/src/ATen/detail/ZoomHooksInterface.h
index 0e971a17e5a9c9..02bdd94ff1dada 100644
--- a/aten/src/ATen/detail/ZoomHooksInterface.h
+++ b/aten/src/ATen/detail/ZoomHooksInterface.h
@@ -91,10 +91,6 @@ struct TORCH_API ZoomHooksInterface : PrivateUse1HooksInterface {
     return false;
   }
 
-  virtual bool checkHIPBlasDeterministic() const {
-    TORCH_CHECK(false, "Cannot call checkHIPBlasDeterministic without torch_zoom library", ZOOM_HELP);
-  }
-
   virtual const at::zoom::HIPRTC& hiprtc() const {
     TORCH_CHECK(false, "HIPRTC requires Zoom. ", ZOOM_HELP);
   }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 10d8b1ad79cadf..b28fcfbfc2732e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -354,7 +354,7 @@
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: abs_out
+    CPU, CUDA, PrivateUse1: abs_out
     MPS: abs_out_mps
     SparseCPU, SparseCUDA: abs_sparse_out
     SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
@@ -413,12 +413,12 @@
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
   variants: function
   dispatch:
-    CPU, CUDA, MPS, Meta: view_as_real
+    CPU, CUDA, PrivateUse1, MPS, Meta: view_as_real
 
 - func: view_as_complex(Tensor(a) self) -> Tensor(a)
   variants: function
   dispatch:
-    CPU, CUDA, MPS, Meta: view_as_complex
+    CPU, CUDA, PrivateUse1, MPS, Meta: view_as_complex
 
 - func: sgn(Tensor self) -> Tensor
   variants: function, method
@@ -931,7 +931,7 @@
 - func: as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
   variants: function, method
   dispatch:
-    ZeroTensor, CPU, CUDA: as_strided_tensorimpl
+    ZeroTensor, CPU, CUDA, PrivateUse1: as_strided_tensorimpl
     Meta: as_strided_tensorimpl_meta_symint
     MPS: as_strided_tensorimpl_mps
     QuantizedCPU, QuantizedCUDA: as_strided_qtensorimpl
@@ -2367,6 +2367,7 @@
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
+    PrivateUse1: empty_zoom
     MPS: empty_mps
     Meta: empty_meta_symint
     MkldnnCPU: empty_mkldnn
@@ -2444,6 +2445,7 @@
     Meta: resize__symint
     CPU: resize_
     CUDA: resize_cuda_
+    PrivateUse1: resize_zoom_
     MPS: resize_mps_
     QuantizedCPU: quantized_resize_cpu_
     SparseCsrCPU, SparseCsrCUDA: resize_sparse_csr_
@@ -2485,6 +2487,7 @@
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
+    PrivateUse1: empty_strided_zoom
     MPS: empty_strided_mps
     Meta: empty_strided_meta_symint
     QuantizedCPU, QuantizedCUDA: empty_strided_unknown_quantized
@@ -2634,12 +2637,14 @@
   dispatch:
     CPU, Meta: eye_out_cpu
     CUDA: eye_out_cuda
+    PrivateUse1: eye_out_zoom
     MPS: eye_out_mps
 
 - func: eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, Meta: eye_out_cpu
     CUDA: eye_out_cuda
+    PrivateUse1: eye_out_zoom
     MPS: eye_out_mps
 
 - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
@@ -2679,7 +2684,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: fill_
+    CPU, CUDA, PrivateUse1: fill_
     MPS: fill_scalar_mps
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
@@ -2691,7 +2696,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: fill_
+    CPU, CUDA, PrivateUse1: fill_
     MPS: fill_tensor_mps_
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
@@ -6501,6 +6506,7 @@
   dispatch:
     CPU: _efficientzerotensor
     CUDA: _efficientzerotensor_cuda
+    PrivateUse1: _efficientzerotensor_zoom
     MPS: _efficientzerotensor_mps
     Meta: _efficientzerotensor_meta_symint
   autogen: _efficientzerotensor.out
@@ -7726,6 +7732,7 @@
   dispatch:
     CPU: _local_scalar_dense_cpu
     CUDA: _local_scalar_dense_cuda
+    PrivateUse1: _local_scalar_dense_zoom
     MPS: _local_scalar_dense_mps
   variants: function
 
@@ -7863,6 +7870,7 @@
     CPU: set_storage_cpu_
     Meta: set_storage_meta__symint
     CUDA: set_storage_cuda_
+    PrivateUse1: set_storage_zoom_
     MPS: set_storage_mps_
     QuantizedCPU, QuantizedCUDA: set_storage_quantized_
   autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out
@@ -7890,6 +7898,7 @@
   dispatch:
     CPU: set_cpu_
     CUDA: set_cuda_
+    PrivateUse1: set_zoom_
     Meta: set_meta_
     MPS: set_mps_
   autogen: set, set.out
@@ -7998,7 +8007,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    ZeroTensor, Meta, CPU, CUDA, QuantizedCPU, QuantizedCUDA, MPS: view
+    ZeroTensor, Meta, CPU, CUDA, PrivateUse1, QuantizedCPU, QuantizedCUDA, MPS: view
     MkldnnCPU: mkldnn_view
     NestedTensorCPU, NestedTensorCUDA: view_nested
   tags: core
@@ -8765,7 +8774,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: ne_Scalar_out
+    CPU, CUDA, PrivateUse1: ne_Scalar_out
     MPS: ne_scalar_out_mps
     QuantizedCPU: ne_out_quantized_cpu
   tags: pointwise
@@ -8783,7 +8792,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: ne_Tensor_out
+    CPU, CUDA, PrivateUse1: ne_Tensor_out
     MPS: ne_tensor_out_mps
     QuantizedCPU: ne_out_quantized_cpu
   tags: pointwise
@@ -8828,7 +8837,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: eq_Scalar_out
+    CPU, CUDA, PrivateUse1: eq_Scalar_out
     MPS: eq_scalar_out_mps
     QuantizedCPU: eq_out_quantized_cpu
   tags: pointwise
@@ -8847,7 +8856,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: eq_Tensor_out
+    CPU, CUDA, PrivateUse1: eq_Tensor_out
     MPS: eq_tensor_out_mps
     QuantizedCPU: eq_out_quantized_cpu
   tags: pointwise
@@ -10123,6 +10132,7 @@
   dispatch:
     CPU: cpu_equal
     CUDA: cuda_equal
+    PrivateUse1: zoom_equal
     MPS: mps_equal
     QuantizedCPU: equal_quantized_cpu
 
diff --git a/aten/src/ATen/native/zoom/AbsKernel.cu b/aten/src/ATen/native/zoom/AbsKernel.cu
new file mode 100644
index 00000000000000..dd6dc56f646bf9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AbsKernel.cu
@@ -0,0 +1,42 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native {
+
+
+CONSTEXPR_EXCEPT_WIN_CUDA constexpr char abs_name[] = "abs_kernel";
+void abs_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.dtype();
+  static const auto abs_string = jiterator_stringify(
+        template <typename T> T abs_kernel(T x) { return std::abs(x); });
+  if (at::isComplexType(dtype)) {
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_zoom", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/abs_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, abs_string);
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half,
+        ScalarType::BFloat16,
+        ScalarType::Bool,
+        iter.dtype(),
+        "abs_zoom", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/abs_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, abs_string);
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &abs_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/AmpKernels.cu b/aten/src/ATen/native/zoom/AmpKernels.cu
deleted file mode 100644
index 14fa799fd6d283..00000000000000
--- a/aten/src/ATen/native/zoom/AmpKernels.cu
+++ /dev/null
@@ -1,252 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#define _USE_MATH_DEFINES
-
-#include <math.h>
-
-#include <ATen/core/Tensor.h>
-#include <ATen/DeviceGuard.h>
-#include <ATen/Dispatch.h>
-#include <ATen/native/zoom/ForeachFunctors.cuh>
-#include <ATen/zoom/jit/Loops.cuh>
-#include <ATen/native/ForeachUtils.h>
-#include <ATen/native/TensorIterator.h>
-
-
-namespace {
-// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
-// to ensure the Cuda math library's isfinite is actually what gets called in
-// _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda.
-//
-// isfinite_ensure_cuda_math is defined outside at::native because:
-// - A bare call to "isfinite(val)" inside at::native causes nvcc to prefer the unrelated
-//   Tensor at::native::isfinite(const Tensor&), resulting in an error:
-//   "no suitable constructor exists to convert from "float" to "at::Tensor""
-// - Unfortunately, the Cuda math library documentation doesn't say how (or if) you can provide a full namespace path
-//   to ensure that its version of a particular function is invoked.  It only shows bare (not-namespaced)
-//   calls to its routines inside kernel or device functions.
-// - "std::isfinite(val)" in the gpu_kernel lambda causes an "unspecified launch failure" at runtime with cuda 9 on Windows.
-//
-// isfinite_ensure_cuda_math, declared at file scope outside the at::native region, uses isfinite as math library docs
-// suggest and allows disambiguated usage in the lambda within the at::native region.
-// GPU_LAMBDA is defined as __host__ __device__ (see Loops.cuh), so I need the __host__ keyword or else nvcc complains that
-// "calling a __device__ function("isfinite_ensure_cuda_math") from a __host__ __device__ function("operator()") is not allowed."
-static __host__ __device__ __forceinline__ int isfinite_ensure_zoom_math(float val) {
-  return isfinite(val);
-}
-}
-
-namespace at::native {
-
-namespace {
-// Single-tensor fallback for _amp_foreach_non_finite_check_and_unscale_zoom_.
-// Handles individual tensors that are acceptable to unscale but not MTA-safe.
-void _amp_non_finite_check_and_unscale_zoom_(Tensor& scaled_grad,
-                                             Tensor& found_inf,
-                                             const Tensor& inv_scale)
-{
-  // The only way we reach this function is through _amp_foreach_non_finite_check_and_unscale_zoom_, so no input checks.
-
-  // It's not obvious gpu_kernel always guards onto its argument.  Guarding here just in case.
-  const OptionalDeviceGuard device_guard(device_of(scaled_grad));
-
-  // Acts on scaled_grad in place.
-  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    iter.dtype(),
-    "_amp_non_finite_check_and_unscale_zoom",
-    [&iter, &found_inf, &inv_scale] {
-      auto* found_inf_ptr = found_inf.mutable_data_ptr<float>();
-      auto* inv_scale_ptr = inv_scale.const_data_ptr<float>();
-
-      using opmath_t = at::opmath_type<scalar_t>;
-
-      gpu_kernel(iter,
-                 [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t {
-                   auto val = static_cast<opmath_t>(val_in);
-                   if (!isfinite_ensure_zoom_math(val)) {
-                     *found_inf_ptr = 1.f;
-                   }
-                   // Every thread accesses inv_scale, but it will hit in cache.
-                   const auto inv_scale_val = *inv_scale_ptr;
-                   return static_cast<scalar_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
-                 });
-    });
-}
-} // anonymous namespace
-
-
-// Multiplies each tensor in scaled_grads by inv_scale in-place.
-// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf to 1.0.
-// Uses multi tensor apply (MTA) to process all MTA-safe tensors.
-//
-// Args:
-// scaled_grads:  A TensorList of scaled gradient tensors.  May contain infs or NaNs.
-// found_inf:  A single-element float tensor to which 1.0 will be written if any gradient contain infs/nans.
-//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
-// inv_scale:  The inverse of the scale factor by which scaled_grads are currently multiplied.
-void _amp_foreach_non_finite_check_and_unscale_zoom_(TensorList scaled_grads,
-                                                     Tensor& found_inf,
-                                                     const Tensor& inv_scale)
-{
-  if (scaled_grads.size() == 0) {
-    return;
-  }
-
-  TORCH_CHECK(inv_scale.is_privateuseone(), "inv_scale must be a Zoom tensor.");
-  TORCH_CHECK(found_inf.is_privateuseone(), "found_inf must be a Zoom tensor.");
-  TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
-  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
-  TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor.");
-  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
-
-  // Ensures client code (GradScaler) filtered scaled_grads by dtype.
-  check_foreach_api_restrictions(scaled_grads);
-
-  std::vector<std::vector<at::Tensor>> tensor_lists;
-
-  // is_non_overlapping_and_dense() is not available in Python.
-  // GradScaler can't filter for it. We need to filter here.
-  if (can_use_fast_route(scaled_grads)) {
-    // Hopefully common case.
-    // can_use_fast_route is true, which confirms:
-    //  - all scaled_grads are strided
-    //  - all scaled_grads are non overlapping and dense
-    //  - all scaled_grads are on the same device
-    //  - all scaled_grads are of the same dtype
-    TORCH_CHECK(scaled_grads[0].is_privateuseone(), "scaled_grads must be Zoom tensors.");
-    // Sets up MTA launch to use scaled_grads as-is.
-    tensor_lists.emplace_back(scaled_grads.vec());
-  } else {
-    // Hopefully uncommon case.
-    // can_use_fast_route is an all-or-nothing check.  In this path it was false,
-    // so any of the above confirmations could have gone wrong.
-    // We filter MTA-safe tensors into an MTA-able list.
-    // If a tensor is acceptable but not MTA-safe, we fall back to the TensorIterator kernel.
-    // If a tensor is unacceptable, we throw an error to blame GradScaler.
-    tensor_lists.resize(1);
-    tensor_lists[0].reserve(scaled_grads.size());
-    auto expected_device = scaled_grads[0].device();
-    const auto expected_dtype = scaled_grads[0].scalar_type();
-    for (const Tensor& t : scaled_grads) {
-      // Ensures GradScaler filtered scaled_grads by device.
-      TORCH_CHECK(t.is_privateuseone(), "one of scaled_grads was not a Zoom tensor.");
-      TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
-      TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
-      if (!t.is_non_overlapping_and_dense() || t.scalar_type() != expected_dtype) {
-        // t is acceptable but not MTA-safe.  Falls back to single-tensor TensorIterator kernel.
-        _amp_non_finite_check_and_unscale_zoom_(const_cast<Tensor&>(t),
-                                                found_inf,
-                                                inv_scale);
-      } else {
-        tensor_lists[0].push_back(t);
-      }
-    }
-    if (tensor_lists[0].size() == 0) {
-      return;
-    }
-  }
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    tensor_lists[0][0].scalar_type(),
-    "_amp_foreach_non_finite_check_and_unscale_zoom",
-    [&tensor_lists, &found_inf, &inv_scale] {
-      auto* found_inf_ptr = found_inf.mutable_data_ptr<float>();
-      auto* inv_scale_ptr = inv_scale.const_data_ptr<float>();
-
-      using opmath_t = at::opmath_type<scalar_t>;
-
-      // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
-      multi_tensor_apply<1>(tensor_lists,
-                            UnaryOpFunctor<scalar_t,
-                                           /* depth */ 1,
-                                           /* r_args_depth */ 1,
-                                           /* res_arg_index */ 0>(),
-                            [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
-                              // There is a slight asymmetry here with the TensorIterator kernel above.
-                              // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
-                              if (!isfinite_ensure_zoom_math(val)) {
-                                *found_inf_ptr = 1.f;
-                              }
-                              // Every thread accesses inv_scale, but it will hit in cache.
-                              const auto inv_scale_val = *inv_scale_ptr;
-                              return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
-                            });
-    });
-}
-
-
-// amp_update_scale_zoom_kernel is launched with a single thread to compute the new scale.
-// The scale factor is maintained and updated on the GPU to avoid synchronization.
-__global__ void amp_update_scale_zoom_kernel(float* current_scale,
-                                             int* growth_tracker,
-                                             const float* found_inf,
-                                             double growth_factor,
-                                             double backoff_factor,
-                                             int growth_interval)
-{
-  if (*found_inf) {
-    *current_scale = (*current_scale)*backoff_factor;
-    *growth_tracker = 0;
-  } else {
-    // Entering this branch means we just carried out a successful step,
-    // so growth_tracker is incremented before comparing to growth_interval.
-    auto successful = (*growth_tracker) + 1;
-    if (successful == growth_interval) {
-      auto new_scale = static_cast<float>((*current_scale)*growth_factor);
-      // Do not grow the scale past fp32 bounds to inf.
-      if (isfinite_ensure_zoom_math(new_scale)) {
-          *current_scale = new_scale;
-      }
-      *growth_tracker = 0;
-    } else {
-      *growth_tracker = successful;
-    }
-  }
-}
-
-
-// _amp_update_scale_zoom asynchronously updates the scale tensor in place.
-//
-// Args:
-// current_scale:  A one-element zoom float tensor containing the scale value.
-// growth_tracker:  A one-element torch.zoom.IntTensor containing the number of recent consecutive unskipped steps.
-// found_inf:  A one-element zoom float tensor. If > 0, indicates that infs/nans were found by the relevant
-//             prior _amp_non_finite_check_and_unscale_zoom call, and 0 if no infs/nans were found.
-// growth_factor:  Multiplier if no infs/NaNs were found (typically slightly > 1).
-// backoff_factor:  Multiplier if infs/NaNs were found (typically 0.5).
-// growth_interval:  Number of consecutive unskipped steps that must occur for current_scale to be multiplied by
-//                   growth_factor.
-//
-// Returns:
-// current_scale
-Tensor& _amp_update_scale_zoom_(Tensor& current_scale,
-                                Tensor& growth_tracker,
-                                const Tensor& found_inf,
-                                double growth_factor,
-                                double backoff_factor,
-                                int64_t growth_interval)
-{
-  TORCH_CHECK(growth_tracker.is_privateuseone(), "growth_tracker must be a Zoom tensor.");
-  TORCH_CHECK(current_scale.is_privateuseone(), "current_scale must be a Zoom tensor.");
-  TORCH_CHECK(found_inf.is_privateuseone(), "found_inf must be a Zoom tensor.");
-  TORCH_CHECK(growth_tracker.numel() == 1, "growth_tracker must be a 1-element tensor.");
-  TORCH_CHECK(current_scale.numel() == 1, "current_scale must be a 1-element tensor.");
-  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
-  TORCH_CHECK(growth_tracker.scalar_type() == at::ScalarType::Int, "growth_tracker must be an int tensor.");
-  TORCH_CHECK(current_scale.scalar_type() == at::ScalarType::Float, "current_scale must be a float tensor.");
-  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
-
-  amp_update_scale_zoom_kernel<<<1, 1, 0, c10::zoom::getCurrentZoomStream()>>>(
-    current_scale.mutable_data_ptr<float>(),
-    growth_tracker.mutable_data_ptr<int>(),
-    found_inf.const_data_ptr<float>(),
-    growth_factor,
-    backoff_factor,
-    growth_interval);
-  C10_ZOOM_KERNEL_LAUNCH_CHECK();
-
-  return current_scale;
-}
-
-} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CompareKernels.cu b/aten/src/ATen/native/zoom/CompareKernels.cu
deleted file mode 100644
index 21da608a35fc94..00000000000000
--- a/aten/src/ATen/native/zoom/CompareKernels.cu
+++ /dev/null
@@ -1,103 +0,0 @@
-#define TORCH_ASSERT_NO_OPERATORS
-#include <ATen/Dispatch.h>
-#include <ATen/native/BinaryOps.h>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/zoom/jit/Loops.cuh>
-
-
-// NOTE: CUDA on Windows requires that the enclosing function
-// of a __device__ lambda not have internal linkage.
-
-namespace at::native { namespace {
-
-enum class OpType {GE, GT, LE, LT};
-
-template<typename scalar_t>
-struct CompareFunctor{
-  constexpr CompareFunctor(OpType op): op_(op) {};
-  OpType op_;
-  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
-    if (op_ == OpType::GE) {
-      return a >= b;
-    } else if (op_ == OpType::GT) {
-      return a > b;
-    } else if (op_ == OpType::LE) {
-      return a <= b;
-    } else { //LT
-      return a < b;
-    }
-  }
-};
-
-// Reflects the comparison operator, so reflect(op)(a, b) == op(b, a)
-OpType reflect(OpType x) {
-  switch (x) {
-    case OpType::GE: return OpType::LE;
-    case OpType::GT: return OpType::LT;
-    case OpType::LE: return OpType::GE;
-    case OpType::LT: return OpType::GT;
-  }
-  TORCH_INTERNAL_ASSERT(false, "Invalid OpType");
-}
-
-}  // namespace (anonymous)
-
-template <typename scalar_t>
-void compare_scalar_kernel(TensorIteratorBase &iter, OpType op, scalar_t rhs) {
-  CompareFunctor<scalar_t> f(op);
-  gpu_kernel(iter, [=] GPU_LAMBDA (scalar_t lhs) -> bool {
-    return f(lhs, rhs);
-  });
-}
-
-template <typename scalar_t>
-void compare_kernel_impl(TensorIteratorBase &iter, OpType op) {
-  // If either input is a cpu scalar, perform the equivalent comparison
-  // where the scalar is on the right hand side. This saves us from
-  // generating two otherwise identical kernels with mirrored
-  // arguments.
-  if (iter.is_cpu_scalar(1)) {
-    const scalar_t lhs = iter.scalar_value<scalar_t>(1);
-    iter.remove_operand(1);
-    const DeviceGuard device_guard(iter.device(1));
-    compare_scalar_kernel(iter, reflect(op), lhs);
-  } else if (iter.is_cpu_scalar(2)) {
-    const scalar_t rhs = iter.scalar_value<scalar_t>(2);
-    iter.remove_operand(2);
-    compare_scalar_kernel(iter, op, rhs);
-  } else {
-    CompareFunctor<scalar_t> f(op);
-    gpu_kernel(iter, f);
-  }
-}
-
-C10_NOINLINE void compare_kernel_with_scalars(TensorIteratorBase &iter, OpType op) {
-  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "compare_zoom", [&]() {
-    compare_kernel_impl<scalar_t>(iter, op);
-  });
-}
-
-
-void ge_kernel_zoom(TensorIteratorBase& iter) {
-  compare_kernel_with_scalars(iter, OpType::GE);
-}
-
-void gt_kernel_zoom(TensorIteratorBase& iter) {
-  compare_kernel_with_scalars(iter, OpType::GT);
-}
-
-void le_kernel_zoom(TensorIteratorBase& iter) {
-  compare_kernel_with_scalars(iter, OpType::LE);
-}
-
-void lt_kernel_zoom(TensorIteratorBase& iter) {
-  compare_kernel_with_scalars(iter, OpType::LT);
-}
-
-REGISTER_PRIVATEUSE1_DISPATCH(ge_stub, &ge_kernel_zoom);
-REGISTER_PRIVATEUSE1_DISPATCH(gt_stub, &gt_kernel_zoom);
-REGISTER_PRIVATEUSE1_DISPATCH(le_stub, &le_kernel_zoom);
-REGISTER_PRIVATEUSE1_DISPATCH(lt_stub, &lt_kernel_zoom);
-
-} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Copy.cu b/aten/src/ATen/native/zoom/Copy.cu
index 3415806851f9fd..57436f844beedc 100644
--- a/aten/src/ATen/native/zoom/Copy.cu
+++ b/aten/src/ATen/native/zoom/Copy.cu
@@ -11,6 +11,7 @@
 #include <ATen/native/quantized/Copy.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -23,8 +24,66 @@
 
 namespace at::native {
 
-void neg_kernel_zoom(TensorIteratorBase &iter);
-void conj_kernel_zoom(TensorIteratorBase &iter);
+// forward decl, defined below
+void direct_copy_kernel_zoom(TensorIteratorBase &iter);
+
+// NB: Ignores the negative bit on tensors
+CONSTEXPR_EXCEPT_WIN_CUDA char neg_name[] = "neg_kernel";
+void neg_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.dtype();
+  if (at::isComplexType(dtype)) {
+  static const auto neg_string = jiterator_stringify(
+      template <typename T>
+      T neg_kernel(T a) {
+        return -a;
+      }
+  ); // neg_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_zoom", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ neg_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, neg_string);
+  });
+
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, dtype, "neg_zoom", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return -a;
+    });
+  });
+  }
+}
+
+// NB: Ignores the negative bit on tensors
+CONSTEXPR_EXCEPT_WIN_CUDA char conj_name[] = "conj_kernel";
+void conj_kernel_zoom(TensorIteratorBase& iter) {
+  auto conj_chalf = [&] {
+    using scalar_t = c10::complex<at::Half>;
+
+      static const auto conj_string = jiterator_stringify(
+        template <typename T>
+        T conj_kernel(T z) {
+          return std::conj(z);
+        }
+      );
+      jitted_gpu_kernel<conj_name, scalar_t, scalar_t, 1>(iter, conj_string);
+
+  };
+
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "conj_zoom",
+    AT_DISPATCH_CASE_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, [&] {
+      // Conj is a no-op for non-complex types
+      direct_copy_kernel_zoom(iter);
+    })
+    AT_DISPATCH_CASE_COMPLEX_TYPES([&] {
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return std::conj(a);
+      });
+    })
+    AT_DISPATCH_CASE(kComplexHalf, conj_chalf)
+  );
+}
 
 void float8_copy_kernel_zoom(TensorIteratorBase &iter) {
   ScalarType dtype = iter.dtype(0);
diff --git a/aten/src/ATen/native/zoom/ForeachFunctors.cuh b/aten/src/ATen/native/zoom/ForeachFunctors.cuh
deleted file mode 100644
index 869e6fa3fd4389..00000000000000
--- a/aten/src/ATen/native/zoom/ForeachFunctors.cuh
+++ /dev/null
@@ -1,681 +0,0 @@
-#pragma once
-#include <ATen/OpMathType.h>
-#include <ATen/native/ForeachUtils.h>
-#include <ATen/native/zoom/MultiTensorApply.cuh>
-#include <ATen/native/zoom/Pow.cuh>
-
-namespace at::native {
-
-namespace {
-
-// TODO(crcrpar): Handle version bump in codegen.
-// rel:
-// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482
-inline void increment_version(TensorList tensors) {
-  for (const auto& t : tensors) {
-    t.unsafeGetTensorImpl()->bump_version();
-  }
-}
-
-// Initializes args and checks if all args are aligned
-template <int depth, typename T>
-__device__ bool init_args(
-    T** args,
-    TensorListMetadata<depth>& tl,
-    const int64_t chunk_idx,
-    const int64_t chunk_size,
-    const int64_t tensor_loc) {
-  bool all_aligned = true;
-  for (int i = 0; i < depth; i++) {
-    args[i] = (T*)tl.addresses[i][tensor_loc];
-    args[i] += chunk_idx * chunk_size;
-
-    if (!is_aligned(args[i])) {
-      all_aligned = false;
-    }
-  }
-  return all_aligned;
-}
-
-// Initializes args and checks if all args are aligned
-template <int depth, typename T, typename T2>
-__device__ bool init_args(
-    T** args,
-    TensorListScalarListMetadata<T2, depth>& tl,
-    const int64_t chunk_idx,
-    const int64_t chunk_size,
-    const int64_t tensor_loc) {
-  bool all_aligned = true;
-  for (int i = 0; i < depth; i++) {
-    args[i] = (T*)tl.addresses[i][tensor_loc];
-    args[i] += chunk_idx * chunk_size;
-
-    if (!is_aligned(args[i])) {
-      all_aligned = false;
-    }
-  }
-  return all_aligned;
-}
-
-template <int depth, typename T>
-__device__ bool init_args(
-    T** args,
-    FusedOptimizerTensorListMetadata<depth>& tl,
-    const int64_t chunk_idx,
-    const int64_t chunk_size,
-    const int64_t tensor_loc) {
-  bool all_aligned = true;
-  for (int i = 0; i < depth; i++) {
-    args[i] = (T*)tl.addresses[i][tensor_loc];
-    args[i] += chunk_idx * chunk_size;
-
-    if (!is_aligned(args[i])) {
-      all_aligned = false;
-    }
-  }
-  return all_aligned;
-}
-
-template <int depth, typename T>
-__device__ void load_args(
-    T r_args[][kILP],
-    T** args,
-    const int64_t i_start,
-    const int64_t chunk_size,
-    const int64_t n) {
-#pragma unroll
-  for (int ii = 0; ii < kILP; ii++) {
-    const auto i = i_start + threadIdx.x + ii * blockDim.x;
-    for (int r_index = 0; r_index < depth; r_index++) {
-      r_args[r_index][ii] = 0;
-      if (i < n && i < chunk_size) {
-        r_args[r_index][ii] = args[r_index][i];
-      }
-    }
-  }
-}
-
-template <typename T>
-__device__ void store_args(
-    T* dst,
-    T* src,
-    const int64_t i_start,
-    const int64_t chunk_size,
-    const int64_t n) {
-#pragma unroll
-  for (int ii = 0; ii < kILP; ii++) {
-    const int64_t i = i_start + threadIdx.x + ii * blockDim.x;
-    if (i < n && i < chunk_size)
-      dst[i] = src[ii];
-  }
-}
-
-template <int res_arg_index, typename Op, typename T, typename opmath_t>
-__device__ __forceinline__ void binary_op_scalar(
-    T r_args[][kILP],
-    T** args,
-    opmath_t scalar,
-    const int64_t n,
-    const int64_t chunk_size,
-    const bool all_aligned,
-    Op op) {
-  // to make things simple, we put aligned case in a different code path
-  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-    for (int64_t i_start = threadIdx.x;
-         i_start * kILP < n && i_start * kILP < chunk_size;
-         i_start += blockDim.x) {
-      // load
-      load_store(r_args[0], args[0], 0, i_start);
-#pragma unroll
-      for (int ii = 0; ii < kILP; ii++) {
-        r_args[0][ii] = static_cast<T>(
-            op(static_cast<opmath_t>(r_args[0][ii]),
-               static_cast<opmath_t>(scalar)));
-      }
-      // store
-      load_store(args[res_arg_index], r_args[0], i_start, 0);
-    }
-  } else {
-    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-         i_start += blockDim.x * kILP) {
-      // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args
-      // has depth 1
-      load_args<1>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-      for (int ii = 0; ii < kILP; ii++) {
-        r_args[0][ii] = static_cast<T>(
-            op(static_cast<opmath_t>(r_args[0][ii]),
-               static_cast<opmath_t>(scalar)));
-      }
-      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-    }
-  }
-}
-
-template <int res_arg_index, typename Op, typename T, typename opmath_t>
-__device__ __forceinline__ void pointwise_op_scalar(
-    T r_args[][kILP],
-    T** args,
-    opmath_t scalar,
-    const int64_t n,
-    const int64_t chunk_size,
-    const bool all_aligned,
-    Op op) {
-  // to make things simple, we put aligned case in a different code path
-  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-    for (int64_t i_start = threadIdx.x;
-         i_start * kILP < n && i_start * kILP < chunk_size;
-         i_start += blockDim.x) {
-      // load
-      load_store(r_args[0], args[0], 0, i_start);
-      load_store(r_args[1], args[1], 0, i_start);
-      load_store(r_args[2], args[2], 0, i_start);
-#pragma unroll
-      for (int ii = 0; ii < kILP; ii++) {
-        r_args[0][ii] = static_cast<T>(
-            static_cast<opmath_t>(r_args[0][ii]) +
-            scalar *
-                op(static_cast<opmath_t>(r_args[1][ii]),
-                   static_cast<opmath_t>(r_args[2][ii])));
-      }
-      // store
-      load_store(args[res_arg_index], r_args[0], i_start, 0);
-    }
-  } else {
-    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-         i_start += blockDim.x * kILP) {
-      // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args
-      // has depth 3
-      load_args<3>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-      for (int ii = 0; ii < kILP; ii++) {
-        r_args[0][ii] = static_cast<T>(
-            static_cast<opmath_t>(r_args[0][ii]) +
-            scalar *
-                op(static_cast<opmath_t>(r_args[1][ii]),
-                   static_cast<opmath_t>(r_args[2][ii])));
-      }
-      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-    }
-  }
-}
-
-//
-// Binary Functors
-//
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct BinaryOpScalarFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op,
-      opmath_t scalar) {
-    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    binary_op_scalar<res_arg_index>(
-        r_args, args, scalar, n, chunk_size, all_aligned, op);
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct BinaryOpScalarListFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListScalarListMetadata<opmath_t, depth>& tl,
-      Op op) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    opmath_t scalar = tl.scalar_vals[tensor_loc];
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    binary_op_scalar<res_arg_index>(
-        r_args, args, scalar, n, chunk_size, all_aligned, op);
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct BinaryOpListAlphaFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op,
-      opmath_t alpha) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        // load
-        load_store(r_args[0], args[0], 0, i_start);
-        load_store(r_args[1], args[1], 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = static_cast<T>(
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 alpha * static_cast<opmath_t>(r_args[1][ii])));
-        }
-        // store
-        load_store(args[res_arg_index], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = static_cast<T>(
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 alpha * static_cast<opmath_t>(r_args[1][ii])));
-        }
-        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct BinaryOpScalarTensorFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op,
-      T* scalar,
-      opmath_t alpha) {
-    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        // load
-        load_store(r_args[0], args[0], 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = static_cast<T>(op(
-              static_cast<opmath_t>(r_args[0][ii]),
-              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
-        }
-        // store
-        load_store(args[res_arg_index], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-        // Regardless if depth is 1 (for inplace) or 2 (for out of place),
-        // r_args has depth 1
-        load_args<1>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = static_cast<T>(op(
-              static_cast<opmath_t>(r_args[0][ii]),
-              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
-        }
-        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-//
-// Unary Functors
-//
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct ZeroFunctor {
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<1>& tl) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const auto all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = 0;
-        }
-        // store
-        load_store(args[0], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = 0;
-        }
-        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct UnaryOpFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        // load
-        load_store(r_args[0], args[0], 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] =
-              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
-        }
-        // store
-        load_store(args[res_arg_index], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] =
-              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
-        }
-        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-//
-// Pointwise Functors
-//
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct PointwiseOpScalarFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op,
-      opmath_t scalar) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    pointwise_op_scalar<res_arg_index>(
-        r_args, args, scalar, n, chunk_size, all_aligned, op);
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct PointwiseOpScalarListFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListScalarListMetadata<opmath_t, depth>& tl,
-      Op op) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    opmath_t scalar = tl.scalar_vals[tensor_loc];
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    pointwise_op_scalar<res_arg_index>(
-        r_args, args, scalar, n, chunk_size, all_aligned, op);
-  }
-};
-
-template <typename T, int depth>
-struct PointwiseOpListFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op) {
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[depth - 1][kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        // load
-        load_store(r_args[0], args[0], 0, i_start);
-        load_store(r_args[1], args[1], 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = static_cast<T>(
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 static_cast<opmath_t>(r_args[1][ii])));
-        }
-        // store
-        load_store(args[2], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-        load_args<depth - 1>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] = static_cast<T>(
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 static_cast<opmath_t>(r_args[1][ii])));
-        }
-        store_args(args[2], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct TernaryOpListFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op) {
-    static_assert(depth == 3 || depth == 4, "");
-    static_assert(depth >= r_args_depth, "");
-    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        load_store(r_args[0], args[0], 0, i_start);
-        load_store(r_args[1], args[1], 0, i_start);
-        load_store(r_args[2], args[2], 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] =
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 static_cast<opmath_t>(r_args[1][ii]),
-                 static_cast<opmath_t>(r_args[2][ii]));
-        }
-        load_store(args[res_arg_index], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] =
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 static_cast<opmath_t>(r_args[1][ii]),
-                 static_cast<opmath_t>(r_args[2][ii]));
-        }
-        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-template <typename T, int depth, int r_args_depth, int res_arg_index>
-struct TernaryOpScalarFunctor {
-  using opmath_t = at::opmath_type<T>;
-  template <typename Op>
-  __device__ __forceinline__ void operator()(
-      int chunk_size,
-      TensorListMetadata<depth>& tl,
-      Op op,
-      opmath_t alpha) {
-    static_assert(depth == 2 || depth == 3, "");
-    static_assert(depth >= r_args_depth, "");
-    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
-    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
-    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
-    auto n = tl.numel_for_tensor[tensor_loc];
-
-    T* args[depth];
-    const bool all_aligned =
-        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
-    n -= chunk_idx * chunk_size;
-    T r_args[r_args_depth][kILP];
-
-    // to make things simple, we put aligned case in a different code path
-    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
-      for (int64_t i_start = threadIdx.x;
-           i_start * kILP < n && i_start * kILP < chunk_size;
-           i_start += blockDim.x) {
-        // load
-        load_store(r_args[0], args[0], 0, i_start);
-        load_store(r_args[1], args[1], 0, i_start);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] =
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 static_cast<opmath_t>(r_args[1][ii]),
-                 alpha);
-        }
-        // store
-        load_store(args[res_arg_index], r_args[0], i_start, 0);
-      }
-    } else {
-      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
-           i_start += blockDim.x * kILP) {
-        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
-#pragma unroll
-        for (int ii = 0; ii < kILP; ii++) {
-          r_args[0][ii] =
-              op(static_cast<opmath_t>(r_args[0][ii]),
-                 static_cast<opmath_t>(r_args[1][ii]),
-                 alpha);
-        }
-        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
-      }
-    }
-  }
-};
-
-template <typename T>
-struct power_functor {
-  C10_DEVICE T operator()(const T& a, const T& b) const {
-    return at::native::pow_(a, b);
-  }
-};
-
-template <typename T>
-struct reverse_power_functor {
-  C10_DEVICE T operator()(const T& a, const T& b) const {
-    return at::native::pow_(b, a);
-  }
-};
-
-} // namespace
-} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/MiscUtils.h b/aten/src/ATen/native/zoom/MiscUtils.h
deleted file mode 100644
index 257c488bd7e98e..00000000000000
--- a/aten/src/ATen/native/zoom/MiscUtils.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#pragma once
-#include <c10/zoom/ZoomException.h>
-#include <ATen/zoom/ZoomContext.h>
-#include <ATen/zoom/PinnedMemoryAllocator.h>
-
-namespace at {
-namespace native {
-
-static inline int zoom_int_cast(int64_t value, const char* varname) {
-  auto result = static_cast<int>(value);
-  TORCH_CHECK(static_cast<int64_t>(result) == value,
-              "zoom_int_cast: The value of ", varname, "(", (long long)value,
-              ") is too large to fit into a int (", sizeof(int), " bytes)");
-  return result;
-}
-
-// Creates an array of size elements of type T, backed by pinned memory
-// wrapped in a Storage
-template<class T>
-static inline Storage pin_memory(int64_t size) {
-  auto* allocator = zoom::getPinnedMemoryAllocator();
-  int64_t adjusted_size = size * sizeof(T);
-  return Storage(
-      Storage::use_byte_size_t(),
-      adjusted_size,
-      allocator,
-      /*resizable=*/false);
-}
-
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/zoom/MultiTensorApply.cuh b/aten/src/ATen/native/zoom/MultiTensorApply.cuh
deleted file mode 100644
index 9efa863f49ceaf..00000000000000
--- a/aten/src/ATen/native/zoom/MultiTensorApply.cuh
+++ /dev/null
@@ -1,379 +0,0 @@
-#pragma once
-#include <ATen/core/Tensor.h>
-#include <ATen/zoom/ZoomContext.h>
-#include <c10/zoom/ZoomGuard.h>
-#include <ATen/zoom/jit/Loops.cuh>
-#include <ATen/zoom/jit/MemoryAccess.cuh>
-#include <vector>
-
-namespace at::native {
-
-namespace {
-
-static constexpr int64_t kILP = 4;
-static constexpr int64_t kChunkSize = 65536;
-static constexpr int64_t kBlockSize = 512;
-
-// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy`
-// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
-static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
-static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
-static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = {
-    72,
-    60};
-
-template <typename T>
-__device__ __forceinline__ bool is_aligned(T* p) {
-  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
-}
-
-template <typename T>
-__device__ __forceinline__ void load_store(
-    T* dst,
-    T* src,
-    int64_t dst_offset,
-    int64_t src_offset) {
-  using LT = at::native::memory::aligned_vector<T, kILP>;
-  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
-}
-
-template <int n>
-struct TensorListMetadata {
-  const void* addresses[n][depth_to_max_tensors[n - 1]];
-  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-  int block_to_chunk[depth_to_max_blocks[n - 1]];
-  int start_tensor_this_launch;
-};
-
-template <typename scalar_vals_t, int n>
-struct TensorListScalarListMetadata {
-  const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]];
-  int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]];
-  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-  int block_to_chunk[depth_to_max_blocks[n - 1]];
-};
-
-// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of
-// 4kb with `c10::complex<double>`
-template <>
-struct TensorListScalarListMetadata<c10::complex<double>, 1> {
-  const void* addresses[1]
-                       [depth_to_max_tensors_scalarlist_of_complex_double[0]];
-  int64_t
-      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]];
-  c10::complex<double>
-      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]];
-  unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]];
-  int block_to_chunk[depth_to_max_blocks[1 - 1]];
-};
-
-template <>
-struct TensorListScalarListMetadata<c10::complex<double>, 2> {
-  const void* addresses[2]
-                       [depth_to_max_tensors_scalarlist_of_complex_double[1]];
-  int64_t
-      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]];
-  c10::complex<double>
-      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]];
-  int block_to_chunk[depth_to_max_blocks[2 - 1]];
-};
-
-// NOTE(crcrpar): This is a conservative resolution to handle `state_steps`
-// whose each element is `at::Tensor` of 1 element representing the number of
-// `step`s called so far.
-template <int n>
-struct FusedOptimizerTensorListMetadata {
-  const void* addresses[n][depth_to_max_tensors[n - 1]];
-  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
-  const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
-  int block_to_chunk[depth_to_max_blocks[n - 1]];
-  int start_tensor_this_launch;
-};
-
-template <typename T, typename U, typename... ArgTypes>
-C10_LAUNCH_BOUNDS_1(kBlockSize)
-__global__ void multi_tensor_apply_kernel(
-    T tensorListMeta,
-    U callable,
-    ArgTypes... args) {
-  // Hand the chunk information to the user-supplied functor to process however
-  // it likes.
-  callable(kChunkSize, tensorListMeta, args...);
-}
-
-} // namespace
-
-// multi_tensor_apply enables horizontal fusion across lists of tensors.
-// For example, whereas you once had a for-loop of a + b = c, where a, b,
-// and c are individual tensors in lists as, bs, and cs, you can now with
-// fewer kernel launches compute as + bs = cs.
-//
-// You can also imagine bs to be a scalar list vs a tensor list.
-//
-// The function below takes in tensor lists, scalars, and a callable and
-// chunks up the computation to launch as few kernels as possible by iterating
-// through every "chunk" in every tensor (thus the nested for loops). In the
-// simplest case, everything gets bundled into just one kernel launch, but
-// due to blocksize constraints, we may need to launch multiple kernels.
-// Each kernel launch is defined by one tensorListMeta construct, which we
-// use to track and reset the necessary metadata for each launch.
-template <int depth, typename scalar_T, typename T, typename... ArgTypes>
-void multi_tensor_apply(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    at::ArrayRef<Scalar> scalars,
-    T callable,
-    ArgTypes... args) {
-  TORCH_CHECK(
-      tensor_lists.size() == depth,
-      "Number of tensor lists has to match the depth.");
-  const size_t n_tensors = tensor_lists[0].size();
-  using scalar_vals_t = typename T::opmath_t;
-  TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
-
-  int loc_block_info = 0;
-  int loc_tensor_info = 0;
-  for (size_t t = 0; t < n_tensors; t++) {
-    // short-circuit to avoid adding empty tensors to tensorListMeta
-    if (tensor_lists[0][t].numel() == 0) {
-      continue;
-    }
-    tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
-    tensorListMeta.numel_for_tensor[loc_tensor_info] =
-        tensor_lists[0][t].numel();
-    for (int d = 0; d < depth; d++) {
-      tensorListMeta.addresses[d][loc_tensor_info] =
-          tensor_lists[d][t].const_data_ptr();
-    }
-    loc_tensor_info++;
-
-    // now we enter [chunking territory].
-    // we will launch a kernel when EITHER the blocks get filled up OR
-    // the tensors get filled up. There will always be at least one block
-    // per tensor since the zero-sized ones will not enter the loop, so
-    // the nested forloop within represents iterating through the chunks
-    // of a single tensor.
-    const auto numel = tensor_lists[0][t].numel();
-    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
-    for (auto chunk = 0; chunk < chunks; chunk++) {
-      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
-      loc_block_info++;
-
-      // a tensor is not considered full unless all its chunks have been
-      // processed
-      const bool tensors_full =
-          (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] &&
-           chunk == chunks - 1);
-      const bool blocks_full =
-          (loc_block_info == depth_to_max_blocks[depth - 1]);
-
-      if (tensors_full || blocks_full) {
-        multi_tensor_apply_kernel<<<
-            loc_block_info,
-            kBlockSize,
-            0,
-            c10::zoom::getCurrentZoomStream()>>>(
-            tensorListMeta, callable, args...);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();
-
-        // Reset.
-        loc_block_info = 0;
-        // all chunks have already been handled in the kernel
-        if (chunk == chunks - 1) {
-          loc_tensor_info = 0;
-        } else { // blocks were full and tensor chunks remain
-          tensorListMeta.numel_for_tensor[0] =
-              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
-          tensorListMeta.scalar_vals[0] =
-              tensorListMeta.scalar_vals[loc_tensor_info - 1];
-          for (int d = 0; d < depth; d++) {
-            tensorListMeta.addresses[d][0] =
-                tensorListMeta.addresses[d][loc_tensor_info - 1];
-          }
-          loc_tensor_info = 1;
-        }
-      }
-    }
-  }
-
-  // note: [finishing what we started]
-  // if there's remaining work to be done but the tensors/blocks aren't full
-  // yet we are at the end, submit the kernel to do the work!
-  if (loc_block_info != 0) {
-    multi_tensor_apply_kernel<<<
-        loc_block_info,
-        kBlockSize,
-        0,
-        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
-    C10_ZOOM_KERNEL_LAUNCH_CHECK();
-  }
-}
-
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    T callable,
-    ArgTypes... args) {
-  TORCH_CHECK(
-      tensor_lists.size() == depth,
-      "Number of tensor lists has to match the depth.");
-  const size_t n_tensors = tensor_lists[0].size();
-  TensorListMetadata<depth> tensorListMeta;
-  tensorListMeta.start_tensor_this_launch = 0;
-
-  int loc_block_info = 0;
-  int loc_tensor_info = 0;
-  for (size_t t = 0; t < n_tensors; t++) {
-    // short-circuit to avoid adding empty tensors to tensorListMeta
-    if (tensor_lists[0][t].numel() == 0) {
-      continue;
-    }
-    tensorListMeta.numel_for_tensor[loc_tensor_info] =
-        tensor_lists[0][t].numel();
-    for (int d = 0; d < depth; d++) {
-      tensorListMeta.addresses[d][loc_tensor_info] =
-          tensor_lists[d][t].const_data_ptr();
-    }
-    loc_tensor_info++;
-
-    // see note: [chunking territory].
-    const auto numel = tensor_lists[0][t].numel();
-    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
-    for (auto chunk = 0; chunk < chunks; chunk++) {
-      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
-      loc_block_info++;
-
-      const bool tensors_full =
-          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-           chunk == chunks - 1);
-      const bool blocks_full =
-          (loc_block_info == depth_to_max_blocks[depth - 1]);
-
-      if (tensors_full || blocks_full) {
-        multi_tensor_apply_kernel<<<
-            loc_block_info,
-            kBlockSize,
-            0,
-            c10::zoom::getCurrentZoomStream()>>>(
-            tensorListMeta, callable, args...);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();
-
-        // Reset.
-        loc_block_info = 0;
-        if (chunk == chunks - 1) {
-          loc_tensor_info = 0;
-          tensorListMeta.start_tensor_this_launch = t + 1;
-        } else {
-          tensorListMeta.numel_for_tensor[0] =
-              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
-          for (int d = 0; d < depth; d++) {
-            tensorListMeta.addresses[d][0] =
-                tensorListMeta.addresses[d][loc_tensor_info - 1];
-          }
-          loc_tensor_info = 1;
-          tensorListMeta.start_tensor_this_launch = t;
-        }
-      }
-    }
-  }
-
-  // see note: [finishing what we started]
-  if (loc_block_info != 0) {
-    multi_tensor_apply_kernel<<<
-        loc_block_info,
-        kBlockSize,
-        0,
-        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
-    C10_ZOOM_KERNEL_LAUNCH_CHECK();
-  }
-}
-
-template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply_for_fused_optimizer(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    at::TensorList state_steps,
-    T callable,
-    ArgTypes... args) {
-  TORCH_CHECK(
-      tensor_lists.size() == depth,
-      "Number of tensor lists has to match the depth");
-  const auto num_tensors = tensor_lists[0].size();
-  FusedOptimizerTensorListMetadata<depth> tensorListMeta;
-
-  int loc_block_info = 0;
-  int loc_tensor_info = 0;
-  for (const auto& tensor_index : c10::irange(num_tensors)) {
-    // short-circuit to avoid adding empty tensors to tensorListMeta
-    if (tensor_lists[0][tensor_index].numel() == 0) {
-      continue;
-    }
-    tensorListMeta.state_steps_addresses[loc_tensor_info] =
-        state_steps[tensor_index].const_data_ptr();
-    tensorListMeta.numel_for_tensor[loc_tensor_info] =
-        tensor_lists[0][tensor_index].numel();
-    for (const auto& d : c10::irange(depth)) {
-      tensorListMeta.addresses[d][loc_tensor_info] =
-          tensor_lists[d][tensor_index].const_data_ptr();
-    }
-    loc_tensor_info++;
-
-    // see above note: [chunking territory]
-    const auto numel = tensor_lists[0][tensor_index].numel();
-    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
-    TORCH_CHECK(chunks > -1);
-    for (const auto& chunk : c10::irange(chunks)) {
-      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
-      loc_block_info++;
-
-      const auto tensor_full =
-          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
-           chunk == chunks - 1);
-      const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1];
-
-      if (tensor_full || blocks_full) {
-        multi_tensor_apply_kernel<<<
-            loc_block_info,
-            kBlockSize,
-            0,
-            c10::zoom::getCurrentZoomStream()>>>(
-            tensorListMeta, callable, args...);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();
-
-        // Reset.
-        loc_block_info = 0;
-        if (chunk == chunks - 1) {
-          loc_tensor_info = 0;
-        } else {
-          tensorListMeta.numel_for_tensor[0] =
-              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
-          tensorListMeta.state_steps_addresses[0] =
-              tensorListMeta.state_steps_addresses[loc_tensor_info - 1];
-          for (const auto& d : c10::irange(depth)) {
-            tensorListMeta.addresses[d][0] =
-                tensorListMeta.addresses[d][loc_tensor_info - 1];
-          }
-          loc_tensor_info = 1;
-        }
-      }
-    }
-  }
-
-  // see above note: [finishing what we've started]
-  if (loc_block_info != 0) {
-    multi_tensor_apply_kernel<<<
-        loc_block_info,
-        kBlockSize,
-        0,
-        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
-    C10_ZOOM_KERNEL_LAUNCH_CHECK();
-  }
-}
-
-} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Nonzero.cu b/aten/src/ATen/native/zoom/Nonzero.cu
deleted file mode 100644
index d735795bcc1720..00000000000000
--- a/aten/src/ATen/native/zoom/Nonzero.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
-#include <ATen/zoom/ZoomContext.h>
-#include <c10/zoom/ZoomCachingAllocator.h>
-#include <ATen/zoom/EmptyTensor.h>
-#include <ATen/zoom/detail/KernelUtils.h>
-#include <ATen/zoom/jit/OffsetCalculator.cuh> //for MAX_DIMS
-#include <ATen/zoom/cub.cuh>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/empty_native.h>
-#include <ATen/ops/nonzero_native.h>
-#endif
-
-
-namespace at::native {
-
-namespace{
-template<typename T>
-struct NonZeroOp
-{
-    __host__ __device__ __forceinline__ bool operator()(const T& a) const {
-      return (a!=T(0));
-    }
-};
-
-//TODO: actually support int64_t index_t
-template<typename index_t>
-struct TensorDims {
-  index_t sizes[MAX_DIMS];
-};
-
-template <typename index_t>
-__global__ void write_indices(
-    int64_t* inp,
-    TensorDims<index_t> dims,
-    int ndim,
-    index_t n) {
-  auto index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < n) {
-    index_t div = 1;
-    int64_t idx_flat = inp[index];
-#pragma unroll
-    for (int dim = MAX_DIMS; dim >= 0; dim--) {
-      if (dim > ndim - 1)
-        continue;
-      auto dim_size = dims.sizes[dim];
-      inp[index + dim * n] = (idx_flat / div) % dim_size;
-      div *= dim_size;
-    }
-  }
-}
-
-} //anonymous namespace
-
-template<typename scalar_t>
-void nonzero_zoom_out_impl(const Tensor& self, Tensor& out){
-  Tensor self_ = self.contiguous();
-  int N = self_.numel();
-  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
-// compute number of nonzero elements
-  size_t temp_storage_bytes=0;
-  auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
-  auto num_nonzeros = allocator.allocate(sizeof(int));
-  hipcub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
-  hipcub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
-  auto temp_storage = allocator.allocate(temp_storage_bytes);
-  hipcub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
-  int num_nonzeros_h;
-  c10::zoom::memcpy_and_sync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), hipMemcpyDeviceToHost, stream);
-  //expected output size is num_nonzeros x ndim
-  //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output)
-  //we are able to directly use passed output with this size and strides, and we can also (per contract)
-  //resize passed output with incorrect sizes anyway we want.
-  //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
-  bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
-  at::Tensor out_temp = need_to_copy ?
-      Tensor(at::detail::empty_zoom({self.dim(), num_nonzeros_h}, out.options())) :
-      out.resize_({self.dim(), num_nonzeros_h});
-  //Scalars are expected to produce output of size (1,0), so we can't write to it
-  if (self.dim() > 0) {
-    hipcub::CountingInputIterator<int64_t> counting_itr(0);
-    temp_storage_bytes = 0;
-    hipcub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
-      out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
-    temp_storage = allocator.allocate(temp_storage_bytes);
-    hipcub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
-      out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
-    if (num_nonzeros_h > 0 && self.dim() > 1){
-        TensorDims<int> dims;
-        for (int i=0; i<self.dim(); i++){
-            dims.sizes[i] = self.sizes()[i];
-        }
-        const int nthreads = 256;
-        const int nblocks = (num_nonzeros_h + nthreads -1)/nthreads;
-        write_indices<<<nblocks, nthreads, 0, stream>>>(out_temp.mutable_data_ptr<int64_t>(),
-        dims, self.dim(), num_nonzeros_h);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();
-    }
-  }
-  if (need_to_copy) {
-    out.copy_(out_temp.t());
-  } else {
-    //transpose out so it is correct size
-    Tensor out_ = out_temp.t();
-    out.set_(out_);
-  }
-}
-
-Tensor& nonzero_out_zoom(const Tensor& self, Tensor& out){
-  TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
-  See https://github.com/pytorch/pytorch/issues/51871");
-  TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
-  TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
-  out.device(), " and self on ", self.device());
-  TORCH_CHECK(self.dim() <= MAX_DIMS, "nonzero is not supported for tensor with more than ", MAX_DIMS, " dimensions");
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(at::ScalarType::ComplexHalf, at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
-    self.scalar_type(), "nonzero_zoom",
-    [&] {nonzero_zoom_out_impl<scalar_t>(self, out);});
-  return out;
-}
-
-Tensor nonzero_zoom(const Tensor& self){
-  Tensor out = at::detail::empty_zoom({0}, self.options().dtype(kLong));
-  return at::native::nonzero_out_zoom(self, out);
-}
-} //namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Pow.cuh b/aten/src/ATen/native/zoom/Pow.cuh
deleted file mode 100644
index eee86031f8d932..00000000000000
--- a/aten/src/ATen/native/zoom/Pow.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-#pragma once
-#include <ATen/native/Pow.h>
-#include <c10/core/Scalar.h>
-
-namespace at { namespace native {
-
-namespace {
-
-
-// SFINAE doesn't work well with NVCC under Windows for math functions like pow and sqrt.
-// So we need to define the functions with the explicit function signatures.
-// As for pow, the following signatures are defined as the device function:
-//   pow(float, int)
-//   pow(double, int)
-//   pow(float, float)
-//   pow(double, double)
-#ifdef _MSC_VER
-// Functions for pow
-// pow for at::Half
-static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
-  return static_cast<at::Half>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
-}
-// pow for at::BFloat16
-static inline __host__ __device__ at::BFloat16 pow_(at::BFloat16 base, at::BFloat16 exp) {
-  return static_cast<at::BFloat16>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
-}
-// pow (floating, floating/int)
-template <typename Base_type, typename Exp_type>
-static inline __host__ __device__ typename std::enable_if<std::is_floating_point<Base_type>::value && (std::is_same<Base_type, Exp_type>::value || std::is_same<Exp_type, int>::value), Base_type>::type
-  pow_(Base_type base, Exp_type exp) {
-  return std::pow(base, exp);
-}
-// pow (Otherwise)
-template <typename Base_type, typename Exp_type>
-static inline __host__ __device__ typename std::enable_if<!std::is_same<Base_type, Exp_type>::value && !std::is_same<Exp_type, int>::value, Base_type>::type
-  pow_(Base_type base, Exp_type exp) {
-  return static_cast<Base_type>(std::pow(static_cast<double>(base), static_cast<double>(exp)));
-}
-#else
-template <typename Base_type, typename Exp_type>
-static inline __host__ __device__ Base_type pow_(Base_type base, Exp_type exp) {
-  return ::pow(base, exp);
-}
-#endif
-
-template <typename T>
-static inline __host__ __device__ std::enable_if_t<std::is_integral<T>::value, T> pow_(
-    T base, T exp) {
-  return at::native::powi(base, exp);
-}
-
-template <typename T>
-static inline __host__ __device__ c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
-  return c10_complex_math::pow(base, exp);
-}
-
-} // namespace
-}} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/PowKernel.cu b/aten/src/ATen/native/zoom/PowKernel.cu
deleted file mode 100644
index e67e47201687ad..00000000000000
--- a/aten/src/ATen/native/zoom/PowKernel.cu
+++ /dev/null
@@ -1,209 +0,0 @@
-#define TORCH_ASSERT_NO_OPERATORS
-#include <ATen/Context.h>
-#include <ATen/Dispatch.h>
-#include <ATen/zoom/jit/Loops.cuh>
-#include <ATen/zoom/jit/JitLoops.cuh>
-#include <ATen/native/zoom/Pow.cuh>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/native/Pow.h>
-#include <c10/core/Scalar.h>
-
-namespace at::native {
-
-// Forward declare some unary kernels
-void rsqrt_kernel_zoom(TensorIteratorBase& iter);
-void sqrt_kernel_zoom(TensorIteratorBase& iter);
-void reciprocal_kernel_zoom(TensorIteratorBase& iter);
-
-namespace {
-
-void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar);
-
-template <typename scalar_t>
-void pow_scalar_tensor_impl(TensorIteratorBase& iter, scalar_t base) {
-  gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t exp) -> scalar_t {
-    return pow_(base, exp);
-  });
-}
-
-template <typename value_t>
-void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<value_t> base) {
-  // For complex, thrust::pow uses the identity
-  // pow(a, b) = exp(log(a) * b)
-  const auto fct = std::log(base);
-  gpu_kernel(iter, [=]GPU_LAMBDA(c10::complex<value_t> exp) -> c10::complex<value_t> {
-    return std::exp(fct * exp);
-  });
-}
-
-/* complex<Half> support impl */
-CONSTEXPR_EXCEPT_WIN_CUDA char pow_scalar_base_name[] = "pow_scalar_base_kernel";
-template <>
-void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<at::Half> base) {
-  using scalar_t = c10::complex<at::Half>;
-  using opmath_t = at::opmath_type<scalar_t>;
-  // For complex, thrust::pow uses the identity
-  // pow(a, b) = exp(log(a) * b)
-  const auto fct = std::log(opmath_t{base});
-#if AT_USE_JITERATOR()
-  static const auto pow_kernel_string =
-      jiterator_stringify(template <typename T> T pow_scalar_base_kernel(T exp, T fct) {
-        return std::exp(fct * exp);
-      });
-  jitted_gpu_kernel<pow_scalar_base_name, scalar_t, scalar_t, 1>(
-      iter,
-      pow_kernel_string,
-      /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
-      /*scalar_val=*/0,
-      /*extra_args=*/std::make_tuple(fct));
-#else
-  gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t exp) -> scalar_t {
-    return std::exp(fct * opmath_t{exp});
-  });
-#endif
-}
-
-namespace {
-
-#if AT_USE_JITERATOR()
-/* complex<Half> support impl */
-CONSTEXPR_EXCEPT_WIN_CUDA char pow_name[] = "pow_kernel";
-static const auto pow_kernel_string =
-    jiterator_stringify(template <typename T> T pow_kernel(T base, T exp) {
-      return std::pow(base, exp);
-    });
-#endif
-
-/* complex<Half> support impl */
-void pow_chalf_tensor_scalar_impl(TensorIteratorBase& iter, const Scalar& exp_scalar) {
-  using scalar_t = c10::complex<at::Half>;
-  using opmath_t = at::opmath_type<scalar_t>;
-  auto exp = exp_scalar.to<opmath_t>();
-#if AT_USE_JITERATOR()
-  jitted_gpu_kernel<pow_name, scalar_t, scalar_t, 1>(
-      iter,
-      pow_kernel_string,
-      /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
-      /*scalar_val=*/0,
-      /*extra_args=*/std::make_tuple(exp));
-#else
-  gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t base) -> scalar_t {
-    return std::pow(opmath_t{base}, exp);
-  });
-#endif
-}
-
-}  // anonymous namespace
-
-void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
-  auto common_dtype = iter.common_dtype();
-  if (common_dtype == kComplexHalf) {
-    using scalar_t = c10::complex<at::Half>;
-    if (iter.is_cpu_scalar(1)) {
-      const auto base = iter.scalar_value<scalar_t>(1);
-      iter.remove_operand(1);
-      pow_scalar_tensor_impl(iter, base);
-    } else if (iter.is_cpu_scalar(2)) {
-      const auto exp = iter.scalar_value<scalar_t>(2);
-      iter.remove_operand(2);
-      pow_chalf_tensor_scalar_impl(iter, exp);
-    } else {
-      using opmath_t = at::opmath_type<scalar_t>;
-      TORCH_INTERNAL_ASSERT(!iter.is_cpu_scalar(1) && !iter.is_cpu_scalar(2));
-#if AT_USE_JITERATOR()
-      jitted_gpu_kernel<pow_name, scalar_t, scalar_t, 2>(
-          iter, pow_kernel_string);
-#else
-      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
-            using opmath_t = at::opmath_type<scalar_t>;
-            return pow_(opmath_t{base}, opmath_t{exp});
-          });
-#endif
-    }
-  } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
-        kHalf, kBFloat16, iter.common_dtype(), "pow_zoom", [&] {
-      if (iter.is_cpu_scalar(1)) {
-        const auto base = iter.scalar_value<scalar_t>(1);
-        iter.remove_operand(1);
-        pow_scalar_tensor_impl(iter, base);
-      } else if (iter.is_cpu_scalar(2)) {
-        const auto exp = iter.scalar_value<scalar_t>(2);
-        iter.remove_operand(2);
-        pow_tensor_scalar_kernel(iter, exp);
-      } else {
-        gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
-          return pow_(base, exp);
-        });
-      }
-    });
-  }
-}
-
-
-template<typename Base_type, typename Exp_type>
-void pow_tensor_scalar_kernel_impl(TensorIteratorBase& iter,
-                                                 Exp_type exp) {
-  const auto d_exp = static_cast<double>(exp);
-  // .5 (sqrt), -.5 (rsqrt) and -1 (reciprocal) specializations are handled
-  // in pow_tensor_scalar_kernel
-  if (d_exp == 2) {
-    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return base * base;
-    });
-  } else if (d_exp == 3) {
-    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return base * base * base;
-    });
-  } else if (d_exp == -2) {
-    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return 1.0 / (base * base);
-    });
-  } else {
-    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
-      return pow_(base, exp);
-    });
-  }
-}
-
-void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar) {
-  // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
-  if (!exp_scalar.isComplex()) {
-    if (exp_scalar.equal(.5)) {
-      return sqrt_kernel_zoom(iter);
-    } else if (exp_scalar.equal(-0.5)) {
-      return rsqrt_kernel_zoom(iter);
-    } else if (exp_scalar.equal(-1.0)) {
-      return reciprocal_kernel_zoom(iter);
-    }
-  }
-  if (isComplexType(iter.common_dtype()) || exp_scalar.isComplex()) {
-    if (iter.common_dtype() == kComplexHalf) {
-      using scalar_t = c10::complex<at::Half>;
-      pow_chalf_tensor_scalar_impl(iter, exp_scalar);
-      return;
-    }
-    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "pow_zoom", [&]() {
-      const auto exp = exp_scalar.to<scalar_t>();
-      gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base) -> scalar_t {
-        return pow_(base, exp);
-      });
-    });
-  } else if (isFloatingType(iter.common_dtype()) || exp_scalar.isIntegral(false)) {
-    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "pow_zoom", [&]() {
-      const auto exp = exp_scalar.to<scalar_t>();
-      pow_tensor_scalar_kernel_impl<scalar_t>(iter, exp);
-    });
-  } else {
-    TORCH_INTERNAL_ASSERT(false, "invalid combination of type in Pow function, common dtype:", iter.common_dtype(),
-                                 "exp is integral?", exp_scalar.isIntegral(false));
-  }
-}
-
-} // anonymous namespace
-
-REGISTER_PRIVATEUSE1_DISPATCH(pow_tensor_tensor_stub, &pow_tensor_tensor_kernel);
-REGISTER_PRIVATEUSE1_DISPATCH(pow_tensor_scalar_stub, &pow_tensor_scalar_kernel);
-
-} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/TensorCompare.cu b/aten/src/ATen/native/zoom/TensorCompare.cu
deleted file mode 100644
index e92d058c9b7222..00000000000000
--- a/aten/src/ATen/native/zoom/TensorCompare.cu
+++ /dev/null
@@ -1,133 +0,0 @@
-#define TORCH_ASSERT_NO_OPERATORS
-#include <ATen/NumericUtils.h>
-#include <ATen/Dispatch.h>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorCompare.h>
-#include <ATen/zoom/jit/Loops.cuh>
-#include <c10/core/Scalar.h>
-
-
-namespace at::native {
-
-namespace {
-
-void where_kernel_impl(TensorIterator &iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBFloat16, kBool, iter.dtype(), "where_zoom", [&] {
-      gpu_kernel(
-        iter,
-        [=] GPU_LAMBDA (bool cond_val, scalar_t self_val, scalar_t other_val) -> scalar_t {
-          return cond_val ? self_val : other_val;
-        });
-  });
-}
-
-void isposinf_kernel_impl(TensorIteratorBase &iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_zoom", [&]() {
-    gpu_kernel(
-      iter,
-      [] GPU_LAMBDA (scalar_t a) -> bool { return a == std::numeric_limits<scalar_t>::infinity(); }
-    );
-  });
-}
-
-void isneginf_kernel_impl(TensorIteratorBase &iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_zoom", [&]() {
-    gpu_kernel(
-      iter,
-      [] GPU_LAMBDA (scalar_t a) -> bool { return a == -std::numeric_limits<scalar_t>::infinity(); }
-    );
-  });
-}
-
-void clamp_kernel_impl(TensorIteratorBase& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_zoom", [&] {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower, scalar_t upper) -> scalar_t {
-      // Propagate nan, which doesn't propagate automatically for ROCm
-      if (at::_isnan(v)) {
-        return v;
-      } if (at::_isnan(lower)) {
-        return lower;
-      } if (at::_isnan(upper)) {
-        return upper;
-      } else {
-        return ::min(::max(v, lower), upper);
-      }
-    });
-  });
-}
-
-void inline launch_clamp_scalar(TensorIteratorBase& iter, Scalar lim0, Scalar lim1, at::native::detail::ClampLimits minmax){
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_scalar_zoom", [&] {
-    using opmath_t = at::opmath_type<scalar_t>;
-    auto lim0_val = lim0.to<opmath_t>();
-    auto lim1_val = lim1.to<opmath_t>();
-
-    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
-      // Propagate nan, which doesn't propagate automatically for ROCm
-      if (_isnan(static_cast<opmath_t>(v))) {
-        return v;
-      } else if (minmax==at::native::detail::ClampLimits::Min){
-        return ::max(static_cast<opmath_t>(v), lim0_val);
-      } else if (minmax==at::native::detail::ClampLimits::Max){
-        return ::min(static_cast<opmath_t>(v), lim0_val);
-      } else {
-        return ::min(::max(static_cast<opmath_t>(v), lim0_val), lim1_val);
-      }
-    });
-  });
-}
-
-
-void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min, const Scalar& max) {
-  launch_clamp_scalar(iter, min, max, at::native::detail::ClampLimits::MinMax);
-}
-
-void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min) {
-  launch_clamp_scalar(iter, min, min, at::native::detail::ClampLimits::Min);
-}
-
-void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max) {
-  launch_clamp_scalar(iter, max, max, at::native::detail::ClampLimits::Max);
-}
-
-} // anonymous namespace
-
-
-REGISTER_PRIVATEUSE1_DISPATCH(where_kernel, &where_kernel_impl);
-REGISTER_PRIVATEUSE1_DISPATCH(isposinf_stub, &isposinf_kernel_impl);
-REGISTER_PRIVATEUSE1_DISPATCH(isneginf_stub, &isneginf_kernel_impl);
-REGISTER_PRIVATEUSE1_DISPATCH(clamp_stub, &clamp_kernel_impl);
-REGISTER_PRIVATEUSE1_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl);
-REGISTER_PRIVATEUSE1_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl);
-REGISTER_PRIVATEUSE1_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl);
-
-template <typename scalar_t>
-__global__ void _assert_async_zoom_kernel(const scalar_t* input) {
-  ZOOM_KERNEL_ASSERT(input[0] != 0);
-}
-
-__global__ void _assert_async_zoom_kernel(const c10::complex<float>* input) {
-  ZOOM_KERNEL_ASSERT(input[0] != c10::complex<float>(0, 0));
-}
-__global__ void _assert_async_zoom_kernel(const c10::complex<double>* input) {
-  ZOOM_KERNEL_ASSERT(input[0] != c10::complex<double>(0, 0));
-}
-
-void _assert_async_zoom(const Tensor& self_tensor) {
-  const TensorBase &self = get_tensor_base(self_tensor);
-  auto n = self.numel();
-  TORCH_CHECK(n != 0, "Boolean value of Tensor with no values is ambiguous");
-  TORCH_CHECK(n < 2, "Boolean value of Tensor with more than one value is ambiguous");
-  auto stream = c10::zoom::getCurrentZoomStream();
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_assert_async_zoom", [&] {
-    _assert_async_zoom_kernel<<<1, 1, 0, stream>>>(self.const_data_ptr<scalar_t>());
-    C10_ZOOM_KERNEL_LAUNCH_CHECK();
-  });
-}
-
-// TODO (tmanlaibaatar) Ignore assert msg for now
-void _assert_async_msg_zoom(const Tensor& self_tensor, c10::string_view assert_msg) {
-  _assert_async_zoom(self_tensor);
-}
-
-} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/TensorShape.cu b/aten/src/ATen/native/zoom/TensorShape.cu
deleted file mode 100644
index 5fad25d8a76179..00000000000000
--- a/aten/src/ATen/native/zoom/TensorShape.cu
+++ /dev/null
@@ -1,833 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <hip/hip_runtime.h>
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/Dispatch.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/zoom/ZoomContext.h>
-#include <ATen/native/Resize.h>
-#include <ATen/native/TensorShape.h>
-#include <c10/zoom/HIPGraphsC10Utils.h>
-#include <c10/util/TypeCast.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_chunk_cat_native.h>
-#include <ATen/ops/empty.h>
-#include <ATen/ops/split_with_sizes_copy_native.h>
-#endif
-
-namespace at::native {
-
-namespace detail {
-
-// NOTE [CUDA fast path for split_with_sizes_copy.out]
-// split_with_sizes_copy.out for contiguous operands has the following
-// properties:
-// - Each src split consists of multiple chunks that are separated by a fixed
-// stride. The number of chunks and the strides are the same across all src
-// splits.
-// - Each dst split is the concatenation of the chunks in its corresponding src
-// splits.
-// - The sizes of chunks vary across splits.
-// - A (src, dst) chunk pair is not guaranteed to have the
-// same alignment.
-//
-// The following strategies are employed to optimize for this workload:
-// - The entire workload is fused into a single kernel to maximize I/O
-// throughput and minimize wave quantization.
-// - To account for both small and large chunk sizes, a "jagged grid" is used.
-// Each chunk is processed by one or more blocks depending on its size.
-// - Within each chunk, the region in which writes can be vectorized is
-// identified. Within this region, writes are always vectorized and reads are
-// oppurtunistically vectorized.
-static constexpr int64_t BLOCK_SIZE = 128;
-static constexpr int64_t BYTES_PER_THREAD = 16;
-static constexpr int64_t BYTES_PER_BLOCK = BYTES_PER_THREAD * BLOCK_SIZE;
-
-static __host__ __device__ inline int64_t div_up(int64_t a, int64_t b) {
-  return (a + b - 1) / b;
-}
-
-template <typename T>
-__device__ inline void stream_load128(uint4& val, const T* addr) {
-  uint64_t low, high;
-  low = reinterpret_cast<const uint64_t*>(addr)[0];
-  high = reinterpret_cast<const uint64_t*>(addr)[1];
-  reinterpret_cast<uint64_t*>(&val)[0] = low;
-  reinterpret_cast<uint64_t*>(&val)[1] = high;
-}
-
-template <typename T>
-__device__ inline void stream_store128(T* addr, const uint4& val) {
-  uint64_t low, high;
-  low = reinterpret_cast<const uint64_t*>(&val)[0];
-  high = reinterpret_cast<const uint64_t*>(&val)[1];
-  reinterpret_cast<uint64_t*>(addr)[0] = low;
-  reinterpret_cast<uint64_t*>(addr)[1] = high;
-}
-
-template <typename T>
-static __device__ inline bool is_aligned(const void* addr) {
-  return reinterpret_cast<uintptr_t>(addr) % sizeof(T) == 0;
-}
-
-template <typename T>
-static __device__ inline void load128(uint4& val, const char* addr) {
-  for (size_t i = 0; i < detail::BYTES_PER_THREAD / sizeof(T); ++i) {
-    reinterpret_cast<T*>(&val)[i] = reinterpret_cast<const T*>(addr)[i];
-  }
-}
-
-template <>
-__device__ inline void load128<uint4>(uint4& val, const char* addr) {
-  stream_load128(val, addr);
-}
-
-static __device__ inline void load128(uint4& val, const char* addr) {
-  if (is_aligned<uint4>(addr)) {
-    load128<uint4>(val, addr);
-  } else if (is_aligned<int64_t>(addr)) {
-    load128<uint64_t>(val, addr);
-  } else if (is_aligned<uint32_t>(addr)) {
-    load128<uint32_t>(val, addr);
-  } else {
-    load128<uint8_t>(val, addr);
-  }
-}
-
-static __device__ __inline__ void get_aligned_region(
-    char* ptr,
-    const int64_t chunk_size,
-    const int64_t alignment,
-    int64_t& align_off,
-    int64_t& aligned_size) {
-  const int64_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
-  align_off = detail::div_up(ptr_val, alignment) * alignment - ptr_val;
-  aligned_size = (chunk_size - align_off) / alignment * alignment;
-}
-
-static __device__ __inline__ void copy_chunk(
-    char* dst,
-    const char* src,
-    int64_t chunk_size,
-    int64_t thread_idx,
-    int64_t num_threads) {
-  if (chunk_size < num_threads) {
-    if (thread_idx < chunk_size) {
-      dst[thread_idx] = src[thread_idx];
-    }
-    return;
-  }
-
-  // Identify the region in which writes are guaranteed to be 128-bit aligned
-  int64_t align_off, aligned_size;
-  get_aligned_region(
-      dst, chunk_size, detail::BYTES_PER_THREAD, align_off, aligned_size);
-
-  for (int64_t off = align_off + thread_idx * detail::BYTES_PER_THREAD;
-       off < align_off + aligned_size;
-       off += num_threads * detail::BYTES_PER_THREAD) {
-    uint4 val;
-    // Oppurtunistically vectorize reads
-    load128(val, &src[off]);
-    stream_store128(&dst[off], val);
-  }
-
-  // Handle unaligned regions
-  if (thread_idx < align_off && thread_idx < chunk_size) {
-    dst[thread_idx] = src[thread_idx];
-  }
-  if (align_off + aligned_size + thread_idx < chunk_size) {
-    dst[align_off + aligned_size + thread_idx] =
-        src[align_off + aligned_size + thread_idx];
-  }
-}
-
-static __global__ void split_with_sizes_copy_out_contiguous_no_cast_kernel(
-    char** dst_base_addrs,
-    char** src_base_addrs,
-    int64_t* split_chunk_sizes,
-    int64_t* block_idx_to_split_idx,
-    int64_t* blocks_cumsums,
-    int64_t src_stride,
-    int64_t num_chunks) {
-  const int64_t split_idx = block_idx_to_split_idx[blockIdx.x];
-  const int64_t split_blocks =
-      blocks_cumsums[split_idx + 1] - blocks_cumsums[split_idx];
-  const int64_t split_threads = split_blocks * blockDim.x;
-  const int64_t split_thread_idx =
-      (blockIdx.x - blocks_cumsums[split_idx]) * blockDim.x + threadIdx.x;
-  const int64_t split_chunk_size = split_chunk_sizes[split_idx];
-
-  char* dst_base_addr = dst_base_addrs[split_idx];
-  char* src_base_addr = src_base_addrs[split_idx];
-
-  for (int64_t i = blockIdx.y; i < num_chunks; i += gridDim.y) {
-    copy_chunk(
-        dst_base_addr + i * split_chunk_size,
-        src_base_addr + i * src_stride,
-        split_chunk_size,
-        split_thread_idx,
-        split_threads);
-  }
-}
-
-// Calculate the base addr for each split.
-static inline std::vector<int64_t> get_split_base_addrs(
-    const at::Tensor& tensor,
-    at::IntArrayRef split_sizes,
-    int64_t dim) {
-  const auto* data_ptr = static_cast<const char*>(tensor.const_data_ptr());
-  const auto strides = tensor.strides();
-  const auto element_sz = tensor.element_size();
-  int64_t off = 0;
-  std::vector<int64_t> split_base_addrs;
-  split_base_addrs.reserve(split_sizes.size());
-  for (const auto& split_size : split_sizes) {
-    split_base_addrs.push_back(reinterpret_cast<int64_t>(data_ptr + off));
-    off += split_size * strides[dim] * element_sz;
-  }
-  return split_base_addrs;
-}
-
-static inline std::vector<int64_t> get_dst_addrs(at::TensorList out) {
-  std::vector<int64_t> addrs;
-  addrs.reserve(out.size());
-  for (const auto& tensor : out) {
-    addrs.push_back(reinterpret_cast<int64_t>(tensor.data_ptr()));
-  }
-  return addrs;
-}
-
-// Calculate the chunk size for each split in bytes.
-static inline std::vector<int64_t> get_split_chunk_sizes(
-    const at::Tensor& tensor,
-    at::IntArrayRef split_sizes,
-    int64_t dim) {
-  const auto stride = tensor.stride(dim);
-  const auto element_sz = tensor.element_size();
-  std::vector<int64_t> split_chunk_sizes;
-  split_chunk_sizes.reserve(split_sizes.size());
-  for (const auto& split_size : split_sizes) {
-    split_chunk_sizes.push_back(split_size * stride * element_sz);
-  }
-  return split_chunk_sizes;
-}
-
-// Calculate the chunk stride in bytes. This is the same for all splits.
-static inline int64_t get_chunk_stride(const at::Tensor& tensor, int64_t dim) {
-  int64_t stride = 1;
-  for (int64_t d = dim; d < tensor.dim(); ++d) {
-    stride *= tensor.sizes()[d];
-  }
-  return stride * tensor.element_size();
-}
-
-// Calculate the number of chunks. This is the same for all splits.
-static inline int64_t get_num_chunks(const at::Tensor& tensor, int64_t dim) {
-  int64_t num_chunks = tensor.numel();
-  for (int64_t d = dim; d < tensor.dim(); ++d) {
-    num_chunks /= tensor.sizes()[d];
-  }
-  return num_chunks;
-}
-
-// Pack multiple std::vector<int64_t> into a single zoom tensor.
-std::pair<at::Tensor, std::vector<int64_t*>> pack_vecs(
-    std::vector<const std::vector<int64_t>*> vecs,
-    const at::Device& device) {
-  int64_t numel = 0;
-  for (const auto* vec : vecs) {
-    numel += vec->size();
-  }
-
-  auto packed = at::empty(
-      {numel}, at::TensorOptions().dtype(at::kLong).pinned_memory(true));
-  size_t offset = 0;
-  for (const auto* vec : vecs) {
-    memcpy(
-        packed.data_ptr<int64_t>() + offset,
-        vec->data(),
-        sizeof(int64_t) * vec->size());
-    offset += vec->size();
-  }
-  packed = packed.to(device, /*non_blocking=*/true);
-
-  std::vector<int64_t*> ptrs;
-  ptrs.reserve(vecs.size());
-  offset = 0;
-  for (const auto* vec : vecs) {
-    ptrs.push_back(packed.data_ptr<int64_t>() + offset);
-    offset += vec->size();
-  }
-  return std::make_pair(std::move(packed), std::move(ptrs));
-}
-
-static inline std::vector<int64_t> get_chunk_cat_out_sizes(
-    IntArrayRef input_tensor_sizes,
-    int64_t dim,
-    int64_t num_chunks,
-    int64_t chunk_size,
-    int64_t out_element_size) {
-  std::vector<int64_t> view_sizes = std::vector<int64_t>(
-      input_tensor_sizes.begin(), input_tensor_sizes.begin() + dim);
-  view_sizes.insert(
-      view_sizes.end(), {num_chunks, chunk_size / out_element_size});
-  return view_sizes;
-}
-
-// Copy `max_chunk_size` bytes from `src` to `dst` by `num_threads`, and pad
-// zero when `src` size (i.e., actual_chunk_size) is less than `max_chunk_size`.
-// Assume elements of src and dst have the same data type.
-template <typename dst_t, typename src_t>
-__device__ __inline__ void copy_chunk_with_pad(
-    dst_t* dst_ptr,
-    src_t* src_ptr,
-    int64_t max_chunk_size,
-    int64_t actual_chunk_size,
-    int64_t thread_idx,
-    int64_t num_threads) {
-  // Supports type cast
-  if (!std::is_same_v<dst_t, src_t>) {
-    const int64_t max_num_elems = max_chunk_size / sizeof(dst_t);
-    const int64_t actual_num_elems = actual_chunk_size / sizeof(src_t);
-    int64_t elem_index = thread_idx;
-    while (elem_index < actual_num_elems) {
-      dst_ptr[elem_index] =
-          static_cast_with_inter_type<dst_t, src_t>::apply(src_ptr[elem_index]);
-      elem_index += num_threads;
-    }
-    while (elem_index < max_num_elems) {
-      dst_ptr[elem_index] = static_cast_with_inter_type<dst_t, int>::apply(0);
-      elem_index += num_threads;
-    }
-    return;
-  }
-  char* dst = reinterpret_cast<char*>(dst_ptr);
-  char* src = reinterpret_cast<char*>(src_ptr);
-  // Fast path when the number of threads is larger than the number of bytes to
-  // be copied (i.e., max_chunk_size). In this case, each thread only copies 1
-  // byte. For 0 <= thread_idx < actual_chunk_size, the thread copies data from
-  // `src`. For actual_chunk_size <= thread_idx < max_chunk_size, the thread set
-  // the val=0 for padding.
-  if (max_chunk_size < num_threads) {
-    char val = static_cast<char>(0);
-    if (thread_idx < actual_chunk_size) {
-      val = src[thread_idx];
-    }
-    if (thread_idx < max_chunk_size) {
-      dst[thread_idx] = val;
-    }
-    return;
-  }
-  // Split dst array into three parts:
-  // [dst, dst+align_off), [dst+align_off, dst+align_end), [dst+align_end,
-  // dst+max_chunk_size) The second part is aligned with BYTES_PER_THREAD(=16
-  // bytes) to enable `stream_store128`.
-  int64_t align_off, aligned_size;
-  get_aligned_region(
-      dst, actual_chunk_size, BYTES_PER_THREAD, align_off, aligned_size);
-  int64_t align_end = align_off + aligned_size;
-  for (int64_t i = align_off + thread_idx * BYTES_PER_THREAD; i < align_end;
-       i += num_threads * BYTES_PER_THREAD) {
-    uint4 val;
-    if (is_aligned<uint4>(src + i)) {
-      stream_load128(val, src + i);
-    } else {
-      for (size_t j = 0; j < BYTES_PER_THREAD; ++j) {
-        reinterpret_cast<char*>(&val)[j] = src[i + j];
-      }
-    }
-    stream_store128(&dst[i], val);
-  }
-  // Copy data for the first part of dst array [dst, dst+align_off).
-  // Check `thread_idx<max_chunk_sze` for the edge case that max_chunk_size <
-  // align_off.
-  if (thread_idx < align_off && thread_idx < max_chunk_size) {
-    char val = (char)0;
-    if (thread_idx < actual_chunk_size) {
-      val = src[thread_idx];
-    }
-    dst[thread_idx] = val;
-  }
-  // Copy data for the third part of dst array [dst+align_end,
-  // dst+max_chunk_size).
-  while (align_end + thread_idx < max_chunk_size) {
-    char val = (char)0;
-    if (align_end + thread_idx < actual_chunk_size) {
-      val = src[align_end + thread_idx];
-    }
-    dst[align_end + thread_idx] = val;
-    align_end += num_threads;
-  }
-}
-
-// NOTE [CUDA kernel for chunk_cat]
-// chunk_cat_zoom adopts a "jagged grid" strategy, inspired by NOTE [CUDA fast
-// path for split_with_sizes_copy.out]. In addition, chunk_cat_zoom supports
-// padding via copy_chunk_with_pad when src chunk size is less than dst chunk
-// size.
-template <typename dst_t, typename src_t>
-static __global__ void chunk_cat_zoom_kernel(
-    src_t** src,
-    dst_t* dst,
-    int64_t* block_idx_to_tensor_idx,
-    int64_t* tensor_idx_to_start_tensor_bytes,
-    int64_t* start_block_idx_per_tensor_chunk,
-    int64_t* actual_tensor_sizes,
-    int64_t* pad_tensor_chunk_sizes,
-    int64_t* num_blocks_per_tensor_chunk,
-    int64_t slice_size,
-    int64_t chunk_size,
-    int64_t dst_to_src_ratio) {
-  const int64_t slice_idx = blockIdx.z;
-  const int64_t chunk_idx = blockIdx.y;
-  const int64_t tensor_idx = block_idx_to_tensor_idx[blockIdx.x];
-  const int64_t tile_idx =
-      blockIdx.x - start_block_idx_per_tensor_chunk[tensor_idx];
-  // Number of threads for the `tensor_idx`-th tensor chunk.
-  const int64_t num_threads =
-      num_blocks_per_tensor_chunk[tensor_idx] * BLOCK_SIZE;
-  const int64_t thread_idx = tile_idx * BLOCK_SIZE + threadIdx.x;
-  char* src_addr = reinterpret_cast<char**>(src)[tensor_idx] +
-      slice_idx * actual_tensor_sizes[tensor_idx] +
-      chunk_idx * pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio;
-  char* dst_addr = reinterpret_cast<char*>(dst) + slice_idx * slice_size +
-      chunk_idx * chunk_size + tensor_idx_to_start_tensor_bytes[tensor_idx];
-  // Compute the actual number of bytes to copy from src.
-  const int64_t actual_copy_size = ::min(
-      pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio,
-      ::max(
-          (int64_t)0,
-          actual_tensor_sizes[tensor_idx] -
-              chunk_idx * pad_tensor_chunk_sizes[tensor_idx] /
-                  dst_to_src_ratio));
-  copy_chunk_with_pad<dst_t, src_t>(
-      reinterpret_cast<dst_t*>(dst_addr),
-      reinterpret_cast<src_t*>(src_addr),
-      pad_tensor_chunk_sizes[tensor_idx],
-      actual_copy_size,
-      thread_idx,
-      num_threads);
-}
-
-bool all_contiguous(TensorList tensors) {
-  bool contiguous = true;
-  for (const auto& t : tensors) {
-    contiguous &= t.is_non_overlapping_and_dense();
-  }
-  return contiguous;
-}
-
-// Get leading dimensions before `dim`-th dimension.
-static inline int64_t get_leading_dim(at::IntArrayRef sizes, int64_t dim) {
-  int64_t leading_dim = 1;
-  if (dim > 0) {
-    leading_dim = c10::multiply_integers(sizes.slice(0, dim));
-  }
-  return leading_dim;
-}
-
-// Get trailing dimensions after `dim`-th dimension and padded size along
-// `dim`-th dimension.
-static inline std::pair<int64_t, int64_t> get_pad_size(
-    at::IntArrayRef sizes,
-    int64_t dim,
-    int64_t num_chunks) {
-  int64_t trailing_numel = 1;
-  if (sizes.size() > (uint64_t)dim + 1) {
-    trailing_numel =
-        c10::multiply_integers(sizes.slice(dim + 1, sizes.size() - dim - 1));
-  }
-  int64_t pad_size_along_dim =
-      detail::div_up(sizes[dim], num_chunks) * num_chunks;
-  return std::make_pair(pad_size_along_dim, trailing_numel);
-}
-
-// Get the padded chunk size.
-static inline int64_t get_chunk_size(
-    TensorList tensors,
-    int64_t dim,
-    int64_t num_chunks,
-    int64_t elem_size) {
-  auto num_tensors = tensors.size();
-  int64_t chunk_size = 0;
-  for (const auto i : c10::irange(num_tensors)) {
-    auto [pad_size_along_dim, trailing_numel] =
-        get_pad_size(tensors[i].sizes(), dim, num_chunks);
-    const int64_t pad_tensor_chunk_size =
-        pad_size_along_dim * trailing_numel * elem_size / num_chunks;
-    chunk_size += pad_tensor_chunk_size;
-  }
-  return chunk_size;
-}
-
-// Get metadata for chunk_cat.
-std::tuple<
-    int64_t,
-    int64_t,
-    int64_t,
-    int64_t,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    std::vector<int64_t>,
-    std::vector<int64_t>>
-get_chunk_cat_metadata(
-    TensorList tensors,
-    int64_t dim,
-    int64_t num_chunks,
-    int64_t dst_elem_size,
-    int64_t src_elem_size) {
-  TORCH_CHECK(
-      dst_elem_size % src_elem_size == 0,
-      "get_chunk_cat_metadata error: only support dst_elem_size % src_elem_size == 0");
-  auto num_tensors = tensors.size();
-  int64_t leading_dim = get_leading_dim(tensors[0].sizes(), dim);
-  std::vector<int64_t> pad_tensor_chunk_sizes;
-  std::vector<int64_t> num_blocks_per_tensor_chunk;
-  std::vector<int64_t> start_block_idx_per_tensor_chunk{0};
-  std::vector<int64_t> actual_tensor_sizes;
-  std::vector<int64_t> tensor_idx_to_start_tensor_bytes{0};
-  std::vector<int64_t> srcs;
-  pad_tensor_chunk_sizes.reserve(num_tensors);
-  num_blocks_per_tensor_chunk.reserve(num_tensors);
-  start_block_idx_per_tensor_chunk.reserve(num_tensors + 1);
-  actual_tensor_sizes.reserve(num_tensors);
-  tensor_idx_to_start_tensor_bytes.reserve(num_tensors + 1);
-  srcs.reserve(num_tensors);
-  // block_idx_to_tensor_idx cannot be reserved since the number of blocks is
-  // data dependent
-  std::vector<int64_t> block_idx_to_tensor_idx;
-  // Inline computing `chunk_size` to avoid redundant computation
-  int64_t chunk_size = 0;
-  for (const auto i : c10::irange(num_tensors)) {
-    at::Tensor tensor = tensors[i];
-    srcs.push_back(reinterpret_cast<int64_t>(tensor.data_ptr()));
-    auto sizes = tensor.sizes();
-    auto [pad_size_along_dim, trailing_numel] =
-        get_pad_size(sizes, dim, num_chunks);
-    const int64_t pad_tensor_chunk_size =
-        pad_size_along_dim * trailing_numel * dst_elem_size / num_chunks;
-    pad_tensor_chunk_sizes.push_back(pad_tensor_chunk_size);
-    chunk_size += pad_tensor_chunk_size;
-    // Number of blocks required to process this tensor chunk.
-    const int64_t num_blocks =
-        detail::div_up(pad_tensor_chunk_size, detail::BYTES_PER_BLOCK);
-    num_blocks_per_tensor_chunk.push_back(num_blocks);
-    start_block_idx_per_tensor_chunk.push_back(
-        start_block_idx_per_tensor_chunk.back() + num_blocks);
-    block_idx_to_tensor_idx.insert(
-        block_idx_to_tensor_idx.end(), num_blocks, i);
-    tensor_idx_to_start_tensor_bytes.push_back(
-        tensor_idx_to_start_tensor_bytes.back() + pad_tensor_chunk_size);
-    actual_tensor_sizes.push_back(sizes[dim] * trailing_numel * src_elem_size);
-  }
-  const int64_t num_blocks_per_chunk = start_block_idx_per_tensor_chunk.back();
-  const int64_t slice_size = num_chunks * chunk_size;
-  return std::make_tuple(
-      chunk_size,
-      leading_dim,
-      num_blocks_per_chunk,
-      slice_size,
-      srcs,
-      block_idx_to_tensor_idx,
-      tensor_idx_to_start_tensor_bytes,
-      start_block_idx_per_tensor_chunk,
-      actual_tensor_sizes,
-      pad_tensor_chunk_sizes,
-      num_blocks_per_tensor_chunk);
-}
-
-// See [CUDA kernel for chunk_cat_cuda]
-template <typename dst_t, typename src_t>
-void _chunk_cat_out_zoom_contiguous(
-    TensorList tensors,
-    int64_t dim,
-    int64_t num_chunks,
-    Tensor& out,
-    int64_t dst_elem_size,
-    int64_t src_elem_size) {
-  const auto device = tensors[0].device();
-  // `get_chunk_cat_metadata` must return vectors and `pack_vecs` cannot be
-  // moved into `get_chunk_cat_metadata`. Otherwise `packed` would point to
-  // vectors allocated inside `get_chunk_cat_metadata` which become out of local
-  // scope.
-  auto
-      [chunk_size,
-       leading_dim,
-       num_blocks_per_chunk,
-       slice_size,
-       srcs,
-       block_idx_to_tensor_idx,
-       tensor_idx_to_start_tensor_bytes,
-       start_block_idx_per_tensor_chunk,
-       actual_tensor_sizes,
-       pad_tensor_chunk_sizes,
-       num_blocks_per_tensor_chunk] =
-          get_chunk_cat_metadata(
-              tensors, dim, num_chunks, dst_elem_size, src_elem_size);
-  auto packed = pack_vecs(
-      {&srcs,
-       &block_idx_to_tensor_idx,
-       &tensor_idx_to_start_tensor_bytes,
-       &start_block_idx_per_tensor_chunk,
-       &actual_tensor_sizes,
-       &pad_tensor_chunk_sizes,
-       &num_blocks_per_tensor_chunk},
-      device);
-  std::vector<int64_t> view_sizes = get_chunk_cat_out_sizes(
-      tensors[0].sizes(), dim, num_chunks, chunk_size, dst_elem_size);
-  at::native::resize_output(out, view_sizes);
-  dim3 blocks(num_blocks_per_chunk, num_chunks, leading_dim);
-  dim3 threads(detail::BLOCK_SIZE, 1, 1);
- hipLaunchKernelGGL(( detail::chunk_cat_zoom_kernel), 
-      dim3(blocks),
-      dim3(threads),
-      0,
-      c10::zoom::getCurrentZoomStream(), 
-      /*srcs=*/reinterpret_cast<src_t**>(packed.second[0]),
-      reinterpret_cast<dst_t*>(out.data_ptr()),
-      /*block_idx_to_tensor_idx=*/packed.second[1],
-      /*tensor_idx_to_start_tensor_bytes=*/packed.second[2],
-      /*start_block_idx_per_tensor_chunk=*/packed.second[3],
-      /*actual_tensor_sizes=*/packed.second[4],
-      /*pad_tensor_chunk_sizes=*/packed.second[5],
-      /*num_blocks_per_tensor_chunk=*/packed.second[6],
-      slice_size,
-      chunk_size,
-      dst_elem_size / src_elem_size);
-  C10_ZOOM_KERNEL_LAUNCH_CHECK();
-}
-
-} // namespace detail
-
-// See [CUDA fast path for split_with_sizes_copy.out]
-void split_with_sizes_copy_out_zoom_contiguous_no_cast(
-    const at::Tensor& self,
-    at::IntArrayRef split_sizes,
-    int64_t dim,
-    at::TensorList out) {
-  const auto device = self.device();
-  const auto src_base_addrs =
-      detail::get_split_base_addrs(self, split_sizes, dim);
-  const auto dst_base_addrs = detail::get_dst_addrs(out);
-  const auto src_stride = detail::get_chunk_stride(self, dim);
-  const auto split_chunk_sizes =
-      detail::get_split_chunk_sizes(self, split_sizes, dim);
-  const auto num_chunks = detail::get_num_chunks(self, dim);
-
-  // Calculate the number of blocks required for the first chunk across all
-  // splits, assuming each thread only processes BYTES_PER_THREAD bytes.
-  int64_t num_blocks = 0;
-  for (const auto& split_chunk_size : split_chunk_sizes) {
-    num_blocks += detail::div_up(
-        split_chunk_size, detail::BLOCK_SIZE * detail::BYTES_PER_THREAD);
-  }
-
-  // Calculate the maximum number of blocks to launch. Only consider
-  // maxThreadsPerMultiProcessor as a limiting factor as the kernel uses no
-  // shared memory and little registers. Over-subscribe the SMs to hide I/O
-  // latency.
-  const auto num_sms =
-      at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
-  const auto max_threads_per_sm =
-      at::zoom::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor;
-  const int64_t max_blocks =
-      num_sms * max_threads_per_sm / detail::BLOCK_SIZE * 2.0;
-
-  // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate
-  // block size. Spread iter_factor evenly between chunks_per_block and
-  // iters_per_chunk.
-  int64_t iter_factor = detail::div_up(num_blocks * num_chunks, max_blocks);
-  int64_t chunks_per_block = ::ceil(std::sqrt(iter_factor));
-  chunks_per_block = ::min(chunks_per_block, num_chunks);
-  const int64_t iters_per_chunk = detail::div_up(iter_factor, chunks_per_block);
-
-  // Launch a logically jagged grid of shape
-  // (chunk_size*, num_splits, num_chunks / chunks_per_block)
-  // backed by a physical grid of shape
-  // (sum(chunk_size), num_chunks / chunks_per_block).
-  // A block can find its split_idx via block_idx_to_split_idx.
-  std::vector<int64_t> block_idx_to_split_idx;
-  std::vector<int64_t> blocks_cumsums{0};
-  block_idx_to_split_idx.reserve(num_blocks);
-  for (size_t split_idx = 0; split_idx < split_sizes.size(); ++split_idx) {
-    const auto blocks = detail::div_up(
-        split_chunk_sizes[split_idx],
-        detail::BLOCK_SIZE * detail::BYTES_PER_THREAD * iters_per_chunk);
-    block_idx_to_split_idx.insert(
-        block_idx_to_split_idx.end(), blocks, split_idx);
-    blocks_cumsums.push_back(blocks_cumsums.back() + blocks);
-  }
-
-  dim3 blocks(blocks_cumsums.back(), num_chunks / chunks_per_block, 1);
-  dim3 threads(detail::BLOCK_SIZE, 1, 1);
-
-  auto [_, ptrs] = detail::pack_vecs(
-      {&dst_base_addrs,
-       &src_base_addrs,
-       &split_chunk_sizes,
-       &block_idx_to_split_idx,
-       &blocks_cumsums},
-      device);
-
- hipLaunchKernelGGL(( detail::split_with_sizes_copy_out_contiguous_no_cast_kernel), 
-      dim3(blocks),
-      dim3(threads),
-      0,
-      c10::zoom::getCurrentZoomStream(), 
-      /*dst_base_addrs=*/reinterpret_cast<char**>(ptrs[0]),
-      /*src_base_addrs=*/reinterpret_cast<char**>(ptrs[1]),
-      /*split_chunk_sizes=*/ptrs[2],
-      /*block_idx_to_split_idx=*/ptrs[3],
-      /*blocks_cumsums=*/ptrs[4],
-      src_stride,
-      num_chunks);
-  C10_ZOOM_KERNEL_LAUNCH_CHECK();
-}
-
-void split_with_sizes_copy_out_zoom(
-    const Tensor& self,
-    IntArrayRef split_sizes,
-    int64_t dim,
-    TensorList out) {
-  const bool is_capturing = c10::zoom::currentStreamCaptureStatusMayInitCtx() !=
-      c10::zoom::CaptureStatus::None;
-  bool contiguous_no_cast = self.is_non_overlapping_and_dense();
-  for (const auto& t : out) {
-    contiguous_no_cast &= t.is_non_overlapping_and_dense();
-    contiguous_no_cast &= (t.dtype() == self.dtype());
-  }
-  // TODO(yifu): make the fast path work for CUDA graph
-  if (!is_capturing && contiguous_no_cast) {
-    // Perform equivalent checks performed by the composite impl
-    if (dim < 0) {
-      dim = at::maybe_wrap_dim(dim, self.dim());
-    }
-    TORCH_CHECK(
-        self.dim() != 0, "split expects at least a 1-dimensional tensor")
-
-    const int64_t dim_size = self.size(dim);
-    int64_t split_sizes_sum = 0;
-    for (const auto i : c10::irange(split_sizes.size())) {
-      TORCH_CHECK(
-          split_sizes[i] >= 0,
-          "split_with_sizes expects split_sizes have only non-negative ",
-          "entries, but got split_sizes=",
-          split_sizes[i]);
-      split_sizes_sum += split_sizes[i];
-    }
-    TORCH_CHECK(
-        split_sizes_sum == dim_size,
-        "split_with_sizes expects split_sizes to sum exactly to ",
-        dim_size,
-        " (input tensor's size at dimension ",
-        dim,
-        "), ",
-        "but got split_sizes=",
-        split_sizes);
-
-    TORCH_CHECK(
-        out.size() == split_sizes.size(),
-        "split_with_sizes_copy_out() expected an out= argument of size ",
-        split_sizes.size(),
-        ", got size ",
-        out.size());
-
-    auto out_shape = self.sizes().vec();
-    for (const auto i : c10::irange(split_sizes.size())) {
-      out_shape[dim] = split_sizes[i];
-      if (resize_output_check(out[i], out_shape)) {
-        out[i].resize_(out_shape);
-      }
-      TORCH_CHECK(
-          out[i].dtype() == self.dtype(),
-          "Expected out tensor to have dtype ",
-          self.dtype(),
-          ", but got ",
-          out[i].dtype(),
-          " instead");
-      TORCH_CHECK(
-          out[i].device() == self.device(),
-          "Expected out tensor to have device ",
-          self.device(),
-          ", but got ",
-          out[i].device(),
-          " instead");
-    }
-    split_with_sizes_copy_out_zoom_contiguous_no_cast(
-        self, split_sizes, dim, out);
-  } else {
-    at::native::split_with_sizes_copy_out(self, split_sizes, dim, out);
-  }
-}
-
-Tensor _chunk_cat_zoom(TensorList tensors, int64_t dim, int64_t num_chunks) {
-  dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
-  if (detail::all_contiguous(tensors)) {
-    // Return a tensor with the same dtype as input tensors
-    int64_t elem_size = tensors[0].element_size();
-    int64_t chunk_size =
-        detail::get_chunk_size(tensors, dim, num_chunks, elem_size);
-    int64_t leading_dim = detail::get_leading_dim(tensors[0].sizes(), dim);
-    auto view_sizes = detail::get_chunk_cat_out_sizes(
-        tensors[0].sizes(), dim, num_chunks, chunk_size, elem_size);
-    Tensor out =
-        tensors[0]
-            .new_empty(chunk_size * num_chunks * leading_dim / elem_size)
-            .view(view_sizes);
-    // Type-agnostic copy since out and input tensors have the same type.
-    detail::_chunk_cat_out_zoom_contiguous<char, char>(
-        tensors, dim, num_chunks, out, elem_size, elem_size);
-    return out;
-  } else {
-    return at::native::_chunk_cat(tensors, dim, num_chunks);
-  }
-}
-
-Tensor& _chunk_cat_out_zoom(
-    TensorList tensors,
-    int64_t dim,
-    int64_t num_chunks,
-    Tensor& out) {
-  dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks);
-  TORCH_CHECK(
-      tensors[0].device() == out.device(),
-      "_chunk_cat_out_zoom: mismatch between input and out tensor devices");
-  bool both_input_output_contiguous =
-      detail::all_contiguous(tensors) && out.is_non_overlapping_and_dense();
-  if (both_input_output_contiguous &&
-      (tensors[0].dtype() == at::ScalarType::BFloat16) &&
-      (out.dtype() == at::ScalarType::Float)) {
-    // _chunk_cat_out_zoom_contiguous should also support other types, thanks to
-    // static_cast_with_inter_type. Here, we dispatch to BFloat16 in and float32
-    // out since it is the only known use case.
-    detail::_chunk_cat_out_zoom_contiguous<float, BFloat16>(
-        tensors,
-        dim,
-        num_chunks,
-        out,
-        out.element_size(),
-        tensors[0].element_size());
-  } else if (
-      both_input_output_contiguous && tensors[0].dtype() == out.dtype()) {
-    // Type-agnostic copy since out and input tensors have the same type.
-    detail::_chunk_cat_out_zoom_contiguous<char, char>(
-        tensors,
-        dim,
-        num_chunks,
-        out,
-        out.element_size(),
-        tensors[0].element_size());
-  } else {
-    at::native::_chunk_cat_out(tensors, dim, num_chunks, out);
-  }
-  return out;
-}
-
-} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorTransformations.cu b/aten/src/ATen/native/zoom/TensorTransformations.cu
deleted file mode 100644
index fd84d2cb79a1bc..00000000000000
--- a/aten/src/ATen/native/zoom/TensorTransformations.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-#include <hip/hip_runtime.h>
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/native/TensorTransformations.h>
-
-#include <ATen/Dispatch.h>
-#include <ATen/zoom/detail/IndexUtils.cuh>
-#include <ATen/zoom/ZoomApplyUtils.cuh>
-#include <ATen/zoom/ZoomContext.h>
-#include <c10/macros/Macros.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/empty_like.h>
-#include <ATen/ops/roll_native.h>
-#endif
-
-#include <cstddef>
-#include <vector>
-
-namespace at::native {
-
-template <typename scalar_t, typename IndexType>
-C10_LAUNCH_BOUNDS_2(zoom::getApplyBlockSize(), zoom::getApplyBlocksPerSM())
-__global__ void kernel_pointwise_flip_apply2(
-    const zoom::detail::TensorInfo<scalar_t, IndexType> in_tensor_info,
-    zoom::detail::TensorInfo<scalar_t, IndexType> out_tensor_info,
-    IndexType N,
-    int flip_dim,
-    IndexType total_dims) {
-  for (IndexType linear_index = blockIdx.x * blockDim.x + threadIdx.x; linear_index < N; linear_index += gridDim.x * blockDim.x) {
-    IndexType dst_offset = 0;
-    if (flip_dim == 0) {
-      // flip 1st dim
-      dst_offset = (in_tensor_info.sizes[0] - 1 - linear_index / in_tensor_info.strides[0]) * in_tensor_info.strides[0] + linear_index % in_tensor_info.strides[0];
-    }
-    else {
-      // flip last dim
-      IndexType i = total_dims - 1;
-      dst_offset = linear_index / in_tensor_info.strides[0] * in_tensor_info.strides[0] + (in_tensor_info.sizes[i] - 1 - linear_index % in_tensor_info.strides[0]);
-    }
-    out_tensor_info.data[dst_offset] = in_tensor_info.data[linear_index];
-  }
-}
-
-template <typename scalar_t>
-C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
-__global__ void flip_zoom_kernel(
-    scalar_t* in_tensor,
-    scalar_t* out_tensor,
-    int64_t N,
-    int64_t* flip_dims,
-    int64_t flip_dims_size,
-    int64_t* strides,
-    int64_t* strides_contiguous,
-    int64_t* shape,
-    int64_t total_dims) {
-  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (linear_index >= N) {
-    return;
-  }
-
-  int64_t cur_indices = linear_index, rem = 0, dst_offset = 0;
-  for (int64_t i = 0; i < total_dims; i++) {
-    int64_t temp = cur_indices;
-    cur_indices = cur_indices / strides_contiguous[i];
-    rem = temp - cur_indices * strides_contiguous[i];
-    // flip the indices if it is in flip_dims
-    for (int64_t j = 0; j < flip_dims_size; j++) {
-      if (i == flip_dims[j]) {
-        cur_indices = shape[i] - 1 - cur_indices;
-      }
-    }
-    dst_offset += cur_indices * strides[i];
-    cur_indices = rem;
-  }
-  out_tensor[linear_index] = in_tensor[dst_offset];
-}
-
-template <typename scalar_t>
-C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
-__global__ void roll_zoom_kernel(
-    const scalar_t* in_tensor,
-    scalar_t* out_tensor,
-    int64_t N,
-    int64_t roll_dim,
-    int64_t start,
-    int64_t size,
-    int64_t stride,
-    int64_t total_dims) {
-  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (linear_index >= N) {
-    return;
-  }
-  // roll dim idx is the index of linear_index along the rolling dimension.
-  int64_t roll_dim_idx = linear_index % (stride * size) / stride;
-  // index into the source data to find appropriate value.
-  int64_t source_idx = 0;
-  if( roll_dim_idx >= (size - start) ) {
-    source_idx = linear_index - ((size - start) * stride);
-  } else {
-    source_idx = linear_index + (start * stride);
-  }
-  out_tensor[linear_index] = in_tensor[source_idx];
-}
-
-// Roll a tensor along a dimension
-Tensor roll_zoom(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
-  if (dims.size() != 1 || shifts.size() != 1) {
-    return roll_common(self, shifts, dims);
-  }
-
-  auto in_tensor = self;
-  if(!self.is_contiguous()) {
-    in_tensor = self.contiguous();
-  }
-  auto out_tensor = at::empty_like(in_tensor, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  if (out_tensor.numel() == 0) {
-    return out_tensor;
-  }
-  const int64_t N = in_tensor.numel();
-  const int64_t dim = dims[0];
-  const int64_t size = in_tensor.size(dim);
-  int64_t start = (size - shifts[0]) % size;
-  // Behavior of % is different in C++ vs Python for negative numbers. This
-  // corrects the difference.
-  if( start < 0 ) start = start + size;
-
-  dim3 dim_block = zoom::getApplyBlock();
-  dim3 dim_grid;
-  TORCH_CHECK(zoom::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid");
-
-  auto total_dims = in_tensor.dim();
-
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
-      at::ScalarType::ComplexHalf,
-      in_tensor.scalar_type(), "roll_zoom",
-      [&] {
-       hipLaunchKernelGGL(( roll_zoom_kernel), dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
-          in_tensor.const_data_ptr<scalar_t>(), out_tensor.mutable_data_ptr<scalar_t>(), N,
-          dim, start,
-          size,
-          in_tensor.stride(dim),
-          total_dims);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();
-      });
-
-  return out_tensor;
-}
-
-} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ZoomScalar.cu b/aten/src/ATen/native/zoom/ZoomScalar.cu
new file mode 100644
index 00000000000000..370c8a28b3ebed
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ZoomScalar.cu
@@ -0,0 +1,38 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/EmptyTensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_local_scalar_dense_native.h>
+#endif
+
+#include <ATen/zoom/ZoomContext.h>
+
+namespace at::native {
+
+Scalar _local_scalar_dense_zoom(const Tensor& self) {
+  Scalar r;
+  AT_DISPATCH_V2(
+    self.scalar_type(), "_local_scalar_dense_zoom", AT_WRAP([&] {
+        // Create pinned memory for the scalar value to avoid implicit
+        // locking/sync in cuda library due to pageable memory
+        auto value = at::detail::empty_cpu(
+          {1}, /* size */
+          c10::CppTypeToScalarType<scalar_t>(), /* dtype */
+          std::nullopt, /* layout */
+          std::nullopt, /* device */
+          true, /* pin_memory */
+          std::nullopt /* memory format */
+        );
+        hipStream_t stream = c10::zoom::getCurrentZoomStream();
+        c10::zoom::memcpy_and_sync((void *)value.const_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(), sizeof(scalar_t), hipMemcpyDeviceToHost, stream);
+        r = Scalar(*value.const_data_ptr<scalar_t>());
+      }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  return r;
+}
+
+} // at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/reduction_template.cuh b/aten/src/ATen/native/zoom/reduction_template.cuh
new file mode 100644
index 00000000000000..f868c450614b38
--- /dev/null
+++ b/aten/src/ATen/native/zoom/reduction_template.cuh
@@ -0,0 +1,680 @@
+namespace at {
+namespace zoom {
+//windows doesn't like large string literals, so split in two
+const std::string reduction_template_0 = R"ESCAPE(
+  #define C10_HOST_DEVICE __host__ __device__
+  #define C10_DEVICE __device__
+  #if defined(__clang__) && defined(__HIP__)
+  #ifndef __forceinline__
+  #define __forceinline__ inline __attribute__((always_inline))
+  #endif
+  // until ROCm support for kernel asserts is restored
+  #define assert(expr) (static_cast<void>(0))
+  #endif
+
+  template <typename T>
+  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+  #if defined(__clang__) && defined(__HIP__)
+    return __shfl_down(value, delta, width);
+  #else
+    return __shfl_down_sync(mask, value, delta, width);
+  #endif
+  }
+
+
+  #if ${complex}
+  template <typename T>
+  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return std::complex<T>(
+  #if defined(__clang__) && defined(__HIP__)
+        __shfl_down(value.real(), delta, width),
+        __shfl_down(value.imag(), delta, width));
+  #else
+        __shfl_down_sync(mask, value.real(), delta, width),
+        __shfl_down_sync(mask, value.imag(), delta, width));
+  #endif
+  }
+  #endif
+
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+
+
+  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+    // get GCD of num and denom using Euclid's algorithm.
+    // Can replace this with std::gcd if we ever support c++17.
+    size_t a = denominator;
+    size_t b = numerator;
+    while (b != 0) {
+        a %= b;
+        // swap(a,b)
+        size_t tmp = a;
+        a = b;
+        b = tmp;
+    }
+
+    // a is now the GCD
+    numerator /= a;
+    denominator /= a;
+  }
+
+
+
+
+  struct ReduceConfig {
+  //has to match host-side ReduceConfig in the eager code
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+
+  };
+
+
+//TODO this will need to be different for more generic reduction functions
+namespace reducer {
+
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+
+  inline __device__ ${functor}
+
+  inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+
+  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  // wrap a normal reduction that ignores the index
+  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
+     return combine(acc, val);
+  }
+}
+
+
+struct ReduceJitOp {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+  using InputCalculator = OffsetCalculator<1>;
+  using OutputCalculator = OffsetCalculator<2>;
+
+//   static constexpr bool can_accumulate_in_output =
+//     std::is_convertible<arg_t, out_scalar_t>::value
+//     && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    uint32_t output_idx = config.output_idx<${output_vec_size}>();
+    uint32_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = Array<arg_t, ${output_vec_size}>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+
+      value = thread_reduce<${output_vec_size}>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<${output_vec_size}>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = Array<out_scalar_t*, ${output_vec_size}>;
+    using offset_vec_t = Array<uint32_t, ${output_vec_size}>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < ${output_vec_size}; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<${output_vec_size}>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < ${output_vec_size}; i++) {
+          value[i] = reducer::translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<${output_vec_size}>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            *(out[i]) = get_accumulated_output(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            value[i] = reducer::combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      assert(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    uint32_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = aligned_vector<scalar_t, input_vec_size>;
+
+    uint32_t idx = config.input_idx();
+    const uint32_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    scalar_t values[input_vec_size];
+
+    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
+      #pragma unroll
+      for (uint32_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    uint32_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = reducer::combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    uint32_t idx = config.input_idx();
+    const uint32_t end = config.num_inputs;
+    const uint32_t stride = config.step_input;
+    const int vt0=${vt0};
+
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using load_t = aligned_vector<scalar_t, output_vec_size>;
+    const load_t* data = reinterpret_cast<const load_t*>(data_);
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        values[i] = data[calc(idx + i * stride) / output_vec_size];
+      }
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (uint32_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      values[i] = data[calc(idx) / output_vec_size];
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = reducer::warp_shfl_down(value[i], offset);
+        value[i] = reducer::combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = reducer::combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+  )ESCAPE";
+
+  const std::string reduction_template_1 = R"ESCAPE(
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
+    Array<out_scalar_t*, output_vec_size> out,
+    Array<arg_t, output_vec_size> value
+  ) const {
+    Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = reducer::combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value
+  ) const {
+    assert(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+//TODO - multi-output reduction - we won't be able to use thrust::pair
+//just explicitly specify typed output reads/writes
+//Currently implemented for max of two outputs
+//   template<class T1, class T2>
+//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+//     if (noutputs >= 1) {
+//       auto res0 = (T1*)((char*)dst[0] + base_offset);
+//       *res0 = x.first;
+//     }
+//     if (noutputs >= 2) {
+//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
+//       // correction to obtain the correct base offset
+//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+//       *res1 = x.second;
+//     }
+//   }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
+    assert(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(reducer::project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = Array<uint32_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    uint32_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      uint32_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        uint32_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      } else {
+        uint32_t input_offset = threadIdx.y;
+        uint32_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = reducer::combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+extern "C"
+__launch_bounds__(${max_threads_lb}, 4)
+__global__ void reduction_${name}_kernel(ReduceJitOp r){
+  r.run();
+}
+)ESCAPE";
+
+const std::string reduction_template = reduction_template_0 + reduction_template_1;
+
+
+const std::string &get_reduction_template() {
+  return reduction_template;
+}
+
+}}
\ No newline at end of file
diff --git a/aten/src/ATen/templates/UfuncZoom.cu b/aten/src/ATen/templates/UfuncZoom.cu
new file mode 100644
index 00000000000000..689a78b42f9102
--- /dev/null
+++ b/aten/src/ATen/templates/UfuncZoom.cu
@@ -0,0 +1,17 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/ufunc/${name}.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+${zoom_headers}
+namespace at {
+// NB: this is explicitly copied here (via codegen) rather than
+// included via NativeFunctions.h to avoid recompiling this file when
+// NativeFunctions.h changes
+namespace meta {
+${meta_declaration}
+}
+namespace native {
+${native_declaration}
+${native_definitions}
+}} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomContext.cpp b/aten/src/ATen/zoom/ZoomContext.cpp
index 3182fafed7493f..30bb6d79d53e0f 100644
--- a/aten/src/ATen/zoom/ZoomContext.cpp
+++ b/aten/src/ATen/zoom/ZoomContext.cpp
@@ -2,7 +2,6 @@
 #include <c10/zoom/ZoomCachingAllocator.h>
 #include <c10/util/CallOnce.h>
 
-// #include <ATen/cuda/CUDAConfig.h>
 #include <mutex>
 #include <deque>
 #include <vector>
diff --git a/aten/src/ATen/zoom/ZoomContextLight.h b/aten/src/ATen/zoom/ZoomContextLight.h
index 44a82879f05267..93ad2791cd4a85 100644
--- a/aten/src/ATen/zoom/ZoomContextLight.h
+++ b/aten/src/ATen/zoom/ZoomContextLight.h
@@ -1,21 +1,10 @@
 #pragma once
 // Light-weight version of ZoomContext.h with fewer transitive includes
-#define DISABLE_HIPBLASLT
-
 #include <cstdint>
-
 #include <hip/hip_runtime.h>
 #include <c10/core/Allocator.h>
 #include <c10/zoom/ZoomFunctions.h>
 
-#include <hipsparse/hipsparse.h>
-#include <hipsolver/hipsolver.h>
-#include <hipblas/hipblas.h>
-#ifndef DISABLE_HIPBLASLT
-#include <hipblaslt/hipblaslt.h>
-#include <hipblaslt/hipblaslt-ext.hpp>
-#endif
-
 namespace c10 {
 struct Allocator;
 }
@@ -23,24 +12,24 @@ struct Allocator;
 namespace at::zoom {
 
 /*
-A common CUDA interface for ATen.
+A common Zoom interface for ATen.
 
-This interface is distinct from CUDAHooks, which defines an interface that links
-to both CPU-only and CUDA builds. That interface is intended for runtime
+This interface is distinct from ZoomHooks, which defines an interface that links
+to both CPU-only and Zoom builds. That interface is intended for runtime
 dispatch and should be used from files that are included in both CPU-only and
-CUDA builds.
+Zoom builds.
 
-CUDAContext, on the other hand, should be preferred by files only included in
-CUDA builds. It is intended to expose CUDA functionality in a consistent
+ZoomContext, on the other hand, should be preferred by files only included in
+Zoom builds. It is intended to expose Zoom functionality in a consistent
 manner.
 
-This means there is some overlap between the CUDAContext and CUDAHooks, but
-the choice of which to use is simple: use CUDAContext when in a CUDA-only file,
-use CUDAHooks otherwise.
+This means there is some overlap between the ZoomContext and ZoomHooks, but
+the choice of which to use is simple: use ZoomContext when in a Zoom-only file,
+use ZoomHooks otherwise.
 
-Note that CUDAContext simply defines an interface with no associated class.
+Note that ZoomContext simply defines an interface with no associated class.
 It is expected that the modules whose functions compose this interface will
-manage their own state. There is only a single CUDA context/state.
+manage their own state. There is only a single Zoom context/state.
 */
 
 /**
@@ -51,9 +40,9 @@ inline int64_t getNumGPUs() {
 }
 
 /**
- * CUDA is available if we compiled with CUDA, and there are one or more
- * devices.  If we compiled with CUDA but there is a driver problem, etc.,
- * this function will report CUDA is not available (rather than raise an error.)
+ * Zoom is available if we compiled with Zoom, and there are one or more
+ * devices.  If we compiled with Zoom but there is a driver problem, etc.,
+ * this function will report Zoom is not available (rather than raise an error.)
  */
 inline bool is_available() {
     return c10::zoom::device_count() > 0;
@@ -71,15 +60,4 @@ TORCH_ZOOM_API bool canDeviceAccessPeer(
 
 TORCH_ZOOM_API c10::Allocator* getZoomDeviceAllocator();
 
-TORCH_ZOOM_API hipsparseHandle_t getCurrentHIPSparseHandle();
-TORCH_ZOOM_API hipblasHandle_t getCurrentHIPBlasHandle();
-#ifndef DISABLE_HIPBLASLT
-TORCH_ZOOM_API hipblasLtHandle_t getCurrentHIPBlasLtHandle();
-#endif
-
-
-#if defined(hipsolverVersionMajor)
-TORCH_ZOOM_API hipsolverDnHandle_t getCurrentHIPSolverDnHandle();
-#endif
-
 } // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/detail/ZoomHooks.cpp b/aten/src/ATen/zoom/detail/ZoomHooks.cpp
index 828ef6993c45b7..51ba8ae7be3f7d 100644
--- a/aten/src/ATen/zoom/detail/ZoomHooks.cpp
+++ b/aten/src/ATen/zoom/detail/ZoomHooks.cpp
@@ -3,31 +3,17 @@
 #include <ATen/DeviceGuard.h>
 #include <ATen/DynamicLibrary.h>
 #include <ATen/core/Vitals.h>
-// #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/zoom/ZoomDevice.h>
 #include <c10/zoom/ZoomException.h>
 #include <ATen/zoom/PeerToPeerAccess.h>
 #include <ATen/zoom/PinnedMemoryAllocator.h>
 #include <ATen/zoom/hiprtc_stub/ATenHIPRTC.h>
 #include <ATen/zoom/detail/ZoomHooks.h>
-// #include <ATen/native/zoom/HIPFFTPlanCache.h>
 #include <c10/util/Exception.h>
 #include <c10/zoom/ZoomCachingAllocator.h>
 #include <c10/zoom/ZoomFunctions.h>
 #include <c10/util/irange.h>
 
-// #if AT_CUDNN_ENABLED()
-// #include <ATen/cudnn/cudnn-wrapper.h>
-// #endif
-
-// #if AT_MAGMA_ENABLED()
-// #include <magma_v2.h>
-// #endif
-
-// #if defined(USE_ROCM)
-// #include <miopen/version.h>
-// #endif
-
 #include <sstream>
 #include <cstddef>
 #include <functional>
@@ -39,22 +25,11 @@ namespace c10::zoom::_internal {
 void setHasPrimaryContext(bool (*func)(DeviceIndex));
 }
 
-// defined in Aten/zoom/HIPblasHandlePool.cpp
-namespace at::zoom {
-  bool getHIPBlasAtomicsEnabled();
-}
-
 namespace at::zoom::detail {
 
 const at::zoom::HIPRTC& hiprtc();
 DeviceIndex current_device();
 
-// static void (*magma_init_fn)() = nullptr;
-
-// void set_magma_init_fn(void (*fn)()) {
-//   magma_init_fn = fn;
-// }
-
 namespace {
 bool _hasPrimaryContext(DeviceIndex device_index) {
   TORCH_CHECK(device_index >= 0 && device_index < c10::zoom::device_count(),
@@ -149,13 +124,6 @@ bool ZoomHooks::hasROCM() const {
   return at::zoom::is_available();
 }
 
-// rocBLAS is deterministic if atomic operations are disabled
-// for details on when rocBLAS is guaranteed to be bitwise deterministic see below:
-// https://github.com/ROCm/rocBLAS/issues/1459#issuecomment-2272082035
-bool ZoomHooks::checkHIPBlasDeterministic() const {
-  return !at::zoom::getHIPBlasAtomicsEnabled();
-}
-
 // #if defined(USE_DIRECT_NVRTC) || defined(USE_DIRECT_HIPRTC)
   static std::pair<std::unique_ptr<at::DynamicLibrary>, at::zoom::HIPRTC*> load_hiprtc() {
     return std::make_pair(nullptr, at::zoom::load_hiprtc());
diff --git a/aten/src/ATen/zoom/detail/ZoomHooks.h b/aten/src/ATen/zoom/detail/ZoomHooks.h
index 51cabb8bde377f..d5d813c9dbb87a 100644
--- a/aten/src/ATen/zoom/detail/ZoomHooks.h
+++ b/aten/src/ATen/zoom/detail/ZoomHooks.h
@@ -20,7 +20,6 @@ struct ZoomHooks : public ZoomHooksInterface {
   bool isPinnedPtr(const void* data) const override;
   const Generator& getDefaultZoomGenerator(DeviceIndex device_index = -1) const override;
   bool hasROCM() const override;
-  bool checkHIPBlasDeterministic() const override;
   const at::zoom::HIPRTC& hiprtc() const override;
   DeviceIndex current_device() const override;
   bool hasPrimaryContext(DeviceIndex device_index) const override;
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 4c4fc9a89a280d..9ee843ef74aac0 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -50,6 +50,7 @@ load(
     "aten_ufunc_generated_cpu_kernel_sources",
     "aten_ufunc_generated_cpu_sources",
     "aten_ufunc_generated_cuda_sources",
+    "aten_ufunc_generated_zoom_sources",
 )
 
 def read_bool(section, field, default, required = True):
@@ -398,6 +399,9 @@ def get_aten_generated_files(enabled_backends):
         # skipped
         src_files.extend(aten_ufunc_generated_cuda_sources())
 
+    # TODO(Arham): redo logic once we have a zoom key and backend name
+    src_files.extend(aten_ufunc_generated_zoom_sources())
+
     res = {}
     for file_name in src_files:
         res[file_name] = [file_name]
diff --git a/build.bzl b/build.bzl
index 5ab9f92acecca0..299307478a0fb5 100644
--- a/build.bzl
+++ b/build.bzl
@@ -3,6 +3,7 @@ load(
     "aten_ufunc_generated_cpu_kernel_sources",
     "aten_ufunc_generated_cpu_sources",
     "aten_ufunc_generated_cuda_sources",
+    "aten_ufunc_generated_zoom_sources",
 )
 
 def define_targets(rules):
@@ -80,13 +81,18 @@ def define_targets(rules):
         aten_ufunc_generated_cuda_sources()
     )
 
+    gen_aten_outs_cuda = (
+        GENERATED_H_ZOOM + GENERATED_CPP_ZOOM +
+        aten_ufunc_generated_zoom_sources()
+    )
+
     gen_aten_outs = (
         GENERATED_H + GENERATED_H_CORE +
         GENERATED_CPP + GENERATED_CPP_CORE +
         aten_ufunc_generated_cpu_sources() +
         aten_ufunc_generated_cpu_kernel_sources() + [
             "Declarations.yaml",
-        ] + gen_aten_outs_cuda
+        ] + gen_aten_outs_cuda + gen_aten_outs_zoom
     )
 
     rules.genrule(
@@ -208,6 +214,15 @@ GENERATED_CPP_CUDA = [
     "RegisterQuantizedCUDA.cpp",
 ]
 
+GENERATED_H_ZOOM = [
+    "ZoomFunctions.h",
+    "ZoomFunctions_inl.h",
+]
+
+GENERATED_CPP_ZOOM = [
+    "RegisterPrivateUse1.cpp",
+]
+
 GENERATED_CPP = [
     "Functions.cpp",
     "RegisterBackendSelect.cpp",
diff --git a/build.sh b/build.sh
new file mode 100644
index 00000000000000..74897f8830e56a
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+rm -rf build
+git clean -fdx -e .idea
+git clean -fdX -e .idea
+
+
+export USE_ZOOM=1
+export USE_ROCM=0
+export USE_CUDA=0
+#export USE_PER_OPERATOR_HEADERS=1
+export USE_CCACHE=1
+export BUILD_PYTHON=1
+export USE_NUMPY=1
+export USE_FLASH_ATTENTION=0
+#export BUILD_SHARED_LIBS=ON
+
+export BUILD_AOT_INDUCTOR_TEST=0
+#export BUILD_BINARY=0
+#export BUILD_CUSTOM_PROTOBUF=1
+export BUILD_DOCS=0
+export BUILD_EXECUTORCH=0
+export BUILD_FUNCTORCH=0
+export BUILD_JNI=0
+#export BUILD_LAZY_TS_BACKEND=1
+#export BUILD_LIBTORCH_CPU_WITH_DEBUG=0
+export BUILD_LITE_INTERPRETER=0
+export BUILD_MOBILE_AUTOGRAD=0
+export BUILD_MOBILE_BENCHMARK=0
+export BUILD_MOBILE_TEST=0
+export BUILD_ONNX_PYTHON=0
+export BUILD_STATIC_RUNTIME_BENCHMARK=0
+export BUILD_TEST=0
+export USE_ASAN=0
+export USE_C10D_GLOO=0
+export USE_C10D_MPI=0
+export USE_C10D_NCCL=0
+export USE_COLORIZE_OUTPUT=0
+export USE_COREML_DELEGATE=0
+export USE_CPP_CODE_COVERAGE=0
+export USE_CUDA=0
+export USE_CUDNN=0
+export USE_CUPTI_SO=0
+export USE_CUSPARSELT=0
+export USE_DISTRIBUTED=1
+export USE_FAKELOWP=0
+export USE_FBGEMM=0
+export USE_FLASH_ATTENTI0=0
+export USE_GFLAGS=0
+export USE_GLOG=0
+export USE_GLOO=0
+export USE_GLOO_WITH_OPENSSL=0
+export USE_GNU_SOURCE=0
+export USE_GOLD_LINKER=0
+export USE_IBVERBS=0
+export USE_INTERNAL_PTHREADPOOL_IMPL=0
+export USE_ITT=0
+export USE_KINETO=0
+export USE_LIBUV=0
+export USE_LIGHTWEIGHT_DISPATCH=0
+export USE_LITE_INTERPRETER_PROFILER=0
+export USE_LITE_PROTO=0
+export USE_MAGMA=0
+export USE_MIMALLOC=0
+export USE_MKLDNN=0
+export USE_MKLDNN_CBLAS=0
+export USE_MPI=0
+export USE_NATIVE_ARCH=0
+export USE_NCCL=0
+export USE_NNAPI=0
+export USE_NNPACK=0
+export USE_NUMA=0
+export USE_NVRTC=0
+export USE_OBSERVERS=0
+export USE_OPENCL=0
+export USE_OPENMP=0
+export USE_PRECOMPILED_HEADERS=0
+export USE_PROF=0
+export USE_PTHREADPOOL=0
+export USE_PYTORCH_METAL=0
+export USE_PYTORCH_METAL_EXPORT=0
+export USE_PYTORCH_QNNPACK=0
+export USE_QNNPACK=0
+#export USE_RCCL=0
+export USE_REDIS=0
+#export USE_ROCM_KERNEL_ASSERT=0
+export USE_SANITIZER=0
+export USE_SLEEF_FOR_ARM_VEC256=0
+export USE_SNPE=0
+export USE_SOURCE_DEBUG_0_MOBILE=0
+export USE_STATIC_CUDNN=0
+export USE_STATIC_MKL=0
+export USE_STATIC_NCCL=0
+export USE_SYSTEM_BENCHMARK=0
+export USE_SYSTEM_CPUINFO=0
+export USE_SYSTEM_EIGEN_INSTALL=0
+export USE_SYSTEM_FP16=0
+export USE_SYSTEM_FXDIV=0
+export USE_SYSTEM_GLOO=0
+export USE_SYSTEM_GOOGLEBENCHMARK=0
+export USE_SYSTEM_GOOGLETEST=0
+export USE_SYSTEM_LIBS=0
+export USE_SYSTEM_NCCL=0
+export USE_SYSTEM_0NX=0
+export USE_SYSTEM_PSIMD=0
+export USE_SYSTEM_PTHREADPOOL=0
+export USE_SYSTEM_PYBIND11=0
+export USE_SYSTEM_SLEEF=0
+export USE_SYSTEM_XNNPACK=0
+export USE_TBB=0
+export USE_TCP_OPENSSL_LINK=0
+export USE_TCP_OPENSSL_LOAD=0
+export USE_TENSORPIPE=1
+export USE_TSAN=0
+export USE_UCC=0
+export USE_VALGRIND=0
+export USE_VULKAN_FP16_INFERENCE=0
+export USE_VULKAN_RELAXED_PRECISI0=0
+export USE_XNNPACK=0
+export USE_XPU=0
+
+# for the ligerllama example we need distributed and tensorpipe, only because
+# huggingface model.generate insists on querying torch.distributed and distributed relies on tensorpipe
+# this could be a factor of nod-pytorch being out of date with upstream:
+# https://github.com/pytorch/pytorch/issues/97397
+
+python setup.py develop
+python zoom_extension/examples/test.py
+PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh
+python setup.py bdist_wheel
\ No newline at end of file
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 1a43c7d53aa9fb..82d8b6b3372135 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1677,6 +1677,8 @@ if(MSVC AND BUILD_SHARED_LIBS)
     install(FILES $<TARGET_PDB_FILE:torch_cuda> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
   elseif(USE_ROCM)
     install(FILES $<TARGET_PDB_FILE:torch_hip> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
+  elseif(USE_ZOOM)
+    install(FILES $<TARGET_PDB_FILE:torch_zoom> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
   endif()
 endif()
 
@@ -1822,16 +1824,13 @@ if(USE_ZOOM)
   target_link_libraries(torch_zoom PUBLIC c10_zoom)
   # target_link_libraries(torch_zoom PUBLIC c10)
 
-  # this is where lib amdhip64 is actually linked (e.g. HIP symbols)
-  # should be included in c10_zoom
-  # target_link_libraries(torch_zoom PUBLIC ${PYTORCH_HIP_LIBRARIES})
   if(NOT INTERN_BUILD_MOBILE)
     # TODO: Cut this over to ATEN_HIP_FILES_GEN_LIB.  At the moment, we
     # only generate CUDA files
     # NB: This dependency must be PRIVATE, because we don't install
     # ATEN_CUDA_FILES_GEN_LIB (it's a synthetic target just to get the
     # correct dependency from generated files.)
-    #target_link_libraries(torch_zoom PRIVATE ATEN_ZOOM_FILES_GEN_LIB)
+    target_link_libraries(torch_zoom PRIVATE ATEN_ZOOM_FILES_GEN_LIB)
   endif()
   target_link_libraries(torch_zoom PUBLIC torch_cpu_library ${Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS})
   target_link_libraries(torch_zoom PRIVATE ${Caffe2_ZOOM_DEPENDENCY_LIBS})
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index f022db009f4673..ce4762860bb524 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -201,6 +201,7 @@ if(INTERN_BUILD_ATEN_OPS)
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake")
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake")
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake")
+    include("${CMAKE_BINARY_DIR}/aten/src/ATen/zoom_generated_${gen_type}.cmake")
     include("${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake")
 
     message(STATUS "${gen_type} outputs: ${gen_outputs}")
@@ -210,6 +211,7 @@ if(INTERN_BUILD_ATEN_OPS)
       OUTPUT
         ${generated_${gen_type}}
         ${cuda_generated_${gen_type}}
+        ${zoom_generated_${gen_type}}
         ${core_generated_${gen_type}}
         ${cpu_vec_generated_${gen_type}}
         ${ops_generated_${gen_type}}
@@ -218,6 +220,7 @@ if(INTERN_BUILD_ATEN_OPS)
         ${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake
         ${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake
         ${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake
+        ${CMAKE_BINARY_DIR}/aten/src/ATen/zoom_generated_${gen_type}.cmake
       COMMAND ${GEN_COMMAND_${gen_type}}
       DEPENDS ${all_python} ${${gen_type}_templates}
         ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
@@ -235,17 +238,25 @@ if(INTERN_BUILD_ATEN_OPS)
       ${generated_declarations_yaml} ${generated_unboxing_sources})
   add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS
       ${cuda_generated_headers} ${cuda_generated_sources})
+  add_custom_target(ATEN_ZOOM_FILES_GEN_TARGET DEPENDS
+      ${zoom_generated_headers} ${zoom_generated_sources})
   add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
   add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
+  add_library(ATEN_ZOOM_FILES_GEN_LIB INTERFACE)
   add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
   add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
+  add_dependencies(ATEN_ZOOM_FILES_GEN_LIB ATEN_ZOOM_FILES_GEN_TARGET)
   
+  message(zoom_gen_headers="${zoom_generated_headers}")
+  message(zoom_gen_sources="${zoom_generated_sources}")
+
   message(cuda_gen_headers="${cuda_generated_headers}")
   message(cuda_gen_sources="${cuda_generated_sources}")
 
   if(USE_PER_OPERATOR_HEADERS)
     target_compile_definitions(ATEN_CPU_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
     target_compile_definitions(ATEN_CUDA_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
+    target_compile_definitions(ATEN_ZOOM_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
   endif()
 
   # Handle source files that need to be compiled multiple times for
diff --git a/torch/csrc/zoom/Module.cpp b/torch/csrc/zoom/Module.cpp
index 7a0470fad0613e..341f8484b30679 100644
--- a/torch/csrc/zoom/Module.cpp
+++ b/torch/csrc/zoom/Module.cpp
@@ -329,83 +329,6 @@ at::Scalar as_scalar(PyObject* arg) {
   return at::Scalar(THPUtils_unpackDouble(arg));
 }
 
-// Entrypoint for the callable created by torch.zoom.jiterator
-// See jiterator.py for more details
-// PyObject* THCPModule_zoomJiteratorCompileAndLaunchKernel(
-//     PyObject* _unused,
-//     PyObject* args) {
-//   HANDLE_TH_ERRORS
-
-//   PyObject* code_string_o = nullptr;
-//   PyObject* kernel_name_o = nullptr;
-//   PyObject* return_by_ref_o = nullptr;
-//   PyObject* num_outputs_o = nullptr;
-//   PyObject* tensors_o = nullptr;
-//   PyObject* kwargs_o = nullptr;
-//   if (!PyArg_ParseTuple(
-//           args,
-//           "OOOOO|O",
-//           &code_string_o,
-//           &kernel_name_o,
-//           &return_by_ref_o,
-//           &num_outputs_o,
-//           &tensors_o,
-//           &kwargs_o)) {
-//     return nullptr;
-//   }
-
-//   const std::string code_string = THPUtils_unpackString(code_string_o);
-//   const std::string kernel_name = THPUtils_unpackString(kernel_name_o);
-//   const bool return_by_ref = THPUtils_unpackBool(return_by_ref_o);
-//   const int num_outputs = static_cast<int>(THPUtils_unpackLong(num_outputs_o));
-
-//   TORCH_CHECK(
-//       PyTuple_Check(tensors_o),
-//       "tensors argument is expected to "
-//       "be a tuple, but got ",
-//       THPUtils_typename(tensors_o));
-//   Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors_o);
-
-//   c10::SmallVector<at::Tensor> tensors;
-//   for (const auto i : c10::irange(num_tensors)) {
-//     PyObject* _tensor = PyTuple_GET_ITEM(tensors_o, i);
-//     TORCH_CHECK(
-//         THPVariable_Check(_tensor),
-//         i,
-//         " of input tensors tuple is not a Tensor");
-
-//     tensors.emplace_back(THPVariable_Unpack(_tensor));
-//   }
-
-//   c10::SmallVector<at::Scalar> extra_args;
-//   PyObject* key = nullptr;
-//   PyObject* value = nullptr;
-//   Py_ssize_t pos = 0;
-//   while (PyDict_Next(kwargs_o, &pos, &key, &value)) {
-//     extra_args.emplace_back(as_scalar(value));
-//   }
-
-//   c10::SmallVector<at::Tensor> outputs = at::zoom::CompileAndLaunchKernel(
-//       code_string,
-//       kernel_name,
-//       num_outputs,
-//       tensors,
-//       extra_args,
-//       return_by_ref);
-
-//   if (num_outputs == 1) {
-//     return THPVariable_Wrap(outputs[0]);
-//   } else {
-//     PyObject* output_tuple = PyTuple_New(num_outputs);
-//     for (int i = 0; i < num_outputs; ++i) {
-//       PyTuple_SetItem(output_tuple, i, THPVariable_Wrap(outputs[i]));
-//     }
-//     return output_tuple;
-//   }
-
-//   END_HANDLE_TH_ERRORS
-// }
-
 PyObject* THCPModule_zoomCachingAllocator_raw_delete(
     PyObject* _unused,
     PyObject* obj) {
@@ -444,26 +367,6 @@ PyObject* THCPModule_zoomSynchronize(PyObject* _unused, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
-// PyObject* THCPModule_zoomIPCCollect(PyObject* _unused, PyObject* noargs) {
-//   HANDLE_TH_ERRORS
-//   torch::zoomIPCCollect();
-//   Py_RETURN_NONE;
-//   END_HANDLE_TH_ERRORS
-// }
-
-// PyObject* THCPModule_zoomSleep(PyObject* _unused, PyObject* cycles) {
-//   HANDLE_TH_ERRORS
-//   TORCH_CHECK(
-//       THPUtils_checkLong(cycles), "torch.zoom._sleep(): expected 'int'");
-//   int64_t unpacked_cycles = THPUtils_unpackLong(cycles);
-//   {
-//     pybind11::gil_scoped_release no_gil;
-//     at::zoom::sleep(unpacked_cycles);
-//   }
-//   Py_RETURN_NONE;
-//   END_HANDLE_TH_ERRORS
-// }
-
 // We need to ensure that as long as a thread will NEVER loose the GIL as long
 // as it holds the CUDA mutex. Otherwise another thread might be scheduled and
 // try to e.g. allocate a new tensor which will cause a deadlock. It's enough to
@@ -929,30 +832,10 @@ static void registerZoomDeviceProperties(PyObject* module) {
         return stream.str();
       });
 
-  // m.def(
-  //     "_zoom_record_memory_history_legacy",
-  //     static_cast<void (*)(bool, bool, int64_t, bool, bool)>(
-  //         torch::zoom::_record_memory_history));
-
-  // m.def(
-  //     "_zoom_record_memory_history",
-  //     static_cast<void (*)(
-  //         std::optional<std::string>,
-  //         std::optional<std::string>,
-  //         const std::string&,
-  //         size_t)>(torch::zoom::_record_memory_history));
-
   m.def("_zoom_isHistoryEnabled", []() {
     return c10::zoom::ZoomCachingAllocator::isHistoryEnabled();
   });
 
-  // m.def("_zoom_get_conv_benchmark_empty_cache", []() {
-  //   return at::native::_cudnn_get_conv_benchmark_empty_cache();
-  // });
-
-  // m.def("_cudnn_set_conv_benchmark_empty_cache", [](bool enable) {
-  //   return at::native::_cudnn_set_conv_benchmark_empty_cache(enable);
-  // });
 }
 
 // We choose to ignore certain blocks that are currently allocated
@@ -1349,33 +1232,6 @@ static PyObject* THCPModule_initExtension(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THCPModule_getCurrentBlasHandle_wrap(
-    PyObject* self,
-    PyObject* noargs) {
-  HANDLE_TH_ERRORS
-  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
-  return PyLong_FromVoidPtr(handle);
-  END_HANDLE_TH_ERRORS
-}
-
-
-// PyObject* THCPModule_rocm_is_backward_pass(
-//     PyObject* _unused,
-//     PyObject* noargs) {
-//   HANDLE_TH_ERRORS
-// #if USE_ROCM
-//   if (at::ROCmBackwardPassGuard::is_backward_pass()) {
-//     Py_RETURN_TRUE;
-//   } else {
-//     Py_RETURN_FALSE;
-//   }
-// #else
-//   Py_RETURN_FALSE;
-// #endif
-//   END_HANDLE_TH_ERRORS
-// }
-
 static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
     PyObject* self,
     PyObject* noargs) {
@@ -1422,10 +1278,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_getDefaultStream_wrap,
      METH_O,
      nullptr},
-    {"_zoom_getCurrentBlasHandle",
-     THCPModule_getCurrentBlasHandle_wrap,
-     METH_NOARGS,
-     nullptr},
     {"_zoom_isCurrentStreamCapturing",
      THCPModule_isCurrentStreamCapturing_wrap,
      METH_NOARGS,
@@ -1491,14 +1343,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_zoomGetSyncDebugMode,
      METH_NOARGS,
      nullptr},
-    // {"_zoom_jiterator_compile_and_launch_kernel",
-    //  THCPModule_zoomJiteratorCompileAndLaunchKernel,
-    //  METH_VARARGS,
-    //  nullptr},
-    // {"_rocm_is_backward_pass",
-    //  THCPModule_rocm_is_backward_pass,
-    //  METH_NOARGS,
-    //  nullptr},
     {nullptr}};
 
 PyMethodDef* THCPModule_methods() {
@@ -1519,13 +1363,7 @@ void initHiprtBindings(PyObject* module);
 
 void initModule(PyObject* module) {
 //   python::initCommMethods(module);
-//   // As weird as it seems, this file is also compiled for ROCm,
-//   // so this condition might not always be true...
   shared::initHiprtBindings(module);
-//   shared::initNvtxBindings(module);
-// #if defined(USE_CUDNN) || defined(USE_ROCM)
-//   shared::initCudnnBindings(module);
-// #endif
   registerZoomDeviceProperties(module);
   registerZoomPluggableAllocator(module);
 }
diff --git a/torchgen/dest/__init__.py b/torchgen/dest/__init__.py
index 0c684fc1915cb9..2c304b3188c407 100644
--- a/torchgen/dest/__init__.py
+++ b/torchgen/dest/__init__.py
@@ -16,4 +16,5 @@
     compute_ufunc_cpu as compute_ufunc_cpu,
     compute_ufunc_cpu_kernel as compute_ufunc_cpu_kernel,
     compute_ufunc_cuda as compute_ufunc_cuda,
+    compute_ufunc_zoom as compute_ufunc_zoom,
 )
diff --git a/torchgen/dest/register_dispatch_key.py b/torchgen/dest/register_dispatch_key.py
index fced019cc4e308..ac1f4c60d74429 100644
--- a/torchgen/dest/register_dispatch_key.py
+++ b/torchgen/dest/register_dispatch_key.py
@@ -30,6 +30,7 @@
     DispatchKey,
     gets_generated_out_inplace_wrapper,
     is_cuda_dispatch_key,
+    is_zoom_dispatch_key,
     NativeFunction,
     NativeFunctionsGroup,
     SchemaKind,
@@ -56,6 +57,8 @@ def gen_registration_headers(
             headers.append("#include <ATen/hip/EmptyTensor.h>")
         else:
             headers.append("#include <ATen/cuda/EmptyTensor.h>")
+    elif backend_index.dispatch_key == DispatchKey.PrivateUse1: #TODO(Arham): remove once we have a zoom key
+        headers.append("#include <ATen/zoom/EmptyTensor.h>")
     elif backend_index.dispatch_key == DispatchKey.MPS:
         headers.append("#include <ATen/mps/EmptyTensor.h>")
     elif per_operator_headers:
@@ -81,9 +84,12 @@ def gen_empty_impl_names(
         DispatchKey.Meta,
         DispatchKey.CPU,
         DispatchKey.CUDA,
+        DispatchKey.PrivateUse1, # TODO (Arham) change keys
         DispatchKey.MPS,
     ):
         dispatch = str(backend_index.dispatch_key).lower()
+        if backend_index.dispatch_key == DispatchKey.PrivateUse1:
+            dispatch = "zoom"
         empty_impl = f"at::detail::empty_{dispatch}"
         empty_strided_impl = f"at::detail::empty_strided_{dispatch}"
     elif backend_index.dispatch_key in (
@@ -506,6 +512,10 @@ def generate_defn(cpp_sig: CppSignature) -> str:
                             device_guard = (
                                 f"globalContext().lazyInitCUDA();\n{device_guard}"
                             )
+                        if is_zoom_dispatch_key(self.backend_index.dispatch_key):
+                            device_guard = (
+                                f"globalContext().lazyInitPrivateUse1();\n{device_guard}"
+                            )
                     else:
                         # kernel is operating on existing tensors
 
@@ -600,6 +610,7 @@ def gen_set_output_function(name: str, maybe_create_proxy: bool) -> str:
     def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) -> str:
         if self.backend_index.dispatch_key in [
             DispatchKey.CUDA,
+            DispatchKey.PrivateUse1, # TODO (Arham): change keys
             DispatchKey.MPS,
             DispatchKey.CompositeExplicitAutogradNonFunctional,
         ]:
@@ -631,6 +642,7 @@ def gen_class_set_output_body(self, k: SchemaKind, maybe_create_proxy: bool) ->
                 DispatchKey.Meta,
                 DispatchKey.CPU,
                 DispatchKey.CUDA,
+                DispatchKey.PrivateUse1, # TODO (Arham): change keys
                 DispatchKey.MPS,
                 DispatchKey.CompositeExplicitAutogradNonFunctional,
             )
@@ -699,6 +711,9 @@ def gen_class(
                 guard_field = "c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;"
             else:
                 guard_field = "c10::cuda::OptionalCUDAGuard guard_;"
+        # TODO (Arham): change keys
+        elif self.backend_index.dispatch_key == DispatchKey.PrivateUse1:
+            guard_field = "c10::OptionalDeviceGuard guard_;"
         elif (
             self.backend_index.dispatch_key
             == DispatchKey.CompositeExplicitAutogradNonFunctional
diff --git a/torchgen/dest/ufunc.py b/torchgen/dest/ufunc.py
index ffc879afb6cdba..999f7489a8ff66 100644
--- a/torchgen/dest/ufunc.py
+++ b/torchgen/dest/ufunc.py
@@ -321,6 +321,39 @@ def compute_ufunc_cuda(g: NativeFunctionsGroup) -> str:
 }}
 """
 
+@with_native_function
+def compute_ufunc_zoom(g: NativeFunctionsGroup) -> str:
+    # First, build the functors, indexing them by dtype
+    ufunctor_sigs, ufunctors = compute_ufunc_cuda_functors(g)
+    # Next, build the conditionals
+    sig = StructuredImplSignature(g, ufunc.kernel_name(g, DispatchKey.PrivateUse1))
+    dtype_cases = []
+    for dtype, inner_ufunc_sigs in ufunctor_sigs.items():
+        dtype_cases.append(
+            f"""
+AT_DISPATCH_CASE(at::ScalarType::{dtype},
+  [&]() {{
+    {compute_ufunc_cuda_dtype_body(g, dtype, inner_ufunc_sigs, sig.arguments())}
+  }}
+)
+"""
+        )
+    dtype_cases_str = "\n".join(dtype_cases)
+    stub_sig = StubSignature(g)
+    return f"""
+{ufunctors}
+{stub_sig.type_defn()};
+{stub_sig.dispatch_decl()};
+{stub_sig.kernel_defn()} {{
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "{sig.name}",
+    {dtype_cases_str}
+  );
+}}
+REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+{sig.defn()} {{
+  {stub_sig.direct_call(sig.arguments())};
+}}
+"""
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
diff --git a/torchgen/gen.py b/torchgen/gen.py
index d715361146ea0e..057e12111f2ebe 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -69,6 +69,7 @@
     DispatchKey,
     FRAGMENT_NAMESPACES,
     FunctionSchema,
+    is_zoom_dispatch_key,
     is_cuda_dispatch_key,
     is_generic_dispatch_key,
     is_ufunc_dispatch_key,
@@ -194,7 +195,7 @@ def parse_native_yaml_struct(
             use_out_as_primary=True,
             external=False,
             # Only cuda-like devices in tree require device guards
-            device_guard=is_cuda_dispatch_key(k),
+            device_guard=is_cuda_dispatch_key(k) or is_zoom_dispatch_key(k),
             index=v,
         )
     return ParsedYaml(rs, indices)
@@ -1729,6 +1730,7 @@ def gen_aggregated_headers(
     selector: SelectiveBuilder,
     backend_indices: Dict[DispatchKey, BackendIndex],
     cpu_fm: FileManager,
+    zoom_fm: FileManager,
     cuda_fm: FileManager,
     functions_keys: Set[DispatchKey],
     dispatch_keys: Sequence[DispatchKey],
@@ -1810,6 +1812,7 @@ def gen_aggregated_headers(
 
     for dispatch_key in dispatch_keys:
         fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        fm = zoom_fm if is_zoom_dispatch_key(dispatch_key) else fm
         if dispatch_key in functions_keys:
             inl_headers = f"#include <ATen/{dispatch_key}Functions_inl.h>"
 
@@ -1849,6 +1852,7 @@ def gen_per_operator_headers(
     selector: SelectiveBuilder,
     backend_indices: Dict[DispatchKey, BackendIndex],
     cpu_fm: FileManager,
+    zoom_fm: FileManager,
     cuda_fm: FileManager,
     ops_fm: FileManager,
     functions_keys: Set[DispatchKey],
@@ -1998,6 +2002,7 @@ def gen_per_operator_headers(
             )
 
         fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        fm = zoom_fm if is_zoom_dispatch_key(dispatch_key) else fm
         inl_headers = f"#include <ATen/{dispatch_key}Functions_inl.h>"
 
         fm.write_with_template(
@@ -2046,6 +2051,7 @@ def gen_headers(
     backend_indices: Dict[DispatchKey, BackendIndex],
     core_fm: FileManager,
     cpu_fm: FileManager,
+    zoom_fm: FileManager,
     cuda_fm: FileManager,
     ops_fm: FileManager,
     dispatch_keys: Sequence[DispatchKey],
@@ -2061,6 +2067,7 @@ def gen_headers(
             selector=selector,
             backend_indices=backend_indices,
             cpu_fm=cpu_fm,
+            zoom_fm=zoom_fm,
             cuda_fm=cuda_fm,
             ops_fm=ops_fm,
             dispatch_keys=dispatch_keys,
@@ -2076,6 +2083,7 @@ def gen_headers(
             selector=selector,
             backend_indices=backend_indices,
             cpu_fm=cpu_fm,
+            zoom_fm=zoom_fm,
             cuda_fm=cuda_fm,
             dispatch_keys=dispatch_keys,
             functions_keys=functions_keys,
@@ -2186,6 +2194,7 @@ def gen_source_files(
     core_fm: FileManager,
     cpu_fm: FileManager,
     cpu_vec_fm: FileManager,
+    zoom_fm: FileManager,
     cuda_fm: FileManager,
     dispatch_keys: Sequence[DispatchKey],
     functions_keys: Set[DispatchKey],
@@ -2209,6 +2218,13 @@ def gen_source_files(
 
     for dispatch_key in dispatch_keys:
         fm = cuda_fm if is_cuda_dispatch_key(dispatch_key) else cpu_fm
+        if is_zoom_dispatch_key(dispatch_key):
+            fm = zoom_fm
+            extra_cuda_headers = """\
+            #include <c10/zoom/impl/ZoomGuardImpl.h>
+            #include <ATen/zoom/ATenZoomGeneral.h>
+            #include <ATen/zoom/ZoomDevice.h>
+            #include <ATen/zoom/ZoomContext.h>"""
 
         if per_operator_headers:
 
@@ -2296,7 +2312,7 @@ def operator_headers() -> List[str]:
             "RegisterDispatchKey.cpp",
             lambda: {
                 "extra_cuda_headers": extra_cuda_headers
-                if is_cuda_dispatch_key(dispatch_key)
+                if is_cuda_dispatch_key(dispatch_key) or is_zoom_dispatch_key(dispatch_key)
                 else "",
                 "external_backend_headers": "",
                 "dispatch_headers": dest.gen_registration_headers(
@@ -2350,6 +2366,21 @@ def operator_headers() -> List[str]:
                         "native_definitions": dest.compute_ufunc_cuda(g),
                     },
                 )
+            elif dispatch_key is DispatchKey.PrivateUse1: # TODO(Arham): change keys
+                zoom_headers = "#include <ATen/zoom/jit/Loops.cuh>"
+                fm.write_with_template(
+                    f"UfuncZoom_{name}.cu",
+                    "UfuncZoom.cu",
+                    lambda: {
+                        "name": name,
+                        "zoom_headers": zoom_headers,
+                        "meta_declaration": compute_meta_function_declaration(g),
+                        "native_declaration": dest.compute_native_function_declaration(
+                            g, backend_indices[dispatch_key]
+                        ),
+                        "native_definitions": dest.compute_ufunc_zoom(g),
+                    },
+                )
             else:
                 raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
 
@@ -2887,6 +2918,7 @@ def main() -> None:
     core_fm = make_file_manager(options=options, install_dir=core_install_dir)
     cpu_fm = make_file_manager(options=options)
     cpu_vec_fm = make_file_manager(options=options)
+    zoom_fm = make_file_manager(options=options)
     cuda_fm = make_file_manager(options=options)
     ops_fm = make_file_manager(options=options, install_dir=ops_install_dir)
     aoti_fm = make_file_manager(options=options, install_dir=aoti_install_dir)
@@ -2896,6 +2928,7 @@ def main() -> None:
     functions_keys = {
         DispatchKey.CPU,
         DispatchKey.CUDA,
+        DispatchKey.PrivateUse1, # TODO(Arham): change keys
         DispatchKey.CompositeImplicitAutograd,
         DispatchKey.CompositeImplicitAutogradNestedTensor,
         DispatchKey.CompositeExplicitAutograd,
@@ -2936,6 +2969,7 @@ def main() -> None:
             core_fm=core_fm,
             cpu_fm=cpu_fm,
             cpu_vec_fm=cpu_vec_fm,
+            zoom_fm=zoom_fm,
             cuda_fm=cuda_fm,
             dispatch_keys=dispatch_keys,
             functions_keys=functions_keys,
@@ -2957,6 +2991,7 @@ def main() -> None:
             backend_indices=backend_indices,
             core_fm=core_fm,
             cpu_fm=cpu_fm,
+            zoom_fm=zoom_fm,
             cuda_fm=cuda_fm,
             ops_fm=ops_fm,
             dispatch_keys=dispatch_keys,
@@ -2977,6 +3012,7 @@ def main() -> None:
             (cpu_fm, ""),
             (cpu_vec_fm, "cpu_vec_"),
             (core_fm, "core_"),
+            (zoom_fm, "zoom_"),
             (cuda_fm, "cuda_"),
             (ops_fm, "ops_"),
         ]:
diff --git a/torchgen/model.py b/torchgen/model.py
index 2706f234c56b0a..40b81a69f18f87 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -259,9 +259,9 @@ def codegen_per_backend_entries() -> str:
                 f"Missing {fk}{bc} from DispatchKey enum.  Here is the autogenerated list we expect to have:\n\n{r}"
             )
 
-
-STRUCTURED_DISPATCH_KEYS = {DispatchKey.MPS, DispatchKey.CUDA, DispatchKey.CPU}
-UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU}
+# TODO(Arham): change keys
+STRUCTURED_DISPATCH_KEYS = {DispatchKey.MPS, DispatchKey.CUDA, DispatchKey.CPU, DispatchKey.PrivateUse1}
+UFUNC_DISPATCH_KEYS = {DispatchKey.CUDA, DispatchKey.CPU, DispatchKey.PrivateUse1}
 
 # Set of supported dispatch keys
 dispatch_keys = [
@@ -270,6 +270,7 @@ def codegen_per_backend_entries() -> str:
     DispatchKey.SparseCsrCPU,
     DispatchKey.MkldnnCPU,
     DispatchKey.CUDA,
+    DispatchKey.PrivateUse1, # TODO(Arham): replace with zoom key
     DispatchKey.MPS,
     DispatchKey.SparseCUDA,
     DispatchKey.SparseCsrCUDA,
@@ -314,6 +315,11 @@ def is_cuda_dispatch_key(dk: DispatchKey) -> bool:
         DispatchKey.AutogradCUDA,
     }
 
+def is_zoom_dispatch_key(dk: DispatchKey) -> bool:
+    return dk in {
+        DispatchKey.PrivateUse1
+    }
+
 
 # Structured kernel generation is only supported for certain key types;
 # otherwise use old-style
diff --git a/ufunc_defs.bzl b/ufunc_defs.bzl
index 4490f05be01519..f94b9e866765bb 100644
--- a/ufunc_defs.bzl
+++ b/ufunc_defs.bzl
@@ -23,3 +23,9 @@ def aten_ufunc_generated_cuda_sources(gencode_pattern = "{}"):
         "UfuncCUDA_{}.cu".format(n)
         for n in aten_ufunc_names
     ]]
+
+def aten_ufunc_generated_zoom_sources(gencode_pattern = "{}"):
+    return [gencode_pattern.format(name) for name in [
+        "UfuncZoom_{}.cu".format(n)
+        for n in aten_ufunc_names
+    ]]
\ No newline at end of file

From 0b7cc75307b9149241a2a9c055dd1e479d69afe1 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Sat, 28 Dec 2024 00:25:57 +0000
Subject: [PATCH 04/23] add kernel deps for llama3

---
 aten/src/ATen/native/SharedReduceOps.h        |   15 +-
 aten/src/ATen/native/SoftMax.cpp              |    8 +-
 aten/src/ATen/native/native_functions.yaml    |   99 +-
 .../ATen/native/zoom/BinaryDivFloorKernel.cu  |   83 +
 .../ATen/native/zoom/BinaryDivTrueKernel.cu   |   61 +
 .../ATen/native/zoom/BinaryDivTruncKernel.cu  |   53 +
 aten/src/ATen/native/zoom/BinaryInternal.h    |   48 +
 aten/src/ATen/native/zoom/BinaryMulKernel.cu  |   48 +
 aten/src/ATen/native/zoom/Bmm.cpp             |  121 +
 aten/src/ATen/native/zoom/CompareKernels.cu   |  103 +
 aten/src/ATen/native/zoom/Copy.cu             |   29 +-
 aten/src/ATen/native/zoom/CumminmaxKernel.cu  |   29 +
 aten/src/ATen/native/zoom/CumprodKernel.cu    |   23 +
 aten/src/ATen/native/zoom/CumsumKernel.cu     |   25 +
 aten/src/ATen/native/zoom/DeviceSqrt.cuh      |   18 +
 aten/src/ATen/native/zoom/HIPbmm.cu           |  126 +
 aten/src/ATen/native/zoom/Indexing.cu         | 1798 ++++++++++
 aten/src/ATen/native/zoom/KernelUtils.cuh     |   97 +
 .../ATen/native/zoom/LegacyThrustHelpers.cu   |  113 +
 .../ATen/native/zoom/LogcumsumexpKernel.cu    |  124 +
 aten/src/ATen/native/zoom/Math.cuh            | 3026 +++++++++++++++++
 .../ATen/native/zoom/PersistentSoftmax.cuh    |  402 +++
 aten/src/ATen/native/zoom/Reduce.cuh          | 1354 ++++++++
 .../src/ATen/native/zoom/ReduceLogicKernel.cu |   38 +
 aten/src/ATen/native/zoom/ScanKernels.cpp     |  115 +
 aten/src/ATen/native/zoom/ScanKernels.h       |   18 +
 aten/src/ATen/native/zoom/ScanUtils.cuh       |  459 +++
 aten/src/ATen/native/zoom/Shape.cu            |  521 +++
 aten/src/ATen/native/zoom/SoftMax.cu          | 1272 +++++++
 aten/src/ATen/native/zoom/Sort.cpp            |  128 +
 aten/src/ATen/native/zoom/Sort.cu             |  384 +++
 aten/src/ATen/native/zoom/Sort.h              |   17 +
 aten/src/ATen/native/zoom/SortImpl.cu         |   37 +
 aten/src/ATen/native/zoom/SortStable.cu       |  286 ++
 aten/src/ATen/native/zoom/SortStable.h        |   19 +
 aten/src/ATen/native/zoom/SortUtils.cuh       |  333 ++
 aten/src/ATen/native/zoom/Sorting.cpp         |  208 ++
 aten/src/ATen/native/zoom/Sorting.cu          |  282 ++
 aten/src/ATen/native/zoom/Sorting.h           |   18 +
 aten/src/ATen/native/zoom/SortingCommon.cuh   |  188 +
 .../ATen/native/zoom/SortingRadixSelect.cuh   |  410 +++
 aten/src/ATen/native/zoom/TensorTopK.cpp      |   96 +
 aten/src/ATen/native/zoom/TensorTopK.cu       |  895 +++++
 aten/src/ATen/native/zoom/TensorTopK.h        |   14 +
 aten/src/ATen/native/zoom/TriangularOps.cu    |  165 +
 .../native/zoom/UnaryGeometricCosKernel.cu    |   58 +
 .../native/zoom/UnaryGeometricSinKernel.cu    |   58 +
 aten/src/ATen/native/zoom/UnarySignKernels.cu |  121 +
 aten/src/ATen/native/zoom/block_reduce.cuh    |  143 +
 torchgen/dest/ufunc.py                        |    2 +-
 50 files changed, 14018 insertions(+), 70 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/BinaryDivFloorKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryDivTrueKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryDivTruncKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryInternal.h
 create mode 100644 aten/src/ATen/native/zoom/BinaryMulKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/Bmm.cpp
 create mode 100644 aten/src/ATen/native/zoom/CompareKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/CumminmaxKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/CumprodKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/CumsumKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DeviceSqrt.cuh
 create mode 100644 aten/src/ATen/native/zoom/HIPbmm.cu
 create mode 100644 aten/src/ATen/native/zoom/Indexing.cu
 create mode 100644 aten/src/ATen/native/zoom/KernelUtils.cuh
 create mode 100644 aten/src/ATen/native/zoom/LegacyThrustHelpers.cu
 create mode 100644 aten/src/ATen/native/zoom/LogcumsumexpKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/Math.cuh
 create mode 100644 aten/src/ATen/native/zoom/PersistentSoftmax.cuh
 create mode 100644 aten/src/ATen/native/zoom/Reduce.cuh
 create mode 100644 aten/src/ATen/native/zoom/ReduceLogicKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ScanKernels.cpp
 create mode 100644 aten/src/ATen/native/zoom/ScanKernels.h
 create mode 100644 aten/src/ATen/native/zoom/ScanUtils.cuh
 create mode 100644 aten/src/ATen/native/zoom/Shape.cu
 create mode 100644 aten/src/ATen/native/zoom/SoftMax.cu
 create mode 100644 aten/src/ATen/native/zoom/Sort.cpp
 create mode 100644 aten/src/ATen/native/zoom/Sort.cu
 create mode 100644 aten/src/ATen/native/zoom/Sort.h
 create mode 100644 aten/src/ATen/native/zoom/SortImpl.cu
 create mode 100644 aten/src/ATen/native/zoom/SortStable.cu
 create mode 100644 aten/src/ATen/native/zoom/SortStable.h
 create mode 100644 aten/src/ATen/native/zoom/SortUtils.cuh
 create mode 100644 aten/src/ATen/native/zoom/Sorting.cpp
 create mode 100644 aten/src/ATen/native/zoom/Sorting.cu
 create mode 100644 aten/src/ATen/native/zoom/Sorting.h
 create mode 100644 aten/src/ATen/native/zoom/SortingCommon.cuh
 create mode 100644 aten/src/ATen/native/zoom/SortingRadixSelect.cuh
 create mode 100644 aten/src/ATen/native/zoom/TensorTopK.cpp
 create mode 100644 aten/src/ATen/native/zoom/TensorTopK.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorTopK.h
 create mode 100644 aten/src/ATen/native/zoom/TriangularOps.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricCosKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricSinKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnarySignKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/block_reduce.cuh

diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index 5b7167ee93dd29..9cdf5df112d716 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -11,8 +11,13 @@
 #include <ATen/cuda/DeviceUtils.cuh>
 #include <ATen/native/cuda/DeviceSqrt.cuh>
 #elif defined(__HIPCC__)
-#include <ATen/hip/DeviceUtils.cuh>
-#include <ATen/native/hip/DeviceSqrt.cuh>
+  #ifdef USE_ZOOM
+    #include <ATen/zoom/DeviceUtils.cuh>  
+    #include <ATen/native/zoom/DeviceSqrt.cuh>
+  #else
+    #include <ATen/hip/DeviceUtils.cuh>
+    #include <ATen/native/hip/DeviceSqrt.cuh>
+  #endif
 #endif
 #if defined(__CUDACC__) || defined(__HIPCC__)
 #include <thrust/pair.h>
@@ -56,7 +61,11 @@ inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
 #include <c10/cuda/CUDAMathCompat.h>
 #define compat_pow c10::cuda::compat::pow
 #elif defined(__HIPCC__)
-#include <c10/hip/HIPMathCompat.h>
+#ifdef USE_ZOOM
+    #include <c10/zoom/HIPMathCompat.h>
+  #else
+    #include <c10/hip/HIPMathCompat.h>
+  #endif
 #define compat_pow c10::hip::compat::pow
 #else
 #define compat_pow std::pow
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 3188479b931f3b..fd2e8e282ad1d2 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -452,7 +452,7 @@ static Tensor softmax(const Tensor& input_, const int64_t dim_) {
 Tensor softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
-    if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
+    if ((input_.is_cuda() || input_.is_privateuseone()) && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
         return at::_softmax(input_, dim_, true);
     } else {
         Tensor converted = dtype.has_value() ? input_.toType(dtype.value()) : input_;
@@ -469,7 +469,7 @@ Tensor& softmax_out(
     std::optional<ScalarType> dtype,
     Tensor& output_) {
   Tensor output_temp;
-  if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
+  if ((input_.is_cuda() || input_.is_privateuseone()) && input_.scalar_type() == ScalarType::Half &&
       dtype == ScalarType::Float) {
     if (!output_.is_contiguous()) {
       auto options =
@@ -517,7 +517,7 @@ static Tensor log_softmax(const Tensor& input_, const int64_t dim_) {
 Tensor log_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
   auto result = [&]() {
     NoNamesGuard guard;
-    if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
+    if ((input_.is_cuda() || input_.is_privateuseone()) && input_.scalar_type() == ScalarType::Half && dtype == ScalarType::Float){
         return at::_log_softmax(input_, dim_, true);
     } else {
         Tensor converted = dtype.has_value()? input_.toType(dtype.value()) : input_;
@@ -534,7 +534,7 @@ Tensor& log_softmax_out(
     std::optional<ScalarType> dtype,
     Tensor& output_) {
   Tensor output_temp;
-  if (input_.is_cuda() && input_.scalar_type() == ScalarType::Half &&
+  if (((input_.is_cuda() || input_.is_privateuseone())) && input_.scalar_type() == ScalarType::Half &&
       dtype == ScalarType::Float) {
     if (!output_.is_contiguous()) {
       auto options =
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b28fcfbfc2732e..a5876201f7e9c6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -442,7 +442,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sgn_out
+    CPU, CUDA, PrivateUse1: sgn_out
     MPS: sgn_out_mps
     SparseCPU, SparseCUDA: sgn_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
@@ -707,14 +707,14 @@
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: all_out
+    CPU, CUDA, PrivateUse1: all_out
     MPS: all_out_mps
 
 - func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: all_dims_out
+    CPU, CUDA, PrivateUse1: all_dims_out
     CompositeExplicitAutograd: all_dims_out_default
   cpp_no_default_args: ['dim']
 
@@ -750,14 +750,14 @@
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: any_out
+    CPU, CUDA, PrivateUse1: any_out
     MPS: any_out_mps
 
 - func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: any_dims_out
+    CPU, CUDA, PrivateUse1: any_dims_out
     CompositeExplicitAutograd: any_dims_out_default
   cpp_no_default_args: ['dim']
 
@@ -1259,7 +1259,7 @@
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: logical_not_out
+    CPU, CUDA, PrivateUse1: logical_not_out
     MPS: logical_not_out_mps
   tags: pointwise
 
@@ -1352,6 +1352,7 @@
   dispatch:
     CPU: bmm_out_cpu
     CUDA: bmm_out_cuda
+    PrivateUse1: bmm_out_zoom
     MPS: bmm_out_mps
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
@@ -1386,6 +1387,7 @@
   dispatch:
     CPU: cat_out_cpu
     CUDA: cat_out_cuda
+    PrivateUse1: cat_out_zoom
     MPS: cat_out_mps
     QuantizedCPU: cat_out_quantized_cpu
 
@@ -1797,7 +1799,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: cos_out
+    CPU, CUDA, PrivateUse1: cos_out
     MPS: cos_out_mps
   tags: pointwise
 
@@ -1933,6 +1935,7 @@
   dispatch:
     CPU: cummax_helper_cpu
     CUDA: cummax_helper_cuda
+    PrivateUse1: cummax_helper_zoom
 
 - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
@@ -1957,6 +1960,7 @@
   dispatch:
     CPU: cummin_helper_cpu
     CUDA: cummin_helper_cuda
+    PrivateUse1: cummin_helper_zoom
 
 - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
   variants: function
@@ -1976,7 +1980,7 @@
   structured: True
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: cumprod_out
+    CPU, CUDA, PrivateUse1: cumprod_out
     MPS: cumprod_out_mps
 
 - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -2008,7 +2012,7 @@
   structured: True
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: cumsum_out
+    CPU, CUDA, PrivateUse1: cumsum_out
     MPS: cumsum_out_mps
 
 - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
@@ -2137,7 +2141,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: div_out
+    CPU, CUDA, PrivateUse1: div_out
     MPS: div_out_mps
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
   tags: pointwise
@@ -2163,7 +2167,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: div_out_mode
+    CPU, CUDA, PrivateUse1: div_out_mode
     MPS: div_out_mode_mps
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
   tags: pointwise
@@ -2736,7 +2740,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: floor_divide
+    CPU, CUDA, PrivateUse1: floor_divide
     MPS: floor_divide_mps
     SparseCPU, SparseCUDA: floor_divide_sparse
 
@@ -2744,14 +2748,14 @@
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: floor_divide_
+    CPU, CUDA, PrivateUse1: floor_divide_
     MPS: floor_divide_mps_
     SparseCPU, SparseCUDA: floor_divide_sparse_
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: floor_divide_out
+    CPU, CUDA, PrivateUse1: floor_divide_out
     MPS: floor_divide_out_mps
     SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
 
@@ -3132,7 +3136,7 @@
   variants: function
   structured: True
   dispatch:
-    CPU, CUDA: isin_Tensor_Tensor_out
+    CPU, CUDA, PrivateUse1: isin_Tensor_Tensor_out
     MPS: isin_Tensor_Tensor_out_mps
 
 - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
@@ -3143,7 +3147,7 @@
   variants: function
   structured: True
   dispatch:
-    CPU, CUDA: isin_Tensor_Scalar_out
+    CPU, CUDA, PrivateUse1: isin_Tensor_Scalar_out
 
 - func: isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
   variants: function
@@ -3153,7 +3157,7 @@
   variants: function
   structured: True
   dispatch:
-    CPU, CUDA: isin_Scalar_Tensor_out
+    CPU, CUDA, PrivateUse1: isin_Scalar_Tensor_out
 
 - func: isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
   variants: function
@@ -3246,6 +3250,7 @@
   dispatch:
     CPU: kthvalue_out_cpu
     CUDA: kthvalue_out_cuda
+    PrivateUse1: kthvalue_out_zoom
 
 - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -3680,6 +3685,7 @@
   dispatch:
     CPU: log_softmax_cpu_out
     CUDA: log_softmax_cuda_out
+    PrivateUse1: log_softmax_zoom_out
     MPS: log_softmax_mps_out
 
 - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
@@ -3690,17 +3696,20 @@
   dispatch:
     CPU: log_softmax_backward_cpu_out
     CUDA: log_softmax_backward_cuda_out
+    PrivateUse1: log_softmax_backward_zoom_out
     MPS: log_softmax_backward_mps_out
 
 - func: _logcumsumexp(Tensor self, int dim) -> Tensor
   dispatch:
     CPU: _logcumsumexp_cpu
     CUDA: _logcumsumexp_cuda
+    PrivateUse1: _logcumsumexp_zoom
 
 - func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: _logcumsumexp_out_cpu
     CUDA: _logcumsumexp_out_cuda
+    PrivateUse1: _logcumsumexp_out_zoom
 
 - func: logcumsumexp(Tensor self, int dim) -> Tensor
   variants: function, method
@@ -3945,6 +3954,7 @@
   dispatch:
     CPU: median_cpu
     CUDA: median_cuda
+    PrivateUse1: median_zoom
     MPS: median_mps
   autogen: median.out
 
@@ -3957,6 +3967,7 @@
   dispatch:
     CPU: median_out_cpu
     CUDA: median_out_cuda
+    PrivateUse1: median_out_zoom
     MPS: median_out_mps
 
 - func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -3969,6 +3980,7 @@
   dispatch:
     CPU: nanmedian_cpu
     CUDA: nanmedian_cuda
+    PrivateUse1: nanmedian_zoom
   autogen: nanmedian.out
 
 - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -3980,6 +3992,7 @@
   dispatch:
     CPU: nanmedian_out_cpu
     CUDA: nanmedian_out_cuda
+    PrivateUse1: nanmedian_out_zoom
 
 - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -4108,6 +4121,7 @@
   dispatch:
     CPU: mm_out_cpu
     CUDA: mm_out_cuda
+    PrivateUse1: mm_out_zoom
     MPS: mm_out_mps
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
@@ -4192,7 +4206,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: mul_out
+    CPU, CUDA, PrivateUse1: mul_out
     MPS: mul_out_mps
     SparseCPU: mul_out_sparse_cpu
     SparseCUDA: mul_out_sparse_cuda
@@ -4835,7 +4849,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: neg_out
+    CPU, CUDA, PrivateUse1: neg_out
     MPS: neg_out_mps
     SparseCPU, SparseCUDA: neg_out_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
@@ -4898,7 +4912,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
+    CPU, CUDA, PrivateUse1, Meta, QuantizedCPU, QuantizedCUDA, ZeroTensor, MPS: _reshape_alias
     # We don't need to support mkldnn since this is handled explicitly by the reshape operator.
 
 - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
@@ -5301,7 +5315,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sin_out
+    CPU, CUDA, PrivateUse1: sin_out
     MPS: sin_out_mps
     SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
     SparseCPU, SparseCUDA: sin_sparse_out
@@ -5506,6 +5520,7 @@
   dispatch:
     CPU: softmax_cpu_out
     CUDA: softmax_cuda_out
+    PrivateUse1: softmax_zoom_out
     MPS: softmax_mps_out
 
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
@@ -5518,6 +5533,7 @@
   dispatch:
     CPU: softmax_backward_cpu_out
     CUDA: softmax_backward_cuda_out
+    PrivateUse1: softmax_backward_zoom_out
     MPS: softmax_backward_mps_out
 
 - func: unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
@@ -6817,7 +6833,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: zero_
+    CPU, CUDA, PrivateUse1: zero_
     MPS: zero_mps_
     Meta: zero_meta_
     SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
@@ -6831,7 +6847,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sub_out
+    CPU, CUDA, PrivateUse1: sub_out
     MPS: sub_out_mps
     SparseCPU, SparseCUDA: sub_out_sparse
   tags: pointwise
@@ -7943,6 +7959,7 @@
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
+    PrivateUse1: masked_fill__zoom
     QuantizedCPU: masked_fill__quantized_cpu
     QuantizedCUDA: masked_fill__quantized_cuda
     MPS: masked_fill__mps
@@ -7962,6 +7979,7 @@
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
+    PrivateUse1: masked_fill__zoom
     QuantizedCPU: masked_fill__quantized_cpu
     QuantizedCUDA: masked_fill__quantized_cuda
     MPS: masked_fill__mps
@@ -7993,12 +8011,14 @@
 - func: _masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor
   dispatch:
     CUDA: masked_softmax_cuda
+    PrivateUse1: masked_softmax_zoom
     CPU: masked_softmax_cpu
   autogen: _masked_softmax.out
 
 - func: _masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
   dispatch:
     CUDA: masked_softmax_backward_cuda
+    PrivateUse1: masked_softmax_backward_zoom
     CPU: masked_softmax_backward_cpu
   autogen: _masked_softmax_backward.out
 
@@ -8044,6 +8064,7 @@
   dispatch:
     CPU: index_add_cpu_out
     CUDA: index_add_cuda_out
+    PrivateUse1: index_add_zoom_out
     MPS: index_add_mps_out
 
 - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)
@@ -8065,6 +8086,7 @@
   dispatch:
     CPU: index_reduce_cpu_out
     CUDA: index_reduce_cuda_out
+    PrivateUse1: index_reduce_zoom_out
 
 - func: index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
   structured_delegate: index_reduce.out
@@ -8725,6 +8747,7 @@
   dispatch:
     CPU: triu_cpu
     CUDA: triu_cuda
+    PrivateUse1: triu_zoom
     MPS: triu_mps_out
 
 - func: triu(Tensor self, int diagonal=0) -> Tensor
@@ -8736,6 +8759,7 @@
   dispatch:
     CPU: tril_cpu
     CUDA: tril_cuda
+    PrivateUse1: tril_zoom
     MPS: tril_mps_out
 
 - func: tril(Tensor self, int diagonal=0) -> Tensor
@@ -8759,6 +8783,7 @@
   dispatch:
     CPU: trace_cpu
     CUDA: trace_cuda
+    PrivateUse1: trace_zoom
     MPS: trace_mps
   autogen: trace.out
 
@@ -8874,7 +8899,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: ge_Scalar_out
+    CPU, CUDA, PrivateUse1: ge_Scalar_out
     MPS: ge_scalar_out_mps
     QuantizedCPU: ge_out_quantized_cpu
   tags: pointwise
@@ -8893,7 +8918,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: ge_Tensor_out
+    CPU, CUDA, PrivateUse1: ge_Tensor_out
     MPS: ge_tensor_out_mps
     QuantizedCPU: ge_out_quantized_cpu
   tags: pointwise
@@ -8938,7 +8963,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: le_Scalar_out
+    CPU, CUDA, PrivateUse1: le_Scalar_out
     MPS: le_scalar_out_mps
     QuantizedCPU: le_out_quantized_cpu
   tags: pointwise
@@ -8956,7 +8981,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: le_Tensor_out
+    CPU, CUDA, PrivateUse1: le_Tensor_out
     MPS: le_tensor_out_mps
     QuantizedCPU: le_out_quantized_cpu
   tags: pointwise
@@ -9001,7 +9026,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: gt_Scalar_out
+    CPU, CUDA, PrivateUse1: gt_Scalar_out
     MPS: gt_scalar_out_mps
     QuantizedCPU: gt_out_quantized_cpu
   tags: pointwise
@@ -9020,7 +9045,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: gt_Tensor_out
+    CPU, CUDA, PrivateUse1: gt_Tensor_out
     MPS: gt_tensor_out_mps
     QuantizedCPU: gt_out_quantized_cpu
   tags: pointwise
@@ -9065,7 +9090,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: lt_Scalar_out
+    CPU, CUDA, PrivateUse1: lt_Scalar_out
     MPS: lt_scalar_out_mps
     QuantizedCPU: lt_out_quantized_cpu
   tags: pointwise
@@ -9083,7 +9108,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: lt_Tensor_out
+    CPU, CUDA, PrivateUse1: lt_Tensor_out
     MPS: lt_tensor_out_mps
     QuantizedCPU: lt_out_quantized_cpu
   tags: pointwise
@@ -9141,6 +9166,7 @@
   dispatch:
     CPU, QuantizedCPU: index_select_out_cpu_
     CUDA, QuantizedCUDA: index_select_out_cuda
+    PrivateUse1: index_select_out_zoom
     MPS: index_select_out_mps
 
 - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
@@ -9150,6 +9176,7 @@
     QuantizedCPU: index_select_quantized_cpu_
     CUDA: index_select_cuda
     QuantizedCUDA: index_select_quantized_cuda
+    PrivateUse1: index_select_zoom
     SparseCPU: index_select_sparse_cpu
     SparseCUDA: index_select_sparse_cuda
     MPS: index_select_mps
@@ -9574,7 +9601,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sign_out
+    CPU, CUDA, PrivateUse1: sign_out
     MPS: sign_out_mps
     SparseCPU, SparseCUDA: sign_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
@@ -9594,6 +9621,7 @@
   dispatch:
     CPU: signbit_out
     CUDA: signbit_out
+    PrivateUse1: signbit_out
     MPS: signbit_out_mps
     SparseCPU, SparseCUDA: signbit_sparse_out
     SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out
@@ -10009,7 +10037,7 @@
 - func: sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   structured: True
   dispatch:
-    CPU, CUDA: sort_stable_out
+    CPU, CUDA, PrivateUse1: sort_stable_out
     MPS: sort_stable_out_mps
 
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
@@ -10059,6 +10087,7 @@
   dispatch:
     CPU: topk_out_cpu
     CUDA: topk_out_cuda
+    PrivateUse1: topk_out_zoom
     MPS: topk_out_mps
 
 - func: topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
@@ -10077,7 +10106,7 @@
   device_check: NoCheck
   structured: True
   dispatch:
-    CPU, CUDA: all_all_out
+    CPU, CUDA, PrivateUse1: all_all_out
     MPS: all_all_out_mps
 
 - func: any(Tensor self) -> Tensor
@@ -10092,7 +10121,7 @@
   device_check: NoCheck
   structured: True
   dispatch:
-    CPU, CUDA: any_all_out
+    CPU, CUDA, PrivateUse1: any_all_out
     MPS: any_all_out_mps
 
 - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
diff --git a/aten/src/ATen/native/zoom/BinaryDivFloorKernel.cu b/aten/src/ATen/native/zoom/BinaryDivFloorKernel.cu
new file mode 100644
index 00000000000000..7ad48ce8c7cb1e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryDivFloorKernel.cu
@@ -0,0 +1,83 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/BinaryInternal.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/generic_math.h>
+#include <ATen/native/zoom/BinaryInternal.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#include <type_traits>
+
+namespace at::native {
+namespace binary_internal {
+
+void div_floor_kernel_zoom(TensorIteratorBase& iter) {
+  // See NOTE: [Floor Division in Python]
+  const auto dtype = iter.common_dtype();
+  if (dtype == kByte) {
+    // In the special case of unsigned integer division, floor division is
+    // equivalent to truncation division (since the signs of the divisor and
+    // dividend are always the same)
+    return div_trunc_kernel_zoom(iter);
+  } else if (isIntegralType(dtype, /*includeBool*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_zoom", [&]() {
+      gpu_kernel_with_scalars(
+          iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+            return c10::div_floor_integer(a, b);
+      });
+    });
+  } else if (iter.is_cpu_scalar(2)) {
+    // optimization for floating-point types: if the second operand is a CPU
+    // scalar, compute a * reciprocal(b). Note that this may lose one bit of
+    // precision compared to computing the division.
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, dtype, "div_floor_zoom", [&]() {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+          auto b = iter.scalar_value<accscalar_t>(2);
+          if (C10_UNLIKELY(b == 0)) {
+            return div_true_kernel_zoom(iter);
+          }
+
+          auto inv_b = accscalar_t(1.0) / b;
+          iter.remove_operand(2);
+          gpu_kernel(iter, [b, inv_b] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            auto mod = std::fmod(a, b);
+            auto div = (a - mod) * inv_b;
+            if ((mod != 0) && (b < 0) != (mod < 0)) {
+              div -= scalar_t(1);
+            }
+
+            scalar_t floordiv;
+            if (div != 0) {
+              floordiv = std::floor(div);
+              if (div - floordiv > scalar_t(0.5)) {
+                floordiv += scalar_t(1.0);
+              }
+            } else {
+              floordiv = c10::hip::compat::copysign(scalar_t(0), a * inv_b);
+            }
+            return floordiv;
+          });
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, dtype, "div_floor_zoom", [&]() {
+          gpu_kernel_with_scalars(
+              iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+                return c10::div_floor_floating(a, b);
+              });
+        });
+  }
+}
+} // namespace binary_internal
+
+REGISTER_PRIVATEUSE1_DISPATCH(div_floor_stub, &binary_internal::div_floor_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryDivTrueKernel.cu b/aten/src/ATen/native/zoom/BinaryDivTrueKernel.cu
new file mode 100644
index 00000000000000..09b92154633f61
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryDivTrueKernel.cu
@@ -0,0 +1,61 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/native/zoom/BinaryInternal.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#include <type_traits>
+
+namespace at::native {
+namespace binary_internal {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char div_name[] = "div_kernel";
+void div_true_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (iter.common_dtype() == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+#if AT_USE_JITERATOR()
+    static const auto div_string = jiterator_stringify(
+        template <typename T> T div_kernel(T a, T b) { return a / b; });
+    opmath_jitted_gpu_kernel_with_scalars<div_name, scalar_t, scalar_t>(
+        iter, div_string);
+#else
+    using opmath_t = at::opmath_type<scalar_t>;
+    opmath_gpu_kernel_with_scalars<scalar_t>(iter, DivFunctor<opmath_t>());
+#endif
+    return;
+  }
+  if (iter.is_cpu_scalar(2)) {
+    // optimization for floating-point types: if the second operand is a CPU
+    // scalar, compute a * reciprocal(b). Note that this may lose one bit of
+    // precision compared to computing the division.
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        kHalf, kBFloat16, common_dtype, "div_true_zoom", [&]() {
+          using opmath_t = at::opmath_type<scalar_t>;
+          auto inv_b = opmath_t(1.0) / iter.scalar_value<opmath_t>(2);
+          iter.remove_operand(2);
+          gpu_kernel(
+              iter,
+              BUnaryFunctor<scalar_t, scalar_t, scalar_t, MulFunctor<opmath_t>>(
+                  MulFunctor<opmath_t>(), inv_b));
+        });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        kHalf, kBFloat16, common_dtype, "div_true_zoom", [&]() {
+          DivFunctor<scalar_t> f;
+          gpu_kernel_with_scalars(iter, f);
+        });
+  }
+}
+} // namespace binary_internal
+
+REGISTER_PRIVATEUSE1_DISPATCH(div_true_stub, &binary_internal::div_true_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryDivTruncKernel.cu b/aten/src/ATen/native/zoom/BinaryDivTruncKernel.cu
new file mode 100644
index 00000000000000..bc1f9a851ae327
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryDivTruncKernel.cu
@@ -0,0 +1,53 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#include <type_traits>
+
+namespace at::native {
+namespace binary_internal {
+
+void div_trunc_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.common_dtype();
+  if (isIntegralType(dtype, /*includeBool*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_trunc_zoom", [&]() {
+      gpu_kernel_with_scalars(
+          iter,
+          [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { return a / b; });
+    });
+  } else if (iter.is_cpu_scalar(2)) {
+    // optimization for floating-point types: if the second operand is a CPU
+    // scalar, compute a * reciprocal(b). Note that this may lose one bit of
+    // precision compared to computing the division.
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, dtype, "div_trunc_zoom", [&]() {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+          auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
+          iter.remove_operand(2);
+          gpu_kernel(iter, [inv_b] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return std::trunc(a * inv_b);
+          });
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, dtype, "div_trunc_zoom", [&]() {
+          gpu_kernel_with_scalars(
+              iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+                return std::trunc(a / b);
+              });
+        });
+  }
+}
+} // namespace binary_internal
+
+REGISTER_PRIVATEUSE1_DISPATCH(div_trunc_stub, &binary_internal::div_trunc_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryInternal.h b/aten/src/ATen/native/zoom/BinaryInternal.h
new file mode 100644
index 00000000000000..a42408c5207fa1
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryInternal.h
@@ -0,0 +1,48 @@
+// DON'T include this except from Binary*.cu files. It should not leak into
+// headers.
+#pragma once
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#include <type_traits>
+
+namespace at {
+namespace native {
+namespace binary_internal {
+
+template <typename scalar_t>
+struct DivFunctor {
+  __device__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a / b;
+  }
+};
+
+template <typename T>
+struct MulFunctor {
+  __device__ T operator()(T a, T b) const {
+    return a * b;
+  }
+};
+
+// Workaround for the error: '*' in boolean context, suggest '&&' instead
+// [-Werror=int-in-bool-context]
+template <>
+struct MulFunctor<bool> {
+  __device__ bool operator()(bool a, bool b) const {
+    return a && b;
+  }
+};
+void div_true_kernel_zoom(TensorIteratorBase& iter);
+void div_trunc_kernel_zoom(TensorIteratorBase& iter);
+} // namespace binary_internal
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/BinaryMulKernel.cu b/aten/src/ATen/native/zoom/BinaryMulKernel.cu
new file mode 100644
index 00000000000000..dd42ba4d24880d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryMulKernel.cu
@@ -0,0 +1,48 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/BinaryInternal.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#include <type_traits>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char mul_name[] = "mul_kernel";
+void mul_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (common_dtype == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+#if AT_USE_JITERATOR()
+    static const auto mul_string = jiterator_stringify(
+        template <typename T> T mul_kernel(T a, T b) { return a * b; });
+    opmath_jitted_gpu_kernel_with_scalars<mul_name, scalar_t, scalar_t>(
+        iter, mul_string);
+#else
+    using opmath_t = at::opmath_type<scalar_t>;
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
+        iter, binary_internal::MulFunctor<opmath_t>());
+#endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+        kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_zoom", [&]() {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
+              iter, binary_internal::MulFunctor<opmath_t>());
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(mul_stub, &mul_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Bmm.cpp b/aten/src/ATen/native/zoom/Bmm.cpp
new file mode 100644
index 00000000000000..f95e530655919f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Bmm.cpp
@@ -0,0 +1,121 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/mm_native.h>
+#endif
+
+
+namespace at::native {
+    // Forward decl, defined in HIPbmm.cu
+    template <typename T>
+    void batched_matmul(const T* A, const T* B, T* C, int M, int N, int K, int batch_size);
+
+    const Tensor& bmm_out_hip_impl(const Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2) {
+        // handle pathological cases
+        if (result.numel() == 0) {
+            return result;
+        } else if (batch1.size(2) == 0) {
+            return result.zero_();
+        }
+
+        c10::MaybeOwned<Tensor> result_ = c10::MaybeOwned<Tensor>::borrowed(result);
+        IntArrayRef result_strides = result.strides();
+        IntArrayRef result_sizes = result.sizes();
+
+        int m = result_sizes[1];
+        int n = result_sizes[2];
+        int k = batch1.sizes()[2];
+        int num_batches = result_->sizes()[0];
+
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "bmm_hip", [&] {
+            const scalar_t* batch1_ptr = batch1.const_data_ptr<scalar_t>();
+            const scalar_t* batch2_ptr = batch2.const_data_ptr<scalar_t>();
+            scalar_t* result_ptr = result_->mutable_data_ptr<scalar_t>();
+           
+           batched_matmul<scalar_t>(batch1_ptr, batch2_ptr, result_ptr, m, n, k, num_batches);
+        });
+        if (!result.is_same(*result_)) {
+            result.copy_(*result_);
+        }
+        return result;
+
+    }
+
+    TORCH_IMPL_FUNC(bmm_out_zoom)(const Tensor& batch1, const Tensor& batch2, const Tensor &result)
+    {
+        NoNamesGuard guard;
+        bmm_out_hip_impl(result, result, batch1, batch2);
+    }
+
+    Tensor& mm_out_hip_impl(Tensor& result, const Tensor& mat1, const Tensor& mat2) {
+        // Make sure to keep addmm_hip below in sync with this code; it
+        // preflights a check to try to avoid actually needing to call
+        // expand().
+        TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+        TORCH_CHECK(
+            mat1.dtype() == mat2.dtype(),
+            "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+        )
+
+        TensorArg targs[]{{result, "out", 0}, {mat1, "mat1", 1}, {mat2, "mat2", 2}};
+        checkAllSameGPU(__func__, targs);
+
+        IntArrayRef mat1_sizes = mat1.sizes();
+        IntArrayRef mat2_sizes = mat2.sizes();
+        at::ScalarType scalar_type = mat1.scalar_type();
+        TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
+        TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
+
+        // resize result tensor
+        at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
+        IntArrayRef result_sizes = result.sizes();
+        if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+            return result;
+        }
+
+        if (mat1.numel() == 0) {
+            // By definition, values in self should be ignored. nans and infs
+            // should not propagate
+            return result.zero_();
+        }
+
+        int m = mat1_sizes[0];
+        int n = mat1_sizes[1];
+        int k = mat2_sizes[1];
+
+        // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result.is_conj());
+        
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            at::ScalarType::Half,
+            at::ScalarType::BFloat16,
+            scalar_type,
+            "mm_zoom",
+            [&] {
+                const scalar_t* mat1_ptr = mat1.const_data_ptr<scalar_t>();
+                const scalar_t* mat2_ptr = mat2.const_data_ptr<scalar_t>();
+                scalar_t* result_ptr = result.mutable_data_ptr<scalar_t>();
+                batched_matmul<scalar_t>(mat1_ptr, mat2_ptr, result_ptr, m, n, k, 1);
+            });
+
+        return result;
+    }
+
+    TORCH_IMPL_FUNC(mm_out_zoom)(const Tensor& self, const Tensor& mat2, const Tensor& result) 
+    {
+        mm_out_hip_impl(const_cast<Tensor&>(result), self, mat2);
+    }
+
+} // at::native
+
+
diff --git a/aten/src/ATen/native/zoom/CompareKernels.cu b/aten/src/ATen/native/zoom/CompareKernels.cu
new file mode 100644
index 00000000000000..7975d449d19592
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CompareKernels.cu
@@ -0,0 +1,103 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native { namespace {
+
+enum class OpType {GE, GT, LE, LT};
+
+template<typename scalar_t>
+struct CompareFunctor{
+  constexpr CompareFunctor(OpType op): op_(op) {};
+  OpType op_;
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    if (op_ == OpType::GE) {
+      return a >= b;
+    } else if (op_ == OpType::GT) {
+      return a > b;
+    } else if (op_ == OpType::LE) {
+      return a <= b;
+    } else { //LT
+      return a < b;
+    }
+  }
+};
+
+// Reflects the comparison operator, so reflect(op)(a, b) == op(b, a)
+OpType reflect(OpType x) {
+  switch (x) {
+    case OpType::GE: return OpType::LE;
+    case OpType::GT: return OpType::LT;
+    case OpType::LE: return OpType::GE;
+    case OpType::LT: return OpType::GT;
+  }
+  TORCH_INTERNAL_ASSERT(false, "Invalid OpType");
+}
+
+}  // namespace (anonymous)
+
+template <typename scalar_t>
+void compare_scalar_kernel(TensorIteratorBase &iter, OpType op, scalar_t rhs) {
+  CompareFunctor<scalar_t> f(op);
+  gpu_kernel(iter, [=] GPU_LAMBDA (scalar_t lhs) -> bool {
+    return f(lhs, rhs);
+  });
+}
+
+template <typename scalar_t>
+void compare_kernel_impl(TensorIteratorBase &iter, OpType op) {
+  // If either input is a cpu scalar, perform the equivalent comparison
+  // where the scalar is on the right hand side. This saves us from
+  // generating two otherwise identical kernels with mirrored
+  // arguments.
+  if (iter.is_cpu_scalar(1)) {
+    const scalar_t lhs = iter.scalar_value<scalar_t>(1);
+    iter.remove_operand(1);
+    const DeviceGuard device_guard(iter.device(1));
+    compare_scalar_kernel(iter, reflect(op), lhs);
+  } else if (iter.is_cpu_scalar(2)) {
+    const scalar_t rhs = iter.scalar_value<scalar_t>(2);
+    iter.remove_operand(2);
+    compare_scalar_kernel(iter, op, rhs);
+  } else {
+    CompareFunctor<scalar_t> f(op);
+    gpu_kernel(iter, f);
+  }
+}
+
+C10_NOINLINE void compare_kernel_with_scalars(TensorIteratorBase &iter, OpType op) {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "compare_zoom", [&]() {
+    compare_kernel_impl<scalar_t>(iter, op);
+  });
+}
+
+
+void ge_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::GE);
+}
+
+void gt_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::GT);
+}
+
+void le_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::LE);
+}
+
+void lt_kernel_zoom(TensorIteratorBase& iter) {
+  compare_kernel_with_scalars(iter, OpType::LT);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(ge_stub, &ge_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(gt_stub, &gt_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(le_stub, &le_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(lt_stub, &lt_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Copy.cu b/aten/src/ATen/native/zoom/Copy.cu
index 57436f844beedc..f1ad63e7cd7e63 100644
--- a/aten/src/ATen/native/zoom/Copy.cu
+++ b/aten/src/ATen/native/zoom/Copy.cu
@@ -27,33 +27,8 @@ namespace at::native {
 // forward decl, defined below
 void direct_copy_kernel_zoom(TensorIteratorBase &iter);
 
-// NB: Ignores the negative bit on tensors
-CONSTEXPR_EXCEPT_WIN_CUDA char neg_name[] = "neg_kernel";
-void neg_kernel_zoom(TensorIteratorBase& iter) {
-  auto dtype = iter.dtype();
-  if (at::isComplexType(dtype)) {
-  static const auto neg_string = jiterator_stringify(
-      template <typename T>
-      T neg_kernel(T a) {
-        return -a;
-      }
-  ); // neg_string
-  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_zoom", [&]() {
-      jitted_gpu_kernel<
-        /*name=*/ neg_name,
-        /*return_dtype=*/ scalar_t,
-        /*common_dtype=*/ scalar_t,
-        /*arity=*/ 1>(iter, neg_string);
-  });
-
-  } else {
-  AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, dtype, "neg_zoom", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return -a;
-    });
-  });
-  }
-}
+// forward decl, defined in UnarySignKernels.cu
+void neg_kernel_zoom(TensorIteratorBase& iter);
 
 // NB: Ignores the negative bit on tensors
 CONSTEXPR_EXCEPT_WIN_CUDA char conj_name[] = "conj_kernel";
diff --git a/aten/src/ATen/native/zoom/CumminmaxKernel.cu b/aten/src/ATen/native/zoom/CumminmaxKernel.cu
new file mode 100644
index 00000000000000..5c3e3a6aa211f4
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CumminmaxKernel.cu
@@ -0,0 +1,29 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/native/zoom/ScanKernels.h>
+#include <ATen/native/zoom/ScanUtils.cuh>
+
+#include <limits>
+#include <functional>
+
+namespace at::native {
+
+void launch_cummax_zoom_kernel(const TensorBase& self, const TensorBase& values, const TensorBase& indices, int64_t dim) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16,
+    self.scalar_type(), "cummax_zoom", [&]() {
+    scalar_t init = self.is_floating_point() ? (-1*std::numeric_limits<scalar_t>::infinity()) : std::numeric_limits<scalar_t>::lowest();
+    scan_dim_with_indices<scalar_t>(self, values, indices, dim, init, std::greater_equal<scalar_t>());
+  });
+}
+
+void launch_cummin_zoom_kernel(const TensorBase& self, const TensorBase& values, const TensorBase& indices, int64_t dim) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16,
+    self.scalar_type(), "cummin_zoom", [&]() {
+    scalar_t init = self.is_floating_point() ? std::numeric_limits<scalar_t>::infinity() : std::numeric_limits<scalar_t>::max();
+    scan_dim_with_indices<scalar_t>(self, values, indices, dim, init, std::less_equal<scalar_t>());
+  });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CumprodKernel.cu b/aten/src/ATen/native/zoom/CumprodKernel.cu
new file mode 100644
index 00000000000000..eaa48e306d4799
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CumprodKernel.cu
@@ -0,0 +1,23 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/native/zoom/ScanKernels.h>
+#include <ATen/native/zoom/ScanUtils.cuh>
+
+namespace at::native {
+
+void launch_cumprod_zoom_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      ScalarType::Half, ScalarType::BFloat16, self.scalar_type(), "cumprod_zoom", [&]() {
+        scalar_t init = 1;
+        scan_dim<scalar_t>(
+            self,
+            result,
+            dim,
+            init,
+            std::multiplies<scalar_t>());
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CumsumKernel.cu b/aten/src/ATen/native/zoom/CumsumKernel.cu
new file mode 100644
index 00000000000000..41808fb8fae8ae
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CumsumKernel.cu
@@ -0,0 +1,25 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/native/zoom/ScanKernels.h>
+#include <ATen/native/zoom/ScanUtils.cuh>
+
+namespace at::native {
+
+void launch_cumsum_zoom_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      self.scalar_type(), "cumsum_zoom",
+      [&]() {
+        scalar_t init = 0;
+        scan_dim<scalar_t>(
+            self,
+            result,
+            dim,
+            init,
+            std::plus<scalar_t>());
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DeviceSqrt.cuh b/aten/src/ATen/native/zoom/DeviceSqrt.cuh
new file mode 100644
index 00000000000000..d5833a9882fd82
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DeviceSqrt.cuh
@@ -0,0 +1,18 @@
+#pragma once
+
+namespace at { namespace native {
+// take these out when ROCm implements std:: math functions
+#include <math.h>
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val);
+
+template <>
+__forceinline__ __device__ float device_sqrt(float val) {
+  return ::sqrtf(val);
+}
+
+template <>
+__forceinline__ __device__ double device_sqrt(double val) {
+  return ::sqrt(val);
+}
+}}
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/HIPbmm.cu b/aten/src/ATen/native/zoom/HIPbmm.cu
new file mode 100644
index 00000000000000..84f5eb2aaf6201
--- /dev/null
+++ b/aten/src/ATen/native/zoom/HIPbmm.cu
@@ -0,0 +1,126 @@
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <c10/core/ScalarType.h>
+
+namespace at::native {
+
+    // Helper function to convert hip_bfloat16 to float
+    __device__ float bfloat16_to_float(hip_bfloat16 a) {
+        union {
+            uint32_t int32;
+            float float32;
+        } u = {uint32_t(a.data) << 16};
+        return u.float32;
+    }
+
+    // Helper function to convert float to hip_bfloat16
+    __device__ hip_bfloat16 float_to_bfloat16(float a) {
+        union {
+            float float32;
+            uint32_t int32;
+        } u = {a};
+        hip_bfloat16 b;
+        b.data = uint16_t(u.int32 >> 16);
+        return b;
+    }
+
+    template <typename T>
+    __device__ float convert_to_float(T a) {
+        return a;
+    }
+
+    template <>
+    __device__ float convert_to_float<hip_bfloat16>(hip_bfloat16 a) {
+        return bfloat16_to_float(a);
+    }
+
+    template <>
+    __device__ float convert_to_float<__half>( __half a) {
+        return __half2float(a);
+    }
+
+    template <typename T>
+    __device__ T convert_from_float(float a) {
+        return static_cast<T>(a);
+    }
+
+    template <>
+    __device__ hip_bfloat16 convert_from_float<hip_bfloat16>(float a) {
+        return float_to_bfloat16(a);
+    }
+
+    template <>
+    __device__ __half convert_from_float<__half>(float a) {
+        return __float2half(a);
+    }
+
+
+    template <typename T>
+    __global__ void batched_matmul_kernel(const T* A, const T* B, T* C, 
+                                        int M, int N, int K, int batch_size) {
+        int row = blockIdx.y * blockDim.y + threadIdx.y;
+        int col = blockIdx.x * blockDim.x + threadIdx.x;
+        int batch = blockIdx.z;
+
+        if (row < M && col < N) {
+            float sum = 0.0f;
+            for (int k = 0; k < K; ++k) {
+                sum += convert_to_float(A[batch * M * K + row * K + k]) * 
+                    convert_to_float(B[batch * K * N + k * N + col]);
+            }
+            C[batch * M * N + row * N + col] = convert_from_float<T>(sum);
+        }
+    }
+
+    template <typename T>
+    void batched_matmul(const T* A, const T* B, T* C, 
+                        int M, int N, int K, int batch_size) {
+        dim3 threadsPerBlock(16, 16);
+        dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                    (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                    batch_size);
+
+        hipLaunchKernelGGL(batched_matmul_kernel<T>, numBlocks, threadsPerBlock, 0, 0,
+                        A, B, C, M, N, K, batch_size);
+    }
+
+    // Specialization for at::Half
+    template <>
+    void batched_matmul<at::Half>(const at::Half* A, const at::Half* B, at::Half* C,
+                                        int M, int N, int K, int batch_size) {
+        dim3 threadsPerBlock(16, 16);
+        dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                    (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                    batch_size);
+
+        hipLaunchKernelGGL(batched_matmul_kernel<__half>, numBlocks, threadsPerBlock, 0, 0,
+            reinterpret_cast<const __half*>(A),
+            reinterpret_cast<const __half*>(B),
+            reinterpret_cast<__half*>(C),
+            M, N, K, batch_size);
+    }
+
+    // Specialization for at::BFloat16
+    template <>
+    void batched_matmul<at::BFloat16>(const at::BFloat16* A, const at::BFloat16* B, at::BFloat16* C,
+                                            int M, int N, int K, int batch_size) {
+        dim3 threadsPerBlock(16, 16);
+        dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+                    (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                    batch_size);
+
+        hipLaunchKernelGGL(batched_matmul_kernel<hip_bfloat16>, numBlocks, threadsPerBlock, 0, 0,
+            reinterpret_cast<const hip_bfloat16*>(A),
+            reinterpret_cast<const hip_bfloat16*>(B),
+            reinterpret_cast<hip_bfloat16*>(C),
+            M, N, K, batch_size);
+    }
+
+    // Explicit instantiations for supported types
+    template void batched_matmul<float>(const float*, const float*, float*, int, int, int, int);
+    template void batched_matmul<double>(const double*, const double*, double*, int, int, int, int);
+    template void batched_matmul<half>(const half*, const half*, half*, int, int, int, int);
+    template void batched_matmul<hip_bfloat16>(const hip_bfloat16*, const hip_bfloat16*, hip_bfloat16*, int, int, int, int);
+
+} // at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Indexing.cu b/aten/src/ATen/native/zoom/Indexing.cu
new file mode 100644
index 00000000000000..6cd4d946ea9cda
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Indexing.cu
@@ -0,0 +1,1798 @@
+#include <hip/hip_runtime.h>
+// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/TensorAdvancedIndexing.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/quantized/IndexKernel.h>
+#include <ATen/native/zoom/KernelUtils.cuh>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/Resize.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/HIPUtils.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_assert_async.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_reduce_native.h>
+#include <ATen/ops/index_select_native.h>
+#include <ATen/ops/masked_fill_native.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#endif
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/cub.cuh>
+#include <c10/util/irange.h>
+#include <c10/core/QScheme.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+#include <limits>
+
+#include <c10/macros/Macros.h>
+
+namespace {
+template <typename scalar_t, int SZ>
+__global__ void indexing_backward_kernel(
+  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
+  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
+//numel is total number of flattened indices, not expanded to dimensions that are not indexed.
+//stride is the cumulative size of the not-indexed last dimensions
+//stride_before is the stride of the dimension immediately preceding first indexed dimension
+//if indexing starts from the 0th dimension, stride_before does not matter because blockIdx.z will be 0 in this case
+//outer_dim is number of elements in the first unindexed dimensions
+  using opmath_t = at::opmath_type<scalar_t>;
+
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceding input has the same destination index as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.
+  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+
+  // Number of values processed by each thread (grain size)
+  for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z){
+    int64_t idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if (idx < numel
+        && (idx == 0 || sorted_indices[idx] != sorted_indices[idx - 1])){
+      do {
+        int64_t start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+        // if not accumulate, we only keep the last duplicate index so skip those before it
+        if (!accumulate && (idx < numel - 1) && sorted_indices[idx] == sorted_indices[idx + 1]) {
+          idx++;
+          continue;
+        }
+        const int64_t weight_row = ((int64_t) sorted_indices[idx]) * stride + z * stride_before;
+        const int64_t grad_row = ((int64_t) indices[idx]) * stride + z * numel * stride;
+        const opmath_t scale = (opmath_t)1.0;
+
+        opmath_t gradient[SZ];
+        opmath_t weight[SZ];
+
+        while (start_feature < stride) {
+          #pragma unroll
+          for (int ii = 0; ii < SZ; ii++) {
+            int64_t feature_dim = start_feature + ii * C10_WARP_SIZE;
+            if (feature_dim < stride) {
+              gradient[ii] = static_cast<opmath_t>(grad_output[grad_row + feature_dim]);
+              if (accumulate) {
+                weight[ii] = static_cast<opmath_t>(grad_weight[weight_row + feature_dim]);
+              }
+            }
+          }
+
+          #pragma unroll
+          for (int ii = 0; ii < SZ; ii++) {
+            if (accumulate) {
+              weight[ii] += gradient[ii] * scale;
+            } else {
+              weight[ii] = gradient[ii] * scale;
+            }
+          }
+
+          #pragma unroll
+          for (int ii = 0; ii < SZ; ii++) {
+            int64_t feature_dim = start_feature + ii * C10_WARP_SIZE;
+            if (feature_dim < stride) {
+                grad_weight[weight_row + feature_dim] = static_cast<scalar_t>(weight[ii]);
+            }
+          }
+          start_feature += gridDim.y * blockDim.x * SZ;
+        }
+
+        idx++;
+      } while (idx < numel && sorted_indices[idx] == sorted_indices[idx - 1]);
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void indexing_backward_kernel_stride_1(
+  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
+  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
+  using opmath_t = at::opmath_type<scalar_t>;
+
+  // Number of values processed by each thread (grain size)
+  for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z){
+    int64_t idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int64_t crnt_sorted_idx = sorted_indices[idx];
+
+    if ((idx < numel) &&
+        (idx == 0 || crnt_sorted_idx != sorted_indices[idx - 1]))
+    {
+      // Determine the number of duplicates in advance
+      int64_t num_duplicates = 1;
+      while (((idx + num_duplicates) < numel) && (sorted_indices[idx + num_duplicates] == crnt_sorted_idx)) {
+        num_duplicates++;
+      }
+
+      // Continue computing weights
+      const int64_t weight_row = crnt_sorted_idx * stride + z * stride_before;
+      int64_t grad_row = 0;
+      const opmath_t scale = (opmath_t)1.0;
+
+      if (!accumulate) {
+        grad_row = ((int64_t)indices[idx + num_duplicates - 1]) * stride + z * numel * stride;
+        grad_weight[weight_row] =
+          static_cast<scalar_t>(static_cast<opmath_t>(grad_output[grad_row]) * scale);
+      } else {
+        opmath_t gradient = (opmath_t)0.0;
+
+        int laneIdx = threadIdx.x % C10_WARP_SIZE;
+        int64_t num_warp_passes = num_duplicates / C10_WARP_SIZE;
+        for (int64_t i = 0; i < num_warp_passes; ++i) {
+            grad_row = ((int64_t) indices[idx + i * C10_WARP_SIZE + laneIdx]) * stride + z * numel * stride;
+            gradient += static_cast<opmath_t>(grad_output[grad_row]) * scale;
+        }
+        WARP_SYNC();
+        for (int offset = C10_WARP_SIZE / 2; offset > 0; offset /= 2) {
+          gradient += WARP_SHFL_DOWN(gradient, offset);
+        }
+
+        if (laneIdx == 0) {
+          for (int64_t i = num_warp_passes * C10_WARP_SIZE; i < num_duplicates; ++i) {
+            grad_row = ((int64_t) indices[idx + i]) * stride + z * numel * stride;
+            gradient += static_cast<opmath_t>(grad_output[grad_row]) * scale;
+          }
+
+          grad_weight[weight_row] = static_cast<scalar_t>(static_cast<opmath_t>(grad_weight[weight_row]) + gradient);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void indexing_backward_kernel_small_stride(
+  const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
+  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
+  using opmath_t = at::opmath_type<scalar_t>;
+
+  // Number of values processed by each thread (grain size)
+  for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z){
+    int64_t idx = blockIdx.x * blockDim.y + threadIdx.y;
+    int64_t tidx = threadIdx.x;
+    int64_t crnt_sorted_idx = sorted_indices[idx];
+
+    if ((idx < numel) &&
+        (tidx < stride) &&
+        (idx == 0 || crnt_sorted_idx != sorted_indices[idx - 1]))
+    {
+      // Determine the number of duplicates in advance
+      int64_t num_duplicates = 1;
+      while (((idx + num_duplicates) < numel) && (sorted_indices[idx + num_duplicates] == crnt_sorted_idx)) {
+        num_duplicates++;
+      }
+
+      // Continue computing weights
+      const int64_t weight_row = crnt_sorted_idx * stride + z * stride_before;
+      int64_t grad_row = 0;
+      const opmath_t scale = (opmath_t)1.0;
+
+      if (!accumulate) {
+        grad_row = ((int64_t)indices[idx + num_duplicates - 1]) * stride + z * numel * stride;
+        grad_weight[weight_row + tidx] =
+          static_cast<scalar_t>(static_cast<opmath_t>(grad_output[grad_row + tidx]) * scale);
+      } else {
+        opmath_t gradient = (opmath_t)0.0;
+        for (int64_t i = 0; i < num_duplicates; ++i) {
+          grad_row = ((int64_t) indices[idx + i]) * stride + z * numel * stride;
+          gradient += static_cast<opmath_t>(grad_output[grad_row + tidx]) * scale;
+        }
+
+        grad_weight[weight_row + tidx] = static_cast<scalar_t>(static_cast<opmath_t>(grad_weight[weight_row + tidx]) + gradient);
+      }
+    }
+  }
+}
+
+template <typename scalar_t, int SZ>
+__global__ void indexing_backward_kernel_quantized(
+  const int64_t* sorted_indices, const int64_t* indices, const float* grad_output, scalar_t* grad_weight,
+  int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim,
+  float inv_scale, int zero_point, int64_t qmin, int64_t qmax) {
+
+  // This implementation is adopted from indexing_backward_kernel above.
+  using opmath_t = at::opmath_type<float>;
+  for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z){
+    int64_t idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if (idx < numel
+        && (idx == 0 || sorted_indices[idx] != sorted_indices[idx - 1])){
+      do {
+        int64_t start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+        // we only keep the last duplicate index so skip those before it
+        if ((idx < numel - 1) && sorted_indices[idx] == sorted_indices[idx + 1]) {
+          idx++;
+          continue;
+        }
+        const int64_t weight_row = ((int64_t) sorted_indices[idx]) * stride + z * stride_before;
+        const int64_t grad_row = ((int64_t) indices[idx]) * stride + z * numel * stride;
+        const opmath_t scale = (opmath_t)1.0;
+
+        opmath_t gradient[SZ];
+        opmath_t weight[SZ];
+
+        while (start_feature < stride) {
+          #pragma unroll
+          for (int ii = 0; ii < SZ; ii++) {
+            int64_t feature_dim = start_feature + ii * C10_WARP_SIZE;
+            if (feature_dim < stride) {
+              gradient[ii] = static_cast<opmath_t>(grad_output[grad_row + feature_dim]);
+            }
+          }
+
+          #pragma unroll
+          for (int ii = 0; ii < SZ; ii++) {
+            weight[ii] = gradient[ii] * scale;
+          }
+
+          #pragma unroll
+          for (int ii = 0; ii < SZ; ii++) {
+            int64_t feature_dim = start_feature + ii * C10_WARP_SIZE;
+            if (feature_dim < stride) {
+                // we do quantization here
+                int64_t qvalue = static_cast<int64_t>(zero_point + nearbyintf(weight[ii]* inv_scale));
+                qvalue = min(max(qvalue, qmin), qmax);
+                grad_weight[weight_row + feature_dim] = static_cast<scalar_t>(qvalue);
+            }
+          }
+          start_feature += gridDim.y * blockDim.x * SZ;
+        }
+
+        idx++;
+      } while (idx < numel && sorted_indices[idx] == sorted_indices[idx - 1]);
+    }
+  }
+}
+
+
+}
+
+
+namespace at::native {
+
+namespace {
+
+class ReduceMultiply {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMul(self_data_start + index, *src_data);
+  }
+};
+static ReduceMultiply reduce_multiply;
+
+class ReduceAdd {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    fastAtomicAdd(self_data_start, index, numel, *src_data, true);
+  }
+};
+static ReduceAdd reduce_add;
+
+class ReduceMinimum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMin(self_data_start + index, *src_data);
+  }
+};
+static ReduceMinimum reduce_minimum;
+
+class ReduceMaximum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMax(self_data_start + index, *src_data);
+  }
+};
+static ReduceMaximum reduce_maximum;
+
+}
+
+static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size, bool check_range=true) {
+//we don't need to check range in backward - if there were out of bounds indices forward should already have errored out
+  if (index.numel() != 0 && check_range) {
+    at::_assert_async(index.max() < dim_size);
+    at::_assert_async(index.min() >= -dim_size);
+  }
+  return index.remainder(dim_size);
+}
+
+static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
+  // computes the stride as if tensor were contiguous
+  auto sizes = tensor.sizes();
+  std::vector<int64_t> stride(tensor.dim());
+  if (stride.empty()) {
+    return stride;
+  }
+  stride[tensor.dim() - 1] = 1;
+  std::partial_sum(sizes.rbegin(), sizes.rend() - 1, stride.rbegin() + 1, std::multiplies<int64_t>());
+  return stride;
+}
+
+static std::tuple<Tensor, int64_t, int64_t, int64_t>
+computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) {
+  auto strides = computeLinearStride(src);
+  const auto& device = src.options().device();
+
+  // Compute the linear index by multiplying the indexing tensors by the
+  // stride and summing them. All the indexing tensors have the same shape at
+  // this point. We also compute the number of dimensions before and after that
+  // are not being index.
+  Tensor linearIndex;
+  int64_t nElemBefore = 1, nElemAfter = 1, strideBefore =0;
+  for (const auto i: c10::irange(src.dim())) {
+    if (indices[i].defined()) {
+      // Cast index to the longType matching src's device
+      // This allows us to support ie indexing a cuda tensor with a cpu tensor
+      Tensor index = (wrapIndexOnce(indices[i], i, src.size(i), check_range) * strides[i]).to(device);
+      if (linearIndex.defined()) {
+        linearIndex += index;
+      } else {
+        linearIndex = index;
+        if (i>0) {
+           strideBefore = src.stride(i-1); // stride after undefined dimensions
+        }
+      }
+    } else if (linearIndex.defined()) {
+      nElemAfter *= src.size(i);
+    } else {
+      nElemBefore *= src.size(i);
+    }
+  }
+
+  return std::make_tuple(std::move(linearIndex), nElemBefore, strideBefore, nElemAfter);
+}
+
+
+static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>> makeLinearIndex(Tensor self, IOptTensorListRef orig, bool check_range) {
+  checkIndexTensorTypes(orig, /*allow_int*/true);
+  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
+  auto indices = expandTensors(self, orig);
+  for (auto & i : indices) {
+    if (i.defined() && i.dtype() == at::kInt) {
+      i = i.to(at::kLong);
+    }
+  }
+  // next broadcast all index tensors together
+  indices = expand_outplace(indices);
+  // add missing null Tensors so that it matches self.dim()
+  while (indices.size() < (size_t)self.dim()) {
+    indices.emplace_back();
+  }
+  // if the non-null indices are not all adjacent, transpose self and indices
+  // together so that they're adjacent at the front
+  std::vector<int64_t> inversePerm;
+  if (!hasContiguousSubspace(indices)) {
+    std::tie(self, indices, inversePerm) = transposeToFrontAndInvPerm(self, indices);
+  }
+  auto [linearIndex, nElemBefore, strideBefore, nElemAfter] = computeLinearIndex(self, indices, check_range);
+  return std::make_tuple(linearIndex, self, nElemBefore, strideBefore, nElemAfter, inversePerm);
+}
+
+
+void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_indices, Tensor &sorted_indices, int64_t num_indices);
+
+namespace {
+
+int64_t largestIndex(const Tensor &self) {
+  int64_t result = 0;
+  for (const auto i: c10::irange(self.dim())) {
+    result += (self.sizes()[i] - 1) * self.strides()[i];
+  }
+  return result;
+}
+
+void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Tensor>>& indices, const Tensor & value, bool accumulate, bool unsafe) {
+  TORCH_CHECK(!indices.empty() || is_expandable_to(value.sizes(), self.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
+             " cannot be broadcast to indexing result of shape ", self.sizes());
+  if (indices.size() > (size_t)self.dim()) {
+    TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+  }
+  bool self_contiguous = self.is_contiguous();
+  auto self_ = self_contiguous ? self : self.contiguous();
+  Tensor linearIndex, src, expandedValue = value;
+  int64_t nElemBefore, strideBefore, sliceSize;
+  std::vector<int64_t> inversePerm;
+  std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self_, indices, !unsafe);
+  int64_t num_indices = linearIndex.numel();
+
+  if (expandedValue.numel() < num_indices * nElemBefore * sliceSize) {
+    auto expanded_size = at::DimVector(expandedValue.sizes());
+    auto size1 = expandedValue.sizes();
+    auto size2 = linearIndex.sizes();
+    if (are_expandable(size1, size2)) {
+      expanded_size = infer_size_dimvector(size1, size2);
+    }
+    if (nElemBefore > 1) {
+      expanded_size.insert(expanded_size.begin(), nElemBefore);
+    }
+    if (sliceSize > 1) {
+      expanded_size.insert(expanded_size.end(), sliceSize);
+    }
+    expandedValue = expandedValue.expand(expanded_size);
+  }
+  expandedValue = expandedValue.contiguous();
+
+  if (num_indices > 0 && sliceSize > 0) {
+      const bool permuted = !src.is_contiguous();
+      auto src_ = permuted ? src.contiguous() : src;
+      linearIndex = linearIndex.reshape(-1);
+      auto sorted_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      auto orig_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+      linearIndex.divide_(sliceSize, "trunc");
+
+      if (num_indices < 50000) {
+        index_put_with_sort_kernel_thrust_helper(linearIndex, orig_indices, sorted_indices, num_indices);
+      } else
+      {
+      // Sort the inputs into sorted with the corresponding indices
+      auto range = at::arange(num_indices, linearIndex.options());
+      // linearIndex can not be negative, and we take advantage of this
+      // fact to sort on less bits for better performance.
+      int64_t nbits = zoom::hipcub::get_num_bits(largestIndex(self_) / sliceSize);
+      zoom::hipcub::radix_sort_pairs(
+        linearIndex.const_data_ptr<int64_t>(), sorted_indices.mutable_data_ptr<int64_t>(),
+        range.const_data_ptr<int64_t>(), orig_indices.mutable_data_ptr<int64_t>(),
+        num_indices, false, 0, nbits);
+      }
+
+      TORCH_INTERNAL_ASSERT(
+          linearIndex.numel()*sliceSize*nElemBefore == expandedValue.numel(),
+          "number of flattened indices did not match number of elements in the value tensor: ",
+          linearIndex.numel()*sliceSize*nElemBefore, " vs ", expandedValue.numel());
+      const int UNROLL = 4;
+      const int indices_per_block = 4;
+      const int warp_size = at::zoom::warp_size();
+      dim3 grid(ceil_div(num_indices, (int64_t) indices_per_block),
+           std::min<int>(at::zoom::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size*UNROLL))),
+           ::min(std::max<int>(1,nElemBefore), at::zoom::getCurrentDeviceProperties()->maxGridSize[2]));
+      dim3 block(warp_size, indices_per_block);
+
+
+      if (sliceSize == 1) {
+        // This implementation is faster with high amounts of duplicates but could overflow
+        // if FP16 / BF16 is used
+        AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16,
+        expandedValue.scalar_type(), "indexing_backward_kernel_stride_1", [&] {
+         hipLaunchKernelGGL(( indexing_backward_kernel_stride_1<scalar_t>), dim3(grid), dim3(block), 0, stream, 
+            sorted_indices.const_data_ptr<int64_t>(),
+            orig_indices.const_data_ptr<int64_t>(),
+            expandedValue.const_data_ptr<scalar_t>(),
+            src_.mutable_data_ptr<scalar_t>(),
+            num_indices,
+            sliceSize,
+            strideBefore,
+            nElemBefore,
+            accumulate);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+      } else {
+        if (sliceSize <= warp_size) {
+          AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16,
+          expandedValue.scalar_type(), "indexing_backward_kernel_small_stride", [&] {
+           hipLaunchKernelGGL(( indexing_backward_kernel_small_stride<scalar_t>), dim3(grid), dim3(block), 0, stream, 
+              sorted_indices.const_data_ptr<int64_t>(),
+              orig_indices.const_data_ptr<int64_t>(),
+              expandedValue.const_data_ptr<scalar_t>(),
+              src_.mutable_data_ptr<scalar_t>(),
+              num_indices,
+              sliceSize,
+              strideBefore,
+              nElemBefore,
+              accumulate);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            });
+        } else {
+            AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16,
+            expandedValue.scalar_type(), "indexing_backward", [&] {
+             hipLaunchKernelGGL(( indexing_backward_kernel<scalar_t, UNROLL>), dim3(grid), dim3(block), 0, stream, 
+                sorted_indices.const_data_ptr<int64_t>(),
+                orig_indices.const_data_ptr<int64_t>(),
+                expandedValue.const_data_ptr<scalar_t>(),
+                src_.mutable_data_ptr<scalar_t>(),
+                num_indices,
+                sliceSize,
+                strideBefore,
+                nElemBefore,
+                accumulate);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            });
+          }
+        }
+
+      if (permuted) {
+        self.copy_(src_.permute(inversePerm));
+      } else if (!self_contiguous) {
+        self.copy_(self_);
+      }
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(index_put_with_sort_stub, &index_put_with_sort_kernel);
+
+void index_put_with_sort_quantized(Tensor & self, const c10::List<std::optional<Tensor>>& indices, const Tensor & value, double scale, int zero_point, bool unsafe) {
+  if (indices.size() > (size_t)self.dim()) {
+    TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
+  }
+  bool self_contiguous = self.is_contiguous();
+  auto self_ = self_contiguous ? self : self.contiguous();
+  Tensor linearIndex, src, expandedValue = value;
+  int64_t nElemBefore, strideBefore, sliceSize;
+  std::vector<int64_t> inversePerm;
+  std::tie(linearIndex, src, nElemBefore, strideBefore, sliceSize, inversePerm) = makeLinearIndex(self_, indices, !unsafe);
+  int64_t num_indices = linearIndex.numel();
+
+  if (expandedValue.numel() < num_indices * nElemBefore * sliceSize) {
+    auto expanded_size = at::DimVector(expandedValue.sizes());
+    auto size1 = expandedValue.sizes();
+    auto size2 = linearIndex.sizes();
+    if (are_expandable(size1, size2)) {
+      expanded_size = infer_size_dimvector(size1, size2);
+    }
+    if (nElemBefore > 1) {
+      expanded_size.insert(expanded_size.begin(), nElemBefore);
+    }
+    expandedValue = expandedValue.expand(expanded_size);
+  }
+  expandedValue = expandedValue.contiguous();
+
+  if (num_indices > 0 && sliceSize > 0) {
+      const bool permuted = !src.is_contiguous();
+      auto src_ = permuted ? src.contiguous() : src;
+      linearIndex = linearIndex.reshape(-1);
+      auto sorted_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      auto orig_indices = at::empty_like(linearIndex, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+      linearIndex.divide_(sliceSize, "trunc");
+
+      // cub on CUDA <= 11.2 have a bug that for small sizes
+      // cub's sort can be much slower than thrust's merge sort
+      // this bug is fixed in CUDA 11.3
+#if (defined(TORCH_HIP_VERSION) && TORCH_HIP_VERSION < 11030)
+      if (num_indices < 50000) {
+        index_put_with_sort_kernel_thrust_helper(linearIndex, orig_indices, sorted_indices, num_indices);
+      } else
+#endif
+      {
+      // Sort the inputs into sorted with the corresponding indices
+      auto range = at::arange(num_indices, linearIndex.options());
+      // linearIndex can not be negative, and we take advantage of this
+      // fact to sort on less bits for better performance.
+      int64_t nbits = zoom::hipcub::get_num_bits(largestIndex(self_) / sliceSize);
+      zoom::hipcub::radix_sort_pairs(
+        linearIndex.const_data_ptr<int64_t>(), sorted_indices.mutable_data_ptr<int64_t>(),
+        range.const_data_ptr<int64_t>(), orig_indices.mutable_data_ptr<int64_t>(),
+        num_indices, false, 0, nbits);
+      }
+
+      TORCH_INTERNAL_ASSERT(
+          linearIndex.numel()*sliceSize*nElemBefore == expandedValue.numel(),
+          "number of flattened indices did not match number of elements in the value tensor: ",
+          linearIndex.numel()*sliceSize*nElemBefore, " vs ", expandedValue.numel());
+      const int UNROLL = 4;
+      const int indices_per_block = 4;
+      const int warp_size = at::zoom::warp_size();
+      dim3 grid(ceil_div(num_indices, (int64_t) indices_per_block),
+           std::min<int>(at::zoom::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size*UNROLL))),
+           ::min(std::max<int>(1,nElemBefore), at::zoom::getCurrentDeviceProperties()->maxGridSize[2]));
+      dim3 block(warp_size, indices_per_block);
+
+      AT_DISPATCH_QINT_TYPES(
+        src.scalar_type(), "indexing_backward_quantized", [&] {
+        constexpr int64_t qmin = std::numeric_limits<typename scalar_t::underlying>::min();
+        constexpr int64_t qmax = std::numeric_limits<typename scalar_t::underlying>::max();
+        float inv_scale = 1.0f / static_cast<float>(scale);
+
+       hipLaunchKernelGGL(( indexing_backward_kernel_quantized<scalar_t, UNROLL>), dim3(grid), dim3(block), 0, stream, 
+          sorted_indices.const_data_ptr<int64_t>(),
+          orig_indices.const_data_ptr<int64_t>(),
+          expandedValue.const_data_ptr<float>(),
+          src_.mutable_data_ptr<scalar_t>(),
+          num_indices,
+          sliceSize,
+          strideBefore,
+          nElemBefore,
+          inv_scale,
+          zero_point,
+          qmin,
+          qmax);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+
+      if (permuted) {
+        self.copy_(src_.permute(inversePerm));
+      } else if (!self_contiguous) {
+        self.copy_(self_);
+      }
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(index_put_with_sort_quantized_stub, &index_put_with_sort_quantized);
+} //anonymous
+
+
+// Check tensor dimensions for index operations, and return the slice size.
+static ptrdiff_t getSliceSize(const Tensor & dst,
+                              int dim,
+                              const Tensor & index,
+                              const Tensor & src)
+{
+  const auto dstDims = dst.dim();
+  const auto srcDims = src.dim();
+
+  TORCH_CHECK(index.dim() <= 1, "Index must be vector or scalar");
+
+  ptrdiff_t dstSliceSize = 1;
+  TORCH_CHECK(dim >= 0 && dim < dstDims, "Indexing dim ", dim, " is out of bounds");
+  for (const auto d: c10::irange(dstDims)) {
+    if (d != dim) {
+      dstSliceSize *= dst.size(d);
+    }
+  }
+
+  TORCH_CHECK(dim < srcDims, "Indexing dim ", dim, " is out of bounds");
+  TORCH_CHECK(index.numel() == src.size(dim),
+             "length of src.size[dim] is not equal to length of indices");
+
+  ptrdiff_t srcSliceSize = 1;
+  bool mismatch = false;
+
+  if (dstDims != srcDims) mismatch = true;
+
+  for (const auto d: c10::irange(srcDims)) {
+    if (d != dim) {
+      srcSliceSize *= src.size(d);
+      if (!mismatch && dst.size(d) != src.size(d)) mismatch = true;
+    }
+  }
+
+  TORCH_CHECK(dstSliceSize == srcSliceSize,
+             "Source/destination tensor have different slice sizes (%ld vs %ld)",
+             dstSliceSize, srcSliceSize);
+
+  if (mismatch) {
+    TORCH_WARN_ONCE(
+        "Warning: source/destination slices have same size but different "
+        "shape for an index operation.  This behavior is deprecated.\n");
+  }
+
+  return dstSliceSize;
+}
+
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexFuncLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          typename func_t>
+__global__ void indexFuncSmallIndex(zoom::detail::TensorInfo<T, IndexType> dst,
+                                    zoom::detail::TensorInfo<const T, IndexType> src,
+                                    zoom::detail::TensorInfo<const IndicesType, IndexType> indices,
+                                    int dstAddDim,
+                                    int srcAddDim,
+                                    IndexType innerSize,
+                                    int64_t dstAddDimSize,
+                                    int64_t dstNumel,
+                                    const func_t& op,
+                                    T alpha) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) {
+    // Lua indices begin at 1
+    IndexType dstIndex =
+        indices.data[zoom::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
+    ZOOM_KERNEL_ASSERT(dstIndex < dstAddDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+          zoom::detail::IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex * dst.strides[dstAddDim];
+
+      IndexType srcOffset =
+          zoom::detail::IndexToOffset<const T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcAddDim];
+
+      T val = src.data[srcOffset] * alpha;
+      op(dst.data, dstOffset, dstNumel, &val);
+    }
+
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexFuncSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          bool IndexIsMajor, typename func_t>
+__global__ void indexFuncLargeIndex(zoom::detail::TensorInfo<T, IndexType> dst,
+                                    zoom::detail::TensorInfo<const T, IndexType> src,
+                                    zoom::detail::TensorInfo<const IndicesType, IndexType> indices,
+                                    int dstAddDim,
+                                    int srcAddDim,
+                                    IndexType totalSize,
+                                    IndexType innerSize,
+                                    int64_t dstAddDimSize,
+                                    int64_t dstNumel,
+                                    const func_t& op,
+                                    T alpha) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType srcIndex, elementInSlice;
+    if (IndexIsMajor) {
+      srcIndex = linearIndex / innerSize;
+      elementInSlice = linearIndex % innerSize;
+    }
+    else {
+      elementInSlice = linearIndex / innerSize;
+      srcIndex = linearIndex % innerSize;
+    }
+
+    // Lua indices begin at 1
+    IndexType dstIndex =
+        indices.data[zoom::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(srcIndex, indices)];
+    ZOOM_KERNEL_ASSERT(dstIndex < dstAddDimSize);
+
+    IndexType dstOffset =
+      zoom::detail::IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstAddDim];
+
+    IndexType srcOffset =
+      zoom::detail::IndexToOffset<const T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcAddDim];
+
+    T val = src.data[srcOffset] * alpha;
+    op(dst.data, dstOffset, dstNumel, &val);
+  }
+}
+
+// Compare the stride between adjacent slices (sliceStride) with strides in the
+// other dimensions (i.e., strides *inside* each slice).
+//
+// - Returns true if some dimension inside the slice has lower stride than
+//   sliceStride.  The simplest example is a 2-D contiguous tensor with sliceDim
+//   == 0 (that is, each slice is a row).
+//
+//   In this case, we choose the CUDA kernel that processes the data in
+//   "index-major order".  For example, if thread count equals slice size, then
+//   all threads process slice #0 in lockstep, and then slice #1, and so on.
+//
+// - Otherwise (i.e., sliceStride has the lowest value), this function returns
+//   false.  The simplest example is a 2-D contiguous tensor with sliceDim == 1
+//   (each slice is a column).
+//
+//   In this case, we choose the CUDA kernel that processes the data in
+//   "elementInSlice-major order".  For example, each thread can process element
+//   #0 of every slice, and then element #1 of every slice, and so on.
+template <typename scalar_t>
+bool indexShouldBeMajor(zoom::detail::TensorInfo<scalar_t, unsigned int> &info,
+                                    int sliceDim)
+{
+  // The stride between adjacent slices (e.g., between element #0 of slice #100
+  // and element #0 of slice #101).
+  unsigned int sliceStride = info.strides[sliceDim];
+
+  for (const auto i: c10::irange(info.dims)) {
+    if (i != sliceDim && info.sizes[i] > 1 && info.strides[i] < sliceStride) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void index_add_zoom_impl(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha, const Tensor& result) {
+  if (!result.is_same(self)) {
+    result.copy_(self);
+  }
+
+  // Scalars are treated as 1-d tensor
+  const Tensor self_ = (result.dim() == 0) ? result.view(1) : result;
+  const Tensor source_ = (source.dim() == 0) ? source.view(1) : source;
+
+  TORCH_CHECK(result.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
+  TORCH_CHECK(source.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims" );
+  TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
+
+  if (globalContext().deterministicAlgorithms()){
+    torch::List<std::optional<Tensor>> indices;
+    indices.reserve(dim + 1);
+    for (const auto i: c10::irange(dim)) {
+      indices.emplace_back();
+    }
+    indices.emplace_back(index.to(at::kLong));
+    result.index_put_(indices, source * alpha, true);
+    return;
+  }
+
+  // The `source` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of index we are choosing, which is the total size
+  // of the tensor `index`.
+  const ptrdiff_t sliceSize = getSliceSize(self_, dim, index, source_);
+  const ptrdiff_t sourceTotalSize = source.numel();
+  const int64_t selfAddDimSize = self_.size(dim);
+  const ptrdiff_t numIndex = index.numel();
+  const int64_t selfNumel = self_.numel();
+
+  if (sliceSize == 0) {
+    return;
+  }
+  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  const bool indContig = index.is_contiguous();
+
+  const int mpc = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)     \
+ hipLaunchKernelGGL(( indexFuncSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM>)   \
+    , dim3(smallIndexGrid), dim3(smallIndexBlock), 0, stream,                                    \
+      selfInfo, sourceInfo, indexInfo,                                                  \
+      selfAddDim, sourceAddDim, sliceSize, selfAddDimSize,                              \
+      selfNumel, reduce_add, alpha_value);                                              \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+#define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                        \
+                    SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)            \
+ hipLaunchKernelGGL(( indexFuncLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                      \
+                      SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>)          \
+    , dim3(largeIndexGrid), dim3(largeIndexBlock), 0, stream,                        \
+      selfInfo, sourceInfo, indexInfo,                                      \
+      selfAddDim, sourceAddDim, sourceTotalSize,                            \
+      (IDX_IS_MAJOR) ? sliceSize : numIndex,                                \
+      selfAddDimSize, selfNumel, reduce_add, alpha_value);                  \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  const dim3 smallIndexGrid(::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  const dim3 smallIndexBlock(::min(sliceSize, (ptrdiff_t)128));
+
+  const dim3 largeIndexGrid(::min(ceil_div(sourceTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  const dim3 largeIndexBlock(::min(sourceTotalSize, (ptrdiff_t)128));
+
+  if (zoom::detail::canUse32BitIndexMath(result) &&
+      zoom::detail::canUse32BitIndexMath(source) &&
+      zoom::detail::canUse32BitIndexMath(index)) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::ComplexHalf, result.scalar_type(), "index_add", [&] {
+      zoom::detail::TensorInfo<scalar_t, unsigned int> selfInfo =
+          zoom::detail::getTensorInfo<scalar_t, unsigned int>(self_);
+      const int selfAddDim = selfInfo.collapseDims(dim);
+      selfInfo.reduceDim(selfAddDim);
+      const auto alpha_value = alpha.to<scalar_t>();
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_zoom_", [&] () {
+        auto sourceInfo =
+          zoom::detail::getTensorInfo<const scalar_t, unsigned int>(source_);
+        const int sourceAddDim = sourceInfo.collapseDims(dim);
+        sourceInfo.reduceDim(sourceAddDim);
+
+        auto indexInfo =
+        zoom::detail::getTensorInfo<const index_t, unsigned int>(index);
+        indexInfo.collapseDims();
+
+        // A reasonable choice for when to have each thread iterate over
+        // index to choose
+        if (numIndex <= 16) {
+          if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2);
+          } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2);
+          } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2);
+          } else {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1);
+          }
+        } else {
+          const bool indexIsMajor = indexShouldBeMajor(selfInfo, selfAddDim);
+
+          if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true);
+          } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) {
+            if (indexIsMajor) {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true);
+            } else {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false);
+            }
+          } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) {
+            if (indexIsMajor) {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true);
+            } else {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false);
+            }
+          } else {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true);
+          }
+        }
+      });
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] {
+      zoom::detail::TensorInfo<scalar_t, uint64_t> selfInfo =
+        zoom::detail::getTensorInfo<scalar_t, uint64_t>(self_);
+      const int selfAddDim = selfInfo.collapseDims(dim);
+      selfInfo.reduceDim(selfAddDim);
+      const auto alpha_value = alpha.to<scalar_t>();
+
+      zoom::detail::TensorInfo<const scalar_t, uint64_t> sourceInfo =
+        zoom::detail::getTensorInfo<const scalar_t, uint64_t>(source_);
+      const int sourceAddDim = sourceInfo.collapseDims(dim);
+      sourceInfo.reduceDim(sourceAddDim);
+
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_zoom_", [&] () {
+        zoom::detail::TensorInfo<const index_t, uint64_t> indexInfo =
+          zoom::detail::getTensorInfo<const index_t, uint64_t>(index);
+        indexInfo.collapseDims();
+
+        LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
+      });
+    });
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+template <typename func_t>
+void index_reduce_func_zoom_impl(
+  const Tensor& self,
+  int64_t dim,
+  const Tensor& index,
+  const Tensor& source,
+  bool include_self,
+  const ReductionType& reduce,
+  const func_t& reduce_func,
+  const Tensor& result) {
+  globalContext().alertNotDeterministic("index_reduce_zoom");
+
+  if (!result.is_same(self)) result.copy_(self);
+
+  // Scalars are treated as 1-d tensor
+  Tensor self_ = (result.dim() == 0) ? result.view(1) : result;
+  Tensor source_ = (source.dim() == 0) ? source.view(1) : source;
+
+  TORCH_CHECK(result.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
+  TORCH_CHECK(source.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims" );
+  TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, "tensor has too many (>", MAX_TENSORINFO_DIMS, ") dims");
+
+  if (!include_self) {
+    AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      self.scalar_type(), "index_reduce_func_zoom_exclude_input_init", [&] {
+      scalar_t init_val;
+      switch (reduce) {
+        case ReductionType::PROD:
+          init_val = (scalar_t)1;
+          break;
+        case ReductionType::MAX:
+          init_val = std::numeric_limits<scalar_t>::has_infinity ? -std::numeric_limits<scalar_t>::infinity()
+                     : std::numeric_limits<scalar_t>::lowest();
+          break;
+        case ReductionType::MIN:
+          init_val = std::numeric_limits<scalar_t>::has_infinity ? std::numeric_limits<scalar_t>::infinity()
+                     : std::numeric_limits<scalar_t>::max();
+          break;
+        default:
+          init_val = (scalar_t)0;
+          break;
+      }
+      // index_fill_ requires index to be a LongTensor
+      self_.index_fill_(dim, index.to(at::ScalarType::Long), init_val);
+    });
+  }
+
+  // The `source` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of index we are choosing, which is the total size
+  // of the tensor `index`.
+  ptrdiff_t sliceSize = getSliceSize(self_, dim, index, source_);
+  ptrdiff_t sourceTotalSize = source.numel();
+  int64_t selfReduceDimSize = self_.size(dim);
+  ptrdiff_t numIndex = index.numel();
+  int64_t selfNumel = self_.numel();
+
+  if (sliceSize == 0) {
+    return;
+  }
+  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  bool indContig = index.is_contiguous();
+
+  int mpc = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM)                  \
+ hipLaunchKernelGGL(( indexFuncSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM>)                \
+    , dim3(smallIndexGrid), dim3(smallIndexBlock), 0, stream,                                                 \
+      selfInfo, sourceInfo, indexInfo,                                                               \
+      selfReduceDim, sourceReduceDim, sliceSize, selfReduceDimSize,                                  \
+      selfNumel, reduce_func, alpha_value);                                                          \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+#define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                                     \
+                    SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR)                         \
+ hipLaunchKernelGGL(( indexFuncLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                                   \
+                     SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR>)                        \
+    , dim3(largeIndexGrid), dim3(largeIndexBlock), 0, stream,                                     \
+      selfInfo, sourceInfo, indexInfo,                                                   \
+      selfReduceDim, sourceReduceDim, sourceTotalSize,                                   \
+      (IDX_IS_MAJOR) ? sliceSize : numIndex,                                             \
+      selfReduceDimSize, selfNumel, reduce_func, alpha_value);                           \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  dim3 smallIndexGrid(::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(::min(ceil_div(sourceTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(::min(sourceTotalSize, (ptrdiff_t)128));
+
+  if (zoom::detail::canUse32BitIndexMath(result) &&
+      zoom::detail::canUse32BitIndexMath(source) &&
+      zoom::detail::canUse32BitIndexMath(index)) {
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "index_reduce", [&] {
+      zoom::detail::TensorInfo<scalar_t, unsigned int> selfInfo =
+          zoom::detail::getTensorInfo<scalar_t, unsigned int>(self_);
+      int selfReduceDim = selfInfo.collapseDims(dim);
+      selfInfo.reduceDim(selfReduceDim);
+      auto alpha_value = (scalar_t) 1;
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_zoom", [&] () {
+        auto sourceInfo =
+          zoom::detail::getTensorInfo<const scalar_t, unsigned int>(source_);
+        int sourceReduceDim = sourceInfo.collapseDims(dim);
+        sourceInfo.reduceDim(sourceReduceDim);
+
+        auto indexInfo =
+        zoom::detail::getTensorInfo<const index_t, unsigned int>(index);
+        indexInfo.collapseDims();
+
+        // A reasonable choice for when to have each thread iterate over
+        // index to choose
+        if (numIndex <= 16) {
+          if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2);
+          } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2);
+          } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2);
+          } else {
+            SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1);
+          }
+        } else {
+          bool indexIsMajor = indexShouldBeMajor(selfInfo, selfReduceDim);
+
+          if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true);
+          } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) {
+            if (indexIsMajor) {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true);
+            } else {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false);
+            }
+          } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) {
+            if (indexIsMajor) {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true);
+            } else {
+              LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false);
+            }
+          } else {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true);
+          }
+        }
+      });
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_reduce", [&] {
+      zoom::detail::TensorInfo<scalar_t, uint64_t> selfInfo =
+        zoom::detail::getTensorInfo<scalar_t, uint64_t>(self_);
+      int selfReduceDim = selfInfo.collapseDims(dim);
+      selfInfo.reduceDim(selfReduceDim);
+      auto alpha_value = (scalar_t) 1;
+
+      zoom::detail::TensorInfo<const scalar_t, uint64_t> sourceInfo =
+        zoom::detail::getTensorInfo<const scalar_t, uint64_t>(source_);
+      int sourceReduceDim = sourceInfo.collapseDims(dim);
+      sourceInfo.reduceDim(sourceReduceDim);
+
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_zoom", [&] () {
+        zoom::detail::TensorInfo<const index_t, uint64_t> indexInfo =
+          zoom::detail::getTensorInfo<const index_t, uint64_t>(index);
+        indexInfo.collapseDims();
+
+        LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
+      });
+    });
+  }
+
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+
+TORCH_IMPL_FUNC(index_add_zoom_out)
+(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& source, const Scalar& alpha, const Tensor& result) {
+  index_add_zoom_impl(self, dim, index, source, alpha, result);
+}
+
+TORCH_IMPL_FUNC(index_reduce_zoom_out)
+(const Tensor& self,
+ int64_t dim,
+ const Tensor& index,
+ const Tensor& source,
+ const c10::string_view reduce,
+ bool include_self,
+ const Tensor& result) {
+  TORCH_WARN_ONCE("index_reduce() is in beta and the API may change at any time.");
+
+  if (reduce == "prod") {
+    index_reduce_func_zoom_impl(self, dim, index, source, include_self, ReductionType::PROD, reduce_multiply, result);
+  } else if (reduce == "mean") {
+    index_reduce_func_zoom_impl(self, dim, index, source, include_self, ReductionType::MEAN, reduce_add, result);
+    auto counts = include_self ? at::ones_like(result) : at::zeros_like(result);
+    counts.index_add_(dim, index, at::ones_like(source));
+    counts.masked_fill_(counts == 0, 1);
+    if (result.is_floating_point() || result.is_complex()) {
+      result.div_(counts);
+    } else {
+      result.div_(counts, "floor");
+    }
+  } else if (reduce == "amax") {
+    index_reduce_func_zoom_impl(self, dim, index, source, include_self, ReductionType::MAX, reduce_maximum, result);
+  } else if (reduce == "amin") {
+    index_reduce_func_zoom_impl(self, dim, index, source, include_self, ReductionType::MIN, reduce_minimum, result);
+  } else {
+    TORCH_CHECK(false, "reduce argument must be either prod, mean, amax or amin, got ", reduce, ".");
+  }
+}
+
+namespace {
+// We prefer this kernel to avoid reloading index points if the number
+// of indices is a small number.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is large, then the
+// indexSelectLargeIndex kernel is a better choice to increase
+// parallelism.
+template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim>
+__global__ void indexSelectSmallIndex(zoom::detail::TensorInfo<T, IndexType> dst,
+                                      zoom::detail::TensorInfo<const T, IndexType> src,
+                                      zoom::detail::TensorInfo<const IndicesType, IndexType> indices,
+                                      int dstSelectDim,
+                                      int srcSelectDim,
+                                      IndexType innerSize,
+                                      int64_t srcSelectDimSize) {
+  // In order to avoid reloading the index that we are copying, load
+  // it once to handle all of the points that are being selected, so
+  // it can be reused as much as possible. This kernel is chosen when
+  // this is a good choice (small number of chosen indices), since
+  // re-accessing indices in addition to src elements can be slow.
+  for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) {
+    IndexType srcIndex =
+      indices.data[zoom::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(dstIndex, indices)];
+    ZOOM_KERNEL_ASSERT(srcIndex < srcSelectDimSize);
+
+    // We stride over the output ignoring the indexed dimension
+    // (innerSize), whose offset calculation is handled differently
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < innerSize;
+         linearIndex += gridDim.x * blockDim.x) {
+      IndexType dstOffset =
+        zoom::detail::IndexToOffset<T, IndexType, DstDim>::get(linearIndex, dst);
+      dstOffset += dstIndex * dst.strides[dstSelectDim];
+
+      IndexType srcOffset =
+        zoom::detail::IndexToOffset<const T, IndexType, SrcDim>::get(linearIndex, src);
+      srcOffset += srcIndex * src.strides[srcSelectDim];
+
+      dst.data[dstOffset] = src.data[srcOffset];
+    }
+  }
+}
+
+// We prefer this kernel to balance parallelism across index points,
+// if there are a large number of indices.
+// This kernel in fact works for all choices of problem size, but if
+// the number of indices chosen is small, then the
+// indexSelectSmallIndex kernel is a better choice to reduce memory
+// accesses.
+template <typename T, typename IndicesType, typename IndexType, int DstDim, int SrcDim, int IdxDim,
+          bool IndexIsMajor>
+__global__ void indexSelectLargeIndex(zoom::detail::TensorInfo<T, IndexType> dst,
+                                      zoom::detail::TensorInfo<const T, IndexType> src,
+                                      zoom::detail::TensorInfo<const IndicesType, IndexType> indices,
+                                      int dstSelectDim,
+                                      int srcSelectDim,
+                                      IndexType totalSize,
+                                      IndexType innerSize,
+                                      int64_t srcSelectDimSize) {
+  // We stride over the output including the indexed dimension
+  // (totalSize), and calculate the destination index point based on that
+  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalSize;
+       linearIndex += gridDim.x * blockDim.x) {
+    IndexType dstIndex, elementInSlice;
+    if (IndexIsMajor) {
+      dstIndex = linearIndex / innerSize;
+      elementInSlice = linearIndex % innerSize;
+    }
+    else {
+      elementInSlice = linearIndex / innerSize;
+      dstIndex = linearIndex % innerSize;
+    }
+
+    IndexType srcIndex =
+      indices.data[zoom::detail::IndexToOffset<const IndicesType, IndexType, IdxDim>::get(dstIndex, indices)];
+    ZOOM_KERNEL_ASSERT(srcIndex < srcSelectDimSize);
+
+    IndexType dstOffset =
+      zoom::detail::IndexToOffset<T, IndexType, DstDim>::get(elementInSlice, dst);
+    dstOffset += dstIndex * dst.strides[dstSelectDim];
+
+    IndexType srcOffset =
+      zoom::detail::IndexToOffset<const T, IndexType, SrcDim>::get(elementInSlice, src);
+    srcOffset += srcIndex * src.strides[srcSelectDim];
+
+    dst.data[dstOffset] = src.data[srcOffset];
+  }
+}
+
+namespace {
+
+// When using a 0-dim scalar tensor, we need the legacy (THC) semantics of
+// TensorInfo: Pretend that the scalar tensor is in fact a one-element vector.
+template <typename T, typename IndexType>
+zoom::detail::TensorInfo<T, IndexType>
+tensorInfoLegacyIfScalar(zoom::detail::TensorInfo<T, IndexType> ti) {
+  if (ti.dims == 0) {
+    ti.dims = 1;
+    ti.sizes[0] = 1;
+    ti.strides[0] = 1;
+  }
+  return ti;
+}
+
+}
+
+template <typename scalar_t>
+void index_select_out_zoom_impl(
+    Tensor& out,
+    const Tensor& self,
+    long dim,
+    const Tensor& index) {
+  ptrdiff_t numIndices = index.numel();
+  int selfDims = self.dim() == 0 ? 1 : self.dim();
+
+  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  TORCH_CHECK(
+      index.dim() <= 1, "Index is supposed to be an empty tensor or a vector");
+  TORCH_CHECK(
+      !(self.dim() == 0 && numIndices != 1), "index_select(): Index to scalar can have only 1 value, got ", numIndices, " value(s)");
+  TORCH_CHECK(dim < selfDims, "Indexing dim is out of bounds");
+
+  std::vector<int64_t> newSize = self.sizes().vec();
+  if (self.dim() > 0) {
+    newSize[dim] = numIndices;
+  }
+
+  if (self.is_quantized()){
+      out = at::empty_quantized(newSize, out);
+  } else {
+    at::native::resize_output(out, newSize);
+  }
+
+  ptrdiff_t outTotalSize = out.numel();
+  if (outTotalSize == 0) {
+    return;
+  }
+
+  bool indContig = index.is_contiguous();
+
+  // The `self` is partitioned into two parts:
+  // -the size of each slice we are indexing, which is the
+  // total size of the tensor ignoring dimension `dim`;
+  // -the number of indices we are choosing, which is the total size
+  // of the tensor `indices`.
+  int64_t selfSelectDimSize = self.dim() == 0 ? 1 : self.size(dim);
+  ptrdiff_t sliceSize = outTotalSize / numIndices;
+
+  int mpc = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+
+#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM)         \
+ hipLaunchKernelGGL(( indexSelectSmallIndex<TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM>)     \
+    , dim3(smallIndexGrid), dim3(smallIndexBlock), 0, stream,                                    \
+      outInfo, selfInfo, indicesInfo,                                                   \
+      outSelectDim, selfSelectDim, static_cast<TYPE>(sliceSize),                        \
+      selfSelectDimSize);                                                               \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+#define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE,                           \
+                    DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR)                   \
+ hipLaunchKernelGGL(( indexSelectLargeIndex<TENSOR_TYPE, INDICES_TYPE, TYPE,                       \
+                        DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR>)               \
+    , dim3(largeIndexGrid), dim3(largeIndexBlock), 0, stream,                           \
+      outInfo, selfInfo, indicesInfo,                                          \
+      outSelectDim, selfSelectDim, static_cast<TYPE>(outTotalSize),            \
+      static_cast<TYPE>((IDX_IS_MAJOR) ? sliceSize : numIndices),              \
+      selfSelectDimSize);                                                      \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  dim3 smallIndexGrid(::min(ceil_div(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 smallIndexBlock(::min(sliceSize, (ptrdiff_t)128));
+
+  dim3 largeIndexGrid(::min(ceil_div(outTotalSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8)));
+  dim3 largeIndexBlock(::min(outTotalSize, (ptrdiff_t)128));
+  if (zoom::detail::canUse32BitIndexMath(out) &&
+      zoom::detail::canUse32BitIndexMath(self) &&
+      zoom::detail::canUse32BitIndexMath(index)) {
+    auto outInfo = tensorInfoLegacyIfScalar(zoom::detail::getTensorInfo<scalar_t, unsigned int>(out));
+    int outSelectDim = outInfo.collapseDims(dim);
+    outInfo.reduceDim(outSelectDim);
+
+    auto  selfInfo = tensorInfoLegacyIfScalar(zoom::detail::getTensorInfo<const scalar_t, unsigned int>(self));
+    int selfSelectDim = selfInfo.collapseDims(dim);
+    selfInfo.reduceDim(selfSelectDim);
+
+    AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_out_zoom_impl", [&] () {
+      auto indicesInfo = tensorInfoLegacyIfScalar(zoom::detail::getTensorInfo<const index_t, unsigned int>(index));
+      indicesInfo.collapseDims();
+
+      // A reasonable choice for when to have each thread iterate over
+      // indices to choose
+      if (numIndices <= 16) {
+        if (outInfo.dims == 1 && selfInfo.dims == 1 && indContig) {
+          SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2);
+        } else if (outInfo.dims == 2 && selfInfo.dims == 2 && indContig) {
+          SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2);
+        } else if (outInfo.dims == 3 && selfInfo.dims == 3 && indContig) {
+          SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2);
+        } else {
+          SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1);
+        }
+      } else {
+        bool indexIsMajor = indexShouldBeMajor(outInfo, outSelectDim);
+
+        if (outInfo.dims == 1 && selfInfo.dims == 1 && indContig) {
+          LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true);
+        } else if (outInfo.dims == 2 && selfInfo.dims == 2 && indContig) {
+          if (indexIsMajor) {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true);
+          } else {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false);
+          }
+        } else if (outInfo.dims == 3 && selfInfo.dims == 3 && indContig) {
+          if (indexIsMajor) {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true);
+          } else {
+            LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false);
+          }
+        } else {
+          LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true);
+        }
+      }
+    });
+  } else {
+    auto outInfo = tensorInfoLegacyIfScalar(zoom::detail::getTensorInfo<scalar_t, uint64_t>(out));
+    int outSelectDim = outInfo.collapseDims(dim);
+    outInfo.reduceDim(outSelectDim);
+
+    auto selfInfo = tensorInfoLegacyIfScalar(zoom::detail::getTensorInfo<const scalar_t, uint64_t>(self));
+    int selfSelectDim = selfInfo.collapseDims(dim);
+    selfInfo.reduceDim(selfSelectDim);
+    AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_out_zoom_impl", [&] () {
+      auto indicesInfo = tensorInfoLegacyIfScalar(zoom::detail::getTensorInfo<const index_t, uint64_t>(index));
+      indicesInfo.collapseDims();
+
+      LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true);
+    });
+  }
+#undef SMALL_INDEX
+#undef LARGE_INDEX
+}
+} // anonymous namespace
+
+Tensor& index_select_out_zoom(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
+    Tensor& out) {
+  static constexpr string_view DIM_WARNING =
+      "Tensor too large or too many (> 25) dimensions";
+  TORCH_CHECK(
+      at::zoom::check_device({out, self, index}),
+      "Input, output and indices must be on the current device");
+  at::assert_no_internal_overlap(out);
+  at::assert_no_overlap(out, self);
+  at::assert_no_overlap(out, index);
+
+  dim = at::maybe_wrap_dim(dim, self);
+  TORCH_CHECK(self.dim() <= MAX_TENSORINFO_DIMS, DIM_WARNING);
+  TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, DIM_WARNING);
+  if (self.is_quantized()){
+    TORCH_CHECK(
+      self.qscheme() == kPerTensorAffine,
+      "Only per_tensor quantized quantized tensors are supported by index_select.")
+    AT_DISPATCH_QINT_TYPES(out.scalar_type(), "index_select_quant_zoom", [&] {
+      index_select_out_zoom_impl<scalar_t>(out, self, dim, index);
+    });
+  } else {
+    AT_DISPATCH_V2(
+        out.scalar_type(),
+        "index_select_zoom",
+        AT_WRAP([&] { index_select_out_zoom_impl<scalar_t>(out, self, dim, index); }),
+        AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+        kComplexHalf,
+        kHalf,
+        kBool,
+        kBFloat16
+        );
+  }
+
+  return out;
+}
+
+Tensor index_select_zoom(const Tensor& self, int64_t dim, const Tensor& index) {
+  Tensor out = at::empty({0}, self.options());
+  at::native::index_select_out_zoom(self, dim, index, out);
+  return out;
+}
+
+Tensor index_select_quantized_zoom(const Tensor& self, int64_t dim, const Tensor& index) {
+  TORCH_CHECK(
+    self.qscheme() == kPerTensorAffine,
+    "Only per_tensor quantized quantized tensors are supported by index_select.")
+  Tensor out = at::empty_quantized({0}, self);
+  at::native::index_select_out_zoom(self, dim, index, out);
+  return out;
+}
+
+namespace {
+
+void masked_fill_kernel(TensorIterator& iter, const Scalar& value) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kBool, kHalf, kBFloat16, kComplexHalf, iter.common_dtype(), "masked_fill_", [&]() {
+        const auto value_ = value.to<scalar_t>();
+        gpu_kernel(
+            iter, [value_] GPU_LAMBDA(scalar_t self, bool mask) -> scalar_t {
+              if (mask) {
+                return value_;
+              }
+              return self;
+            });
+      });
+}
+
+template <typename scalar_t>
+void zoom_masked_fill_kernel_quantized(TensorIterator& iter, scalar_t quantized_val) {
+    gpu_kernel(
+        iter, [quantized_val] GPU_LAMBDA(scalar_t self, bool mask) -> scalar_t {
+          if (mask) {
+            return quantized_val;
+          }
+          return self;
+    });
+}
+
+void masked_fill_kernel_quantized(TensorIterator& iter, const Scalar& value, double scale, int zero_point) {
+  TORCH_CHECK(iter.input_dtype(1) == at::ScalarType::Bool, "masked_fill only supports boolean masks, ",
+    "but got dtype ", iter.input_dtype(1));
+  AT_DISPATCH_QINT_TYPES(
+      iter.common_dtype(), "masked_fill_", [&]() {
+        float float_val = value.to<float>();
+        const auto quantized_val = quantize_val<scalar_t>(scale, zero_point, float_val);
+
+        zoom_masked_fill_kernel_quantized<scalar_t>(iter, quantized_val);
+    });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(masked_fill_kernel_quantized_stub, &masked_fill_kernel_quantized);
+
+} // anonymous namespace
+
+Tensor & masked_fill__zoom(Tensor & self, const Tensor & mask, const Scalar & value) {
+  TORCH_CHECK(self.device() == mask.device(), "expected self and mask to be on the same device, but got mask on ",
+    mask.device(), " and self on ", self.device());
+  TORCH_CHECK(mask.scalar_type() == kBool,
+    "masked_fill only supports boolean masks, but got dtype ", mask.scalar_type());
+  auto maybe_outnames = namedinference::broadcast_to_outnames(self, mask, "masked_fill_");
+  if (at::has_internal_overlap(self) == MemOverlap::Yes) {
+    TORCH_WARN(
+      "Use of masked_fill_ on expanded tensors is deprecated. "
+      "Please clone() the tensor before performing this operation. "
+      "This also applies to advanced indexing e.g. tensor[mask] = scalar");
+  }
+  at::assert_no_partial_overlap(self, mask);
+
+  c10::MaybeOwned<Tensor> b_mask = expand_inplace(self, mask, "masked_fill_");
+
+  auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self)
+      .add_const_input(self)
+      .add_const_input(*b_mask)
+      .build();
+
+  masked_fill_kernel(iter, value);
+  namedinference::propagate_names_if_nonempty(self, maybe_outnames);
+  return self;
+}
+
+
+Tensor & masked_fill__zoom(Tensor & self, const Tensor & mask, const Tensor & value) {
+   TORCH_CHECK(value.dim() == 0, "masked_fill_ only supports a 0-dimensional value tensor, but got tensor "
+      "with ", value.dim(), " dimension(s).");
+    // We hit this function if either of the input tensor lives on CUDA.
+    // It is ok, if `value` is `CPU` tensor but we should not allow `self` or
+    // `mask` to be CPU tensor. Check for `self` and `mask` being on same device
+    // exists in `masked_fill__zoom` (Scalar version).
+    TORCH_CHECK(!self.device().is_cpu(), "masked_fill_: Expected inputs to be on same device")
+    return masked_fill__zoom(self, mask, value.item());
+}
+
+namespace {
+
+// ForwardIt: only legacy random access iterator is supported.
+template<class ForwardIt, class T, bool is_lower = true>
+static __host__ __device__ __forceinline__
+ForwardIt find_bound(ForwardIt first, ForwardIt last, const T& value) {
+    ForwardIt it;
+    typename std::iterator_traits<ForwardIt>::difference_type count, step;
+    // NOTE: std::distance(first, last) compiles but produces wrong results here,
+    // so only legacy random access iterators are safe in this code.
+    count = last - first;
+
+    while (count > 0) {
+      it = first;
+      step = count / 2;
+      // avoiding std::advance(it, step),
+      // although it does work unlike std::distance
+      it += step;
+      if (is_lower ? *it < value : value >= *it) {
+        first = ++it;
+        count -= step + 1;
+      }
+      else {
+        count = step;
+      }
+    }
+    return first;
+}
+
+}
+
+Tensor index_select_sparse_zoom(const Tensor& self, int64_t dim, const Tensor& index) {
+  const auto ndim = self.dim();
+  TORCH_CHECK_INDEX(ndim, "index_select() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK_INDEX(
+      index.dim() == 1 && index.dtype() == at::kLong && index.options().layout() == at::kStrided,
+      "index_select() argument index must be 1-D strided (non-sparse) long-tensor.");
+  dim = maybe_wrap_dim(dim, ndim);
+  const auto size = self.size(dim);
+  const auto sparse_dim = self.sparse_dim();
+  const auto dense_dim = self.dense_dim();
+  const auto indices = self._indices();
+  const auto values = self._values();
+  const auto nnz = values.size(0);
+  const auto index_len = index.size(0);
+  auto res_sizes = self.sizes().vec();
+  res_sizes[dim] = index_len;
+
+  // If indexing into sparse dimensions
+  if (dim < sparse_dim) {
+    const auto make_output = [
+      dim, sparse_dim, dense_dim, res_sizes, &self, &indices, &values
+    ](
+        const Tensor& selected_dim_indices,
+        const Tensor& res_dim_indices
+    ) -> Tensor {
+      auto res_indices = indices.index_select(1, selected_dim_indices);
+      res_indices[dim] = res_dim_indices;
+      const auto res_values = values.index_select(0, selected_dim_indices);
+
+      return at::_sparse_coo_tensor_with_dims_and_tensors(
+          sparse_dim, dense_dim, res_sizes, res_indices, res_values, self.options());
+    };
+
+    // short-circuit if index is empty
+    if (!index_len) {
+      return make_output(index, index);
+    }
+
+    const auto nneg_index = [&index, size]() -> Tensor {
+      auto nneg_index = at::empty_like(index, at::MemoryFormat::Contiguous);
+
+      auto iter = TensorIteratorConfig()
+        .add_output(nneg_index)
+        .add_input(index)
+        .build();
+
+      AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_sparse_zoom", [&]() {
+          gpu_kernel(iter, [size] GPU_LAMBDA (index_t idx) -> index_t {
+              ZOOM_KERNEL_ASSERT(idx >= -size && idx < size
+                  && "index_select(): index out of bounds");
+              return idx < 0 ? idx + size : idx;
+          });
+      });
+      return nneg_index;
+    }();
+
+    const auto dim_indices = indices[dim].contiguous();
+    const auto idx_nneg_index = at::arange(index_len, nneg_index.options());
+    const auto idx_dim_indices = at::arange(nnz, dim_indices.options());
+
+    Tensor sorted_dim_indices, argsort_dim_indices;
+    std::tie(sorted_dim_indices, argsort_dim_indices) = [&]() -> std::tuple<Tensor, Tensor> {
+      if (dim == 0 && self.is_coalesced()) {
+        return std::make_tuple(dim_indices, idx_dim_indices);
+      }
+      else {
+        return dim_indices.sort();
+      }
+    }();
+
+    Tensor intrsc_counts_nneg_index;
+    Tensor intrsc_first_match_nneg_index;
+    std::tie(intrsc_counts_nneg_index, intrsc_first_match_nneg_index) = [&]() -> std::tuple<Tensor, Tensor> {
+      auto intrsc_counts_nneg_index = at::zeros_like(nneg_index);
+      auto intrsc_first_match_nneg_index = at::zeros_like(nneg_index);
+
+      auto iter = TensorIteratorConfig()
+        .add_output(intrsc_first_match_nneg_index)
+        .add_input(nneg_index)
+        .add_input(idx_nneg_index)
+        .build();
+
+      AT_DISPATCH_INDEX_TYPES(nneg_index.scalar_type(), "index_select_sparse_zoom", [&]() {
+          index_t* ptr_intrsc_counts_nneg_index = intrsc_counts_nneg_index.mutable_data_ptr<index_t>();
+          const index_t* ptr_sorted_dim_indices = sorted_dim_indices.const_data_ptr<index_t>();
+          gpu_kernel(
+              iter,
+              [ptr_intrsc_counts_nneg_index, ptr_sorted_dim_indices, nnz] GPU_LAMBDA (
+                index_t idx_val, index_t idx_idx
+              ) -> index_t {
+                auto* lb = find_bound<const index_t*, index_t, true>(
+                  ptr_sorted_dim_indices,
+                  ptr_sorted_dim_indices + nnz,
+                  idx_val
+                );
+                auto* ub = find_bound<const index_t*, index_t, false>(
+                  ptr_sorted_dim_indices,
+                  ptr_sorted_dim_indices + nnz,
+                  idx_val
+                );
+                const auto idx_count = ub - lb;
+                ptr_intrsc_counts_nneg_index[idx_idx] = idx_count;
+
+                return lb - ptr_sorted_dim_indices;
+              }
+          );
+      });
+
+      return std::make_tuple(intrsc_counts_nneg_index, intrsc_first_match_nneg_index);
+    }();
+
+    // Unavoidable sync since the shape of the result is not known in advance
+    auto res_len = intrsc_counts_nneg_index.sum().item<int64_t>();
+    // Short-circuit if empty intersection
+    if (!res_len) {
+      auto empty_idx = at::empty({0}, nneg_index.options());
+      return make_output(empty_idx, empty_idx);
+    }
+
+    Tensor selected_dim_indices, res_dim_indices;
+    std::tie(selected_dim_indices, res_dim_indices) = [&]() -> std::tuple<Tensor, Tensor> {
+      auto res_dim_indices = at::empty({res_len}, nneg_index.options());
+      auto selected_dim_indices = at::empty_like(res_dim_indices);
+      auto selected_dim_indices_offsets = intrsc_counts_nneg_index.cumsum(0)
+        .sub_(intrsc_counts_nneg_index);
+
+      // Need to have output as TensorIterator does not allow having void lambdas.
+      auto dummy_output = at::empty({1}, dim_indices.options()).expand(IntArrayRef({index_len}));
+      auto iter = TensorIteratorConfig()
+        .add_output(dummy_output)
+        // All iterations map to a single element in dummy_output by design,
+        // hence removed output memory overlap check.
+        .set_check_mem_overlap(false)
+        .add_input(idx_nneg_index)
+        .add_input(intrsc_counts_nneg_index)
+        .add_input(selected_dim_indices_offsets)
+        .add_input(intrsc_first_match_nneg_index)
+        .build();
+
+      AT_DISPATCH_INDEX_TYPES(nneg_index.scalar_type(), "index_select_sparse_zoom", [&]() {
+          index_t* ptr_res_dim_indices = res_dim_indices.mutable_data_ptr<index_t>();
+          index_t* ptr_selected_dim_indices = selected_dim_indices.mutable_data_ptr<index_t>();
+          const index_t* ptr_argsort_dim_indices = argsort_dim_indices.const_data_ptr<index_t>();
+          gpu_kernel(
+              iter,
+              [ptr_res_dim_indices, ptr_selected_dim_indices, ptr_argsort_dim_indices] GPU_LAMBDA (
+                index_t idx_idx, index_t count, index_t offset, index_t first_match
+              ) -> index_t {
+                index_t* __restrict__ ptr_res_dim_indices_out = ptr_res_dim_indices + offset;
+                const index_t* __restrict__ ptr_argsort_dim_indices_in = ptr_argsort_dim_indices + first_match;
+                index_t* __restrict__ ptr_selected_dim_indices_out = ptr_selected_dim_indices + offset;
+                for (index_t i = 0; i < count; ++i) {
+                  *ptr_res_dim_indices_out++ = idx_idx;
+                  *ptr_selected_dim_indices_out++ = *ptr_argsort_dim_indices_in++;
+                }
+
+                // A dummy return scalar for a dummy output
+                return static_cast<index_t>(1);
+              }
+          );
+      });
+
+      return std::make_tuple(selected_dim_indices, res_dim_indices);
+    }();
+
+    return make_output(selected_dim_indices, res_dim_indices);
+  }
+  // If indexing into dense dimensions
+  else {
+    // It is sufficient to just perform `index_select` on values
+    // if `dim` refers to dense dimensions.
+    const auto res_values = values.index_select(dim - sparse_dim + 1, index);
+
+    return _sparse_coo_tensor_with_dims_and_tensors(
+        sparse_dim, dense_dim, res_sizes, indices, res_values, self.options());
+  }
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/KernelUtils.cuh b/aten/src/ATen/native/zoom/KernelUtils.cuh
new file mode 100644
index 00000000000000..99c66efe21ffc4
--- /dev/null
+++ b/aten/src/ATen/native/zoom/KernelUtils.cuh
@@ -0,0 +1,97 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <ATen/zoom/Atomic.cuh>
+
+namespace at {
+namespace native {
+
+__device__ __forceinline__ size_t
+idx(const size_t nc,
+    const size_t height,
+    const size_t width,
+    const size_t h,
+    const size_t w) {
+  return (nc * height + h) * width + w;
+}
+
+// for channels-last
+__device__ __forceinline__ size_t
+idx_cl(
+  const size_t n, const size_t h, const size_t w, const size_t c,
+  const size_t height, const size_t width, const size_t channel
+) {
+  return ((n * height + h) * width + w) * channel + c;
+}
+
+// fastSpecializedAtomicAdd (and fastAtomicAdd) are an optimization
+// that speed up half-precision atomics.  The situation with half
+// precision atomics is that we have a slow __half atomic, and
+// a fast vectored __half2 atomic (this can be worth up to a 6x
+// speedup, see https://github.com/pytorch/pytorch/pull/21879).
+// We can convert a __half atomic into a __half2 atomic by simply
+// pairing the __half with a zero entry on the left/right depending
+// on alignment... but only if this wouldn't cause an out of bounds
+// access!  Thus, you must specify tensor and numel so we can check
+// if you would be out-of-bounds and use a plain __half atomic if
+// you would be.
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<std::is_same<c10::Half, scalar_t>::value>::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+  gpuAtomicAddNoReturn(
+      reinterpret_cast<at::Half*>(tensor) + index,
+      static_cast<at::Half>(value));
+}
+
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<std::is_same<c10::BFloat16, scalar_t>::value>::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+  gpuAtomicAddNoReturn(
+      reinterpret_cast<at::BFloat16*>(tensor) + index,
+      static_cast<at::BFloat16>(value));
+
+}
+
+
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<!std::is_same<c10::Half, scalar_t>::value && !std::is_same<c10::BFloat16, scalar_t>::value >::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+  gpuAtomicAddNoReturn(tensor + index, value);
+}
+
+template <class scalar_t, class index_t>
+__device__ __forceinline__ void fastAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value,
+    bool fast_atomics) {
+  if (fast_atomics) {
+    fastSpecializedAtomicAdd(tensor, index, numel, value);
+  } else {
+    gpuAtomicAddNoReturn(tensor + index, value);
+  }
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/LegacyThrustHelpers.cu b/aten/src/ATen/native/zoom/LegacyThrustHelpers.cu
new file mode 100644
index 00000000000000..6379b68b9479b4
--- /dev/null
+++ b/aten/src/ATen/native/zoom/LegacyThrustHelpers.cu
@@ -0,0 +1,113 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/zoom/cub_definitions.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <ATen/zoom/ThrustAllocator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace at::native {
+
+void index_put_with_sort_kernel_thrust_helper(Tensor &linearIndex, Tensor &orig_indices, Tensor &sorted_indices, int64_t num_indices) {
+  sorted_indices.copy_(linearIndex);
+  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  at::zoom::ThrustAllocator allocator;
+  auto policy = thrust::hip::par(allocator).on(stream);
+
+  using device_ptr = thrust::device_ptr<int64_t>;
+
+  // Fill sortedOrigIndices with sequential indices
+  const auto count_iter = thrust::counting_iterator<int64_t>(0);
+  auto orig_data = device_ptr(orig_indices.mutable_data_ptr<int64_t>());
+  thrust::copy(policy, count_iter, count_iter + num_indices, orig_data);
+
+  // Sort the inputs into sorted with the corresponding indices; we
+  // don't need a stable or multidimensional sort, so just use Thrust
+  // directly
+  // Sort; a stable sort is not required
+  // NB - not passing comparator causes thrust to use radix sort, and it hurts perf A LOT, at least for medium (few K) sized indices
+  auto sorted_data = device_ptr(sorted_indices.mutable_data_ptr<int64_t>());
+  thrust::sort_by_key(policy, sorted_data, sorted_data + num_indices, orig_data, LTOp<int64_t>());
+}
+
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+
+template<typename index_t>
+void embedding_dense_backward_zoom_scan(Tensor &sorted_indices, Tensor &count) {
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  at::zoom::ThrustAllocator allocator;
+  auto policy = thrust::hip::par(allocator).on(stream);
+
+  auto num_indices = count.numel();
+
+  // Compute an increasing sequence per unique item in sortedIndices:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 1 2 3 1 2 1 1 2
+  auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
+  auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
+  thrust::inclusive_scan_by_key(
+    policy,
+    sorted_data,
+    sorted_data + num_indices,
+    thrust::make_constant_iterator(1),
+    count_data
+  );
+
+  // Take the maximum of each count per unique key in reverse:
+  // sorted: 2 5 5 5 7 7 8 9 9
+  //  count: 1 3 3 3 2 2 1 2 2
+  thrust::inclusive_scan_by_key(
+    policy,
+    thrust::make_reverse_iterator(sorted_data + num_indices),
+    thrust::make_reverse_iterator(sorted_data),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::make_reverse_iterator(count_data + num_indices),
+    thrust::equal_to<index_t>(),
+    thrust::maximum<index_t>()
+  );
+}
+
+template
+void embedding_dense_backward_zoom_scan<int>(Tensor &sorted_indices, Tensor &count);
+template
+void embedding_dense_backward_zoom_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
+
+#endif
+
+template<typename index_t>
+int64_t embedding_backward_zoom_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
+  auto stream = c10::zoom::getCurrentZoomStream();
+  at::zoom::ThrustAllocator allocator;
+  auto policy = thrust::hip::par(allocator).on(stream);
+  const ptrdiff_t numel = sorted_indices.numel();
+  auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
+  auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
+  auto ends = thrust::unique_by_key_copy(
+          policy,
+          sorted_indices_dev,
+          sorted_indices_dev + numel,
+          thrust::make_counting_iterator(0),
+          dummy_dev,
+          thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
+  return thrust::get<0>(ends) - dummy_dev;
+}
+
+template
+int64_t embedding_backward_zoom_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
+template
+int64_t embedding_backward_zoom_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/LogcumsumexpKernel.cu b/aten/src/ATen/native/zoom/LogcumsumexpKernel.cu
new file mode 100644
index 00000000000000..13f8c9af5af6a4
--- /dev/null
+++ b/aten/src/ATen/native/zoom/LogcumsumexpKernel.cu
@@ -0,0 +1,124 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/core/TensorBase.h>
+#include <ATen/OpMathType.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/native/zoom/ScanKernels.h>
+#include <ATen/native/zoom/ScanUtils.cuh>
+
+#include <cmath>
+#include <limits>
+
+namespace at::native {
+
+// custom min and max to be used in logcumsumexp for complex arguments
+template <typename scalar_t, bool min>
+__host__ __device__ c10::complex<scalar_t> _logcumsumexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  scalar_t xr = std::real(x);
+  scalar_t yr = std::real(y);
+  if (::isnan(yr) || (::isnan(std::imag(y)))) {
+    return y;
+  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
+    return x;
+  } else if (min) { // min
+    return (xr < yr) ? x : y;
+  } else { // max
+    return (xr >= yr) ? x : y;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
+  auto isnan_x = at::_isnan(x);
+  auto isnan_y = at::_isnan(y);
+  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
+  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
+  if (min != max || ::isfinite(min)) {
+    // nan will be propagated here
+    return ::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the x is finite (not inf nor nan)
+  auto xreal = std::real(x);
+  auto ximag = std::imag(x);
+  auto exp_x_abs = std::exp(xreal);
+  auto exp_x_real = exp_x_abs * std::cos(ximag);
+  auto exp_x_imag = exp_x_abs * std::sin(ximag);
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the real part of x is infinite
+  auto ximag = std::imag(x);
+  auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
+  auto sin = std::sin(ximag);
+  auto cos = std::cos(ximag);
+  // special case if the angle is exactly the multiple of pi/2
+  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
+  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  c10::complex<scalar_t> min = _logcumsumexp_minmax<scalar_t, /*min=*/true>(x, y);
+  c10::complex<scalar_t> max = _logcumsumexp_minmax<scalar_t, /*min=*/false>(x, y);
+  scalar_t min_real = std::real(min);
+  scalar_t max_real = std::real(max);
+
+  if (::isnan(min_real) || ::isnan(std::imag(min))) {
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      auto exp_min = _fast_build_exp_inf(min);
+      auto exp_max = _fast_build_exp_inf(max);
+      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
+    }
+  } else {
+    auto minmax = min - max;
+    auto exp_minmax = _fast_build_exp(minmax);
+    return ::log1p(exp_minmax) + max;
+  }
+}
+
+void launch_logcumsumexp_zoom_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
+// Compile time for CUDA-11.4 is 3x slower than with CUDA-11.6+, specifically for complex numbers
+#if defined(FBCODE_CAFFE2) || defined(OVRSOURCE)
+#define _LCME_DISPATCH AT_DISPATCH_FLOATING_TYPES_AND2
+#else
+#define _LCME_DISPATCH AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2
+#endif
+  _LCME_DISPATCH(ScalarType::Half, ScalarType::BFloat16,
+      self.scalar_type(), "logcumsumexp_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        scalar_t init = -std::numeric_limits<scalar_t>::infinity();
+        auto log_add_exp = [] C10_HOST_DEVICE (const scalar_t x_, const scalar_t y_) -> scalar_t {
+          const opmath_t x{x_}, y{y_};
+          return _log_add_exp_helper(x, y);
+        };
+        scan_dim<scalar_t>(self, result, dim, init, log_add_exp);
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Math.cuh b/aten/src/ATen/native/zoom/Math.cuh
new file mode 100644
index 00000000000000..c7085693ee2240
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Math.cuh
@@ -0,0 +1,3026 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/zoom/jit/macros.h>
+#include <c10/macros/Macros.h>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at {
+namespace native {
+// See note [Jiterator]
+// TODO: elaborate in this comment on the structure of math.cuh
+
+const auto ndtri_string = jiterator_stringify(
+  /*
+  * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+  * See note [3-Clause BSD License for the Cephes Math Library].
+  *
+  * Evaluates polynomial of degree N:
+  *
+  *                     2          N
+  * y  =  C  + C x + C x  +...+ C x
+  *        0    1     2          N
+  *
+  * Coefficients are stored in reverse order:
+  *
+  * coef[0] = C  , ..., coef[N] = C  .
+  *            N                   0
+  */
+  template <typename T>
+  T polevl(const T x, const T A[], const int len) {
+    // NOTE: This `polevl` is different from other `polevl`
+    // implementation (in PyTorch) which expect the `len` to be
+    // `len(A) - 1` instead of `len(A)`.
+    T result = 0;
+    for (int i = 0; i < len; ++i) {
+      result = result * x + A[i];
+    }
+    return result;
+  }
+
+  /*
+  * This function is derived from the implementation of the i1e function in the Cephes Math Library.
+  * See note [3-Clause BSD License for the Cephes Math Library].
+  *
+  * Computes the argument, x, for which the area under the Gaussian probability density function
+  * (integrated from minus infinity to x) is equal to y.
+  */
+  template <typename T>
+  T ndtri(T y0) {
+
+    constexpr T zero = 0;
+    constexpr T one = 1;
+
+    // Handles special cases
+    if (y0 == zero) {
+      return NEG_INFINITY;
+    }
+    if (y0 == one) {
+      return POS_INFINITY;
+    }
+    if (y0 < zero || y0 > one) {
+      return NAN;
+    }
+
+    bool code = true;
+    T y = y0;
+    // Note: the constant 0.135... is equal to exp(-2)
+    if (y > one - T{0.13533528323661269189}) {
+      y = one - y;
+      code = false;
+    }
+
+    if (y > T{0.13533528323661269189}) {
+      /* approximation for 0 <= |y - 0.5| <= 3/8 */
+      static const T P0[5] = {
+          -5.99633501014107895267E1,
+          9.80010754185999661536E1,
+          -5.66762857469070293439E1,
+          1.39312609387279679503E1,
+          -1.23916583867381258016E0,
+      };
+
+      static const T Q0[9] = {
+        1.00000000000000000000E0,
+        1.95448858338141759834E0,
+        4.67627912898881538453E0,
+        8.63602421390890590575E1,
+        -2.25462687854119370527E2,
+        2.00260212380060660359E2,
+        -8.20372256168333339912E1,
+        1.59056225126211695515E1,
+        -1.18331621121330003142E0,
+      };
+
+      /* sqrt(2pi) */
+      constexpr T s2pi = 2.50662827463100050242E0;
+
+      y = y - T{0.5};
+      const T y2 = y * y;
+      T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9}));
+      return x * s2pi;
+    }
+
+    T x = sqrt(T{-2.} * log(y));
+    const T x0 = x - (log(x) / x);
+
+    const T z = one / x;
+    T x1;
+
+    /* y > exp(-32) = 1.2664165549e-14 */
+    if (x < T{8.0}) {
+      /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8
+      * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14.
+      */
+      static const T P1[9] = {
+        4.05544892305962419923E0,
+        3.15251094599893866154E1,
+        5.71628192246421288162E1,
+        4.40805073893200834700E1,
+        1.46849561928858024014E1,
+        2.18663306850790267539E0,
+        -1.40256079171354495875E-1,
+        -3.50424626827848203418E-2,
+        -8.57456785154685413611E-4,
+      };
+
+      static const T Q1[9] = {
+        1.00000000000000000000E0,
+        1.57799883256466749731E1,
+        4.53907635128879210584E1,
+        4.13172038254672030440E1,
+        1.50425385692907503408E1,
+        2.50464946208309415979E0,
+        -1.42182922854787788574E-1,
+        -3.80806407691578277194E-2,
+        -9.33259480895457427372E-4,
+      };
+
+      x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9});
+    } else {
+      /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64
+      * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+      */
+      static const T P2[9] = {
+        3.23774891776946035970E0,
+        6.91522889068984211695E0,
+        3.93881025292474443415E0,
+        1.33303460815807542389E0,
+        2.01485389549179081538E-1,
+        1.23716634817820021358E-2,
+        3.01581553508235416007E-4,
+        2.65806974686737550832E-6,
+        6.23974539184983293730E-9,
+      };
+
+      static const T Q2[9] = {
+        1.00000000000000000000E0,
+        6.02427039364742014255E0,
+        3.67983563856160859403E0,
+        1.37702099489081330271E0,
+        2.16236993594496635890E-1,
+        1.34204006088543189037E-2,
+        3.28014464682127739104E-4,
+        2.89247864745380683936E-6,
+        6.79019408009981274425E-9,
+      };
+
+      x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9});
+    }
+
+    x = x0 - x1;
+    return (!code) ? x : -x;
+  }
+); // ndtri_string
+
+const auto log_ndtr_string = jiterator_stringify(
+  template <typename T>
+  T log_ndtr(T x) {
+    constexpr T SQRT1_2{0.707106781186547524400844362104849039};   // 1/sqrt(2)
+    T t = x * SQRT1_2;
+    if (x < T{-1.0}) {
+      return log(erfcx(-t) / 2) - t * t;
+    } else {
+      return log1p(-erfc(t) / 2);
+    }
+  }
+); // log_ndtr_string
+
+const auto gcd_string = jiterator_stringify(
+  template <typename T>
+  T gcd(const T a_in, const T b_in) {
+    T a = abs(a_in);
+    T b = abs(b_in);
+
+    while (a != T{0}) {
+      T c = a;
+      a = b % a;
+      b = c;
+    }
+
+    return b;
+  }
+); // gcd_string
+
+const auto lcm_string = jiterator_stringify(
+  template <typename T>
+  T gcd(const T a_in, const T b_in) {
+    T a = abs(a_in);
+    T b = abs(b_in);
+
+    while (a != T{0}) {
+      T c = a;
+      a = b % a;
+      b = c;
+    }
+
+    return b;
+  }
+
+  template <typename T>
+  T lcm(const T a, const T b) {
+    T g = gcd(a, b);
+    return (g == T{0}) ? T{0} : abs(a / g * b);
+  }
+); // lcm_string
+
+/*
+ * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
+const auto digamma_string = jiterator_stringify(
+  template <typename T>
+  T digamma(T x) {
+    static const double PI_f64 = 3.14159265358979323846;
+
+    // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
+    if (x == 0) {
+      return copysign(POS_INFINITY, -x);
+    }
+
+    T result = 0;
+    if (x < 0) {
+      // Short-circuits if x is a negative integer and returns NaN
+      //   per the C++ standard
+      const bool x_is_integer = (x == trunc(x));
+      if (x_is_integer) {
+        return NAN;
+      }
+
+      // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+      // accurate than tan(pi * x). While these operations are mathematically equivalent
+      // since both x and r are in radians and tan() has a periodicity of pi, in practice
+      // the computation of pi * x is a source of error (when |x| > 1).
+      double q, r;
+      r = modf(static_cast<double>(x), &q);
+      result = - PI_f64 / tan(PI_f64 * r);
+      x = 1 - x;
+    }
+
+    while (x < T{10}) {
+      result -= T{1} / x;
+      x += T{1};
+    }
+
+    if (x == T{10}) {
+      return result + T{2.25175258906672110764};
+    }
+
+    T y = 0;
+    if (x < T{1.0e17}) {
+      const T A[] = {
+        8.33333333333333333333E-2,
+        -2.10927960927960927961E-2,
+        7.57575757575757575758E-3,
+        -4.16666666666666666667E-3,
+        3.96825396825396825397E-3,
+        -8.33333333333333333333E-3,
+        8.33333333333333333333E-2,
+      };
+
+
+      T z = T{1} / (x * x);
+
+      T polevl_result = 0;
+      for (int i = 0; i <= 6; i++) {
+        polevl_result = polevl_result * z + A[i];
+      }
+      y = z * polevl_result;
+    }
+
+    return log(x) - (T{0.5} / x) - y + result;
+  }
+); // digamma_string
+
+/*
+ * This function is derived from the implementation of the zeta function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+const auto zeta_string = jiterator_stringify(
+  template <typename T>
+  T zeta(T x, T q) {
+    const T MACHEP{1.11022302462515654042E-16};
+    constexpr T zero{0};
+    constexpr T half{0.5};
+    constexpr T one{1};
+    static const T A[] = {
+        12.0,
+        -720.0,
+        30240.0,
+        -1209600.0,
+        47900160.0,
+        -1.8924375803183791606e9, /*1.307674368e12/691*/
+        7.47242496e10,
+        -2.950130727918164224e12, /*1.067062284288e16/3617*/
+        1.1646782814350067249e14, /*5.109094217170944e18/43867*/
+        -4.5979787224074726105e15, /*8.028576626982912e20/174611*/
+        1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/
+        -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/
+    };
+
+    int i = 0;
+    T a, b, k, s, t, w;
+
+    // Short-circuits x -> +infty
+    if (x == one) {
+      return POS_INFINITY;
+    }
+
+    // Short-circuits x < 1 -> NaN
+    if (x < one) {
+      return NAN;
+    }
+
+    // Short-circuits negative q integers map to +infty,
+    //   negative q non-integers map to NaN
+    if (q <= zero) {
+      if (q == floor(q)) {
+        return POS_INFINITY;
+      }
+      if (x != floor(x)) {
+        return NAN;
+      }
+    }
+
+    s = pow(q, -x);
+    a = q;
+    i = 0;
+    b = zero;
+    while ((i < 9) || (a <= T{9.0})) {
+      i += 1;
+      a += one;
+      b = pow(a, -x);
+      s += b;
+      if ((-MACHEP * s < b) && (b < MACHEP * s)) {
+        return s;
+      }
+    };
+
+    w = a;
+    s += b * w / (x - one);
+    s -= half * b;
+    a = one;
+    k = zero;
+    for (int i = 0; i < 12; i++) {
+      a *= x + k;
+      b /= w;
+      t = a * b / A[i];
+      s = s + t;
+      t = fabs(t / s);
+
+      if (t < MACHEP) {
+        return s;
+      }
+
+      k += one;
+      a *= x + k;
+      b /= w;
+      k += one;
+    }
+
+    return s;
+  }
+); // zeta_string
+
+const auto trigamma_string = jiterator_stringify(
+  template <typename T>
+  T trigamma(T x) {
+    const T PI{3.14159265358979323846};
+    T sign = 1;
+    T result = 0;
+
+    if (x < T{0.5}) {
+      sign = -1;
+      T sin_pi_x = sin(PI * x);
+      result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+      x = 1 - x;
+    }
+
+    for (int i = 0; i < 6; ++i) {
+      result += T{1} / (x * x);
+      x += 1;
+    }
+
+    const T one{1};
+    const T ixx = one / (x*x);
+    result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x;
+    return sign * result;
+}
+); // trigamma_string
+
+const auto lgamma_string = jiterator_stringify(
+  template <typename T>
+  T lgamma_kernel(T a) {
+    return lgamma(a);
+  }
+); // lgamma_string
+
+const auto polygamma_string = zeta_string + jiterator_stringify(
+  template <typename T>
+  T polygamma(T x, int n) {
+    // already blocked if n <= 1
+    const auto one = T{1};
+    return ((n % 2) ? one : -one) * exp(lgamma(static_cast<T>(n) + one)) *
+        zeta<T>(static_cast<T>(n + 1), x);
+  }
+); // polygamma_string
+
+const auto exp2_string = jiterator_stringify(
+  template <typename T>
+  T exp2_impl(T a) {
+    return exp2(a);
+  }
+
+  namespace std { template <typename _Ty> class complex; }
+  template <typename T>
+  std::complex<T> exp2_impl(std::complex<T> x) {
+    // There is no std::exp2 overload for complex, so instead
+    // use the identity 2^x = e^(ln(2) * x)
+    const auto ln_2 = static_cast<T>(0.693147180559945309417232121458176);
+    return exp(ln_2 * x);
+  }
+
+  template <typename T>
+  T exp2_kernel(T a) {
+    return exp2_impl(a);
+  }
+); // exp2_string
+
+const auto erfc_string = jiterator_stringify(
+  template <typename T>
+  T erfc_kernel(T a) {
+    return erfc(a);
+  }
+); // erfc_string
+
+const auto erfinv_string = jiterator_stringify(
+  template <typename T>
+  T erfinv_kernel(T a) {
+    return erfinv(a);
+  }
+); // erfinv_string
+
+const auto entr_string = jiterator_stringify(
+  template <typename T>
+  T entr(T a) {
+    if (a != a) {
+      return a;
+    }
+
+    if (a > 0) {
+      return -a * log(a);
+    }
+
+    if (a == 0) {
+      return 0;
+    }
+
+    return NEG_INFINITY;
+  }
+); // entr_string
+
+// NOTE: `kaiser_window_string` depends on `i0_string`
+//       for its implementation.
+const auto i0_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(T x, const T array[], const int len) {
+
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  template<typename T>
+  T i0(T _x) {
+      T x = fabs(_x);
+
+      if (x <= T{8.0}) {
+          /* Chebyshev coefficients for exp(-x) I0(x)
+          *   in the interval [0,8].
+          *
+          * lim(x->0){ exp(-x) I0(x) } = 1.
+          */
+          static const T A[] = {
+              -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+              -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+              -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+              -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+              -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+              -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+              -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+              -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+              -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+              -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+              -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+              -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+              -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+              -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+              -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+          T y = (x / T{2.0}) - T{2.0};
+          return exp(x) * chbevl(y, A, int{30});
+      }
+
+      // Handles x > 8 case
+      /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+      * in the inverted interval [8,infinity].
+      *
+      * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+      */
+      const T B[] = {
+          -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+          4.46562142029675999901E-17,  3.46122286769746109310E-17,
+          -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+          1.77256013305652638360E-15,  3.81168066935262242075E-15,
+          -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+          1.54008621752140982691E-14,  3.85277838274214270114E-13,
+          7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+          -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+          1.18891471078464383424E-11,  4.94060238822496958910E-10,
+          3.39623202570838634515E-9,   2.26666899049817806459E-8,
+          2.04891858946906374183E-7,   2.89137052083475648297E-6,
+          6.88975834691682398426E-5,   3.36911647825569408990E-3,
+          8.04490411014108831608E-1};
+
+      return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x);
+  }
+); // i0_string
+
+const auto i1_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(const T x, const T array[], const int len) {
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  template <typename T>
+  T i1(T _x) {
+    const T x = fabs(_x);
+
+    if (x <= T{8.0}) {
+      // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8]
+      //   lim(x->0){ exp(-x) i1(x) / x } = 1/2
+      static const T coefficients[] = {
+          2.77791411276104639959E-18, -2.11142121435816608115E-17,
+          1.55363195773620046921E-16, -1.10559694773538630805E-15,
+          7.60068429473540693410E-15, -5.04218550472791168711E-14,
+          3.22379336594557470981E-13, -1.98397439776494371520E-12,
+          1.17361862988909016308E-11, -6.66348972350202774223E-11,
+          3.62559028155211703701E-10, -1.88724975172282928790E-9,
+          9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+          2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+          3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+          4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+          5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+          4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+          2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+          1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+          2.52587186443633654823E-1};
+      const T y = x / T{2.0} - T{2.0};
+      const T out = exp(x) * x * chbevl(y, coefficients, int{29});
+      return (_x < T{0.0}) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval [8, infinity]
+    //   lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi)
+    static const T coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+    const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x);
+    return (_x < T{0.}) ? -out : out;
+  }
+); // i1_string
+
+const auto i1e_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(const T x, const T array[], const int len) {
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  // See double and float instantiations below
+  template <typename T>
+  T i1e(T _x) { }
+
+  // Double specialization (uses different coefficients than the float version)
+  template<>
+  double i1e(double _x) {
+    const double x = fabs(_x);
+    if (x <= double{8.}) {
+      // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+      // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+      static const double coefficients[] = {
+        2.77791411276104639959E-18, -2.11142121435816608115E-17,
+        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+        2.52587186443633654823E-1};
+      const double y = x / double{2.} - double{2.};
+      const double out = chbevl(y, coefficients, int{29}) * x;
+      return (_x < 0.) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval (8, infinity].
+    // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+    // TODO: what's an "inverted interval"? Open on the left
+    //   and closed on the right?
+  static const double coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+
+    const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x);
+    return (_x < double{0.}) ? -out : out;
+  }
+
+  // Float specialization (uses different coefficients than the double version)
+  template<>
+  float i1e(float _x) {
+    const float x = fabsf(_x);
+    if (x <= float{8.}) {
+      // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+      // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+      static const float coefficients[] = {
+        9.38153738649577178388E-9f,
+        -4.44505912879632808065E-8f,
+        2.00329475355213526229E-7f,
+        -8.56872026469545474066E-7f,
+        3.47025130813767847674E-6f,
+        -1.32731636560394358279E-5f,
+        4.78156510755005422638E-5f,
+        -1.61760815825896745588E-4f,
+        5.12285956168575772895E-4f,
+        -1.51357245063125314899E-3f,
+        4.15642294431288815669E-3f,
+        -1.05640848946261981558E-2f,
+        2.47264490306265168283E-2f,
+        -5.29459812080949914269E-2f,
+        1.02643658689847095384E-1f,
+        -1.76416518357834055153E-1f,
+        2.52587186443633654823E-1f};
+      const float y = x / float{2.} - float{2.};
+      const float out = chbevl(y, coefficients, int{17}) * x;
+      return (_x < 0.) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval (8, infinity].
+    // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+    // TODO: what's an "inverted interval"? Open on the left
+    //   and closed on the right?
+  static const float coefficients[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+    const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x);
+    return (_x < float{0.}) ? -out : out;
+  }
+); // i1e_string
+
+const auto kaiser_window_string = i0_string + jiterator_stringify(
+  template <typename T>
+  T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) {
+    T x = a * inv_alpha - T{1};
+    T y = max(T{0}, T{1} - x * x);
+    return i0(beta * sqrt(y)) * inv_i0_beta;
+  }
+); // kaiser_window_string
+
+const auto sinc_string = jiterator_stringify(
+  template <typename T>
+  T sinc(T a) {
+    if (a == T(0)) {
+      return T(1);
+    } else {
+      constexpr T pi = T(3.14159265358979323846L);
+      T product = pi * a;
+      return std::sin(product) / product;
+    }
+  }
+); // sinc_string
+
+const auto erfcx_string = jiterator_stringify(
+  /* The next function is taken from http://ab-initio.mit.edu/Faddeev */
+
+  /* Copyright (c) 2012 Massachusetts Institute of Technology
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining
+  * a copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sublicense, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be
+  * included in all copies or substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  */
+
+  /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by
+    Steven G. Johnson, October 2012.
+
+    This function combines a few different ideas.
+
+    First, for x > 50, it uses a continued-fraction expansion (same as
+    for the Faddeeva function, but with algebraic simplifications for z=i*x).
+
+    Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations,
+    but with two twists:
+
+        a) It maps x to y = 4 / (4+x) in [0,1].  This simple transformation,
+          inspired by a similar transformation in the octave-forge/specfun
+          erfcx by Soren Hauberg, results in much faster Chebyshev convergence
+          than other simple transformations I have examined.
+
+        b) Instead of using a single Chebyshev polynomial for the entire
+          [0,1] y interval, we break the interval up into 100 equal
+          subintervals, with a switch/lookup table, and use much lower
+          degree Chebyshev polynomials in each subinterval. This greatly
+          improves performance in my tests.
+
+    For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x),
+    with the usual checks for overflow etcetera.
+
+    Performance-wise, it seems to be substantially faster than either
+    the SLATEC DERFC function [or an erfcx function derived therefrom]
+    or Cody's CALERF function (from netlib.org/specfun), while
+    retaining near machine precision in accuracy.
+  */
+
+  /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x).
+
+    Uses a look-up table of 100 different Chebyshev polynomials
+    for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+    with the help of Maple and a little shell script.   This allows
+    the Chebyshev polynomials to be of significantly lower degree (about 1/4)
+    compared to fitting the whole [0,1] interval with a single polynomial.
+  */
+
+  // TODO: review if this is computing in double when given a float input
+  template <typename T>
+  T erfcx_y100(T y100) {
+    switch (static_cast<int>(y100)) {
+      case 0: {
+      T t = 2*y100 - 1;
+      return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 1: {
+      T t = 2*y100 - 3;
+      return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 2: {
+      T t = 2*y100 - 5;
+      return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 3: {
+      T t = 2*y100 - 7;
+      return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 4: {
+      T t = 2*y100 - 9;
+      return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 5: {
+      T t = 2*y100 - 11;
+      return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 6: {
+      T t = 2*y100 - 13;
+      return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 7: {
+      T t = 2*y100 - 15;
+      return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 8: {
+      T t = 2*y100 - 17;
+      return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 9: {
+      T t = 2*y100 - 19;
+      return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 10: {
+      T t = 2*y100 - 21;
+      return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 11: {
+      T t = 2*y100 - 23;
+      return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 12: {
+      T t = 2*y100 - 25;
+      return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 13: {
+      T t = 2*y100 - 27;
+      return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 14: {
+      T t = 2*y100 - 29;
+      return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 15: {
+      T t = 2*y100 - 31;
+      return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 16: {
+      T t = 2*y100 - 33;
+      return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 17: {
+      T t = 2*y100 - 35;
+      return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 18: {
+      T t = 2*y100 - 37;
+      return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 19: {
+      T t = 2*y100 - 39;
+      return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 20: {
+      T t = 2*y100 - 41;
+      return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 21: {
+      T t = 2*y100 - 43;
+      return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 22: {
+      T t = 2*y100 - 45;
+      return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 23: {
+      T t = 2*y100 - 47;
+      return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 24: {
+      T t = 2*y100 - 49;
+      return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 25: {
+      T t = 2*y100 - 51;
+      return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 26: {
+      T t = 2*y100 - 53;
+      return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 27: {
+      T t = 2*y100 - 55;
+      return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 28: {
+      T t = 2*y100 - 57;
+      return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 29: {
+      T t = 2*y100 - 59;
+      return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 30: {
+      T t = 2*y100 - 61;
+      return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 31: {
+      T t = 2*y100 - 63;
+      return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 32: {
+      T t = 2*y100 - 65;
+      return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 33: {
+      T t = 2*y100 - 67;
+      return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 34: {
+      T t = 2*y100 - 69;
+      return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 35: {
+      T t = 2*y100 - 71;
+      return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 36: {
+      T t = 2*y100 - 73;
+      return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 37: {
+      T t = 2*y100 - 75;
+      return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 38: {
+      T t = 2*y100 - 77;
+      return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 39: {
+      T t = 2*y100 - 79;
+      return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 40: {
+      T t = 2*y100 - 81;
+      return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 41: {
+      T t = 2*y100 - 83;
+      return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 42: {
+      T t = 2*y100 - 85;
+      return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 43: {
+      T t = 2*y100 - 87;
+      return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 44: {
+      T t = 2*y100 - 89;
+      return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 45: {
+      T t = 2*y100 - 91;
+      return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 46: {
+      T t = 2*y100 - 93;
+      return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 47: {
+      T t = 2*y100 - 95;
+      return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 48: {
+      T t = 2*y100 - 97;
+      return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 49: {
+      T t = 2*y100 - 99;
+      return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 50: {
+      T t = 2*y100 - 101;
+      return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 51: {
+      T t = 2*y100 - 103;
+      return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 52: {
+      T t = 2*y100 - 105;
+      return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 53: {
+      T t = 2*y100 - 107;
+      return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 54: {
+      T t = 2*y100 - 109;
+      return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 55: {
+      T t = 2*y100 - 111;
+      return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 56: {
+      T t = 2*y100 - 113;
+      return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 57: {
+      T t = 2*y100 - 115;
+      return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 58: {
+      T t = 2*y100 - 117;
+      return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 59: {
+      T t = 2*y100 - 119;
+      return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 60: {
+      T t = 2*y100 - 121;
+      return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 61: {
+      T t = 2*y100 - 123;
+      return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 62: {
+      T t = 2*y100 - 125;
+      return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 63: {
+      T t = 2*y100 - 127;
+      return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 64: {
+      T t = 2*y100 - 129;
+      return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 65: {
+      T t = 2*y100 - 131;
+      return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 66: {
+      T t = 2*y100 - 133;
+      return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 67: {
+      T t = 2*y100 - 135;
+      return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 68: {
+      T t = 2*y100 - 137;
+      return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 69: {
+      T t = 2*y100 - 139;
+      return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 70: {
+      T t = 2*y100 - 141;
+      return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 71: {
+      T t = 2*y100 - 143;
+      return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 72: {
+      T t = 2*y100 - 145;
+      return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 73: {
+      T t = 2*y100 - 147;
+      return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 74: {
+      T t = 2*y100 - 149;
+      return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 75: {
+      T t = 2*y100 - 151;
+      return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 76: {
+      T t = 2*y100 - 153;
+      return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 77: {
+      T t = 2*y100 - 155;
+      return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 78: {
+      T t = 2*y100 - 157;
+      return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 79: {
+      T t = 2*y100 - 159;
+      return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 80: {
+      T t = 2*y100 - 161;
+      return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 81: {
+      T t = 2*y100 - 163;
+      return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 82: {
+      T t = 2*y100 - 165;
+      return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 83: {
+      T t = 2*y100 - 167;
+      return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 84: {
+      T t = 2*y100 - 169;
+      return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 85: {
+      T t = 2*y100 - 171;
+      return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 86: {
+      T t = 2*y100 - 173;
+      return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 87: {
+      T t = 2*y100 - 175;
+      return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 88: {
+      T t = 2*y100 - 177;
+      return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 89: {
+      T t = 2*y100 - 179;
+      return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 90: {
+      T t = 2*y100 - 181;
+      return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 91: {
+      T t = 2*y100 - 183;
+      return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 92: {
+      T t = 2*y100 - 185;
+      return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 93: {
+      T t = 2*y100 - 187;
+      return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 94: {
+      T t = 2*y100 - 189;
+      return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 95: {
+      T t = 2*y100 - 191;
+      return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 96: {
+      T t = 2*y100 - 193;
+      return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 97: {
+      T t = 2*y100 - 195;
+      return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 98: {
+      T t = 2*y100 - 197;
+      return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 99: {
+      T t = 2*y100 - 199;
+      return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
+      }
+    }
+
+    // we only get here if y = 1, i.e. |x| < 4*eps, in which case
+    // erfcx is within 1e-15 of 1..
+    return 1.;
+  }
+
+  template <typename T>
+  T erfcx(T x) {
+    // Short-circuits on NaN (returning NaN)
+    if (x != x) {
+      return x;
+    }
+
+    if (x >= 0) {
+      if (x > T{50}) { // continued-fraction expansion is faster
+        const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+
+        if (x > T{5e7}) { // 1-term expansion, important to avoid overflow
+          return ispi / x;
+        }
+
+        /* 5-term expansion (rely on compiler for CSE), simplified from:
+                  ispi / (x+0.5/(x+1/(x+1.5/(x+2/x))))  */
+        return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75}));
+      }
+
+      // x >= 0 x <= 50
+      return erfcx_y100(T{400} / (T{4} + x));
+    }
+
+    // x < 0
+    if (x < T{-26.7}) {
+      return POS_INFINITY;
+    } else if (x < T{-6.1}) {
+      return T{2} * exp(x * x);
+    }
+
+    // x < 0 and x >= -6.1
+    return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x));
+  }
+); // erfcx_string
+
+const auto airy_ai_string = jiterator_stringify(
+    template<typename T>
+    T airy_ai_forward(T x) {
+        static const T AN[] = {
+                +3.46538101525629032477e-01,
+                +1.20075952739645805542e+01,
+                +7.62796053615234516538e+01,
+                +1.68089224934630576269e+02,
+                +1.59756391350164413639e+02,
+                +7.05360906840444183113e+01,
+                +1.40264691163389668864e+01,
+                +9.99999999999999995305e-01,
+        };
+
+        static const T AD[] = {
+                +5.67594532638770212846e-01,
+                +1.47562562584847203173e+01,
+                +8.45138970141474626562e+01,
+                +1.77318088145400459522e+02,
+                +1.64234692871529701831e+02,
+                +7.14778400825575695274e+01,
+                +1.40959135607834029598e+01,
+                +1.00000000000000000470e+00,
+        };
+
+        static const T AFN[] = {
+                -1.31696323418331795333e-01,
+                -6.26456544431912369773e-01,
+                -6.93158036036933542233e-01,
+                -2.79779981545119124951e-01,
+                -4.91900132609500318020e-02,
+                -4.06265923594885404393e-03,
+                -1.59276496239262096340e-04,
+                -2.77649108155232920844e-06,
+                -1.67787698489114633780e-08,
+        };
+
+        static const T AFD[] = {
+                +1.33560420706553243746e+01,
+                +3.26825032795224613948e+01,
+                +2.67367040941499554804e+01,
+                +9.18707402907259625840e+00,
+                +1.47529146771666414581e+00,
+                +1.15687173795188044134e-01,
+                +4.40291641615211203805e-03,
+                +7.54720348287414296618e-05,
+                +4.51850092970580378464e-07,
+        };
+
+        static const T AGN[] = {
+                +1.97339932091685679179e-02,
+                +3.91103029615688277255e-01,
+                +1.06579897599595591108e+00,
+                +9.39169229816650230044e-01,
+                +3.51465656105547619242e-01,
+                +6.33888919628925490927e-02,
+                +5.85804113048388458567e-03,
+                +2.82851600836737019778e-04,
+                +6.98793669997260967291e-06,
+                +8.11789239554389293311e-08,
+                +3.41551784765923618484e-10,
+        };
+
+        static const T AGD[] = {
+                +9.30892908077441974853e+00,
+                +1.98352928718312140417e+01,
+                +1.55646628932864612953e+01,
+                +5.47686069422975497931e+00,
+                +9.54293611618961883998e-01,
+                +8.64580826352392193095e-02,
+                +4.12656523824222607191e-03,
+                +1.01259085116509135510e-04,
+                +1.17166733214413521882e-06,
+                +4.91834570062930015649e-09,
+        };
+
+        int domain_flag = 0;
+
+        T ai;
+
+        if (isinf(x)) {
+            return NAN;
+        }
+
+        if (x > T(103.892)) {
+            return T(0.0);
+        }
+
+        T f;
+        T g;
+        T k;
+
+        if (x < T(-2.09)) {
+            T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0));
+
+            T afn = 0.0;
+
+            for (uint8_t index = 0; index <= 8; index++) {
+                afn = afn * (z * z) + AFN[index];
+            }
+
+            T afd = 0.0;
+
+            for (uint8_t index = 0; index <= 8; index++) {
+                afd = afd * (z * z) + AFD[index];
+            }
+
+            T agn = 0.0;
+
+            for (uint8_t index = 0; index <= 10 + 0; index++) {
+                agn = agn * (z * z) + AGN[index];
+            }
+
+            T agd = 0.0;
+
+            for (uint8_t index = 0; index <= 10 - 1; index++) {
+                agd = agd * (z * z) + AGD[index];
+            }
+
+            T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846);
+
+            return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd));
+        }
+
+        if (x >= T(2.09)) {
+            domain_flag = 5;
+
+            T zeta = T(2.0) * x * sqrt(x) / T(3.0);
+
+            T an = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                an = an * (T(1.0) / zeta) + AN[index];
+            }
+
+            T ad = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                ad = ad * (T(1.0) / zeta) + AD[index];
+            }
+
+            ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta));
+
+            if (x > T(8.3203353)) {
+                return ai;
+            }
+        }
+
+        f = 1.0;
+        g = x;
+        k = 1.0;
+
+        T m = 1.0;
+        T n = x;
+        T t = 1.0;
+        T z = x * x * x;
+
+        while (t > T(1.11022302462515654042e-16)) {
+            m *= z;
+            k += T(1.0);
+            m /= k;
+            n *= z;
+            k += T(1.0);
+            n /= k;
+            m /= k;
+            f += m;
+            k += T(1.0);
+            n /= k;
+            g += n;
+
+            t = abs(m / f);
+        }
+
+        if ((domain_flag & 1) == 0) {
+            return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g;
+        }
+
+        return ai;
+    } // T airy_ai(T x)
+); // airy_ai_string
+
+const auto bessel_j0_string = jiterator_stringify(
+    template<typename T>
+    T bessel_j0_forward(T x) {
+        static const T PP[] = {
+                +7.96936729297347051624e-04,
+                +8.28352392107440799803e-02,
+                +1.23953371646414299388e+00,
+                +5.44725003058768775090e+00,
+                +8.74716500199817011941e+00,
+                +5.30324038235394892183e+00,
+                +9.99999999999999997821e-01,
+        };
+
+        static const T PQ[] = {
+                +9.24408810558863637013e-04,
+                +8.56288474354474431428e-02,
+                +1.25352743901058953537e+00,
+                +5.47097740330417105182e+00,
+                +8.76190883237069594232e+00,
+                +5.30605288235394617618e+00,
+                +1.00000000000000000218e+00,
+        };
+
+        static const T QP[] = {
+                -1.13663838898469149931e-02,
+                -1.28252718670509318512e+00,
+                -1.95539544257735972385e+01,
+                -9.32060152123768231369e+01,
+                -1.77681167980488050595e+02,
+                -1.47077505154951170175e+02,
+                -5.14105326766599330220e+01,
+                -6.05014350600728481186e+00,
+        };
+
+        static const T QQ[] = {
+                +6.43178256118178023184e+01,
+                +8.56430025976980587198e+02,
+                +3.88240183605401609683e+03,
+                +7.24046774195652478189e+03,
+                +5.93072701187316984827e+03,
+                +2.06209331660327847417e+03,
+                +2.42005740240291393179e+02,
+        };
+
+        static const T RP[] = {
+                -4.79443220978201773821e+09,
+                +1.95617491946556577543e+12,
+                -2.49248344360967716204e+14,
+                +9.70862251047306323952e+15,
+        };
+
+        static const T RQ[] = {
+                +4.99563147152651017219e+02,
+                +1.73785401676374683123e+05,
+                +4.84409658339962045305e+07,
+                +1.11855537045356834862e+10,
+                +2.11277520115489217587e+12,
+                +3.10518229857422583814e+14,
+                +3.18121955943204943306e+16,
+                +1.71086294081043136091e+18,
+        };
+
+        if (x < T(0)) {
+            x = -x;
+        }
+
+        if (x <= T(5.0)) {
+            if (x < T(0.00001)) {
+                return T(1.0) - x * x / T(4.0);
+            }
+
+            T rp = 0.0;
+
+            for (uint8_t index = 0; index <= 3; index++) {
+                rp = rp * (x * x) + RP[index];
+            }
+
+            T rq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                rq = rq * (x * x) + RQ[index];
+            }
+
+            return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq;
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(25.0) / (x * x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(25.0) / (x * x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(25.0) / (x * x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(25.0) / (x * x)) + QQ[index];
+        }
+
+        return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_j0_forward(T x)
+); // bessel_j0_string
+
+const auto bessel_y0_string = bessel_j0_string + jiterator_stringify(
+    template<typename T>
+    T bessel_y0_forward(T x) {
+        static const T PP[] = {
+                +7.96936729297347051624e-04,
+                +8.28352392107440799803e-02,
+                +1.23953371646414299388e+00,
+                +5.44725003058768775090e+00,
+                +8.74716500199817011941e+00,
+                +5.30324038235394892183e+00,
+                +9.99999999999999997821e-01,
+        };
+
+        static const T PQ[] = {
+                +9.24408810558863637013e-04,
+                +8.56288474354474431428e-02,
+                +1.25352743901058953537e+00,
+                +5.47097740330417105182e+00,
+                +8.76190883237069594232e+00,
+                +5.30605288235394617618e+00,
+                +1.00000000000000000218e+00,
+        };
+
+        static const T QP[] = {
+                -1.13663838898469149931e-02,
+                -1.28252718670509318512e+00,
+                -1.95539544257735972385e+01,
+                -9.32060152123768231369e+01,
+                -1.77681167980488050595e+02,
+                -1.47077505154951170175e+02,
+                -5.14105326766599330220e+01,
+                -6.05014350600728481186e+00,
+        };
+
+        static const T QQ[] = {
+                +6.43178256118178023184e+01,
+                +8.56430025976980587198e+02,
+                +3.88240183605401609683e+03,
+                +7.24046774195652478189e+03,
+                +5.93072701187316984827e+03,
+                +2.06209331660327847417e+03,
+                +2.42005740240291393179e+02,
+        };
+
+        static const T YP[] = {
+                +1.55924367855235737965e+04,
+                -1.46639295903971606143e+07,
+                +5.43526477051876500413e+09,
+                -9.82136065717911466409e+11,
+                +8.75906394395366999549e+13,
+                -3.46628303384729719441e+15,
+                +4.42733268572569800351e+16,
+                -1.84950800436986690637e+16,
+        };
+
+        static const T YQ[] = {
+                +1.04128353664259848412e+03,
+                +6.26107330137134956842e+05,
+                +2.68919633393814121987e+08,
+                +8.64002487103935000337e+10,
+                +2.02979612750105546709e+13,
+                +3.17157752842975028269e+15,
+                +2.50596256172653059228e+17,
+        };
+
+        if (x <= T(5.0)) {
+            if (x == T(0.0)) {
+                return NEG_INFINITY;
+            }
+
+            if (x < T(0.0)) {
+                NAN;
+            }
+
+            T yp = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                yp = yp * (x * x) + YP[index];
+            }
+
+            T yq = 0.0;
+
+            for (uint8_t index = 0; index <= 6; index++) {
+                yq = yq * (x * x) + YQ[index];
+            }
+
+            return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(25.0) / (x * x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(25.0) / (x * x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(25.0) / (x * x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(25.0) / (x * x)) + QQ[index];
+        }
+
+        return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_y0_forward(T x)
+); // bessel_y0_string
+
+const auto bessel_j1_string = jiterator_stringify(
+    template<typename T>
+    T bessel_j1_forward(T x) {
+        static const T PP[] = {
+                +7.62125616208173112003e-04,
+                +7.31397056940917570436e-02,
+                +1.12719608129684925192e+00,
+                +5.11207951146807644818e+00,
+                +8.42404590141772420927e+00,
+                +5.21451598682361504063e+00,
+                +1.00000000000000000254e+00,
+        };
+
+        static const T PQ[] = {
+                +5.71323128072548699714e-04,
+                +6.88455908754495404082e-02,
+                +1.10514232634061696926e+00,
+                +5.07386386128601488557e+00,
+                +8.39985554327604159757e+00,
+                +5.20982848682361821619e+00,
+                +9.99999999999999997461e-01,
+        };
+
+        static const T QP[] = {
+                +5.10862594750176621635e-02,
+                +4.98213872951233449420e+00,
+                +7.58238284132545283818e+01,
+                +3.66779609360150777800e+02,
+                +7.10856304998926107277e+02,
+                +5.97489612400613639965e+02,
+                +2.11688757100572135698e+02,
+                +2.52070205858023719784e+01,
+        };
+
+        static const T QQ[] = {
+                +7.42373277035675149943e+01,
+                +1.05644886038262816351e+03,
+                +4.98641058337653607651e+03,
+                +9.56231892404756170795e+03,
+                +7.99704160447350683650e+03,
+                +2.82619278517639096600e+03,
+                +3.36093607810698293419e+02,
+        };
+
+        static const T RP[] = {
+                -8.99971225705559398224e+08,
+                +4.52228297998194034323e+11,
+                -7.27494245221818276015e+13,
+                +3.68295732863852883286e+15,
+        };
+
+        static const T RQ[] = {
+                +6.20836478118054335476e+02,
+                +2.56987256757748830383e+05,
+                +8.35146791431949253037e+07,
+                +2.21511595479792499675e+10,
+                +4.74914122079991414898e+12,
+                +7.84369607876235854894e+14,
+                +8.95222336184627338078e+16,
+                +5.32278620332680085395e+18,
+        };
+
+        if (x < T(0.0)) {
+            return -bessel_j1_forward(-x);
+        }
+
+        if (x <= T(5.0)) {
+            T rp = 0.0;
+
+            for (uint8_t index = 0; index <= 3; index++) {
+                rp = rp * (x * x) + RP[index];
+            }
+
+            T rq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                rq = rq * (x * x) + RQ[index];
+            }
+
+            return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+        }
+
+        return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_j1_forward(T x)
+); // bessel_j1_string
+
+const auto bessel_y1_string = bessel_j1_string + jiterator_stringify(
+    template<typename T>
+    T bessel_y1_forward(T x) {
+        static const T PP[] = {
+                +7.62125616208173112003e-04,
+                +7.31397056940917570436e-02,
+                +1.12719608129684925192e+00,
+                +5.11207951146807644818e+00,
+                +8.42404590141772420927e+00,
+                +5.21451598682361504063e+00,
+                +1.00000000000000000254e+00,
+        };
+
+        static const T PQ[] = {
+                +5.71323128072548699714e-04,
+                +6.88455908754495404082e-02,
+                +1.10514232634061696926e+00,
+                +5.07386386128601488557e+00,
+                +8.39985554327604159757e+00,
+                +5.20982848682361821619e+00,
+                +9.99999999999999997461e-01,
+        };
+
+        static const T QP[] = {
+                +5.10862594750176621635e-02,
+                +4.98213872951233449420e+00,
+                +7.58238284132545283818e+01,
+                +3.66779609360150777800e+02,
+                +7.10856304998926107277e+02,
+                +5.97489612400613639965e+02,
+                +2.11688757100572135698e+02,
+                +2.52070205858023719784e+01,
+        };
+
+        static const T QQ[] = {
+                +7.42373277035675149943e+01,
+                +1.05644886038262816351e+03,
+                +4.98641058337653607651e+03,
+                +9.56231892404756170795e+03,
+                +7.99704160447350683650e+03,
+                +2.82619278517639096600e+03,
+                +3.36093607810698293419e+02,
+        };
+
+        static const T YP[] = {
+                +1.26320474790178026440e+09,
+                -6.47355876379160291031e+11,
+                +1.14509511541823727583e+14,
+                -8.12770255501325109621e+15,
+                +2.02439475713594898196e+17,
+                -7.78877196265950026825e+17,
+        };
+
+        static const T YQ[] = {
+                +5.94301592346128195359e+02,
+                +2.35564092943068577943e+05,
+                +7.34811944459721705660e+07,
+                +1.87601316108706159478e+10,
+                +3.88231277496238566008e+12,
+                +6.20557727146953693363e+14,
+                +6.87141087355300489866e+16,
+                +3.97270608116560655612e+18,
+        };
+
+        if (x <= T(5.0)) {
+            if (x == T(0.0)) {
+                return NEG_INFINITY;
+            }
+
+            if (x <= T(0.0)) {
+                return NAN;
+            }
+
+            T yp = 0.0;
+
+            for (uint8_t index = 0; index <= 5; index++) {
+                yp = yp * (x * x) + YP[index];
+            }
+
+            T yq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                yq = yq * (x * x) + YQ[index];
+            }
+
+            return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+        }
+
+        return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_y1_forward(T x)
+); // bessel_y1_string
+
+const auto chebyshev_polynomial_t_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_t_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 6) && (abs(x) < T(1.0))) {
+            return cos(n * acos(x));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_t_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_t_forward(T x, T n) {
+        return chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_t_forward(T x, T n)
+); // chebyshev_polynomial_t_string
+
+const auto chebyshev_polynomial_u_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_u_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return n + 1;
+            }
+
+            return -(n + 1);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (sin(acos(x)) != T(0.0)) {
+                return sin((n + 1) * acos(x)) / sin(acos(x));
+            }
+
+            return (n + 1) * cos((n + 1) * acos(x)) / x;
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x;
+        }
+
+        T p = T(1.0);
+        T q = x + x;
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_u_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_u_forward(T x, T n) {
+        return chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_u_forward(T x, T n)
+); // chebyshev_polynomial_u_string
+
+const auto chebyshev_polynomial_v_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_v_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0)) {
+                return T(1.0);
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (sin(acos(x) / T(2.0)) != T(1.0)) {
+                return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_v_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_v_forward(T x, T n) {
+        return chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_v_forward(T x, T n)
+); // chebyshev_polynomial_v_string
+
+const auto chebyshev_polynomial_w_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_w_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0)) {
+                return n + n + 1;
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (cos(acos(x) / T(2.0)) != T(1.0)) {
+                return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0));
+            }
+
+            if (x > T(0.0)) {
+                return n + n + 1;
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x + T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x + T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_w_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_w_forward(T x, T n) {
+        return chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_w_forward(T x, T n)
+); // chebyshev_polynomial_w_string
+
+const auto hermite_polynomial_h_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_h_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x;
+        }
+
+        T p = T(1.0);
+        T q = x + x;
+        T r = T(0.0);
+
+        for (int64_t k = 2; k < n + n; k += 2) {
+            r = (x + x) * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_h_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_h_forward(T x, T n) {
+        return hermite_polynomial_h_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_h_forward(T x, T n)
+); // hermite_polynomial_h_string
+
+const auto hermite_polynomial_he_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = x * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_he_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, T n) {
+        return hermite_polynomial_he_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_he_forward(T x, T n)
+); // hermite_polynomial_he_string
+
+const auto laguerre_polynomial_l_string = jiterator_stringify(
+    template<typename T>
+    T laguerre_polynomial_l_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(0.0)) {
+            return T(1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return T(1.0) - x;
+        }
+
+        T p = T(1.0);
+        T q = T(1.0) - x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // laguerre_polynomial_l_forward(T x, int64_t n)
+
+    template<typename T>
+    T laguerre_polynomial_l_forward(T x, T n) {
+        return laguerre_polynomial_l_forward(x, static_cast<int64_t>(n));
+    } // laguerre_polynomial_l_forward(T x, T n)
+); // laguerre_polynomial_l_string
+
+const auto legendre_polynomial_p_string = jiterator_stringify(
+    template<typename T>
+    T legendre_polynomial_p_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = ((k + k + 1) * x * q - k * p) / (k + 1);
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // legendre_polynomial_p_forward(T x, int64_t n)
+
+    template<typename T>
+    T legendre_polynomial_p_forward(T x, T n) {
+        return legendre_polynomial_p_forward(x, static_cast<int64_t>(n));
+    } // legendre_polynomial_p_forward(T x, T n)
+); // legendre_polynomial_p_string
+
+const auto modified_bessel_i0_string = jiterator_stringify(
+    template<typename T>
+    T modified_bessel_i0_forward(T x) {
+        static const T A[] = {
+                -4.41534164647933937950e-18,
+                +3.33079451882223809783e-17,
+                -2.43127984654795469359e-16,
+                +1.71539128555513303061e-15,
+                -1.16853328779934516808e-14,
+                +7.67618549860493561688e-14,
+                -4.85644678311192946090e-13,
+                +2.95505266312963983461e-12,
+                -1.72682629144155570723e-11,
+                +9.67580903537323691224e-11,
+                -5.18979560163526290666e-10,
+                +2.65982372468238665035e-09,
+                -1.30002500998624804212e-08,
+                +6.04699502254191894932e-08,
+                -2.67079385394061173391e-07,
+                +1.11738753912010371815e-06,
+                -4.41673835845875056359e-06,
+                +1.64484480707288970893e-05,
+                -5.75419501008210370398e-05,
+                +1.88502885095841655729e-04,
+                -5.76375574538582365885e-04,
+                +1.63947561694133579842e-03,
+                -4.32430999505057594430e-03,
+                +1.05464603945949983183e-02,
+                -2.37374148058994688156e-02,
+                +4.93052842396707084878e-02,
+                -9.49010970480476444210e-02,
+                +1.71620901522208775349e-01,
+                -3.04682672343198398683e-01,
+                +6.76795274409476084995e-01,
+        };
+
+        static const T B[] = {
+                -7.23318048787475395456e-18,
+                -4.83050448594418207126e-18,
+                +4.46562142029675999901e-17,
+                +3.46122286769746109310e-17,
+                -2.82762398051658348494e-16,
+                -3.42548561967721913462e-16,
+                +1.77256013305652638360e-15,
+                +3.81168066935262242075e-15,
+                -9.55484669882830764870e-15,
+                -4.15056934728722208663e-14,
+                +1.54008621752140982691e-14,
+                +3.85277838274214270114e-13,
+                +7.18012445138366623367e-13,
+                -1.79417853150680611778e-12,
+                -1.32158118404477131188e-11,
+                -3.14991652796324136454e-11,
+                +1.18891471078464383424e-11,
+                +4.94060238822496958910e-10,
+                +3.39623202570838634515e-09,
+                +2.26666899049817806459e-08,
+                +2.04891858946906374183e-07,
+                +2.89137052083475648297e-06,
+                +6.88975834691682398426e-05,
+                +3.36911647825569408990e-03,
+                +8.04490411014108831608e-01,
+        };
+
+        T p;
+        T q = 0.0;
+
+        if (abs(x) <= T(8.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 30; index++) {
+                p = q;
+                q = a;
+                a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+            }
+
+            return exp(abs(x)) * (T(0.5) * (a - p));
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x));
+    } // modified_bessel_i0_forward(T x)
+); // modified_bessel_i0_string
+
+const auto modified_bessel_i1_string = jiterator_stringify(
+    template<typename T>
+    T modified_bessel_i1_forward(T x) {
+        static const T A[] = {
+                +2.77791411276104639959e-18,
+                -2.11142121435816608115e-17,
+                +1.55363195773620046921e-16,
+                -1.10559694773538630805e-15,
+                +7.60068429473540693410e-15,
+                -5.04218550472791168711e-14,
+                +3.22379336594557470981e-13,
+                -1.98397439776494371520e-12,
+                +1.17361862988909016308e-11,
+                -6.66348972350202774223e-11,
+                +3.62559028155211703701e-10,
+                -1.88724975172282928790e-09,
+                +9.38153738649577178388e-09,
+                -4.44505912879632808065e-08,
+                +2.00329475355213526229e-07,
+                -8.56872026469545474066e-07,
+                +3.47025130813767847674e-06,
+                -1.32731636560394358279e-05,
+                +4.78156510755005422638e-05,
+                -1.61760815825896745588e-04,
+                +5.12285956168575772895e-04,
+                -1.51357245063125314899e-03,
+                +4.15642294431288815669e-03,
+                -1.05640848946261981558e-02,
+                +2.47264490306265168283e-02,
+                -5.29459812080949914269e-02,
+                +1.02643658689847095384e-01,
+                -1.76416518357834055153e-01,
+                +2.52587186443633654823e-01,
+        };
+
+        static const T B[] = {
+                +7.51729631084210481353e-18,
+                +4.41434832307170791151e-18,
+                -4.65030536848935832153e-17,
+                -3.20952592199342395980e-17,
+                +2.96262899764595013876e-16,
+                +3.30820231092092828324e-16,
+                -1.88035477551078244854e-15,
+                -3.81440307243700780478e-15,
+                +1.04202769841288027642e-14,
+                +4.27244001671195135429e-14,
+                -2.10154184277266431302e-14,
+                -4.08355111109219731823e-13,
+                -7.19855177624590851209e-13,
+                +2.03562854414708950722e-12,
+                +1.41258074366137813316e-11,
+                +3.25260358301548823856e-11,
+                -1.89749581235054123450e-11,
+                -5.58974346219658380687e-10,
+                -3.83538038596423702205e-09,
+                -2.63146884688951950684e-08,
+                -2.51223623787020892529e-07,
+                -3.88256480887769039346e-06,
+                -1.10588938762623716291e-04,
+                -9.76109749136146840777e-03,
+                +7.78576235018280120474e-01,
+        };
+
+        T p;
+        T q = 0.0;
+
+        if (abs(x) <= T(8.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 29; index++) {
+                p = q;
+                q = a;
+                a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+            }
+
+            if (x < T(0.0)) {
+                return -(T(0.5) * (a - p) * abs(x) * exp(abs(x)));
+            }
+
+            return T(0.5) * (a - p) * abs(x) * exp(abs(x));
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index];
+        }
+
+        if (x < T(0.0)) {
+            return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)));
+        }
+
+        return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x));
+    } // modified_bessel_i1_forward(T x)
+); // modified_bessel_i1_string
+
+const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify(
+    template<typename T>
+    T modified_bessel_k0_forward(T x) {
+        static const T A[] = {
+                +1.37446543561352307156e-16,
+                +4.25981614279661018399e-14,
+                +1.03496952576338420167e-11,
+                +1.90451637722020886025e-09,
+                +2.53479107902614945675e-07,
+                +2.28621210311945178607e-05,
+                +1.26461541144692592338e-03,
+                +3.59799365153615016266e-02,
+                +3.44289899924628486886e-01,
+                -5.35327393233902768720e-01,
+        };
+
+        static const T B[] = {
+                +5.30043377268626276149e-18,
+                -1.64758043015242134646e-17,
+                +5.21039150503902756861e-17,
+                -1.67823109680541210385e-16,
+                +5.51205597852431940784e-16,
+                -1.84859337734377901440e-15,
+                +6.34007647740507060557e-15,
+                -2.22751332699166985548e-14,
+                +8.03289077536357521100e-14,
+                -2.98009692317273043925e-13,
+                +1.14034058820847496303e-12,
+                -4.51459788337394416547e-12,
+                +1.85594911495471785253e-11,
+                -7.95748924447710747776e-11,
+                +3.57739728140030116597e-10,
+                -1.69753450938905987466e-09,
+                +8.57403401741422608519e-09,
+                -4.66048989768794782956e-08,
+                +2.76681363944501510342e-07,
+                -1.83175552271911948767e-06,
+                +1.39498137188764993662e-05,
+                -1.28495495816278026384e-04,
+                +1.56988388573005337491e-03,
+                -3.14481013119645005427e-02,
+                +2.44030308206595545468e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 10; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(-x) * (T(0.5) * (b - p)) / sqrt(x);
+    } // modified_bessel_k0_forward(T x)
+); // modified_bessel_k0_string
+
+const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify(
+    template<typename T>
+    T scaled_modified_bessel_k0_forward(T x) {
+        static const T A[] = {
+                +1.37446543561352307156e-16,
+                +4.25981614279661018399e-14,
+                +1.03496952576338420167e-11,
+                +1.90451637722020886025e-09,
+                +2.53479107902614945675e-07,
+                +2.28621210311945178607e-05,
+                +1.26461541144692592338e-03,
+                +3.59799365153615016266e-02,
+                +3.44289899924628486886e-01,
+                -5.35327393233902768720e-01,
+        };
+
+        static const T B[] = {
+                +5.30043377268626276149e-18,
+                -1.64758043015242134646e-17,
+                +5.21039150503902756861e-17,
+                -1.67823109680541210385e-16,
+                +5.51205597852431940784e-16,
+                -1.84859337734377901440e-15,
+                +6.34007647740507060557e-15,
+                -2.22751332699166985548e-14,
+                +8.03289077536357521100e-14,
+                -2.98009692317273043925e-13,
+                +1.14034058820847496303e-12,
+                -4.51459788337394416547e-12,
+                +1.85594911495471785253e-11,
+                -7.95748924447710747776e-11,
+                +3.57739728140030116597e-10,
+                -1.69753450938905987466e-09,
+                +8.57403401741422608519e-09,
+                -4.66048989768794782956e-08,
+                +2.76681363944501510342e-07,
+                -1.83175552271911948767e-06,
+                +1.39498137188764993662e-05,
+                -1.28495495816278026384e-04,
+                +1.56988388573005337491e-03,
+                -3.14481013119645005427e-02,
+                +2.44030308206595545468e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 10; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return T(0.5) * (b - p) / sqrt(x);
+    } // T scaled_modified_bessel_k0_forward(T x)
+); // scaled_modified_bessel_k0_string
+
+const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify(
+    template<typename T>
+    T modified_bessel_k1_forward(T x) {
+        static const T A[] = {
+                -7.02386347938628759343e-18,
+                -2.42744985051936593393e-15,
+                -6.66690169419932900609e-13,
+                -1.41148839263352776110e-10,
+                -2.21338763073472585583e-08,
+                -2.43340614156596823496e-06,
+                -1.73028895751305206302e-04,
+                -6.97572385963986435018e-03,
+                -1.22611180822657148235e-01,
+                -3.53155960776544875667e-01,
+                +1.52530022733894777053e+00,
+        };
+
+        static const T B[] = {
+                -5.75674448366501715755e-18,
+                +1.79405087314755922667e-17,
+                -5.68946255844285935196e-17,
+                +1.83809354436663880070e-16,
+                -6.05704724837331885336e-16,
+                +2.03870316562433424052e-15,
+                -7.01983709041831346144e-15,
+                +2.47715442448130437068e-14,
+                -8.97670518232499435011e-14,
+                +3.34841966607842919884e-13,
+                -1.28917396095102890680e-12,
+                +5.13963967348173025100e-12,
+                -2.12996783842756842877e-11,
+                +9.21831518760500529508e-11,
+                -4.19035475934189648750e-10,
+                +2.01504975519703286596e-09,
+                -1.03457624656780970260e-08,
+                +5.74108412545004946722e-08,
+                -3.50196060308781257119e-07,
+                +2.40648494783721712015e-06,
+                -1.93619797416608296024e-05,
+                +1.95215518471351631108e-04,
+                -2.85781685962277938680e-03,
+                +1.03923736576817238437e-01,
+                +2.72062619048444266945e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 11; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x;
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(-x) * (T(0.5) * (b - p)) / sqrt(x);
+    } // modified_bessel_k1_forward(T x)
+); // modified_bessel_k1_string
+
+const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify(
+    template<typename T>
+    T scaled_modified_bessel_k1_forward(T x) {
+        static const T A[] = {
+                -7.02386347938628759343e-18,
+                -2.42744985051936593393e-15,
+                -6.66690169419932900609e-13,
+                -1.41148839263352776110e-10,
+                -2.21338763073472585583e-08,
+                -2.43340614156596823496e-06,
+                -1.73028895751305206302e-04,
+                -6.97572385963986435018e-03,
+                -1.22611180822657148235e-01,
+                -3.53155960776544875667e-01,
+                +1.52530022733894777053e+00,
+        };
+
+        static const T B[] = {
+                -5.75674448366501715755e-18,
+                +1.79405087314755922667e-17,
+                -5.68946255844285935196e-17,
+                +1.83809354436663880070e-16,
+                -6.05704724837331885336e-16,
+                +2.03870316562433424052e-15,
+                -7.01983709041831346144e-15,
+                +2.47715442448130437068e-14,
+                -8.97670518232499435011e-14,
+                +3.34841966607842919884e-13,
+                -1.28917396095102890680e-12,
+                +5.13963967348173025100e-12,
+                -2.12996783842756842877e-11,
+                +9.21831518760500529508e-11,
+                -4.19035475934189648750e-10,
+                +2.01504975519703286596e-09,
+                -1.03457624656780970260e-08,
+                +5.74108412545004946722e-08,
+                -3.50196060308781257119e-07,
+                +2.40648494783721712015e-06,
+                -1.93619797416608296024e-05,
+                +1.95215518471351631108e-04,
+                -2.85781685962277938680e-03,
+                +1.03923736576817238437e-01,
+                +2.72062619048444266945e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 11; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return (T(0.5) * (b - p) / sqrt(x));
+    } // T scaled_modified_bessel_k1_forward(T x)
+); // scaled_modified_bessel_k1_string
+
+const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return T(1.0);
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            return cos(n * acos(x + x - T(1.0)));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_t_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_t_forward(T x, T n)
+); // shifted_chebyshev_polynomial_t_string
+
+const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return n + 1;
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return n + 1;
+            }
+
+            return -(n + 1);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (sin(acos(x + x - T(1.0))) != T(0.0)) {
+                return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)));
+            }
+
+            return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0));
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0));
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_u_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_u_forward(T x, T n)
+); // shifted_chebyshev_polynomial_u_string
+
+const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return T(1.0);
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return (n + n + 1);
+            }
+
+            return -(n + n + 1);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+                return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_v_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_v_forward(T x, T n)
+); // shifted_chebyshev_polynomial_v_string
+
+const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return n + n + 1;
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+                return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_w_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_w_forward(T x, T n)
+); // shifted_chebyshev_polynomial_w_string
+
+const auto spherical_bessel_j0_string = jiterator_stringify(
+    template<typename T>
+    T spherical_bessel_j0_forward(T x) {
+        if (isinf(x)) {
+            return T(0.0);
+        }
+
+        if (abs(x) < T(0.5)) {
+            return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0)))))));
+        }
+
+        return sin(x) / x;
+    } // T spherical_bessel_j0_forward(T x)
+); // spherical_bessel_j0_string
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/PersistentSoftmax.cuh b/aten/src/ATen/native/zoom/PersistentSoftmax.cuh
new file mode 100644
index 00000000000000..64919d846c41eb
--- /dev/null
+++ b/aten/src/ATen/native/zoom/PersistentSoftmax.cuh
@@ -0,0 +1,402 @@
+#include <hip/hip_runtime.h>
+#pragma once
+
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <hip/hip_fp16.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/zoom/DeviceUtils.cuh>
+
+namespace {
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension.
+// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024.
+// The template arguments have the following meaning:
+// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
+// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
+// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
+// This is important because it means only __shfl_ instructions are required for reductions.
+// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
+// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
+// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
+// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed.
+// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
+// This allows SoftMax to be fused with a cast immediately following the SoftMax.
+// The mask should have the same shape as input, with a boolean indicate if the value is masked.
+// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D.
+// For instance:
+// input_t=half,  acc_t=float, output_t=half  => read half tensor, float accumulators, write half tensor.
+// input_t=half,  acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor.
+// input_t_float, acc_t=float, output_t=half  => read float tensor, float accumulators, write half tensor.
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+    int idx_offset = first_batch * stride + local_idx;
+
+    src += idx_offset;
+    dst += idx_offset;
+
+    if (is_transformer_mask) {
+        mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx;
+    } else {
+        mask += idx_offset;
+    }
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                elements[i][it] = src[i*element_count+it*WARP_SIZE];
+            } else {
+                elements[i][it] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        bool is_meaningful_max = false;
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (is_masked) {
+                int idx = it*WARP_SIZE;
+                if ((idx + local_idx) < batch_element_count) {
+                    if (!is_transformer_mask) {
+                        idx += i*element_count;
+                    }
+                    if (!mask[idx]) {
+                        max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+                        is_meaningful_max = true;
+                    }
+                }
+            } else {
+                max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it];
+            }
+        }
+        if (is_masked) {
+            if (!is_meaningful_max) {
+                max_value[i] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked) {
+                if (is_log_softmax) {
+                    sum[i] += ::exp(elements[i][it] - max_value[i]);
+                } else {
+                    elements[i][it] = ::exp(elements[i][it] - max_value[i]);
+                    sum[i] += elements[i][it];
+                }
+            } else {
+                int idx = it*WARP_SIZE;
+                bool valid = (idx + local_idx) < batch_element_count;
+                if (!is_transformer_mask) {
+                    idx += i*element_count;
+                }
+                if (valid) {
+                    if (!mask[idx]) {
+                        if (is_log_softmax) {
+                            sum[i] += ::exp(elements[i][it] - max_value[i]);
+                        } else {
+                            elements[i][it] = ::exp(elements[i][it] - max_value[i]);
+                            sum[i] += elements[i][it];
+                        }
+                    } else {
+                        if (!is_log_softmax) {
+                            // Masked values are treated as -infinity, and ::exp(-infinity) is 0.
+                            elements[i][it] = 0;
+                        }
+                    }
+                } else {
+                    if (!is_log_softmax) {
+                        elements[i][it] = 0.;
+                    }
+                }
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        if (is_log_softmax) sum[i] = ::log(sum[i]);
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_log_softmax) {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
+                } else if (sum[i] == 0) {
+                    dst[i*element_count+it*WARP_SIZE] = std::numeric_limits<acc_t>::quiet_NaN();
+                } else {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i];
+                }
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x % WARP_SIZE;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+    if (is_masked) {
+        mask += thread_offset;
+    }
+
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE];
+                output_reg[i][it] = output[i*element_count+it*WARP_SIZE];
+            } else {
+                grad_reg[i][it] = acc_t(0);
+                output_reg[i][it] = acc_t(0);
+            }
+        }
+    }
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) {
+                sum[i] += grad_reg[i][it];
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_masked && mask[i*element_count+it*WARP_SIZE]) {
+                    gradInput[i*element_count+it*WARP_SIZE] = 0;
+                }
+                // compute gradients
+                else if (is_log_softmax) {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - ::exp(output_reg[i][it]) * sum[i]);
+                } else {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+                }
+            }
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = at::zoom::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E:                    \
+           hipLaunchKernelGGL(( softmax_warp_forward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked>)   \
+                , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(), dst,   \
+                    src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();                                       \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
+            LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
+            LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
+            LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
+            LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
+            LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
+            LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
+            LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
+            LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
+            LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
+            LAUNCH_SOFTMAX_WARP_FORWARD(10); ; // 1024
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = at::zoom::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E:                      \
+           hipLaunchKernelGGL(( softmax_warp_backward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked>) \
+                , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(),        \
+                grad_input, grad, output, batch_count, softmax_elements_stride, \
+                softmax_elements, mask);                                              \
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();                                      \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1
+            LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2
+            LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4
+            LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8
+            LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16
+            LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32
+            LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64
+            LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128
+            LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256
+            LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512
+            LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024
+            default:
+                break;
+        }
+    }
+}
diff --git a/aten/src/ATen/native/zoom/Reduce.cuh b/aten/src/ATen/native/zoom/Reduce.cuh
new file mode 100644
index 00000000000000..c22e4bd53f020d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Reduce.cuh
@@ -0,0 +1,1354 @@
+#pragma once
+
+#include <ATen/core/Array.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/thread_constants.h>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+#include <ATen/OpMathType.h>
+#include <c10/macros/Macros.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <functional>
+#include <iosfwd>
+#include <type_traits>
+#include <utility>
+#include <thrust/pair.h>
+
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at { namespace native {
+
+using at::detail::Array;
+
+static inline int64_t div_up(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
+// returns floor(log2(n))
+static inline int last_pow2(int n) {
+  n |= (n >>  1);
+  n |= (n >>  2);
+  n |= (n >>  4);
+  n |= (n >>  8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+// returns reduced fraction numerator & denominator
+C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+  // get GCD of num and denom using Euclid's algorithm.
+  // Can replace this with std::gcd if we ever support c++17.
+  size_t a = denominator;
+  size_t b = numerator;
+  while (b != 0) {
+      a %= b;
+      // swap(a,b)
+      size_t tmp = a;
+      a = b;
+      b = tmp;
+  }
+
+  // a is now the GCD
+  numerator /= a;
+  denominator /= a;
+}
+
+//template for changing MAX_NUM_THREADS based on op dtype
+template <typename T>
+struct mnt_wrapper {
+  static constexpr int MAX_NUM_THREADS = 512;
+};
+
+template <>
+struct mnt_wrapper <c10::complex<double>>{
+  static constexpr int MAX_NUM_THREADS = 256;
+};
+
+constexpr int max_reduce_threads(c10::ScalarType type) {
+  return type == kComplexDouble ? 256 : 512;
+}
+
+struct ReduceConfig {
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+
+  ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs)
+    : element_size_bytes(element_size_bytes)
+    , num_inputs(num_inputs)
+    , num_outputs(num_outputs) {}
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  template <typename T>
+  void set_block_dimension(int64_t dim0, int64_t dim1) {
+    const int max_num_threads = mnt_wrapper<T>::MAX_NUM_THREADS / output_vec_size;
+    int dim0_pow2 = dim0 < max_num_threads ? static_cast<int>(last_pow2(dim0)) : max_num_threads;
+    int dim1_pow2 = dim1 < max_num_threads ? static_cast<int>(last_pow2(dim1)) : max_num_threads;
+    block_width = std::min(dim0_pow2, int(at::zoom::warp_size()));
+    block_height = std::min(dim1_pow2, int(max_num_threads / block_width));
+    block_width = std::min(dim0_pow2, int(max_num_threads / block_height));
+    num_threads = block_width * block_height;
+  }
+
+  int split_input(int parallelism) {
+    int step = step_input;
+    step_input *= parallelism;
+    return step;
+  }
+
+  int split_output(int parallelism) {
+    int step = step_output;
+    step_output *= parallelism;
+    return step;
+  }
+
+  dim3 block() const {
+    return dim3(block_width, block_height);
+  }
+
+  dim3 grid() const {
+    return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output);
+  }
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+  int shared_memory_size() const {
+    if (!should_block_y_reduce() &&
+        (!should_block_x_reduce() ||
+         block_width <= at::zoom::warp_size())) {
+      return 0;
+    }
+    return element_size_bytes * num_threads * output_vec_size;
+  }
+
+  int64_t global_memory_size() const {
+    if (!should_global_reduce()) {
+      return 0;
+    }
+    auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output;
+    if (!should_block_x_reduce()) {
+      size *= block().x * output_vec_size;
+    }
+    return size;
+  }
+
+  int semaphore_size() const {
+    if (!should_global_reduce()) {
+      return 0;
+    }
+    return sizeof(int) * grid().x;
+  }
+
+  int values_per_thread() const {
+    return div_up(num_inputs, step_input);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const ReduceConfig& config);
+
+template<int nt, int output_vec_size, typename R>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void reduce_kernel(R reduction) {
+  reduction.template run<output_vec_size>();
+}
+
+template <typename index_t>
+static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) {
+  int num_reduce_dims = iter.num_reduce_dims();
+  int num_output_dims = iter.ndim() - num_reduce_dims;
+  int input_index = iter.ntensors() - 1;
+  int output_index = 0;
+  std::array<const int64_t*, 2> strides = {
+    iter.strides(output_index).data() + num_reduce_dims,
+    iter.strides(input_index).data() + num_reduce_dims,
+  };
+  auto shape = iter.shape().data() + num_reduce_dims;
+  return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data());
+}
+
+template <typename index_t>
+static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) {
+  int num_reduce_dims = iter.num_reduce_dims();
+  int input_index = iter.ntensors() - 1;
+  std::array<const int64_t*, 1> strides = {
+    iter.strides(input_index).data(),
+  };
+  return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data());
+}
+
+template <typename out_scalar_t, typename func_t>
+struct func_wrapper_t {
+  using arg_t = typename binary_function_traits<func_t>::arg1_t;
+  using scalar_t = typename binary_function_traits<func_t>::arg2_t;
+
+  func_t combine;
+  static inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+  static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  func_wrapper_t(const func_t& op) : combine(op) {
+  }
+
+  // wrap a normal reduction that ignores the index
+  __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const {
+    return combine(acc, val);
+  }
+};
+
+template <typename scalar_t, typename func_t>
+func_wrapper_t<scalar_t, func_t> func_wrapper(const func_t& op) {
+  return func_wrapper_t<scalar_t, func_t> { op };
+}
+
+template <typename scalar_t, typename out_scalar_t=scalar_t>
+struct ReduceJitOp {
+//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations
+//Maybe we can find a way to unify ReduceOp and ReduceJitOp
+  using InputCalculator = OffsetCalculator<1, uint32_t>;
+  using OutputCalculator = OffsetCalculator<2, uint32_t>;
+  //TODO for now arg_t is always opmath_t of the input, later we'll need to change it
+  using arg_t = at::opmath_type<scalar_t>;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+  //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor,
+  //not just wrapper
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceJitOp(
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+};
+
+template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4>
+struct ReduceOp {
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename std::decay<typename traits::template arg<0>::type>::type;
+
+  using InputCalculator = OffsetCalculator<1, index_t>;
+  using OutputCalculator = OffsetCalculator<2, index_t>;
+
+  static constexpr bool can_accumulate_in_output =
+    std::is_convertible<arg_t, out_scalar_t>::value
+    && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  ops_t ops;
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceOp(
+      ops_t ops,
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ops(ops),
+        ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    index_t output_idx = config.output_idx<output_vec_size>();
+    index_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+      value = thread_reduce<output_vec_size>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<output_vec_size>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<output_vec_size>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = at::detail::Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = at::detail::Array<index_t, output_vec_size>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<output_vec_size>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = ops.translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<output_vec_size, can_accumulate_in_output>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<output_vec_size>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            *(out[i]) = get_accumulated_output<can_accumulate_in_output>(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<output_vec_size>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      ZOOM_KERNEL_ASSERT(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](index_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](index_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    index_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(at::native::memory::aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = at::native::memory::aligned_vector<scalar_t, input_vec_size>;
+
+    index_t idx = config.input_idx();
+    const index_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      const auto values_vec = memory::load_vector<input_vec_size>(data, idx);
+      #pragma unroll
+      for (index_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    index_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        const auto value = c10::load(data + idx);
+        value_list[0] = ops.reduce(value_list[0], value, idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = ops.combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    index_t idx = config.input_idx();
+    const index_t end = config.num_inputs;
+    const index_t stride = config.step_input;
+
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    using load_t = at::native::memory::aligned_vector<scalar_t, output_vec_size>;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (index_t i = 0; i < vt0; i++) {
+        const auto offset = calc(idx + i * stride) / output_vec_size;
+        values[i] = memory::load_vector<output_vec_size>(data_, offset);
+      }
+      #pragma unroll
+      for (index_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (index_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (index_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      const auto offset = calc(idx) / output_vec_size;
+      values[i] = memory::load_vector<output_vec_size>(data_, offset);
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (index_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (index_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (index_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> block_x_reduce(at::detail::Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = ops.warp_shfl_down(value[i], offset);
+        value[i] = ops.combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> block_y_reduce(at::detail::Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = ops.combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size, bool can_acc>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> accumulate_in_output(
+    at::detail::Array<out_scalar_t*, output_vec_size> out,
+    at::detail::Array<arg_t, output_vec_size> value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    at::detail::Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = ops.combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    ZOOM_KERNEL_ASSERT(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  // This function should never be called --
+  // it's the version of `accumulate_in_output`
+  // when accumulation in the output is not possible.
+  template <int output_vec_size, bool can_acc>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> accumulate_in_output(
+    at::detail::Array<out_scalar_t*, output_vec_size>,
+    at::detail::Array<arg_t, output_vec_size>,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    ZOOM_KERNEL_ASSERT(false);
+    return arg_t {};
+  }
+
+  // This function should never be called --
+  // it's the version of `get_accumulated_output`
+  // when accumulation in the output is not possible.
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    ZOOM_KERNEL_ASSERT(false);
+    return *out;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const index_t base_offset) const {
+    ZOOM_KERNEL_ASSERT(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+  //Currently implemented for max of two outputs
+  template<class T1, class T2>
+  C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+    if (noutputs >= 1) {
+      auto res0 = (T1*)((char*)dst[0] + base_offset);
+      *res0 = x.first;
+    }
+    if (noutputs >= 2) {
+      // base offset is computed assuming element size being sizeof(T1), so we need to make a
+      // correction to obtain the correct base offset
+      auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+      *res1 = x.second;
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<index_t, output_vec_size> base_offset) const {
+    ZOOM_KERNEL_ASSERT(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(ops.project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> global_reduce(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = at::detail::Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = at::detail::Array<index_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    index_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      index_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        index_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          index_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[i]);
+          }
+        }
+      } else {
+        index_t input_offset = threadIdx.y;
+        index_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          index_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size, can_accumulate_in_output>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output<can_accumulate_in_output>(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = ops.combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+template<int max_threads, typename R>
+static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  int shared_memory = config.shared_memory_size();
+
+  switch(config.output_vec_size) {
+  case 4:
+    reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    break;
+  case 2:
+    reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    break;
+  default:
+    reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+inline void launch_jitted_reduce_kernel(
+    std::mutex &jiterator_mutex,
+    std::array<at::zoom::jit::hiprtcFunction, 3> &fn_cache,
+    const at::zoom::jit::KernelDescriptor &desc,
+    int vt0, const ReduceConfig& config, void *reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  int shared_memory = config.shared_memory_size();
+  at::zoom::jit::hiprtcFunction* fn_ptr;
+  switch(config.output_vec_size) {
+  case 4:
+    fn_ptr = &fn_cache[0];
+    break;
+  case 2:
+    fn_ptr = &fn_cache[1];
+    break;
+  default:
+    fn_ptr = &fn_cache[2];
+  }
+  if (!fn_ptr->function) {
+    int max_threads_codegen =
+        max_reduce_threads(desc.f_inputs_type) / config.output_vec_size;
+    auto code = at::zoom::jit::generate_reduction_code(
+        desc, vt0, true, false, config.output_vec_size, max_threads_codegen);
+
+    *fn_ptr = at::zoom::jit::jit_pwise_function(code, "reduction_" + desc.name);
+  }
+  constexpr int kernel_args = 1;
+  void* args[kernel_args];
+  args[0] = reduction;
+  at::zoom::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory);
+}
+
+
+class AccumulationBuffer {
+ public:
+  AccumulationBuffer() {}
+
+  AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) {
+    out_ptr_ = (char*)out_ptr;
+    if (out_t_size >= acc_t_size) {
+      // reusing output buffer for accumulation.
+      acc_ptr_ = (char*)out_ptr;
+      numerator_ = 1;
+      denominator_ = 1;
+    } else {
+      auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
+      buffer_ = allocator.allocate(size);
+      acc_ptr_ = (char*)buffer_.get();
+      numerator_ = acc_t_size;
+      denominator_ = out_t_size;
+      reduce_fraction(numerator_, denominator_);
+    }
+  }
+
+  char* get_acc_slice(char* out_ptr) {
+    if (acc_ptr_ == nullptr) {
+      return nullptr;
+    }
+    return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_);
+  }
+
+ private:
+  char* acc_ptr_ = nullptr;
+  char* out_ptr_ = nullptr;
+  size_t numerator_;
+  size_t denominator_;
+  at::DataPtr buffer_;
+};
+
+template <typename scalar_t>
+int get_output_vec_size(const TensorIterator &iter) {
+  int vec_size = 4;
+  auto update_vec_size = [&vec_size](uint64_t n) {
+    while(n % vec_size != 0) {
+      vec_size /= 2;
+    }
+  };
+
+  uint64_t base_address = reinterpret_cast<uint64_t>(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t);
+  update_vec_size(base_address);
+
+  const int output_index = iter.num_reduce_dims();
+  update_vec_size(iter.shape()[output_index]);
+
+  int j = 0;
+  for(auto i : iter.strides(iter.noutputs())) {
+    if (j != output_index) {
+      update_vec_size(i / sizeof(scalar_t));
+    }
+    j++;
+  }
+  return vec_size;
+}
+
+template<typename arg_t, typename scalar_t, int vt0>
+ReduceConfig setReduceConfig(const TensorIterator& iter){
+  // Start by assuming that each thread handles a single output and all
+  // the inputs for that output.
+  int64_t num_outputs = iter.num_output_elements();
+  int64_t inputs_per_output = iter.numel() / num_outputs;
+  int input_index = iter.ntensors() - 1;
+
+  auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);
+
+  int64_t dim0;
+  int64_t dim1;
+  int64_t fastest_moving_stride;
+  bool reduction_on_fastest_striding_dimension;
+
+  if (iter.ndim() > 0) {
+    // Adjust block size to map block width to fastest changing dimension of input
+    // tensor. This grants the best possible memory accessing pattern, given that
+    // for non-contiguous tensor with space in between, we cannot have perfect
+    // memory coalescing.
+    reduction_on_fastest_striding_dimension =
+        (iter.num_reduce_dims() == iter.ndim()) ||
+        (iter.strides(/*arg=*/input_index)[0] <
+        iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]);
+    // Notice that dim0 & dim1 does NOT guarantee any launch configuration here!
+    // dim0 & dim1 are more like the upper bound of the block dimension. The
+    // actual launch config and reduction scheme is determined by setting values
+    // to `config.input_mult` and `config.output_mult`.
+    // We try to max out dim1 so that we have enough threads per CTA to deliver
+    // performance for larger problem size.
+    if (reduction_on_fastest_striding_dimension) {
+      // Map block.x to the fastest reducing dimension. It implies:
+      //   1. block_x_reduce is required.
+      //   2. block.y now max out to num_outputs.
+      dim0 = inputs_per_output;
+      dim1 = num_outputs;
+      fastest_moving_stride = iter.strides(/*arg=*/input_index)[0];
+    } else {
+      // Map block.x to the fastest non reducing dimension. It implies:
+      //   1. block_x_reduce is turned off.
+      //   2. block.y now max out to inputs_per_output.
+      dim0 = num_outputs;
+      dim1 = inputs_per_output;
+      fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()];
+    }
+  } else {
+    reduction_on_fastest_striding_dimension = true;
+    fastest_moving_stride = sizeof(scalar_t);
+    dim0 = 1;
+    dim1 = 1;
+  }
+
+  // We do vectorization to gain better memory access, there are two cases which we call
+  // "vectorize along input" and "vectorize along output". Note that the "input/output"
+  // here does not mean we are vectorizing load/store instructions. We always only vectorize
+  // load instructions.
+  //
+  // Case 1: "vectorize along input"
+  // This case happens when we are reducing along fastest moving dimesion. In such case, threads
+  // with the same threadIdx.y works on the same reduction cooperatively and will produce results
+  // for the same output. In such case, values in each loaded vector always correspond to the same output.
+  //
+  // Case 2: "vectorize along output"
+  // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
+  // threads with different threadIdx.x are independent and will produce results for different outputs.
+  // In such case, values in each loaded vector always correspond to different outputs.
+  if (fastest_moving_stride == sizeof(scalar_t)) {
+    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) {
+      // Case 1: "vectorize along input"
+      // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
+      // we should avoid vectorization.
+      config.vectorize_input = true;
+      dim0 /= config.input_vec_size;
+    } else if (!reduction_on_fastest_striding_dimension) {
+      // Case 2: "vectorize along output"
+      config.output_vec_size = get_output_vec_size<scalar_t>(iter);
+      dim0 /= config.output_vec_size;
+    }
+  }
+
+  // Adjust block_width and block_height
+  config.set_block_dimension<scalar_t>(dim0, dim1);
+
+  int block_width = config.block_width;
+  int block_height = config.block_height;
+
+  if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) {
+    // Split the input across lanes if the input is contiguous in the reduced
+    // dimension. This will require reduction between threads using warp
+    // shuffle instructions and shared memory (if block_width > warpSize).
+    config.input_mult[0] = config.split_input(block_width);
+  } else {
+    // Otherwise split the output across lanes in a warp.
+    config.output_mult[0] = config.split_output(block_width);
+  }
+
+  constexpr int min_values_per_thread = 16;
+  constexpr int max_values_per_thread = 256;
+
+  if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
+    // Divide the input across warps in a thread-block, if that leaves at least
+    // 16 elements to be summed by each thread. This will require inter-warp
+    // reduction using shared memory.
+    config.input_mult[1] = config.split_input(block_height);
+  } else {
+    // Otherwise, each warp handles a separate output.
+    config.output_mult[1] = config.split_output(block_height);
+  }
+
+  const int blocks_per_sm = at::zoom::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads;
+  const int num_mp = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+  const int target_grid_size = num_mp * blocks_per_sm;
+  int grid = config.grid().x;
+  if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) {
+    // Divide the input across thread-blocks if the amount of work per-thread
+    // is large enough and the size of the output is small enough. This will
+    // require a reduction using global memory.
+    // If we decide to split input across blocks, as long as we can get enough
+    // number of blocks (`target_grid_size`) to balance SM, we should still
+    // make the number of values per thread large for best performance.
+    int ctas_per_output1 = div_up(target_grid_size, grid);
+    int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread);
+    int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread);
+    // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have
+    // a large number of values to deal with. But we don't want values_per_thread to be larger than
+    // max_values_per_thread
+    config.ctas_per_output = std::max(std::min<int>(ctas_per_output1, ctas_per_output2), ctas_per_output3);
+    if (config.ctas_per_output > 1) {
+      config.input_mult[2] = config.split_input(config.ctas_per_output);
+    }
+  }
+  return config;
+};
+
+template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
+inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename traits::template arg<0>::type;
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same<at::Half, scalar_t>::value &&
+       std::is_same<at::Half, out_scalar_t>::value) ||
+      (std::is_same<c10::complex<Half>, scalar_t>::value &&
+       std::is_same<c10::complex<Half>, out_scalar_t>::value);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same<at::BFloat16, scalar_t>::value &&
+       std::is_same<at::BFloat16, out_scalar_t>::value);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible<arg_t, out_scalar_t>::value &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(arg_t)));
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = c10::zoom::getCurrentZoomStream();
+    C10_ZOOM_CHECK(hipMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
+      ops,
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  launch_reduce_kernel<mnt_wrapper<scalar_t>::MAX_NUM_THREADS>(config, reduce);
+}
+
+//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function
+//try unifying with gpu_reduce_kernel
+template <char const* name, typename scalar_t, typename out_scalar_t, int vt0=4, typename ident_t=double>
+inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  //TODO - this will be different for more complicated reductions, but for now reductions using
+  //func_wrapper all have arg_t = opmath
+  using arg_t = at::opmath_type<scalar_t>;
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same<at::Half, scalar_t>::value &&
+       std::is_same<at::Half, out_scalar_t>::value) ||
+      (std::is_same<c10::complex<Half>, scalar_t>::value &&
+       std::is_same<c10::complex<Half>, out_scalar_t>::value);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same<at::BFloat16, scalar_t>::value &&
+       std::is_same<at::BFloat16, out_scalar_t>::value);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible<arg_t, out_scalar_t>::value &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(out_scalar_t))); //TODO
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      jitted_gpu_reduce_kernel<name, scalar_t, out_scalar_t, vt0>(sub_iter, func, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  //TODO - for now we support a single input, we may be able to relax this constraint
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = c10::zoom::getCurrentZoomStream();
+    C10_ZOOM_CHECK(hipMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceJitOp<scalar_t, out_scalar_t>(
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  constexpr int nInputs = 1;
+  constexpr int nOutputs = 1;
+  static auto desc = at::zoom::jit::make_kernel_descriptor<
+    out_scalar_t, scalar_t>(name, func, nInputs, nOutputs);
+
+  static std::mutex jiterator_mutex;
+  static std::vector<std::array<at::zoom::jit::hiprtcFunction, 3>> fn_cache(c10::zoom::device_count());
+  auto &cache = fn_cache[iter.device().index()];
+
+  launch_jitted_reduce_kernel(
+      jiterator_mutex, cache, desc, vt0, config, &reduce);
+}
+
+}} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/ReduceLogicKernel.cu b/aten/src/ATen/native/zoom/ReduceLogicKernel.cu
new file mode 100644
index 00000000000000..fb6bb731781358
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceLogicKernel.cu
@@ -0,0 +1,38 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Reduce.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/Dispatch.h>
+
+namespace at::native {
+
+void and_kernel_zoom(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kHalf, kBFloat16, kBool, iter.common_dtype(), "and_zoom", [&]() {
+        gpu_reduce_kernel<scalar_t, bool>(
+            iter,
+            func_wrapper<bool>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+              return (static_cast<bool>(a) && static_cast<bool>(b));
+            }),
+            true);
+      });
+}
+
+void or_kernel_zoom(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kHalf, kBFloat16, kBool, iter.common_dtype(), "or_zoom", [&]() {
+        gpu_reduce_kernel<scalar_t, bool>(
+            iter,
+            func_wrapper<bool>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+              return (static_cast<bool>(a) || static_cast<bool>(b));
+            }),
+            false);
+      });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(and_stub, &and_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(or_stub, &or_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/ScanKernels.cpp b/aten/src/ATen/native/zoom/ScanKernels.cpp
new file mode 100644
index 00000000000000..3bd21f18615d6a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ScanKernels.cpp
@@ -0,0 +1,115 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+
+#include <ATen/native/zoom/ScanKernels.h>
+#include <ATen/native/ReduceOps.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_cummax_helper_native.h>
+#include <ATen/ops/_cummin_helper_native.h>
+#include <ATen/ops/_logcumsumexp_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
+namespace at::native {
+
+static c10::MaybeOwned<Tensor> contiguous_out_arg(const Tensor &tensor) {
+  if (tensor.is_contiguous()) {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+  return c10::MaybeOwned<Tensor>::owned(at::empty(tensor.sizes(), tensor.options()));
+}
+
+void cummax_helper_zoom(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
+  TensorArg output_arg{ values, "output", 1 };
+  TensorArg indices_arg{ indices, "indices", 2 };
+  TensorArg input_arg{ self, "input", 3 };
+  checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
+
+  auto values_ = contiguous_out_arg(values);
+  auto indices_ = contiguous_out_arg(indices);
+  launch_cummax_zoom_kernel(self, *values_, *indices_, dim);
+  if (!values.is_same(*values_)) {
+    values.copy_(*values_);
+  }
+  if (!indices.is_same(*indices_)) {
+    indices.copy_(*indices_);
+  }
+}
+
+void cummin_helper_zoom(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim) {
+  TensorArg output_arg{ values, "output", 1 };
+  TensorArg indices_arg{ indices, "indices", 2 };
+  TensorArg input_arg{ self, "input", 3 };
+  checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
+
+  auto values_ = contiguous_out_arg(values);
+  auto indices_ = contiguous_out_arg(indices);
+  launch_cummin_zoom_kernel(self, *values_, *indices_, dim);
+  if (!values.is_same(*values_)) {
+    values.copy_(*values_);
+  }
+  if (!indices.is_same(*indices_)) {
+    indices.copy_(*indices_);
+  }
+}
+
+Tensor& _logcumsumexp_out_zoom(const Tensor& self, int64_t dim, Tensor& result) {
+  const auto wrap_dim = maybe_wrap_dim(dim, self.dim());
+  result.resize_(self.sizes());
+  if (self.dim() == 0) {
+    result.fill_(self);
+    return result;
+  }
+  if (self.numel() == 0) {
+    result.zero_();
+    return result;
+  }
+
+  TensorArg output_arg{ result, "output", 1 };
+  TensorArg input_arg{ self, "input", 2 };
+  checkAllSameGPU(__func__, {output_arg, input_arg});
+
+  auto result_ = contiguous_out_arg(result);
+  launch_logcumsumexp_zoom_kernel(*result_, self, wrap_dim);
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+  return result;
+}
+
+Tensor _logcumsumexp_zoom(const Tensor& self, int64_t dim) {
+  Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
+  return _logcumsumexp_out_zoom(self, dim, result);
+}
+
+void cumsum_zoom_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+  if (self.is_floating_point() || self.is_complex()) {
+    // See Note [Writing Nondeterministic Operations]
+    // Issue reporting nondeterministic behavior: https://github.com/pytorch/pytorch/issues/75240
+    globalContext().alertNotDeterministic("cumsum_zoom_kernel");
+  }
+  auto result_ = contiguous_out_arg(result);
+  launch_cumsum_zoom_kernel(*result_, self, dim);
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+}
+
+void cumprod_zoom_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+  auto result_ = contiguous_out_arg(result);
+  launch_cumprod_zoom_kernel(*result_, self, dim);
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(cumsum_stub, &cumsum_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(cumprod_stub, &cumprod_zoom_kernel);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/ScanKernels.h b/aten/src/ATen/native/zoom/ScanKernels.h
new file mode 100644
index 00000000000000..f9a6f86f2c6ebe
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ScanKernels.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+
+namespace native {
+
+// NOTE: these functions require output tensors to be contiguous
+void launch_cummax_zoom_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_cummin_zoom_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_logcumsumexp_zoom_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumsum_zoom_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumprod_zoom_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+
+}}  // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/ScanUtils.cuh b/aten/src/ATen/native/zoom/ScanUtils.cuh
new file mode 100644
index 00000000000000..2ff2970dce4dcd
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ScanUtils.cuh
@@ -0,0 +1,459 @@
+#pragma once
+#include <ATen/NumericUtils.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/zoom/cub.cuh>
+#include <ATen/zoom/ZoomContext.h>
+
+#include <c10/util/Load.h>
+#include <limits>
+#include <cmath>
+
+namespace at {
+namespace native {
+
+template <typename integer>
+constexpr inline integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr inline integer get_log_num_threads_x_inner_scan(integer num_rows, integer row_size) {
+  integer log_num_threads_x = 0;
+  integer log_num_threads_y = 0;
+  while (((integer)1 << log_num_threads_x) < row_size) {
+    ++log_num_threads_x;
+  }
+  while (((integer)1 << log_num_threads_y) < num_rows) {
+    ++log_num_threads_y;
+  }
+  // we want to keep the ratio between the x-threads and y-threads about the same as
+  // the ratio between the row_size and num_rows, but the total number of threads in
+  // a block should be about 512
+  integer diff = log_num_threads_x - log_num_threads_y;
+  // 9 is from log2(512)
+  log_num_threads_x = ((integer)9 + diff) / (integer)2;
+  // I found that in having larger log_num_threads_x can give significant speed up in some cases,
+  // but detrimental in another case, so just keep the lower bound to be log2(16) == 4 to make it
+  // similar to the previous implementation
+  // Keeping the upper bound to be log2(512) == 9 as the maximum number of threads in a block.
+  log_num_threads_x = std::min(std::max((integer)4, log_num_threads_x), (integer)9);
+  return log_num_threads_x;
+}
+
+template<typename scalar_t, typename idx_t, typename BinaryOperation>
+__device__ void binary_op_update(const scalar_t lhs, scalar_t& rhs, const idx_t lhs_idx, idx_t& rhs_idx, BinaryOperation binary_op) {
+  if(!at::_isnan(rhs) && (at::_isnan(lhs) || !binary_op(rhs, lhs))) {
+    rhs = lhs;
+    rhs_idx = lhs_idx;
+  }
+}
+/* Perform an inclusive scan along the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename scalar_t, class BinaryFunction>
+__global__ void tensor_kernel_scan_innermost_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_,
+                                                int num_rows, int row_size,
+                                                const uint32_t num_threads, const uint32_t log_num_threads_x,
+                                                scalar_t init, BinaryFunction binary_op) {
+  // dynamic memory allocation for vbuf and ibuf
+  alignas(sizeof(double)) extern __shared__ char buf[];
+  scalar_t* vbuf = reinterpret_cast<scalar_t*>(buf); // the size is num_threads * 2
+  int64_t* ibuf = reinterpret_cast<int64_t*>(vbuf + num_threads * 2);
+  const uint32_t num_threads_x = 1 << log_num_threads_x;
+  scalar_t* row_buf = vbuf + 2 * num_threads_x * threadIdx.y;
+  int64_t* row_idx_buf = ibuf + 2 * num_threads_x * threadIdx.y;
+
+  for (int block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    int row = block_row + threadIdx.y;
+    const scalar_t *row_self = self_ + row * row_size;
+    scalar_t *row_values = values_ + row * row_size;
+    int64_t *row_indices = indices_ + row * row_size;
+    scalar_t block_total = init;
+    int64_t block_idx_final = 0;
+    const bool row_exists = row < num_rows;
+    // Perform scan on one block at a time, keeping track of the total value of
+    // all blocks processed so far.
+    for (int block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+      // Load data into shared memory (two values per thread).
+      int col1 = block_col + threadIdx.x;
+      int col2 = block_col + num_threads_x + threadIdx.x;
+      if (row_exists) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = c10::load(&row_self[col1]);
+          row_idx_buf[threadIdx.x] = col1;
+        } else {
+          row_buf[threadIdx.x] = init;
+          // No need to set the index here as the value in init will never be selected
+        }
+
+        if (col2 < row_size) {
+          row_buf[num_threads_x + threadIdx.x] = c10::load(&row_self[col2]);
+          row_idx_buf[num_threads_x + threadIdx.x] = col2;
+        } else {
+          row_buf[num_threads_x + threadIdx.x] = init;
+          // No need to set the index here as the value in init will never be selected
+        }
+
+        // Add the total value of all previous blocks to the first value of this block.
+        if (threadIdx.x == 0) {
+          binary_op_update(block_total, row_buf[0], block_idx_final, row_idx_buf[0], binary_op);
+        }
+      }
+      __syncthreads();
+
+      // Parallel reduction with Sklansky method. The diagram can be seen on this paper:
+      // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+      for (uint32_t s = 1; s <= num_threads_x; s <<= 1) {
+        if (row_exists) {
+          uint32_t a = (threadIdx.x / s) * (2 * s) + s;
+          uint32_t ti = a + (threadIdx.x % s);
+          uint32_t si = a - 1;
+          binary_op_update(row_buf[si], row_buf[ti], row_idx_buf[si], row_idx_buf[ti], binary_op);
+        }
+        __syncthreads();
+      }
+
+      // Write back to output.
+      if (row_exists) {
+        if (col1 < row_size){
+          row_values[col1] = row_buf[threadIdx.x];
+          row_indices[col1] = row_idx_buf[threadIdx.x];
+        }
+        if (col2 < row_size) {
+          row_values[col2] = row_buf[num_threads_x + threadIdx.x];
+          row_indices[col2] = row_idx_buf[num_threads_x + threadIdx.x];
+        }
+      }
+      block_total = row_buf[2 * num_threads_x - 1];
+      block_idx_final = row_idx_buf[2 * num_threads_x - 1];
+      __syncthreads();
+    }
+  }
+}
+
+/* Perform an inclusive scan along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to compute the variance;
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename scalar_t, class BinaryFunction>
+__global__ void tensor_kernel_scan_outer_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_,
+                  const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, scalar_t init, BinaryFunction binary_op) {
+  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      const scalar_t *self = self_ + orow * row_size * num_irows + irow;
+      scalar_t *values = values_ + orow * row_size * num_irows + irow;
+      int64_t *indices = indices_ + orow * row_size * num_irows + irow;
+      scalar_t out = init;
+      int64_t out_idx = 0;
+
+      for (auto col = decltype(row_size){0}; col < row_size; ++col) {
+        const auto val = c10::load(self);
+        if(at::_isnan(val) || (!at::_isnan(out) && binary_op(val, out))) {
+          out = val;
+          out_idx = col;
+        }
+        *values = out;
+        *indices = out_idx;
+        self += num_irows;
+        values += num_irows;
+        indices += num_irows;
+      }
+    }
+  }
+}
+
+inline void check_fits_in_unsigned(int64_t val, const char* name) {
+  constexpr auto umax = std::numeric_limits<uint32_t>::max();
+  TORCH_CHECK(
+      val >= 0 && val <= umax, name, " must fit in a 32-bit uint32_t value");
+}
+
+
+template<typename scalar_t, class BinaryFunction>
+__host__ void scan_outer_dim_with_indices(
+    const TensorBase& self, const TensorBase& values, const TensorBase& indices,
+    int dim, scalar_t init, BinaryFunction binary_op) {
+  int64_t row_size = self.size(dim);
+  auto sizes = self.sizes();
+
+  // Treat all outer dimensions (i.e. dim_ < dim) as one.
+  const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim);
+
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end());
+  //for performance reasons, cuda kernels use uint32_t for loops over irows, orows and row,
+  //make sure that input is not bigger than supported by uint32_t
+  check_fits_in_unsigned(num_irows, "num_irows");
+  check_fits_in_unsigned(num_orows, "num_orows");
+  check_fits_in_unsigned(row_size, "row_size");
+
+
+  dim3 threads(std::min(512, int(num_irows)));
+  int64_t maxGridDim = at::zoom::getCurrentDeviceProperties()->maxGridSize[1];
+  dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x})));
+  tensor_kernel_scan_outer_dim_with_indices<scalar_t><<<grid, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+    self.const_data_ptr<scalar_t>(), values.mutable_data_ptr<scalar_t>(), indices.mutable_data_ptr<int64_t>(),
+    num_orows, num_irows, row_size, init, binary_op);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t, class BinaryFunction>
+__host__ void scan_innermost_dim_with_indices(
+    const TensorBase& self, const TensorBase& values, const TensorBase& indices,
+    scalar_t init, BinaryFunction binary_op) {
+  int ndim = self.dim();
+  // Treat all outer dimensions as a single dimension.
+  int row_size = self.size(ndim - 1);
+  int num_rows = self.numel() / row_size;
+
+  // assuming max_num_threads per block is 512
+  const uint32_t num_threads = 512;
+  const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan<uint32_t>(num_rows, row_size);
+  const uint32_t num_threads_x = (1 << log_num_threads_x);
+  const uint32_t num_threads_y = num_threads / num_threads_x;
+  dim3 threads(num_threads_x, num_threads_y);
+  dim3 grid(std::min(at::zoom::getCurrentDeviceProperties()->maxGridSize[0], ceil_div(num_rows, int(threads.y))));
+
+  const uint32_t mem_size = 2 * num_threads * (sizeof(scalar_t) + sizeof(int64_t));
+  tensor_kernel_scan_innermost_dim_with_indices<scalar_t><<<grid, threads, mem_size,
+                                                            c10::zoom::getCurrentZoomStream()>>>(
+    self.const_data_ptr<scalar_t>(), values.mutable_data_ptr<scalar_t>(), indices.mutable_data_ptr<int64_t>(),
+    num_rows, row_size, num_threads, log_num_threads_x, init, binary_op);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename scalar_t, typename BinaryFunction>
+void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64_t dim) {
+     int64_t dim, scalar_t init, BinaryFunction binary_op) {
+  int ndim = self.dim();
+  auto self_ = self.expect_contiguous();
+  TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous());
+  if (dim == ndim - 1) {
+    scan_innermost_dim_with_indices<scalar_t>(*self_, values, indices, init, binary_op);
+  } else {
+    scan_outer_dim_with_indices<scalar_t>(*self_, values, indices, dim, init, binary_op);
+  }
+}
+
+// TODO: The implementation of `tensor_kernel_scan_outer_dim` and
+// `tensor_kernel_scan_innermost_dim` is similar to
+// `tensor_kernel_scan_outer_dim_with_indices`
+// `tensor_kernel_scan_outer_dim_with_indices` and should be refactored to
+// remove the duplication.
+
+/* Perform an inclusive scan along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to scan;
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename scalar_t, class BinaryOp>
+__global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_,
+                                              const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size,
+                                              const scalar_t init, BinaryOp binary_op)
+{
+  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      const scalar_t *src = src_ + orow * row_size * num_irows + irow;
+      scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow;
+      scalar_t acc = init;
+
+      for (uint32_t col = 0; col < row_size; ++col) {
+        acc = binary_op(acc, c10::load(src));
+        *tgt = acc;
+
+        src += num_irows;
+        tgt += num_irows;
+      }
+    }
+  }
+}
+
+/* Perform an inclusive scan along the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename T, class BinaryFunction>
+__device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, const T *src_,
+                                      const uint32_t num_rows, const uint32_t row_size,
+                                      const uint32_t log_num_threads_x,
+                                      T init, BinaryFunction binary_op){
+  const uint32_t num_threads_x = 1 << log_num_threads_x;
+  for (uint32_t block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    uint32_t row = block_row + threadIdx.y;
+    T block_total = init;
+
+    const T *row_src = src_ + row * row_size;
+    T *row_tgt = tgt_ + row * row_size;
+    const bool row_exists = row < num_rows;
+
+    // Perform scan on one block at a time, keeping track of the total value of
+    // all blocks processed so far.
+    for (uint32_t block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+      // Load data into shared memory (two values per thread).
+      uint32_t col1 = block_col + threadIdx.x;
+      uint32_t col2 = block_col + num_threads_x + threadIdx.x;
+      if (row_exists) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = row_src[col1];
+        } else {
+          row_buf[threadIdx.x] = init;
+        }
+
+        if (col2 < row_size) {
+          row_buf[num_threads_x + threadIdx.x] = row_src[col2];
+        } else {
+          row_buf[num_threads_x + threadIdx.x] = init;
+        }
+
+        // Add the total value of all previous blocks to the first value of this block.
+        if (threadIdx.x == 0) {
+          row_buf[0] = binary_op(row_buf[0], block_total);
+        }
+      }
+      __syncthreads();
+
+      // Parallel reduction with Sklansky method. The diagram can be seen on this paper:
+      // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+      for (uint32_t m = 0; m <= log_num_threads_x; ++m) {
+        if (row_exists) {
+          uint32_t s = 1 << m; // s = 2 ^ m
+          uint32_t a = ((threadIdx.x >> m) << (m + 1)) | s; // a = (threadIdx.x / s) * (2 * s) + s
+          uint32_t ti = a + (threadIdx.x % s);
+          uint32_t si = a - 1;
+          row_buf[ti] = binary_op(row_buf[ti], row_buf[si]);
+        }
+        __syncthreads();
+      }
+
+      // Write back to output.
+      if (row_exists) {
+        if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
+        if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
+      }
+      block_total = row_buf[2 * num_threads_x - 1];
+      __syncthreads();
+    }
+  }
+}
+
+template <
+    typename T,
+    class BinaryFunction>
+__global__ void tensor_kernel_scan_innermost_dim(
+    T* tgt_,
+    const T* src_,
+    const uint32_t num_rows,
+    const uint32_t row_size,
+    const uint32_t log_num_threads_x,
+    T init,
+    BinaryFunction binary_op) {
+  alignas(sizeof(double)) extern __shared__ char sbuf[];
+  T* sbuf2 = reinterpret_cast<T*>(sbuf);
+  const uint32_t num_threads_x = 1 << log_num_threads_x;
+  T* row_buf = reinterpret_cast<T*>(sbuf2 + num_threads_x * 2 * threadIdx.y);
+
+  tensor_kernel_scan_innermost_dim_impl<T>(
+      row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op);
+}
+
+
+template<typename scalar_t, class BinaryFunction>
+__host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result,
+                             int dim, scalar_t init, BinaryFunction binary_op) {
+  const int64_t row_size = self.size(dim);
+  auto sizes = self.sizes();
+
+  // Treat all outer dimensions (i.e. dim_ < dim) as one.
+  const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim);
+
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end());
+
+  dim3 threads(std::min(512, int(num_irows)));
+  int64_t maxGridDim = at::zoom::getCurrentDeviceProperties()->maxGridSize[1];
+  dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x})));
+
+  check_fits_in_unsigned(num_irows, "num_irows");
+  check_fits_in_unsigned(num_orows, "num_orows");
+  check_fits_in_unsigned(row_size, "row_size");
+
+  tensor_kernel_scan_outer_dim<scalar_t><<<grid, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
+    num_orows, num_irows, row_size, init, binary_op);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t, class BinaryFunction>
+void scan_innermost_dim(const TensorBase& self, const TensorBase& result,
+                        scalar_t init, BinaryFunction binary_op) {
+  int64_t ndim = self.dim();
+  // Treat all outer dimensions as a single dimension.
+  int64_t row_size = self.size(ndim - 1);
+  int64_t num_rows = self.numel() / row_size;
+
+  // assuming max_num_threads per block is 512
+  const uint32_t num_threads = 512;
+  const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan<uint32_t>(num_rows, row_size);
+  const uint32_t num_threads_x = (1 << log_num_threads_x);
+  const uint32_t num_threads_y = num_threads / num_threads_x;
+  dim3 threads(num_threads_x, num_threads_y);
+  int64_t maxGridDim = at::zoom::getCurrentDeviceProperties()->maxGridSize[0];
+  dim3 grid(std::min(maxGridDim, ceil_div(num_rows, int64_t{threads.y})));
+
+  check_fits_in_unsigned(num_rows, "Number of rows (self.numel()/self.size(self.dim()-1))");
+  check_fits_in_unsigned(row_size, "row_size");
+
+  tensor_kernel_scan_innermost_dim<scalar_t><<<grid, threads, num_threads * 2 * sizeof(scalar_t),
+                                               c10::zoom::getCurrentZoomStream()>>>(
+    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
+    num_rows, row_size, log_num_threads_x, init, binary_op);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename scalar_t, typename BinaryFunction>
+void scan_dim(const TensorBase& self, const TensorBase& result,
+     int64_t dim, scalar_t init, BinaryFunction binary_op) {
+  int ndim = self.dim();
+  auto self_ = self.expect_contiguous();
+  TORCH_INTERNAL_ASSERT(result.is_contiguous());
+
+  if (self.numel() == self.size(dim)) {
+    zoom::hipcub::inclusive_scan(self_->const_data_ptr<scalar_t>(), result.mutable_data_ptr<scalar_t>(), binary_op, self.numel());
+  } else if (dim == ndim - 1) {
+    scan_innermost_dim<scalar_t>(*self_, result, init, binary_op);
+  } else {
+    scan_outer_dim<scalar_t>(*self_, result, dim, init, binary_op);
+  }
+}
+
+}}  // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Shape.cu b/aten/src/ATen/native/zoom/Shape.cu
new file mode 100644
index 00000000000000..58ed638ee6bf87
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Shape.cu
@@ -0,0 +1,521 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/ceil_div.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/native/TensorShape.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/util/Optional.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cat_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/narrow.h>
+#endif
+
+namespace at::native {
+
+constexpr int CAT_ARRAY_BATCH_SIZE = 128;
+constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4;
+constexpr int ALIGNED_VEC_LOAD_BYTES = 16;
+
+namespace {
+
+inline bool is_aligned_vec4(const void* ptr) {
+  auto iptr = reinterpret_cast<uintptr_t>(ptr);
+  return !(iptr % alignof(int4));
+}
+
+inline bool getCatGrid(ptrdiff_t nTensors, dim3& grid) {
+  const int numSM = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+
+  // X dim of grid for cat array cooperates on a single tensor in the cat.
+  // Given half of the GPU, full utilization will always occur.
+
+  // This will have cating two tensors fill the entire grid, but prevent
+  // many threads from needlessly load meta data if their sizes is small.
+
+  grid = dim3( 2LL * numSM, (long long) nTensors );
+
+  return true;
+}
+
+template<typename T>
+inline std::tuple<dim3, dim3> getCatGridRocm(unsigned int max_elements_per_tensor,
+  ptrdiff_t nTensors) {
+  constexpr unsigned int threads_per_block = 256;
+  constexpr unsigned int elements_per_thread = 8;
+  constexpr unsigned int max_tb_per_sm = 32;
+
+  unsigned int max_threads = ceil_div(max_elements_per_tensor, elements_per_thread);
+  unsigned int thread_blocks = ceil_div(max_threads, threads_per_block);
+
+  // Limit the number of thread blocks to prevent too many threads to load the metadata
+  // if they operate on very small tensors.
+
+  const unsigned int num_sm = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+  thread_blocks = std::min(num_sm * max_tb_per_sm, thread_blocks);
+
+  dim3 block = dim3(threads_per_block);
+  dim3 grid = dim3(thread_blocks, (long long)nTensors);
+
+  return std::make_tuple(grid, block);
+}
+
+template<typename T>
+inline std::tuple<dim3, dim3> getCatGridContig(unsigned int max_elements_per_tensor,
+  ptrdiff_t nTensors) {
+  constexpr unsigned int threads_per_block = 128;
+  constexpr unsigned int min_aligned_vec_per_thread = 1;
+  constexpr unsigned int max_tb_per_sm = 32;
+
+  unsigned int elements_per_thread = ALIGNED_VEC_LOAD_BYTES / sizeof(T) *
+    min_aligned_vec_per_thread;
+  unsigned int max_threads = ceil_div(max_elements_per_tensor, elements_per_thread);
+  unsigned int thread_blocks = ceil_div(max_threads, threads_per_block);
+
+  // Limit the number of thread blocks to prevent too many threads to load the metadata
+  // if they operate on very small tensors.
+
+  const unsigned int num_sm = at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+  thread_blocks = std::min(num_sm * max_tb_per_sm, thread_blocks);
+
+  dim3 block = dim3(threads_per_block);
+  dim3 grid = dim3(thread_blocks, (long long)nTensors);
+
+  return std::make_tuple(grid, block);
+}
+
+// Similar to any other IndexToOffset calculation for copying along a given
+// dimension.
+template <typename IndexType, int Dims>
+struct CatArrIndexToOffset {
+  static inline __device__ IndexType compute(
+      const IndexType tensorSize[Dims],
+      const IndexType tensorStride[Dims],
+      const IndexType dimSize,
+      const unsigned int concatDim,
+      IndexType linearIndex) {
+    // linearIndex is not really linear index, but instead the offset in
+    // input tensor. If the input tensor is contiguous, then this offset
+    // is the linear index, but if the input tensor is channels last, then
+    // it is the linear index of the permuted contiguous tensor
+    IndexType offset = 0;
+
+    #pragma unroll
+    for (int i = Dims - 1; i >= 1; --i) {
+      IndexType curDimSize = i == concatDim ? dimSize : tensorSize[i];
+      IndexType nextDimIndex = linearIndex / curDimSize;
+      IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex;
+      IndexType curDimOffset = curDimIndex * tensorStride[i];
+      offset += curDimOffset;
+      linearIndex = nextDimIndex;
+    }
+
+    return offset + linearIndex * tensorStride[0];
+  }
+};
+
+template<typename IndexType, unsigned int MaxDims>
+struct TensorSizeStride {
+  IndexType tensorSize[MaxDims];
+  IndexType tensorStride[MaxDims];
+};
+
+/**
+  * Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a
+  * grid-stride loop based off of the blockIdx.x, threadIdx.x for each input to
+  * copy each element from each input tensor into the output.
+  *
+  * output: base pointer to the storage associated with the output tensor
+  * inputs: GPU-allocated array of input metadata for each input to concatenate
+  *         in the kernel
+  * os: the size/stride vectors for the output tensor
+  * concatDim: dimension along which we are concatenating
+  * dimStride: the stride of the output tensor at the concatDim
+  *
+  * The most important assumption made is that the input tensors are contiguous.
+  */
+
+
+// pass meta data directly through kernel argument instead of pin memory
+// In contiguous case, we will not need stride_size, setting it as 1 as placeholder
+// to pass compile.
+template <typename T, typename IndexType, int n, int stride_size>
+struct CatArrInputTensorMetadata {
+  const T* input[n];
+  IndexType offset[n];
+  IndexType dimSize[n];
+  IndexType nElements[n];
+  bool isContiguous[n];
+  TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> tensorStride[stride_size];
+};
+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size>
+__global__ void CatArrayBatchedCopy(
+    T* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y];
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> ins = stride_size > 1 ? inputs.tensorStride[blockIdx.y] : inputs.tensorStride[0];
+    bool isContig = inputs.isContiguous[blockIdx.y];
+
+    if(tid >= nElements) return;
+
+    const T* data = inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y];
+    IndexType dimSize = inputs.dimSize[blockIdx.y];
+    IndexType dataOffset = offset * dimStride;
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid);
+      if (isContig) {
+        output[dataOffset + elementOffset] = data[tid];
+      } else {
+        IndexType inElementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+                    ins.tensorSize, ins.tensorStride, dimSize, concatDim, tid);
+        output[dataOffset + elementOffset] = data[inElementOffset];
+      }
+    tid += stride;
+    }
+}
+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size>
+__global__ void CatArrayBatchedCopy_contig(
+    T* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y];
+
+    if(tid >= nElements) return;
+
+    const T* data = inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y];
+    IndexType dimSize = inputs.dimSize[blockIdx.y];
+    IndexType dataOffset = offset * dimStride;
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid);
+      output[dataOffset + elementOffset] = data[tid];
+      tid += stride;
+    }
+}
+
+/*
+  Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
+  to improve memory bandwidth throughput.
+*/
+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size>
+__global__ void CatArrayBatchedCopy_aligned16_contig(
+    T* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType dimStride) {
+
+    // This kernel tries to use 128 bit loads
+    constexpr int kILP = ALIGNED_VEC_LOAD_BYTES / sizeof(T);
+    IndexType inputOffset = (blockIdx.x * blockDim.x + threadIdx.x) * kILP;
+    IndexType inputStride = gridDim.x * blockDim.x * kILP;
+
+    IndexType nElements = inputs.nElements[blockIdx.y];
+    if (inputOffset >= nElements) {
+      return;
+    }
+
+    const T* data = inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y];
+    IndexType dimSize = inputs.dimSize[blockIdx.y];
+    IndexType dataOffset = offset * dimStride;
+
+    IndexType v_elementOffset[kILP];
+    T reg_data[kILP];
+
+    while (inputOffset + kILP <= nElements) {
+      for (int i = 0; i < kILP; ++i) {
+        v_elementOffset[i] = CatArrIndexToOffset<IndexType, Dims>::compute(os.tensorSize,
+          os.tensorStride, dimSize, concatDim, inputOffset + i);
+      }
+
+      using LT = at::native::memory::aligned_vector<T, kILP>;
+      ((LT*)reg_data)[0] = const_cast<LT*>((LT*)(data + inputOffset))[0];
+
+      #pragma unroll
+      for (int i = 0; i < kILP; ++i) {
+        output[dataOffset + v_elementOffset[i]] = reg_data[i];
+      }
+
+      inputOffset += inputStride;
+    }
+
+    // Handle remaining tail in case nElements does not divide
+    // exactly to kILP
+
+    while (inputOffset < nElements) {
+      v_elementOffset[0] = CatArrIndexToOffset<IndexType, Dims>::compute(os.tensorSize,
+        os.tensorStride, dimSize, concatDim, inputOffset);
+      output[dataOffset + v_elementOffset[0]] = data[inputOffset];
+      inputOffset++;
+    }
+}
+
+template <typename scalar_t, int batch_size, int stride_size>
+void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, int64_t dimension,
+                  int nDims, c10::MemoryFormat memory_format) {
+  // First, let's set up our kernel parameters. We start with a raw pointer to
+  // the storage for the output Tensor.
+  scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
+  CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
+  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
+
+  // Next, let's initialize the size, stride arrays for the output Tensor.
+  if (memory_format == c10::MemoryFormat::Contiguous) {
+    for (int i = 0; i < nDims; ++i) {
+      outputParam.tensorSize[i] = out.size(i);
+      outputParam.tensorStride[i] = out.stride(i);
+    }
+  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
+    // permute the semantics of dims from NCHW to NHWC so that the input
+    // tensor is now contiguous
+    outputParam.tensorSize[0] = out.size(0);
+    outputParam.tensorStride[0] = out.stride(0);
+    for (int i = 1; i < nDims - 1; ++i) {
+      outputParam.tensorSize[i] = out.size(i + 1);
+      outputParam.tensorStride[i] = out.stride(i + 1);
+    }
+    outputParam.tensorSize[nDims - 1] = out.size(1);
+    outputParam.tensorStride[nDims - 1] = out.stride(1);
+  } else {
+    TORCH_CHECK(false, "unsupported memory format");
+  }
+
+  c10::zoom::ZoomStream stream = c10::zoom::getCurrentZoomStream();
+
+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.
+
+  bool isContig = true;
+  bool isAligned = true;
+  unsigned int max_elements_per_tensor = 0;
+
+  // Now we loop
+  int batchCounter = 0;
+  int64_t offset = 0;
+  for (unsigned i = 0; i < inputs.size() ; i += batch_size) {
+    for (batchCounter = 0;
+          batchCounter < batch_size &&
+            (i+batchCounter) < inputs.size();
+          ++batchCounter) {
+      int64_t dimSize = 0;
+      // There is a legacy case where a 1-D empty tensor can be concat with
+      // high-dimensional tensor
+      if (inputs[i+batchCounter].get().numel() > 0) {
+        dimSize = inputs[i+batchCounter].get().size(dimension);
+      }
+
+      catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
+      catMetaData.offset[batchCounter] = offset;
+      catMetaData.dimSize[batchCounter] = dimSize;
+      catMetaData.nElements[batchCounter] = inputs[i+batchCounter].get().numel();
+
+      // On ROCm, CatArrayBatchedCopy_contig is faster
+      isAligned = false;
+
+      if (stride_size > 1) {
+        auto strides = inputs[i+batchCounter].get().strides();
+        auto sizes = inputs[i+batchCounter].get().sizes();
+        for(int j = 0; j < nDims; j++){
+          catMetaData.tensorStride[batchCounter].tensorSize[j] = sizes[j];
+          catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
+        }
+        catMetaData.isContiguous[batchCounter] = false;
+        isContig = false;
+      } else {
+        catMetaData.isContiguous[batchCounter] = true;
+      }
+
+      // Update offset
+      offset += dimSize;
+
+      // We need max elements per tensor to compute grid parameters
+      max_elements_per_tensor = std::max(max_elements_per_tensor,
+        catMetaData.nElements[batchCounter]);
+    }
+
+    // Skip if the tensor is empty. Otherwise, the grid dim is invalid
+    if (max_elements_per_tensor == 0)
+      continue;
+
+    dim3 applyBlock, catGrid;
+
+    // always base grid size on max_elements_per_tensor
+    {
+      std::tuple<dim3, dim3> launchParams = getCatGridRocm<scalar_t>(
+          max_elements_per_tensor, batchCounter);
+      catGrid = std::get<0>(launchParams);
+      applyBlock = std::get<1>(launchParams);
+    }
+
+    if (memory_format != c10::MemoryFormat::Contiguous) {
+      switch (dimension) {
+      case 0:
+        break;
+      case 1:
+        dimension = nDims - dimension;
+        break;
+      default:
+        dimension--;
+      }
+    }
+    // Template Declarations for dim = 1, 2, 3, 4
+#define HANDLE_CASE(DIMS) \
+    if (isContig && isAligned && sizeof(scalar_t) >= 4 && sizeof(scalar_t) <= 8) {\
+      CatArrayBatchedCopy_aligned16_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else if (isContig) {\
+      CatArrayBatchedCopy_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    } else {\
+      CatArrayBatchedCopy<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
+          catGrid, applyBlock, 0, stream.stream()>>>(\
+              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+    }\
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    switch (nDims) {
+      case 1:
+        HANDLE_CASE(1);
+        break;
+      case 2:
+        HANDLE_CASE(2);
+        break;
+      case 3:
+        HANDLE_CASE(3);
+        break;
+      case 4:
+        HANDLE_CASE(4);
+        break;
+    }
+#undef HANDLE_CASE
+  }
+}
+// The kernels are templated on an opaque, self-aligned type of the correct
+// size to avoid redundant kernels for different types of the same size.
+template <unsigned N> struct alignas(N) OpaqueType { char data[N]; };
+
+} // namespace
+
+TORCH_IMPL_FUNC(cat_out_zoom)
+(const ITensorListRef& tensors,
+ int64_t dim,
+ int64_t valid,
+ bool all_contiguous,
+ bool all_same_dtype,
+ bool all_same_sizes_and_stride,
+ MemoryFormat memory_format,
+ const Tensor& result) {
+  if (result.numel() == 0) {
+    return;
+  }
+
+  auto materialized = tensors.materialize();
+
+  // We parallelize the copy if all 6 conditions pass:
+  //
+  // 1. There is more than one input tensor
+  // 2. The out tensor is 32-bit indexable
+  // 3. The number of dimensions is <= 4
+  // 4. All input tensors are contiguous (output tensor may be non-contig)
+  // 5. All input tensors can use 32-bit indexing
+
+  const bool all32BitIndexable = std::all_of(materialized.begin(), materialized.end(),
+    [] (const Tensor& t) {
+      return at::zoom::detail::canUse32BitIndexMath(t);
+    });
+
+  int nDims = materialized[valid].get().dim();
+
+  // We support the contiguous inputs and non-contiguous input (<=4 dims) in different ways
+  // For contiguous input, we don't need to pass stride meta data to cuda kernel through constant
+  // memory. Therefore, we could pass more inputs to cuda threads.
+  // For non-contiguous, we reduce the number of inputs passed to cuda kernel due to the limitation
+  // of constant memory.
+
+
+
+  if (materialized.size() > 1 &&
+      result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      at::zoom::detail::canUse32BitIndexMath(result) &&
+      all_contiguous &&
+      all32BitIndexable &&
+      all_same_dtype) {
+      if (isBitsType(result.scalar_type())) {
+        AT_DISPATCH_BIT_TYPES(result.scalar_type(), "cat_zoom", [&]() {
+          using dtype = OpaqueType<sizeof(scalar_t)>;
+          parallel_cat<dtype, CAT_ARRAY_BATCH_SIZE, 1>(result, materialized, dim, nDims, memory_format);
+        });
+      } else {
+        AT_DISPATCH_V2(result.scalar_type(), "cat_zoom", AT_WRAP([&]() {
+          using dtype = OpaqueType<sizeof(scalar_t)>;
+          parallel_cat<dtype, CAT_ARRAY_BATCH_SIZE, 1>(result, materialized, dim, nDims, memory_format);
+        }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+      }
+  } else if (materialized.size() > 1 &&
+      result.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      at::zoom::detail::canUse32BitIndexMath(result) &&
+      nDims <= CAT_ARRAY_MAX_INPUT_DIMS &&
+      all32BitIndexable &&
+      all_same_dtype &&
+      memory_format == c10::MemoryFormat::Contiguous) {
+      if (isBitsType(result.scalar_type())) {
+        AT_DISPATCH_BIT_TYPES(result.scalar_type(), "cat_zoom", [&]() {
+          using dtype = OpaqueType<sizeof(scalar_t)>;
+          parallel_cat<dtype, CAT_ARRAY_BATCH_SIZE/2, CAT_ARRAY_BATCH_SIZE/2>(result, materialized, dim, nDims, memory_format);
+        });
+      } else {
+        AT_DISPATCH_V2(result.scalar_type(), "cat_zoom", AT_WRAP([&]() {
+            using dtype = OpaqueType<sizeof(scalar_t)>;
+            parallel_cat<dtype, CAT_ARRAY_BATCH_SIZE/2, CAT_ARRAY_BATCH_SIZE/2>(result, materialized, dim, nDims, memory_format);
+        }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+      }
+  } else {
+    int64_t offset = 0;
+    for (const Tensor& t : materialized) {
+      if (cat_should_skip_tensor(t)) continue;
+      int64_t dimSize = t.size(dim);
+      Tensor nt = at::narrow(result, dim, offset, dimSize);
+      copy_(nt, t);
+      offset += dimSize;
+    }
+  }
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/SoftMax.cu b/aten/src/ATen/native/zoom/SoftMax.cu
new file mode 100644
index 00000000000000..2101dd42bb2d7a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SoftMax.cu
@@ -0,0 +1,1272 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/WrapDimUtils.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <type_traits>
+
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+#include <ATen/native/zoom/PersistentSoftmax.cuh>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_masked_softmax_native.h>
+#include <ATen/ops/_log_softmax_native.h>
+#include <ATen/ops/_log_softmax_backward_data_native.h>
+#include <ATen/ops/_softmax_native.h>
+#include <ATen/ops/_softmax_backward_data_native.h>
+#include <ATen/ops/softmax.h>
+#include <ATen/ops/_softmax_backward_data.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+constexpr int ALIGN_BYTES = 16;
+
+template<typename T, typename AccumT, typename OutT>
+struct LogSoftMaxForwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : max_input(max_input),  logsum(::log(sum)) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>(input - max_input - logsum);
+}
+
+  const AccumT max_input;
+  const AccumT logsum;
+};
+
+template<typename T, typename AccumT, typename OutT>
+struct LogSoftMaxBackwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum)
+    : sum(sum) {}
+
+  __device__ __forceinline__ T operator()(OutT gradOutput, OutT output) const {
+    return static_cast<T>(gradOutput - ::exp(static_cast<AccumT>(output)) * sum);
+  }
+
+  const AccumT sum;
+};
+
+template<typename T, typename AccumT, typename OutT>
+struct SoftMaxForwardEpilogue {
+  __device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : max_input(max_input)
+    , sum(sum) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>(::exp(input - max_input) / sum);
+  }
+
+  const AccumT max_input;
+  const AccumT sum;
+};
+
+template<typename T, typename AccumT, typename OutT>
+struct SoftMaxBackwardEpilogue {
+  __device__ __forceinline__ SoftMaxBackwardEpilogue(AccumT sum)
+    : sum(sum) {}
+
+  // XXX: gradOutput that we get here is really gradOutput * output
+  // Look for cmul in SoftMax_updateGradInput
+  __device__ __forceinline__ T operator()(OutT gradOutput, OutT output) const {
+    return static_cast<T>(gradOutput - output * sum);
+  }
+
+  const AccumT sum;
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Spatial kernel (fast with large inner_size and small dim_size)
+////////////////////////////////////////////////////////////////////////////////
+// Let's assume that our input has been flattened to have only three dimension:
+//     outer x dim x inner
+// The spatial algorithm tries to parallelize along all of them.
+// Within a 2d block threadIdx.y parallelizes over dim slices, and threads that
+// share it will speed up reductions over dim (along axis x).
+// The 2d grid is used to parallelize inner dimension over y axis and outer over x.
+inline dim3 SpatialSoftMax_getGridSize(
+    dim3 block, uint32_t max_active_blocks,
+    uint64_t outer_size, uint64_t inner_size) {
+  // First, tile as many blocks as we can over the y axis
+  uint32_t inner_blocks = (inner_size + block.y - 1) / block.y;
+  if (inner_blocks > max_active_blocks)
+    inner_blocks = max_active_blocks;
+  // Fill the x axis with as many blocks as we can fit (a little more is ok too)
+  uint32_t outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks;
+  if (outer_blocks > outer_size)
+    outer_blocks = outer_size;
+  return dim3(outer_blocks, inner_blocks);
+}
+
+const int max_threads = 1024;
+
+inline dim3 SpatialSoftMax_getBlockSize(
+  uint64_t dim_size, uint64_t inner_size) {
+  uint32_t inner_threads = inner_size;
+  inner_threads = ::min(inner_threads, static_cast<uint32_t>(max_threads));
+  uint32_t dim_threads = 1;
+  if (inner_threads <= 64 && dim_size >= 64) {
+    while (inner_threads * dim_threads <= max_threads && dim_threads <= dim_size)
+      dim_threads *= 2;
+    dim_threads /= 2;
+  }
+  return dim3(dim_threads, inner_threads);
+}
+
+
+template<typename accscalar_t, typename Kernel>
+void SpatialSoftMax_getLaunchSizes(
+    Kernel k,
+    uint64_t outer_size, uint64_t dim_size, uint64_t inner_size,
+    dim3& grid, dim3& block, uint32_t& smem_size) {
+  block = SpatialSoftMax_getBlockSize(dim_size, inner_size);
+  uint32_t block_threads = block.x * block.y;
+  smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
+  int max_active_blocks;
+#if defined(TORCH_HIP_VERSION) && TORCH_HIP_VERSION < 305
+  // HIP function signature is not compatible yet.
+  uint32_t max_blocks;
+  hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
+                                                k, block_threads, smem_size);
+  max_active_blocks = max_blocks;
+#else
+  hipOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
+                                                k, block_threads, smem_size);
+#endif
+  max_active_blocks *= at::zoom::getCurrentDeviceProperties()->multiProcessorCount;
+  grid = SpatialSoftMax_getGridSize(block, max_active_blocks, outer_size, inner_size);
+}
+
+inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = ::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
+
+  // In the vectorized case we want to trade off allowing more of the buffers to be accessed
+  // in a vectorized way against wanting a larger block size to get better utilisation.
+  // In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
+  // of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
+  // allowing a larger block size.
+  if (ILP > 1) {
+    max_block_size /= 2;
+  }
+
+  while (block_size < (max_block_size)) block_size *= 2;
+  // Launch at least a single warp - the kernel assumes that.
+  block_size = ::max(block_size, static_cast<uint64_t>(at::zoom::warp_size()));
+  return dim3(block_size);
+}
+
+inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = ::min(dim_size, static_cast<uint64_t>(max_threads));
+
+  // We need a block size that is a multiple of C10_WARP_SIZE in order
+  // to perform block size reductions using warp shuffle instructions.
+  // Since max_threads is also a multiple of C10_WARPS_SIZE we do not
+  // risk creating a block size larger than the limit.
+
+  if (max_block_size % C10_WARP_SIZE == 0) {
+    block_size = max_block_size;
+  } else {
+    block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE;
+  }
+
+  return dim3(block_size);
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+
+  __device__ __forceinline__ T combine(T a, T b) const {
+    return a + b;
+  }
+
+  // Needed to allow warp level reduction as a first step in the
+  // thread block reduction
+  __device__ __forceinline__ T warp_shfl_down(T data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+
+  __device__ __forceinline__ T combine(T a, T b) const {
+    return a < b ? b : a;
+  }
+
+  // Needed to allow warp level reduction as a first step in the
+  // thread block reduction
+  __device__ __forceinline__ T warp_shfl_down(T data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+};
+
+// Note that it's not a complete block-wide reduction.
+// Only threads that share threadIdx.y reduce values.
+template<typename T, template<typename> class ReduceOp>
+__forceinline__ __device__
+T spatialBlockReduceX(T *shared, T val) {
+  ReduceOp<T> r;
+  shared += threadIdx.y * blockDim.x;
+
+  __syncthreads();
+
+  shared[threadIdx.x] = val;
+
+  // NOTE: loop starts with __syncthreads()
+  int offset = blockDim.x / 2;
+  while (offset > 0) {
+    __syncthreads();
+    if (threadIdx.x < offset)
+      shared[threadIdx.x] = r(shared[threadIdx.x], shared[threadIdx.x + offset]);
+    offset /= 2;
+  }
+
+  __syncthreads();
+
+  return shared[0];
+}
+
+template <typename scalar_t, typename accscalar_t, typename outscalar_t, typename index_t, template<typename, typename, typename> class Epilogue>
+__global__ void cunn_SpatialSoftMaxForward(
+    outscalar_t *output, const scalar_t *input,
+    index_t outer_size, index_t dim_size, index_t inner_size)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  const index_t outer_stride = inner_size * dim_size;
+  const index_t dim_stride = inner_size;
+
+  for (index_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) {
+    const index_t outer_offset = outer_index * outer_stride;
+    for (index_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
+      const index_t data_offset = outer_offset + inner_index;
+      ////////////////////////////////////////////////////////////
+      // These two blocks are really equivalent, but specializing on
+      // blockDim.x == 1 makes the kernel faster when it's unused.
+      // I didn't want to thread an extra template parameter, and nvcc
+      // seems to be smart enough to hoist the if outside of the loops.
+      ////////////////////////////////////////////////////////////
+
+      if (blockDim.x > 1) {
+        accscalar_t max_input = at::numeric_limits<accscalar_t>::lowest();
+        for (index_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
+          const accscalar_t value = static_cast<accscalar_t>(input[data_offset + d * dim_stride]);
+          max_input = Max<accscalar_t>()(max_input, value);
+        }
+        max_input = spatialBlockReduceX<accscalar_t, Max>(sdata,max_input);
+
+        accscalar_t sum = 0;
+        for (index_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          sum += ::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+                 - max_input);
+        sum = spatialBlockReduceX<accscalar_t, Add>(sdata, sum);
+
+        Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_input, sum);
+        for (index_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
+      } else {
+        accscalar_t max_input = at::numeric_limits<accscalar_t>::lowest();
+        for (index_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
+          const accscalar_t value = static_cast<accscalar_t>(input[data_offset + d * dim_stride]);
+          max_input = Max<accscalar_t>()(max_input, value);
+        }
+        accscalar_t sum = 0;
+        for (index_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          sum += ::exp(static_cast<accscalar_t>(input[data_offset + d * dim_stride])
+                 - max_input);
+        Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_input, sum);
+        for (index_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          output[data_offset + d * dim_stride] = epilogue(input[data_offset + d * dim_stride]);
+      }
+    }
+  }
+}
+
+
+
+template <typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__global__ void cunn_SpatialSoftMaxBackward(
+    scalar_t *gradInput, const outscalar_t *output, const outscalar_t *gradOutput,
+    uint32_t outer_size, uint32_t dim_size, uint32_t inner_size)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  const uint32_t outer_stride = inner_size * dim_size;
+  const uint32_t dim_stride = inner_size;
+
+  for (uint32_t outer_index = blockIdx.x; outer_index < outer_size; outer_index += gridDim.x) {
+    const uint32_t outer_offset = outer_index * outer_stride;
+    for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
+      const uint32_t data_offset = outer_offset + inner_index;
+      // See the comment in forward kernel
+      if (blockDim.x > 1) {
+        accscalar_t sum = 0;
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x)
+          sum += gradOutput[data_offset + d * dim_stride];
+        sum = spatialBlockReduceX<accscalar_t, Add>(sdata, sum);
+
+        Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(sum);
+        for (uint32_t d = threadIdx.x; d < dim_size; d += blockDim.x) {
+          gradInput[data_offset + d * dim_stride] =
+            epilogue(gradOutput[data_offset + d * dim_stride],
+                    output[data_offset + d * dim_stride]);
+        }
+      } else {
+        accscalar_t sum = 0;
+        for (uint32_t d = 0; d < dim_size; d++)
+          sum += gradOutput[data_offset + d * dim_stride];
+
+        Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(sum);
+        for (uint32_t d = 0; d < dim_size; d++) {
+          gradInput[data_offset + d * dim_stride] =
+            epilogue(gradOutput[data_offset + d * dim_stride],
+                    output[data_offset + d * dim_stride]);
+        }
+      }
+    }
+  }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Regular kernel (fast when dim_size is large; requires inner_size == 1)
+////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename T, typename AccumT>
+struct MaxFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
+    return ::max(max, (AccumT)v);
+  }
+};
+
+template<typename T, typename AccumT>
+struct AddFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + v;
+  }
+};
+
+template<typename T, typename AccumT>
+struct SumExpFloat
+{
+  __device__ __forceinline__ SumExpFloat(AccumT v)
+    : max_k(v) {}
+
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + ::exp(v - max_k);
+  }
+
+  const AccumT max_k;
+};
+
+template <template<typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+            const Reduction<AccumT>& r,
+            AccumT defaultVal)
+{
+  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
+  __syncthreads();
+
+  smem[threadIdx.x] = val;
+
+  __syncthreads();
+
+  AccumT warpVal = defaultVal;
+
+  // First warp will perform per-warp reductions for the remaining warps
+  uint32_t mask = (((uint64_t)1) << (blockDim.x / C10_WARP_SIZE)) - 1;
+  if (threadIdx.x < C10_WARP_SIZE) {
+    int lane = threadIdx.x % C10_WARP_SIZE;
+    if (lane < blockDim.x / C10_WARP_SIZE) {
+#pragma unroll
+      for (int i = 0; i < C10_WARP_SIZE; ++i) {
+        warpVal = r(warpVal, smem[lane * C10_WARP_SIZE + i]);
+      }
+      smem[lane] = warpVal;
+    }
+  }
+
+  __syncthreads();
+
+  // First thread will perform a reduction of the above per-warp reductions
+  AccumT blockVal = defaultVal;
+
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < blockDim.x / C10_WARP_SIZE; ++i) {
+      blockVal = r(blockVal, smem[i]);
+    }
+    smem[0] = blockVal;
+  }
+
+  // Sync and broadcast
+  __syncthreads();
+  return smem[0];
+}
+
+// Performs a thread block reduction with a given functor but uses
+// warp shuffles as the first step in the reduction
+template <template<typename> class Reduction, typename T>
+__device__ __forceinline__
+T blockReduceWarp(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
+{
+  T result = zoom_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache);
+  if (threadIdx.x == 0) {
+    smem_cache[0] = result;
+  }
+  __syncthreads();
+  return smem_cache[0];
+}
+
+template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
+__device__ __forceinline__ AccumT
+ilpReduce(index_t shift,
+          const T* data,
+          index_t size,
+          const Reduction<T, AccumT>& r,
+          AccumT defaultVal)
+{
+  using LoadT = at::native::memory::aligned_vector<T, ILP>;
+  AccumT threadVal = defaultVal;
+  index_t offset = threadIdx.x;
+
+  // shift and do 1
+  if(shift > 0){
+    data -= shift;
+    size += shift;
+    if(threadIdx.x >= shift){
+      threadVal = r(threadVal, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
+  index_t last = size % (ILP * blockDim.x);
+
+  T v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *value = reinterpret_cast<const LoadT*>(data)[offset];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      threadVal = r(threadVal, v[j]);
+    }
+  }
+
+  offset = size - last + threadIdx.x;
+  // Epilogue
+  for (; offset < size; offset += blockDim.x)
+    threadVal = r(threadVal, data[offset]);
+
+  return threadVal;
+}
+
+/**
+ * This will apply the Epilogue with vectorized reads & writes when input & output have the same shift
+ */
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__device__ __forceinline__ void
+WriteFpropResultsVectorized(
+             int size,
+             const int shift,
+             const scalar_t *input,
+             outscalar_t *output,
+             Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
+  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
+
+  int offset = threadIdx.x;
+
+  // if unaligned, do one value / thread and move on, guaranteeing aligned reads/writes later
+  if (shift > 0) {
+    input -= shift;
+    output -= shift;
+    size += shift;
+
+    if (threadIdx.x >= shift) {
+      output[offset] = epilogue(input[offset]);
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+    output += blockDim.x;
+  }
+
+  const int last = size % (ILP * blockDim.x);
+
+  scalar_t in_v[ILP];
+  LoadT* in_value = reinterpret_cast<LoadT*>(&in_v);
+
+  outscalar_t out_v[ILP];
+  const StoreT* out_value = reinterpret_cast<const StoreT*>(&out_v);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *in_value = reinterpret_cast<const LoadT*>(input)[offset];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      out_v[j] = epilogue(in_v[j]);
+    }
+
+    reinterpret_cast<StoreT*>(output)[offset] = *out_value;
+  }
+
+  offset = size - last + threadIdx.x;
+  // handle the tail
+  for (; offset < size; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue, typename index_t = int32_t>
+__device__ __forceinline__ void
+WriteBpropResultsVectorized(
+             index_t size,
+             const index_t shift,
+             scalar_t *gradInput,
+             const outscalar_t *output,
+             const outscalar_t *gradOutput,
+             Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  using gradInputT = at::native::memory::aligned_vector<scalar_t, ILP>;
+  using outputT = at::native::memory::aligned_vector<outscalar_t, ILP>;
+
+  index_t offset = threadIdx.x;
+
+  // if unaligned, do one value / thread and move on, guaranteeing aligned reads/writes later
+  if (shift > 0) {
+    gradInput -= shift;
+    output -= shift;
+    gradOutput -= shift;
+    size += shift;
+
+    if (threadIdx.x >= shift) {
+      gradInput[offset] = epilogue(gradOutput[offset], output[offset]);
+    }
+    size -= blockDim.x;
+    gradInput += blockDim.x;
+    output += blockDim.x;
+    gradOutput += blockDim.x;
+  }
+
+  const index_t last = size % (ILP * blockDim.x);
+
+  scalar_t dX[ILP];
+  gradInputT *dX_v = reinterpret_cast<gradInputT*>(&dX);
+
+  outscalar_t Y[ILP];
+  outputT *Y_v = reinterpret_cast<outputT*>(&Y);
+
+  outscalar_t dY[ILP];
+  outputT *dY_v = reinterpret_cast<outputT*>(&dY);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *Y_v = reinterpret_cast<const outputT*>(output)[offset];
+    *dY_v = reinterpret_cast<const outputT*>(gradOutput)[offset];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      dX[j] = epilogue(dY[j], Y[j]);
+    }
+
+    reinterpret_cast<gradInputT*>(gradInput)[offset] = *dX_v;
+  }
+
+  offset = size - last + threadIdx.x;
+  for (; offset < size; offset += blockDim.x) {
+    gradInput[offset] = epilogue(gradOutput[offset], output[offset]);
+  }
+}
+
+/**
+ * This will apply the Epilogue with non-vectorized reads & writes for the general case
+ */
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__device__ __forceinline__ void
+WriteFpropResults(
+             int classes,
+             const scalar_t *input,
+             outscalar_t *output,
+             Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+  for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) {
+    output[offset] = epilogue(input[offset]);
+  }
+}
+
+template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue, typename index_t>
+__device__ __forceinline__ void
+WriteBpropResults(
+             int classes,
+             scalar_t *gradInput,
+             const outscalar_t *output,
+             const outscalar_t *gradOutput,
+             Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
+
+  index_t offset = threadIdx.x;
+
+  index_t last = classes % (ILP * blockDim.x);
+
+  for (; offset < classes - last; offset += blockDim.x * ILP) {
+    outscalar_t tmpOutput[ILP];
+    outscalar_t tmpGradOutput[ILP];
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      tmpOutput[j] = output[offset + j * blockDim.x];
+      tmpGradOutput[j] = gradOutput[offset + j * blockDim.x];
+    }
+
+    #pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      gradInput[offset + j * blockDim.x] = epilogue(tmpGradOutput[j], tmpOutput[j]);
+    }
+  }
+
+  // Remainder - no ILP
+  for (; offset < classes; offset += blockDim.x) {
+    gradInput[offset] = epilogue(gradOutput[offset], output[offset]);
+  }
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+
+  // forward pointers to batch[blockIdx.x]
+  // each block handles a sample in the mini-batch
+  input += static_cast<int64_t>(blockIdx.x) * classes;
+  output += static_cast<int64_t>(blockIdx.x) * classes;
+
+  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
+  const int output_shift = ((uint64_t)output) % ALIGN_BYTES / sizeof(outscalar_t);
+
+  // find the max
+  accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
+    shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(sdata, threadMax,
+    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+
+  // reduce all values
+  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
+    shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = blockReduceWarp<Add, accscalar_t>(sdata, threadExp,
+    Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  if (shift == output_shift) {
+    WriteFpropResultsVectorized<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, shift, input, output, epilogue);
+  } else {
+    WriteFpropResults<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, input, output, epilogue);
+  }
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
+  template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
+__global__ void
+cunn_SoftMaxForwardSmem(outscalar_t *output, const scalar_t *input, index_t classes)
+{
+  // Each thread block processes a sample in the batch
+  input += static_cast<int64_t>(blockIdx.x) * classes;
+  output += static_cast<int64_t>(blockIdx.x) * classes;
+
+  accscalar_t threadMax = -at::numeric_limits<accscalar_t>::max();
+  accscalar_t threadExp = static_cast<accscalar_t>(0);
+
+  // The first smem segment is used to cache input values and the last
+  // segment is used for thread block reductions
+  extern __shared__ unsigned char smem[];
+  auto smem_input_cache = reinterpret_cast<scalar_t*>(smem);
+  auto smem_reduction_cache = reinterpret_cast<accscalar_t*>(smem +
+    classes * sizeof(scalar_t));
+
+  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
+  const LoadT* const input_vec_ptr = reinterpret_cast<const LoadT*>(input);
+  LoadT* const smem_input_cache_vec_ptr = reinterpret_cast<LoadT*>(smem_input_cache);
+
+  // Download inputs to shared memory while doing the first step
+  // in max calculation
+  MaxFloat<scalar_t, accscalar_t> maxFunc;
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = input_vec_ptr[offset];
+    smem_input_cache_vec_ptr[offset] = crnt_vec;
+
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      threadMax = maxFunc(threadMax, crnt_vec.val[i]);
+    }
+  }
+
+  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(smem_reduction_cache, threadMax,
+    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
+
+  // Reload input from shared memory to compute the sum. The previous
+  // reduce has performed a __syncthreads() so the smem contents are populated.
+  SumExpFloat<scalar_t, accscalar_t> sumExpFunc(max_k);
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = smem_input_cache_vec_ptr[offset];
+
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      threadExp = sumExpFunc(threadExp, crnt_vec.val[i]);
+    }
+  }
+
+  accscalar_t sumAll = blockReduceWarp<Add, accscalar_t>(smem_reduction_cache, threadExp,
+    Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  // Use vectorized stores to save the output
+  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
+  StoreT* output_vec_ptr = reinterpret_cast<StoreT*>(output);
+  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
+    LoadT crnt_vec = smem_input_cache_vec_ptr[offset];
+    StoreT out_vec;
+
+    #pragma unroll
+    for (int i = 0; i < ILP; ++i) {
+      out_vec.val[i] = epilogue(crnt_vec.val[i]);
+    }
+
+    output_vec_ptr[offset] = out_vec;
+  }
+}
+
+C10_DEVICE bool inline is_32bit_representable(const int64_t value) {
+  return value < static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxBackward(scalar_t *gradInput, const outscalar_t *output, const outscalar_t *gradOutput, int64_t classes)
+{
+  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
+  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
+
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  gradInput += static_cast<int64_t>(blockIdx.x) * classes;
+  output += static_cast<int64_t>(blockIdx.x) * classes;
+  gradOutput += static_cast<int64_t>(blockIdx.x) * classes;
+
+  const int64_t shift = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t);
+  const int64_t output_shift = ((uint64_t)output) % ALIGN_BYTES / sizeof(outscalar_t);
+  const int64_t grad_output_shift = ((uint64_t)gradOutput) % ALIGN_BYTES / sizeof(outscalar_t);
+
+  const bool can_use_32bit_indexing = is_32bit_representable(shift) && is_32bit_representable(output_shift) && is_32bit_representable(grad_output_shift) && is_32bit_representable(classes);
+  accscalar_t threadSum;
+  if (can_use_32bit_indexing) {
+    threadSum = ilpReduce<AddFloat, ILP, outscalar_t, accscalar_t, int32_t>(
+        static_cast<int32_t>(grad_output_shift), gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
+  } else {
+    threadSum = ilpReduce<AddFloat, ILP, outscalar_t, accscalar_t, int64_t>(
+        grad_output_shift, gradOutput, classes, AddFloat<outscalar_t, accscalar_t>(), accscalar_t(0));
+  }
+  accscalar_t sum_k = blockReduce<Add, accscalar_t>(
+        sdata, threadSum, Add<accscalar_t>(), accscalar_t(0));
+
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(sum_k);
+
+  if (shift == output_shift && shift == grad_output_shift) {
+    if (can_use_32bit_indexing) {
+      WriteBpropResultsVectorized<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue, int32_t>(classes, static_cast<int32_t>(shift), gradInput, output, gradOutput, epilogue);
+    } else {
+      WriteBpropResultsVectorized<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue, int64_t>(classes, shift, gradInput, output, gradOutput, epilogue);
+    }
+  } else {
+    if (can_use_32bit_indexing) {
+      WriteBpropResults<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue, int32_t>(classes, gradInput, output, gradOutput, epilogue);
+    } else {
+      WriteBpropResults<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue, int64_t>(classes, gradInput, output, gradOutput, epilogue);
+    }
+  }
+}
+
+template<template<typename, typename, typename> class Epilogue, bool is_log_softmax>
+Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_to_float, const Tensor& output){
+  if (half_to_float) {
+    TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
+  }
+  auto input = input_.contiguous();
+  static_assert(std::is_same<acc_type<at::Half, true>, float>::value, "accscalar_t for half should be float");
+  if (input.dim() == 0) input = input.view(1);
+  int64_t dim = maybe_wrap_dim(dim_, input.dim());
+  TORCH_CHECK(dim >=0 && dim < input.dim(), "dim must be non-negative and less than input dimensions");
+  int64_t outer_size = 1;
+  int64_t dim_size = input.size(dim);
+
+  if (input.numel() > 0) {
+    int64_t inner_size = 1;
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+    for (int64_t i = 0; i < dim; ++i)
+      outer_size *= input.size(i);
+    for (int64_t i = dim + 1; i < input.dim(); ++i)
+      inner_size *= input.size(i);
+    // This kernel spawns a block per each element in the batch.
+    // XXX: it assumes that inner_size == 1
+
+    if (inner_size == 1) {
+      dim3 grid(outer_size);
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        if (!half_to_float) {
+          auto output_ptr = output.mutable_data_ptr<scalar_t>();
+          auto input_ptr = input.const_data_ptr<scalar_t>();
+          if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
+            int64_t remaining = outer_size;
+            int64_t chunk_size = (1L << 30L) / dim_size;
+            while(remaining > 0) {
+              dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false>(
+                output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr/* not masked */);
+              input_ptr += chunk_size * dim_size;
+              output_ptr += chunk_size * dim_size;
+              remaining -= chunk_size;
+            }
+          } else {
+            constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+            dim3 block = SoftMaxForward_getBlockSize(dim_size);
+            size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+            auto max_elements_per_smem = (at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock -
+              smem_reduction_sz) / sizeof(scalar_t);
+
+            bool can_use_smem = dim_size < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES);
+            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+            can_use_smem &= !(dim_size % ILP);
+
+            if (can_use_smem) {
+              size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+             hipLaunchKernelGGL(( cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>)
+                , dim3(grid), dim3(block), smem_sz, stream, output_ptr, input_ptr, dim_size);
+            } else {
+             hipLaunchKernelGGL(( cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>)
+                , dim3(grid), dim3(block), smem_reduction_sz, stream, output_ptr, input_ptr, dim_size);
+            }
+
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        } else {
+          auto output_ptr = output.mutable_data_ptr<accscalar_t>();
+          auto input_ptr = input.const_data_ptr<scalar_t>();
+          if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
+            int64_t remaining = outer_size;
+            int64_t chunk_size = (1<<30) / dim_size;
+            while(remaining > 0) {
+              dispatch_softmax_forward<scalar_t, accscalar_t, accscalar_t, is_log_softmax, false>(
+                  output_ptr, input_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size), nullptr/* not masked */);
+              input_ptr += chunk_size * dim_size;
+              output_ptr += chunk_size * dim_size;
+              remaining -= chunk_size;
+            }
+          } else {
+            constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+            dim3 block = SoftMaxForward_getBlockSize(dim_size);
+            size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+            auto max_elements_per_smem = (at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock -
+              smem_reduction_sz) / sizeof(scalar_t);
+
+            bool can_use_smem = dim_size < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES);
+            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+            can_use_smem &= !(dim_size % ILP);
+
+            if (can_use_smem) {
+              size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+             hipLaunchKernelGGL(( cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>)
+                , dim3(grid), dim3(block), smem_sz, stream, output_ptr, input_ptr, dim_size);
+            } else {
+             hipLaunchKernelGGL(( cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>)
+                , dim3(grid), dim3(block), smem_reduction_sz, stream, output_ptr, input_ptr, dim_size);
+            }
+
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        }
+      });
+    // This kernel runs in a 2D grid, where each application along y dimension has a fixed
+    // outer_size, and runs in parallel over inner_size. Dimension x is parallel over outer_size.
+    // Reductions over dim are done in a single-threaded manner.
+    } else {
+      uint32_t smem_size;
+      dim3 grid, block;
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        AT_DISPATCH_INDEX_TYPES(
+            at::native::canUse32BitIndexMath(input, INT_MAX) ? ScalarType::Int : ScalarType::Long,
+        "host_softmax_launcher", [&] {
+            if (!half_to_float) {
+                SpatialSoftMax_getLaunchSizes<accscalar_t>(
+                    &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, scalar_t, index_t, Epilogue>,
+                    outer_size, dim_size, inner_size,
+                    grid, block, smem_size);
+               hipLaunchKernelGGL(( cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, scalar_t, index_t, Epilogue>)
+                  , dim3(grid), dim3(block), smem_size, stream, 
+                  output.mutable_data_ptr<scalar_t>(), input.const_data_ptr<scalar_t>(), outer_size, dim_size, inner_size);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            } else {
+                SpatialSoftMax_getLaunchSizes<accscalar_t>(
+                    &cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, accscalar_t, index_t, Epilogue>,
+                    outer_size, dim_size, inner_size,
+                    grid, block, smem_size);
+               hipLaunchKernelGGL(( cunn_SpatialSoftMaxForward<scalar_t, accscalar_t, accscalar_t, index_t, Epilogue>)
+                  , dim3(grid), dim3(block), smem_size, stream, 
+                  output.mutable_data_ptr<accscalar_t>(), input.const_data_ptr<scalar_t>(), outer_size, dim_size, inner_size);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            }
+         });
+      });
+    }
+  }
+  return output;
+}
+
+template<template<typename, typename, typename> class Epilogue, bool is_log_softmax>
+void host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t dim_, bool half_to_float, const Tensor &gI){
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+  if (grad_.numel() == 0) {
+    return;
+  }
+  auto grad = grad_.contiguous();
+  static_assert(std::is_same<acc_type<at::Half, true>, float>::value, "accscalar_t for half should be float");
+  if (grad.dim() == 0) grad = grad.view(1);
+  TORCH_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions");
+  auto output = output_.contiguous();
+  if (output.dim() == 0) output = output.view(1);
+  int64_t outer_size = 1;
+  int64_t dim_size = output.size(dim);
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= output.size(i);
+  for (int64_t i = dim + 1; i < output.dim(); ++i)
+    inner_size *= output.size(i);
+// See descriptions of kernels above.
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  if (inner_size == 1) {
+    dim3 grid(outer_size);
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, gI.scalar_type(), "host_softmax_backward", [&] {
+    using accscalar_t = acc_type<scalar_t, true>;
+    if (!half_to_float) {
+      if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
+        auto gI_ptr = gI.mutable_data_ptr<scalar_t>();
+        auto grad_ptr = grad.const_data_ptr<scalar_t>();
+        auto output_ptr = output.const_data_ptr<scalar_t>();
+        int64_t remaining = outer_size;
+        int64_t chunk_size = (1<<30) / dim_size;
+        while(remaining > 0) {
+          dispatch_softmax_backward<scalar_t, scalar_t, accscalar_t, is_log_softmax, false /* masked_softmax */>(
+            gI_ptr, grad_ptr, output_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size));
+          gI_ptr += chunk_size * dim_size;
+          grad_ptr += chunk_size * dim_size;
+          output_ptr += chunk_size * dim_size;
+          remaining -= chunk_size;
+        }
+      } else {
+        constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
+        dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+       hipLaunchKernelGGL(( cunn_SoftMaxBackward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>)
+         , dim3(grid), dim3(block), block.x * sizeof(accscalar_t), stream, 
+            gI.mutable_data_ptr<scalar_t>(), output.const_data_ptr<scalar_t>(), grad.const_data_ptr<scalar_t>(), dim_size
+        );
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    } else {
+      if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) {
+        auto gI_ptr = gI.mutable_data_ptr<scalar_t>();
+        auto grad_ptr = grad.const_data_ptr<accscalar_t>();
+        auto output_ptr = output.const_data_ptr<accscalar_t>();
+        int64_t remaining = outer_size;
+        int64_t chunk_size = (1<<30) / dim_size;
+        while(remaining > 0) {
+          dispatch_softmax_backward<accscalar_t, scalar_t, accscalar_t, is_log_softmax, false /* masked_softmax */>(
+            gI_ptr, grad_ptr, output_ptr, dim_size, dim_size, std::min<int64_t>(remaining, chunk_size));
+          gI_ptr += chunk_size * dim_size;
+          grad_ptr += chunk_size * dim_size;
+          output_ptr += chunk_size * dim_size;
+          remaining -= chunk_size;
+        }
+      } else {
+        constexpr int ILP = sizeof(float4) / sizeof(accscalar_t);
+        dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+       hipLaunchKernelGGL(( cunn_SoftMaxBackward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>)
+         , dim3(grid), dim3(block), block.x * sizeof(accscalar_t), stream, 
+            gI.mutable_data_ptr<scalar_t>(), output.const_data_ptr<accscalar_t>(), grad.const_data_ptr<accscalar_t>(), dim_size
+        );
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    }
+    });
+  } else {
+    uint32_t smem_size;
+    dim3 grid, block;
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, gI.scalar_type(), "host_softmax_backward", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      if (!half_to_float) {
+          SpatialSoftMax_getLaunchSizes<accscalar_t>(
+              &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, scalar_t, Epilogue>,
+              outer_size, dim_size, inner_size,
+              grid, block, smem_size);
+
+         hipLaunchKernelGGL(( cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, scalar_t, Epilogue>)
+            , dim3(grid), dim3(block), smem_size, stream, 
+              gI.mutable_data_ptr<scalar_t>(), output.const_data_ptr<scalar_t>(), grad.const_data_ptr<scalar_t>(),
+              outer_size, dim_size, inner_size
+          );
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+          SpatialSoftMax_getLaunchSizes<accscalar_t>(
+              &cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, accscalar_t, Epilogue>,
+              outer_size, dim_size, inner_size,
+              grid, block, smem_size);
+
+         hipLaunchKernelGGL(( cunn_SpatialSoftMaxBackward<scalar_t, accscalar_t, accscalar_t, Epilogue>)
+            , dim3(grid), dim3(block), smem_size, stream, 
+              gI.mutable_data_ptr<scalar_t>(), output.const_data_ptr<accscalar_t>(), grad.const_data_ptr<accscalar_t>(),
+              outer_size, dim_size, inner_size
+          );
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    });
+  }
+}
+}
+
+TORCH_IMPL_FUNC(log_softmax_zoom_out) (
+  const Tensor &input,
+  const int64_t dim,
+  const bool half_to_float,
+  const Tensor &output) {
+  host_softmax<LogSoftMaxForwardEpilogue,true>(input, dim, half_to_float, output);
+}
+
+TORCH_IMPL_FUNC(log_softmax_backward_zoom_out) (
+  const Tensor& grad,
+  const Tensor& output,
+  int64_t dim,
+  ScalarType input_dtype,
+  const Tensor& grad_input) {
+  bool half_to_float = grad.scalar_type() != input_dtype;
+  if (half_to_float) {
+    TORCH_CHECK(
+        (grad.scalar_type() == ScalarType::Float &&
+         input_dtype == ScalarType::Half),
+        "expected input and grad types to match, or input to be at::Half and grad to be at::Float");
+  }
+  host_softmax_backward<LogSoftMaxBackwardEpilogue, true>(grad, output, dim, half_to_float, grad_input);
+}
+
+TORCH_IMPL_FUNC(softmax_zoom_out) (
+  const Tensor &input,
+  const int64_t dim,
+  const bool half_to_float,
+  const Tensor &output) {
+  host_softmax<SoftMaxForwardEpilogue,false>(input, dim, half_to_float, output);
+}
+
+TORCH_IMPL_FUNC(softmax_backward_zoom_out)
+(const Tensor& grad,
+ const Tensor& output,
+ int64_t dim,
+ ScalarType input_dtype,
+ const Tensor& grad_input) {
+  bool half_to_float = grad.scalar_type() != input_dtype;
+  if (half_to_float) {
+    TORCH_CHECK(
+        (grad.scalar_type() == ScalarType::Float &&
+         input_dtype == ScalarType::Half),
+        "expected input and grad types to match, or input to be at::Half and grad to be at::Float");
+  }
+  Tensor tmp = grad * output;
+  host_softmax_backward<SoftMaxBackwardEpilogue, false>(tmp, output, dim, half_to_float, grad_input);
+}
+
+Tensor masked_softmax_zoom(const Tensor& input_, const Tensor& mask_, const std::optional<int64_t> dim_, const c10::optional<int64_t> mask_type_) {
+  Tensor output = at::empty_like(input_, input_.options());
+  TORCH_CHECK(mask_.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
+
+  TORCH_CHECK(mask_type_.has_value(), "Mask Type should be defined");
+  int64_t mask_type = mask_type_.value();
+  TORCH_CHECK((mask_type == 0) || (mask_type == 1) || (mask_type == 2), "Mask Type should be 0 (src_mask), 1 (src_key_padding_mask), or 2 (default_mask)");
+
+  // If input is [B, H, T, T] and mask is [B, T]
+  // we have special fast kernel
+  // mask_type == 1 => mask_ is a src_key_padding_mask
+  bool is_BxT_mask = (mask_type == 1) && (input_.dim() == 4 && mask_.dim() == 2 && input_.size(0) == mask_.size(0) && input_.size(2) == mask_.size(1) && input_.size(3) == mask_.size(1));
+
+  // If input is [B, H, T, T] and mask is [T, T]
+  // expand mask to [B, H, T, T] and treat it like regular mask
+  // TODO We should have special fast kernel for TxT mask as well
+  // mask_type == 0 => mask_ is a src_mask
+  bool is_TxT_mask = (mask_type == 0) && input_.dim() == 4 && mask_.dim() == 2 && input_.size(3) == mask_.size(1) && input_.size(2) == mask_.size(0) && mask_.size(0) == mask_.size(1);
+  // If mask_type == 2, then mask_.sizes() must equal input_.sizes()
+  TORCH_CHECK(mask_.sizes() == input_.sizes() || is_BxT_mask || is_TxT_mask, "Mask shape should match input. mask: ", mask_.sizes(), " input: ", input_.sizes());
+
+  auto input = input_.dim() == 0 ? input_.view(1) : input_;
+  auto mask = mask_.dim() == 0 ? mask_.view(1) : mask_;
+  if (is_TxT_mask) {
+    mask = mask.expand(input.sizes());
+  }
+  int64_t dim = dim_.has_value() ? dim_.value() : input.dim() - 1;
+
+  int softmax_elements = input.size(dim);
+  // Persistent softmax is only supported when all of the conditions are held:
+  //     1) softmax_elements <= 1024
+  //     2) softmax_elements * input.element_size() <= 4096
+  //     3) mask.is_contiguous()
+  //     4) dim == input.dim() - 1
+  // Otherwise, we fallback to vanilla softmax (where we do not support transformer_mask since converting the mask is expensive)
+  if (softmax_elements > 1024 || softmax_elements * input.element_size() > 4096 || !mask.is_contiguous() || dim < input.dim()-1) {
+    if (is_BxT_mask) {
+      mask = mask.view({mask_.size(0), 1, 1, mask_.size(1)}).expand(input.sizes());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      input.scalar_type(),
+      "masked_softmax",
+      [&] {
+        output = at::softmax(input.masked_fill(mask, -std::numeric_limits<scalar_t>::infinity()), dim);
+      });
+    return output;
+  }
+  int batch_count = input.numel() / softmax_elements;
+  int chunk_size = input.numel() / input.size(0);
+  if (is_BxT_mask) {
+    // Only support when num_heads is even in transformer
+    TORCH_CHECK(input.size(1) % 2 == 0, "Only support when num_heads is even in transformer");
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      input.scalar_type(),
+      "masked_softmax",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, false/* is_log_softmax */, true/* is_masked */>(
+          output.mutable_data_ptr<scalar_t>(),    // dst
+          input.const_data_ptr<scalar_t>(),       // src
+          softmax_elements,
+          softmax_elements,
+          batch_count,
+          mask.const_data_ptr<bool>(),
+          chunk_size,
+          true // is_transformer_mask
+        );
+      });
+
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      input.scalar_type(),
+      "masked_softmax",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        dispatch_softmax_forward<scalar_t, scalar_t, accscalar_t, false/* is_log_softmax */, true/* is_masked */>(
+          output.mutable_data_ptr<scalar_t>(),    // dst
+          input.const_data_ptr<scalar_t>(),       // src
+          softmax_elements,
+          softmax_elements,
+          batch_count,
+          mask.const_data_ptr<bool>()
+        );
+      });
+  }
+  return output;
+}
+
+Tensor masked_softmax_backward_zoom(
+    const Tensor& grad_,
+    const Tensor& output_,
+    const Tensor& mask_,
+    const std::optional<int64_t> dim_) {
+  Tensor grad_input = at::empty_like(grad_, grad_.options());
+  if (grad_.numel() == 0) {
+    return grad_input;
+  }
+
+  auto grad = grad_.contiguous();
+  auto output = output_.contiguous();
+  auto mask = mask_.contiguous();
+  int64_t dim = dim_.has_value() ? maybe_wrap_dim(dim_.value(), output.dim()) : output.dim() - 1;
+
+  grad = grad.dim() == 0 ? grad.view(1) : grad;
+  mask = mask.dim() == 0 ? mask.view(1) : mask;
+  output = output.dim() == 0 ? output.view(1) : output;
+
+  TORCH_CHECK(dim >=0 && dim < grad.dim(), "dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(grad.sizes() == mask.sizes(), "Mask shape should match grad shape");
+  TORCH_CHECK(mask.scalar_type() == ScalarType::Bool, "Mask should be a boolean tensor");
+
+  int softmax_elements = output.size(dim);
+  int64_t batch_count = grad.numel() / softmax_elements;
+
+  if (softmax_elements > 1024 || softmax_elements * grad.element_size() > 4096 || dim < grad.dim()-1) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      grad_input.scalar_type(),
+      "masked_softmax_backward",
+      [&] {
+        grad_input = at::_softmax_backward_data(
+          grad,
+          output.masked_fill(mask, 0),
+          dim,
+          grad.scalar_type()
+        );
+      });
+  } else {
+    grad = grad * output;
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      grad_input.scalar_type(),
+      "masked_softmax_backward",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        dispatch_softmax_backward<scalar_t, scalar_t, accscalar_t, false, true /* masked_softmax */>(
+          grad_input.mutable_data_ptr<scalar_t>(),  // gI_ptr
+          grad.const_data_ptr<scalar_t>(),  // grad_ptr
+          output.const_data_ptr<scalar_t>(),  // output_ptr
+          softmax_elements,  // softmax_elements
+          softmax_elements,   // softmax_elements_stride
+          batch_count,  // batch_count
+          mask.const_data_ptr<bool>()  /* not masked */
+        );
+      });
+  }
+  return grad_input;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Sort.cpp b/aten/src/ATen/native/zoom/Sort.cpp
new file mode 100644
index 00000000000000..5f34f230c0edf2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Sort.cpp
@@ -0,0 +1,128 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/Sort.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/native/Sorting.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/sort_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+#include <limits>
+
+namespace at::native {
+
+std::vector<int64_t> infer_dense_strides_dim_last(const Tensor & self, int64_t dim);
+
+void fillSliceWithIndex(const Tensor& t, int dim) {
+  if (t.numel()) {
+    auto sizes = DimVector(t.dim(), 1);
+    sizes[dim] = t.sizes()[dim];
+    auto range = at::arange(t.sizes()[dim], t.options());
+    auto rangeview = range.view(sizes);
+    t.copy_(rangeview);
+  }
+}
+
+// We perform a segmented sort in cub with inputs that have
+// more than 1024/2048 elements along the selected dimension.
+// Otherwise, we do an inplace bitonic sort (see sortKeyValueInplace).
+void sort_zoom_kernel(
+    const TensorBase& self_base,
+    const TensorBase& values_base,
+    const TensorBase& indices_base,
+    int64_t dim,
+    bool descending,
+    bool stable) {
+  // this algorithm is always stable
+
+  // Macro for converting `TensorBase` -> `Tensor` without
+  // reference count bumps.
+#define TOTENSOR(BASE, VAR)           \
+  OptionalTensorRef opt_##BASE(BASE); \
+  const Tensor& VAR = *opt_##BASE;
+
+  // Converting TensorBase into Tensor.
+  // We will need Tensor's methods from this point onwards.
+  TOTENSOR(self_base, self);
+  TOTENSOR(values_base, values);
+  TOTENSOR(indices_base, indices);
+
+  TORCH_CHECK(self.sizes()[dim] <= std::numeric_limits<int>::max(),
+    "The dimension being sorted can not have more than INT_MAX elements.");
+
+  const auto self_dtype = self.dtype();
+  // FIXME: remove this check once cub sort supports bool
+  TORCH_CHECK(self_dtype != ScalarType::Bool,
+    "Sort currently does not support bool dtype on Zoom.");
+  TORCH_CHECK(self_dtype != ScalarType::ComplexFloat && self_dtype != ScalarType::ComplexDouble,
+    "Sort currently does not support complex dtypes on Zoom.");
+
+  // use inplace algorithm for smaller input sizes without stable=True
+  if (should_use_small_sort(self, dim)) {
+    // from thc: sorted->values, indices->indices, input->self
+    fillSliceWithIndex(indices, dim);
+
+    // We sort k/v pairs in-place; copy unsorted input to output
+    values.copy_(self);
+
+    // Sort using our in-place k/v kernel that supports arbitrary
+    // layout
+    sortKeyValueInplace(values, indices, dim, descending, stable);
+    return;
+  }
+
+  Tensor self_;
+  bool newself = false;
+  if (self.is_non_overlapping_and_dense() && self.stride(dim) == 1) {
+    self_ = self;
+  } else {
+    auto new_strides_unsort = infer_dense_strides_dim_last(self, dim);
+    self_ = at::empty_strided(self.sizes(), new_strides_unsort, self.options());
+    self_.copy_(self);
+    newself = true;
+  }
+
+  c10::MaybeOwned<Tensor> values_tmp, indices_tmp;
+  if (values.strides() == self_.strides() && (newself || get_overlap_status(self, values) == MemOverlapStatus::No)) {
+    values_tmp = c10::MaybeOwned<Tensor>::borrowed(values);
+  } else {
+    values_tmp = c10::MaybeOwned<Tensor>::owned(
+        at::empty_strided(self_.sizes(), self_.strides(), self_.options()));
+  }
+
+  if (indices.strides() != self_.strides()) {
+    indices_tmp = c10::MaybeOwned<Tensor>::owned(
+        at::empty_strided(self_.sizes(), self_.strides(), self_.options().dtype(kLong)));
+  } else {
+    indices_tmp = c10::MaybeOwned<Tensor>::borrowed(indices);
+  }
+
+  launch_stable_sort_kernel(self_, dim, descending, *values_tmp, *indices_tmp);
+
+  if (!values_tmp->is_same(values)) {
+    values.copy_(*values_tmp);
+  }
+  if (!indices_tmp->is_same(indices)) {
+    indices.copy_(*indices_tmp);
+  }
+}
+
+// TODO: we should handle this accordingly when we start using REGISTER_HIP_DISPATCH,
+// since REGISTER_PRIVATEUSE1_DISPATCH won't work in this cpp file.
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+REGISTER_PRIVATEUSE1_DISPATCH(sort_stub, &sort_zoom_kernel);
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Sort.cu b/aten/src/ATen/native/zoom/Sort.cu
new file mode 100644
index 00000000000000..466c705ced9b5c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Sort.cu
@@ -0,0 +1,384 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/Sort.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/core/Array.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/cub.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/native/zoom/SortUtils.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+
+#include <limits>
+#include <c10/core/DeviceArray.h>
+
+namespace at::native {
+
+template <typename T>
+static int minimum_grid_for_occupancy(T kernel, int max_block_size) {
+  int minGridSize = 0;
+  int blockSize;
+  C10_ZOOM_CHECK(hipOccupancyMaxPotentialBlockSize(
+      &minGridSize,
+      &blockSize,
+      kernel,
+      /*dynamicSMemSize=*/0,
+      max_block_size));
+  return minGridSize;
+}
+
+template <typename T>
+constexpr bool has_nan() {
+  if constexpr (std::numeric_limits<T>::is_specialized) {
+    return std::numeric_limits<T>::has_quiet_NaN;
+  } else if constexpr (
+      c10::is_complex<T>::value ||
+      std::is_same_v<T, c10::BFloat16> ||
+      std::is_same_v<T, c10::Half>) {
+    return true;
+  }
+}
+
+// For very small unstable sorts (n <= 32), use bitonicSortKVInPlace
+// which can sort multiple arrays within the same block of threads,
+// improving occupancy.
+struct SmallBitonicSort {
+  template <int A, typename K, typename V, typename IndexType>
+  void sort(
+      at::zoom::detail::TensorInfo<K, IndexType> keyInfo,
+      IndexType keySlices,
+      IndexType keySliceSize,
+      IndexType keySliceStride,
+      at::zoom::detail::TensorInfo<V, IndexType> valueInfo,
+      IndexType valueSliceStride,
+      bool descending) {
+    constexpr int sort_size = 32;
+    constexpr int max_block_y = 16;
+    constexpr int items_per_thread = 2;
+    static_assert(sort_size % items_per_thread == 0, "");
+    constexpr int block_x = sort_size / items_per_thread;
+
+    TORCH_INTERNAL_ASSERT(keySliceSize <= sort_size);
+
+    // Scale batch size down if the grid would be too small
+    const auto min_grid = minimum_grid_for_occupancy(
+        bitonicSortKVInPlace<
+            A, -1, block_x, max_block_y,
+            K, V, LTOp<K, true>, IndexType>,
+        block_x * max_block_y);
+    const auto max_batch = ::max(IndexType{1}, keySlices / min_grid);
+    const int block_y = ::min(IndexType(max_block_y), max_batch);
+    dim3 block(block_x, block_y);
+
+    dim3 grid;
+    const int grid_count = (keySlices + block_y - 1) / block_y;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(grid_count, grid),
+                          "Too many slices to sort");
+    const auto stream = c10::zoom::getCurrentZoomStream();
+
+    if (descending) {
+     hipLaunchKernelGGL(( bitonicSortKVInPlace<A, -1, block_x, max_block_y>)
+        , dim3(grid), dim3(block), 0, stream, 
+          keyInfo,
+          keySlices,
+          keySliceSize,
+          keySliceStride,
+          valueInfo,
+          valueSliceStride,
+          GTOp<K, true>());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+     hipLaunchKernelGGL(( bitonicSortKVInPlace<A, -1, block_x, max_block_y>)
+        , dim3(grid), dim3(block), 0, stream, 
+          keyInfo,
+          keySlices,
+          keySliceSize,
+          keySliceStride,
+          valueInfo,
+          valueSliceStride,
+          LTOp<K, true>());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+};
+
+#if HAS_WARP_MERGE_SORT()
+
+// For small sorts (n <= 128) we use warpMergeSortKVInPlace which
+// sorts one slice per warp and potentially multiple slices in the
+// same block for improved occupancy with large batch sizes.
+template <int sort_size>
+struct WarpMergeSort {
+
+  template <int A, typename K, typename V, typename IndexType>
+  void sort(
+      at::zoom::detail::TensorInfo<K, IndexType> keyInfo,
+      IndexType keySlices,
+      IndexType keySliceSize,
+      IndexType keySliceStride,
+      at::zoom::detail::TensorInfo<V, IndexType> valueInfo,
+      IndexType valueSliceStride,
+      bool descending) {
+    constexpr int max_block_y = 16;
+    const int block_x = at::zoom::warp_size();
+
+    TORCH_INTERNAL_ASSERT(keySliceSize <= sort_size);
+
+    // Scale batch size down if the grid would be too small
+    const auto min_grid = minimum_grid_for_occupancy(
+        warpMergeSortKVInPlace<
+            A, -1, sort_size, max_block_y,
+            K, V, LTOp<K, true>, IndexType>,
+        block_x * max_block_y);
+    const auto max_batch = ::max(IndexType{1}, keySlices / min_grid);
+    const int block_y = ::min(IndexType(max_block_y), max_batch);
+    dim3 block(block_x, block_y);
+
+    dim3 grid;
+    const int grid_count = (keySlices + block_y - 1) / block_y;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(grid_count, grid),
+                          "Too many slices to sort");
+    const auto stream = c10::zoom::getCurrentZoomStream();
+
+    if (descending) {
+      const K invalid_key = at::numeric_limits<K>::lower_bound();
+     hipLaunchKernelGGL(( warpMergeSortKVInPlace<A, -1, sort_size, max_block_y>)
+        , dim3(grid), dim3(block), 0, stream, 
+          keyInfo,
+          keySlices,
+          keySliceSize,
+          keySliceStride,
+          valueInfo,
+          valueSliceStride,
+          GTOp<K, true>(),
+          invalid_key);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      const K invalid_key = []{
+        // NAN is sorted after inf
+        if constexpr(has_nan<K>()) {
+          return K(NAN);
+        }
+        return at::numeric_limits<K>::upper_bound();
+      }();
+     hipLaunchKernelGGL(( warpMergeSortKVInPlace<A, -1, sort_size, max_block_y>)
+        , dim3(grid), dim3(block), 0, stream, 
+          keyInfo,
+          keySlices,
+          keySliceSize,
+          keySliceStride,
+          valueInfo,
+          valueSliceStride,
+          LTOp<K, true>(),
+          invalid_key);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+};
+
+#endif // !HAS_WARP_MERGE_SORT()
+
+// For medium sizes (128 < n <= 4096) use radixSortKVInplace.
+struct MediumRadixSort {
+
+  template <int A, typename K, typename V, typename IndexType>
+  void sort(
+      at::zoom::detail::TensorInfo<K, IndexType> keyInfo,
+      IndexType keySlices,
+      IndexType keySliceSize,
+      IndexType keySliceStride,
+      at::zoom::detail::TensorInfo<V, IndexType> valueInfo,
+      IndexType valueSliceStride,
+      bool descending) {
+
+#define HANDLE_CASE(SIZE, ITEMS_PER_THREAD)         \
+    fixed_size_sort<A, SIZE, ITEMS_PER_THREAD>(     \
+        keyInfo,                                    \
+        keySlices,                                  \
+        keySliceSize,                               \
+        keySliceStride,                             \
+        valueInfo,                                  \
+        valueSliceStride,                           \
+        descending)
+
+    int64_t ceilPowerOf2 = nextHighestPowerOf2(keySliceSize);
+    TORCH_INTERNAL_ASSERT(ceilPowerOf2 <= 4096);
+    switch (ceilPowerOf2) {
+      case 4096:
+        HANDLE_CASE(4096, 32);
+        break;
+      case 2048:
+        HANDLE_CASE(2048, 32);
+        break;
+      case 1024:
+      case 512:
+      case 256:
+        HANDLE_CASE(1024, 32);
+        break;
+      case 128:
+      case 64:
+#if !HAS_WARP_MERGE_SORT()
+        HANDLE_CASE(128, 4);
+        break;
+#endif
+      case 32:
+      case 16:
+      case 8:
+      case 4:
+      case 2:
+#if HAS_WARP_MERGE_SORT()
+        TORCH_INTERNAL_ASSERT(
+            false, "Expected size <= 128 to be handled by a different algorithm");
+#else
+        HANDLE_CASE(32, 2);
+#endif
+        break;
+      case 1:
+        /* Nothing to do, data already sorted */
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false);
+    }
+#undef HANDLE_CASE
+
+  }
+
+  template <int A, int sort_size, int items_per_thread,
+            typename K, typename V, typename IndexType>
+  void fixed_size_sort(
+      at::zoom::detail::TensorInfo<K, IndexType> keyInfo,
+      IndexType keySlices,
+      IndexType keySliceSize,
+      IndexType keySliceStride,
+      at::zoom::detail::TensorInfo<V, IndexType> valueInfo,
+      IndexType valueSliceStride,
+      bool descending) {
+    static_assert(sort_size % items_per_thread == 0, "");
+    constexpr int block = sort_size / items_per_thread;
+    dim3 grid;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(keySlices, grid),
+                          "Too many slices to sort");
+
+    const auto stream = c10::zoom::getCurrentZoomStream();
+   hipLaunchKernelGGL(( radixSortKVInPlace<A, -1, block, items_per_thread>)
+        , dim3(grid), dim3(block), 0, stream, 
+          keyInfo,
+          keySlices,
+          keySliceSize,
+          keySliceStride,
+          valueInfo,
+          valueSliceStride,
+          descending);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+};
+
+template <typename Sorter>
+void sortCommon(Sorter sorter, const TensorBase &key, const TensorBase &value,
+                int dim, bool descending) {
+  TORCH_CHECK(key.sizes() == value.sizes(),
+              "Key tensor must have same size as value tensor");
+  int dims = value.dim();
+  TORCH_CHECK(dims <= MAX_DIMS, "value tensor has too many dimensions");
+  // if key and value tensors have the same size, we do not need to check both
+
+  ptrdiff_t inElements = key.numel();
+
+  if (inElements == 0) {
+    return;
+  }
+
+  int64_t keySliceSize = key.size(dim);
+  ptrdiff_t keySlices = inElements / keySliceSize;
+
+#define HANDLE_SORT_CASE(TYPE, A)                   \
+  sorter.template sort<A>(                          \
+      keyInfo,                                      \
+      (TYPE) keySlices,                             \
+      (TYPE) keySliceSize,                          \
+      (TYPE) keyInfo.strides[collapseKeyDim],       \
+      valueInfo,                                    \
+      (TYPE) valueInfo.strides[collapseValueDim],   \
+      descending)
+
+  // The constructed key/value tensor info is used to select the slice
+  // we are sorting on a per-block basis
+  // The constructed key/value tensor info is used to select the slice
+  // we are sorting on a per-block basis
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, key.scalar_type(), "sortKeyValueInplace", [&]  {
+    if (at::zoom::detail::canUse32BitIndexMath(key)) {
+      at::zoom::detail::TensorInfo<scalar_t, unsigned int> keyInfo =
+        at::zoom::detail::getTensorInfo<scalar_t, unsigned int>(key);
+      at::zoom::detail::TensorInfo<int64_t, unsigned int> valueInfo =
+        at::zoom::detail::getTensorInfo<int64_t, unsigned int>(value);
+
+      auto strideKey = keyInfo.strides[dim];
+      keyInfo.sizes[dim] = 1;
+      int collapseKeyDim = keyInfo.collapseDims(dim);
+      keyInfo.strides[collapseKeyDim] = strideKey;
+      auto strideValue = valueInfo.strides[dim];
+      valueInfo.sizes[dim]=1;
+      int collapseValueDim = valueInfo.collapseDims(dim);
+      valueInfo.strides[collapseValueDim] = strideValue;
+
+      if (keyInfo.isContiguous()) {
+        HANDLE_SORT_CASE(unsigned int, -2);
+      } else {
+        switch (keyInfo.dims) {
+          case 2:
+            HANDLE_SORT_CASE(unsigned int, 2);
+            break;
+          default:
+            HANDLE_SORT_CASE(unsigned int, -1);
+            break;
+        }
+      }
+
+    } else {
+      at::zoom::detail::TensorInfo<scalar_t, uint64_t> keyInfo =
+        at::zoom::detail::getTensorInfo<scalar_t, uint64_t>(key);
+      at::zoom::detail::TensorInfo<int64_t, uint64_t> valueInfo =
+        at::zoom::detail::getTensorInfo<int64_t, uint64_t>(value);
+
+      auto strideKey = keyInfo.strides[dim];
+      keyInfo.sizes[dim] = 1;
+      int collapseKeyDim = keyInfo.collapseDims(dim);
+      keyInfo.strides[collapseKeyDim] = strideKey;
+      auto strideValue = valueInfo.strides[dim];
+      valueInfo.sizes[dim]=1;
+      int collapseValueDim = valueInfo.collapseDims(dim);
+      valueInfo.strides[collapseValueDim] = strideValue;
+
+      // int64_t case is rare, just instantiate the generic version
+      HANDLE_SORT_CASE(uint64_t, -1);
+    }
+  });
+#undef HANDLE_SORT_CASE
+}
+
+void sortKeyValueInplace(
+    const TensorBase& key,
+    const TensorBase& value,
+    int dim,
+    bool descending,
+    bool stable) {
+  const auto sort_size = key.size(dim);
+  if (sort_size <= 1) {
+    return; // Already sorted
+  } else if (!stable && sort_size <= 32) {
+    // NOTE: Bitonic sort is unstable
+    sortCommon(SmallBitonicSort{}, key, value, dim, descending);
+#if HAS_WARP_MERGE_SORT()
+  } else if (sort_size <= 128) {
+    sortCommon(WarpMergeSort<128>{}, key, value, dim, descending);
+#endif
+  } else {
+    sortCommon(MediumRadixSort{}, key, value, dim, descending);
+  }
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Sort.h b/aten/src/ATen/native/zoom/Sort.h
new file mode 100644
index 00000000000000..77f33a5b8d7634
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Sort.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <cstdint>
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/zoom/SortStable.h>
+
+namespace at {
+namespace native {
+
+inline bool should_use_small_sort(const TensorBase &self, int64_t dim) {
+  return self.size(dim) <= 4096;
+}
+
+void sortKeyValueInplace(
+    const TensorBase &key, const TensorBase &value, int dim,
+    bool descending, bool stable=false);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/SortImpl.cu b/aten/src/ATen/native/zoom/SortImpl.cu
new file mode 100644
index 00000000000000..5d779d0fd15ce5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SortImpl.cu
@@ -0,0 +1,37 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+namespace at::native {
+
+std::vector<int64_t> infer_dense_strides_dim_last(const Tensor & self, int64_t dim) {
+  int64_t ndim = self.dim();
+  // sort the strides in descending order according to its value,
+  // keeping dim the last.
+  std::vector<int64_t> strides = self.strides().vec();
+  strides[dim] = -1;
+  std::vector<int64_t> original_dim(ndim);
+  for (int64_t i = 0; i < ndim; i++) {
+    original_dim[i] = i;
+  }
+  thrust::stable_sort_by_key(
+    thrust::host, strides.data(), strides.data() + ndim, original_dim.data(),
+    thrust::greater<int64_t>()
+  );
+  // generate contiguous strides on permuted dims
+  std::vector<int64_t> new_strides(ndim);
+  std::vector<int64_t> new_strides_unsort(ndim);
+  int64_t cumprod = 1;
+  for (int64_t i = 0; i < ndim; i++) {
+    new_strides[ndim - 1 - i] = cumprod;
+    cumprod *= self.sizes()[original_dim[ndim - 1 - i]];
+  }
+  // unsort new strides
+  for (int64_t i = 0; i < ndim; i++) {
+    new_strides_unsort[original_dim[i]] = new_strides[i];
+  }
+  return new_strides_unsort;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/SortStable.cu b/aten/src/ATen/native/zoom/SortStable.cu
new file mode 100644
index 00000000000000..62df3c4379e8af
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SortStable.cu
@@ -0,0 +1,286 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/SortStable.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/core/Array.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/cub.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/native/zoom/SortUtils.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+
+#include <c10/core/DeviceArray.h>
+#include <limits>
+
+namespace at::native {
+
+namespace {
+
+struct offset_t {
+  int stride;
+  int begin;
+  __device__ int operator[](int i) {
+    return stride * (begin + i);
+  }
+};
+// Segmented sort by full sort algorithm:.
+// Say we are sorting a (2, 3) tensor. We have in flattened form:
+// values       0.4 1.2 5.3 6.2 1.3 2.3
+// indices        0   1   2   0   1   2
+// segment_id     0   0   0   1   1   1
+
+// First we sort by values, globally:
+// values       6.2 5.3 2.3 1.2 1.3 0.4
+// indices        0   2   2   1   1   0
+// segment_id     1   0   1   0   1   0
+
+// Then we stable sort by segment id:
+// values       5.3 1.2 0.4 6.2 2.3 1.3
+// indices        2   1   0   0   2   1
+// segment_id     0   0   0   1   1   1
+
+// This method can only work if the slice we are sorting (`dim`) is
+// innermost, and both values and indices are contiguous. We do this
+// by re-arranging the input into this form as needed, which will
+// unfortunately allocate memory if the request is not in this form.
+// Vectorized sort is slower than iterated sort if the number of
+// slices is small (since we're sorting twice, instead of invoking a
+// smaller sort `numSlices` times), but the cub sort
+// implementation here is a catch-all, so we're not looking for
+// efficiency, but instead correctness.
+
+template <typename scalar_t>
+__global__ void sort_postprocess_kernel(
+    const scalar_t* in,
+    scalar_t* out,
+    int64_t* index,
+    const int2* i_s_ptr,
+    int nsegments,
+    int nsort) {
+  HIP_KERNEL_LOOP(i, nsegments * nsort) {
+    int segment = i / nsort;
+    int j = i % nsort;
+
+    int offset = segment * nsort;
+    const scalar_t* in_ = in + offset;
+    scalar_t* out_ = out + offset;
+    int64_t* index_ = index + offset;
+    const int2* i_s_ptr_ = i_s_ptr + offset;
+
+    int idx = i_s_ptr_[j].y;
+    index_[j] = idx;
+    out_[j] = in_[idx];
+  }
+}
+
+C10_LAUNCH_BOUNDS_1(at::zoom::detail::HIP_NUM_THREADS)
+__global__ void fill_index_and_segment_kernel(
+    int2* data,
+    int numel,
+    at::zoom::detail::IntDivider<uint32_t> nsort_divider) {
+  HIP_KERNEL_LOOP(idx, numel) {
+    auto div_mod = nsort_divider.divmod(idx);
+    auto segment = static_cast<int>(div_mod.div);
+    auto sort = static_cast<int>(div_mod.mod);
+    data[idx] = int2{segment, sort};
+  }
+}
+
+C10_LAUNCH_BOUNDS_1(at::zoom::detail::HIP_NUM_THREADS)
+__global__ void fill_reverse_indices_kernel(
+    int64_t* data,
+    int numel,
+    at::zoom::detail::IntDivider<uint32_t> nsort_divider) {
+  HIP_KERNEL_LOOP(idx, numel) {
+    data[idx] = nsort_divider.mod(idx);
+  }
+}
+
+template <typename scalar_t>
+inline void segmented_sort_large_segments(
+    const int64_t nsegments,
+    const int64_t nsort,
+    const int64_t n,
+    const bool descending,
+    const scalar_t* self_ptr,
+    scalar_t* values_ptr,
+    int64_t* indices_ptr) {
+  using namespace at::zoom::detail;
+  auto allocator = at::zoom::getZoomDeviceAllocator();
+  auto stream = c10::zoom::getCurrentZoomStream();
+  dim3 block = HIP_NUM_THREADS;
+  dim3 grid = GET_BLOCKS(nsort);
+  c10::DeviceArray<int64_t> indices(*allocator, nsort);
+  at::zoom::detail::IntDivider<uint32_t> nsort_divider(nsort);
+ hipLaunchKernelGGL(( fill_reverse_indices_kernel), dim3(grid), dim3(block), 0, stream, 
+      indices.get(), nsort, nsort_divider);
+  const int64_t* initial_indices = indices.get();
+
+  for (auto i : c10::irange(nsegments)) {
+    at::zoom::hipcub::radix_sort_pairs<scalar_t, int64_t>(
+        self_ptr, values_ptr, initial_indices, indices_ptr, nsort, descending);
+    indices_ptr += nsort;
+    self_ptr += nsort;
+    values_ptr += nsort;
+  }
+}
+
+template <typename scalar_t>
+inline void segmented_sort_pairs_by_full_sort(
+    const int64_t nsegments,
+    const int64_t nsort,
+    const int64_t n,
+    const bool descending,
+    const scalar_t* const self_ptr,
+    scalar_t* const values_ptr,
+    int64_t* const indices_ptr) {
+  int64_t segment_bits = std::max<int64_t>(
+      1L, static_cast<int64_t>(::ceil(std::log2(nsegments))));
+
+  const auto numel = nsort * nsegments;
+  auto zoom_allocator = at::zoom::getZoomDeviceAllocator();
+  auto indices_and_segment = zoom_allocator->allocate(numel * sizeof(int2));
+  auto i_s_ptr = static_cast<int2*>(indices_and_segment.get());
+
+  using namespace at::zoom::detail;
+  dim3 block = HIP_NUM_THREADS;
+  dim3 grid = GET_BLOCKS(numel);
+  auto stream = c10::zoom::getCurrentZoomStream();
+  at::zoom::detail::IntDivider<uint32_t> nsort_divider(nsort);
+ hipLaunchKernelGGL(( fill_index_and_segment_kernel), dim3(grid), dim3(block), 0, stream, 
+      i_s_ptr, numel, nsort_divider);
+
+  auto indices_and_segment2 =
+      zoom_allocator->allocate(nsegments * nsort * sizeof(int2));
+  auto i_s_ptr2 = static_cast<int2*>(indices_and_segment2.get());
+
+  at::zoom::hipcub::radix_sort_pairs<scalar_t, int2>(
+      self_ptr, nullptr, i_s_ptr, i_s_ptr2, n, descending);
+
+  TORCH_INTERNAL_ASSERT(segment_bits <= 32);
+
+  // sort on lower 32bits, i.e. segment index
+  at::zoom::hipcub::radix_sort_keys<int64_t>(
+      reinterpret_cast<int64_t*>(i_s_ptr2),
+      reinterpret_cast<int64_t*>(i_s_ptr),
+      n,
+      false,
+      0,
+      segment_bits);
+
+ hipLaunchKernelGGL(( sort_postprocess_kernel), 
+      dim3((n + 511) / 512),
+      dim3(512),
+      0,
+      c10::zoom::getCurrentZoomStream(), 
+      self_ptr, values_ptr, indices_ptr, i_s_ptr, nsegments, nsort);
+}
+
+template <typename scalar_t>
+void segmented_sort_pairs(
+    int64_t nsegments,
+    int64_t nsort,
+    int64_t n,
+    bool descending,
+    const scalar_t* self_ptr,
+    scalar_t* values_ptr,
+    int64_t* indices_ptr) {
+  const auto numel = nsort * nsegments;
+  auto zoom_allocator = at::zoom::getZoomDeviceAllocator();
+  auto reverse_indices = zoom_allocator->allocate(numel * sizeof(int64_t));
+  int64_t* reverse_indices_ptr = static_cast<int64_t*>(reverse_indices.get());
+
+  using namespace at::zoom::detail;
+  dim3 block = HIP_NUM_THREADS;
+  dim3 grid = GET_BLOCKS(numel);
+  auto stream = c10::zoom::getCurrentZoomStream();
+  at::zoom::detail::IntDivider<uint32_t> nsort_divider(nsort);
+ hipLaunchKernelGGL(( fill_reverse_indices_kernel), dim3(grid), dim3(block), 0, stream, 
+      reverse_indices_ptr, numel, nsort_divider);
+
+  at::zoom::hipcub::segmented_sort_pairs(
+      self_ptr,
+      values_ptr,
+      reverse_indices_ptr,
+      indices_ptr,
+      n,
+      nsegments,
+      offset_t{(int)nsort, 0},
+      offset_t{(int)nsort, 1},
+      descending);
+}
+
+} // namespace
+
+void launch_stable_sort_kernel(
+    const TensorBase& self,
+    int64_t dim,
+    bool descending,
+    const TensorBase& values,
+    const TensorBase& indices) {
+  const auto numel = self.numel();
+  if (numel == 0) {
+    return;
+  }
+
+  int64_t numel_or_intmax =
+      ::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
+  int64_t nsort = self.size(dim);
+  int64_t nbatch = (numel_or_intmax / nsort) * nsort;
+  TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort);
+  int64_t* indices_ptr = indices.mutable_data_ptr<int64_t>();
+
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&] {
+        const scalar_t* self_ptr = self.const_data_ptr<scalar_t>();
+        scalar_t* values_ptr = values.mutable_data_ptr<scalar_t>();
+        int64_t remaining = numel;
+        while (remaining > 0) {
+          int64_t n = ::min(remaining, nbatch);
+          int64_t nsegments = n / nsort;
+
+          if (nsegments == 1 ||
+              nsort >= 1000000) { // rough heuristics where even a single
+                                  // sort occupies GPU
+            segmented_sort_large_segments(
+                nsegments,
+                nsort,
+                n,
+                descending,
+                self_ptr,
+                values_ptr,
+                indices_ptr);
+          } else if (nsegments < 128) {
+            segmented_sort_pairs_by_full_sort(
+                nsegments,
+                nsort,
+                n,
+                descending,
+                self_ptr,
+                values_ptr,
+                indices_ptr);
+          } else {
+            segmented_sort_pairs(
+                nsegments,
+                nsort,
+                n,
+                descending,
+                self_ptr,
+                values_ptr,
+                indices_ptr);
+          }
+
+          remaining -= n;
+          self_ptr += n;
+          values_ptr += n;
+          indices_ptr += n;
+        }
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/SortStable.h b/aten/src/ATen/native/zoom/SortStable.h
new file mode 100644
index 00000000000000..039c4307c522c9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SortStable.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <cstdint>
+
+namespace at {
+namespace native {
+
+// Stable-sort self into values, and set indices to the
+// inverse-permutation from values back to self.
+// Output tensors must be pre-allocated and contiguous.
+void launch_stable_sort_kernel(
+    const TensorBase& self,
+    int64_t dim,
+    bool descending,
+    const TensorBase& values,
+    const TensorBase& indices);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/SortUtils.cuh b/aten/src/ATen/native/zoom/SortUtils.cuh
new file mode 100644
index 00000000000000..95197f75cedba0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SortUtils.cuh
@@ -0,0 +1,333 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+
+#include <ATen/zoom/cub.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/native/zoom/Sort.h>
+#include <ATen/native/StridedRandomAccessor.h>
+
+#define HAS_WARP_MERGE_SORT() (TORCH_HIP_VERSION >= 110600)
+
+
+namespace at { namespace native {
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA,
+                                   K& kB, V& vB, bool& validB,
+                                   bool dir,
+                                   const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <int Power2SortSize, typename IndexType, typename Comparator,
+          typename K, typename V>
+__device__ inline void bitonicSort(K *keys,
+                                   V *values,
+                                   bool *valid,
+                                   const Comparator& comp) {
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwap<Comparator, K, V>(
+        keys[pos], values[pos], valid[pos],
+        keys[pos + stride], values[pos + stride], valid[pos + stride],
+        flag, comp);
+    }
+  }
+
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    __syncthreads();
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwap<Comparator, K, V>(
+      keys[pos], values[pos], valid[pos],
+      keys[pos + stride], values[pos + stride], valid[pos + stride],
+      false, comp);
+  }
+
+  __syncthreads();
+
+}
+
+// at::zoom::detail::TensorInfo version
+// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
+// modifies the input `keys` and `values`
+template <int KeyDims, int ValueDims, int block_dim_x, int max_block_dim_y,
+          typename K, typename V, typename Comparator, typename IndexType>
+C10_LAUNCH_BOUNDS_1(block_dim_x * max_block_dim_y)
+__global__ void
+bitonicSortKVInPlace(at::zoom::detail::TensorInfo<K, IndexType> keys,
+                     IndexType keySlices,
+                     IndexType keySliceSize,
+                     IndexType keySliceStride,
+                     at::zoom::detail::TensorInfo<V, IndexType> values,
+                     IndexType valueSliceStride,
+                     Comparator comp) {
+  // Find the slice of the tensor that we are sorting
+  // NOTE: blockDim.y may be less max_block_dim_y
+  const IndexType blockIndex = getLinearBlockId<IndexType>();
+  const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y;
+
+  // If the entire block is out of bounds exit early
+  if (blockIndex * blockDim.y >= keySlices) {
+    return;
+  }
+  // It's also possible for some rows of a block to be out of bounds
+  // but all thread need to run for __syncthreads to work.
+  const bool row_valid = linearIndex < keySlices;
+
+  constexpr int items_per_thread = 2;
+  constexpr int Power2SortSize = block_dim_x * items_per_thread;
+
+  // Storage for max_block_dim_y sorts performed in parallel
+  __shared__ K blockSharedKeys[max_block_dim_y][Power2SortSize];
+  __shared__ V blockSharedValues[max_block_dim_y][Power2SortSize];
+  __shared__ bool blockSharedValid[max_block_dim_y][Power2SortSize];
+
+  auto sharedKeys = blockSharedKeys[threadIdx.y];
+  auto sharedValues = blockSharedValues[threadIdx.y];
+  auto sharedValid = blockSharedValid[threadIdx.y];
+
+  const IndexType keyStartOffset =
+    at::zoom::detail::IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    at::zoom::detail::IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  // Load 2 values per thread into the shared workspace
+  #pragma unroll
+  for (int k = 0; k < items_per_thread; ++k) {
+    auto idx = threadIdx.x + k * blockDim.x;
+    bool valid = row_valid && idx < keySliceSize;
+
+    sharedKeys[idx] = valid ?
+        keys.data[idx * keySliceStride + keyStartOffset] : K{};
+    sharedValues[idx] = valid ?
+        values.data[idx * valueSliceStride + valueStartOffset] : V{};
+    sharedValid[idx] = valid;
+  }
+
+  // Sort!
+  bitonicSort<Power2SortSize, IndexType>(
+      sharedKeys, sharedValues, sharedValid, comp);
+
+  if (!row_valid) {
+    return;
+  }
+
+  // Store outputs
+  #pragma unroll
+  for (int k = 0; k < items_per_thread; ++k) {
+    auto idx = threadIdx.x + k * blockDim.x;
+    if (idx < keySliceSize) {
+      keys.data[idx * keySliceStride + keyStartOffset] = sharedKeys[idx];
+      values.data[idx * valueSliceStride + valueStartOffset] = sharedValues[idx];
+    }
+  }
+}
+
+#if HAS_WARP_MERGE_SORT()
+
+template <int KeyDims, int ValueDims, int sort_size, int max_block_dim_y,
+          typename K, typename V, typename Comparator, typename IndexType>
+C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE * max_block_dim_y)
+__global__ void
+warpMergeSortKVInPlace(
+    at::zoom::detail::TensorInfo<K, IndexType> keys,
+    IndexType keySlices,
+    IndexType keySliceSize,
+    IndexType keySliceStride,
+    at::zoom::detail::TensorInfo<V, IndexType> values,
+    IndexType valueSliceStride,
+    Comparator comp,
+    K invalid_key) {
+  // Find the slice of the tensor that we are sorting
+  // NOTE: blockDim.y may be less max_block_dim_y
+  const IndexType blockIndex = getLinearBlockId<IndexType>();
+  const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y;
+
+  // If this row is out of bounds exit early
+  if (linearIndex >= keySlices) {
+    return;
+  }
+
+  const IndexType keyStartOffset =
+    at::zoom::detail::IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    at::zoom::detail::IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  K *keys_slice = &keys.data[keyStartOffset];
+  V *values_slice = &values.data[valueStartOffset];
+
+  StridedRandomAccessor<K, IndexType> keys_iter(keys_slice, keySliceStride);
+  StridedRandomAccessor<V, IndexType> values_iter(values_slice, valueSliceStride);
+
+  namespace cub = ROCM_HIPCUB(at_zoom_detail::cub);
+
+  ZOOM_KERNEL_ASSERT(blockDim.x == C10_WARP_SIZE);
+  ZOOM_KERNEL_ASSERT(blockDim.y <= max_block_dim_y);
+  constexpr int items_per_thread = sort_size / C10_WARP_SIZE;
+  static_assert(
+      items_per_thread * C10_WARP_SIZE == sort_size,
+      "sort_size must be a multiple of C10_WARP_SIZE");
+
+
+  using LoadKeys = cub::WarpLoad<K, items_per_thread, cub::WARP_LOAD_TRANSPOSE>;
+  using LoadValues = cub::WarpLoad<V, items_per_thread, cub::WARP_LOAD_TRANSPOSE>;
+  using Sort = cub::WarpMergeSort<K, items_per_thread, C10_WARP_SIZE, V>;
+  using StoreKeys = cub::WarpStore<K, items_per_thread, cub::WARP_STORE_TRANSPOSE>;
+  using StoreValues = cub::WarpStore<V, items_per_thread, cub::WARP_STORE_TRANSPOSE>;
+
+  __shared__ union {
+    typename LoadKeys::TempStorage load_keys;
+    typename LoadValues::TempStorage load_values;
+    typename Sort::TempStorage sort;
+    typename StoreKeys::TempStorage store_keys;
+    typename StoreValues::TempStorage store_values;
+  } tmp_storage[max_block_dim_y];
+
+  auto& warp_storage = tmp_storage[threadIdx.y];
+
+  // Load inputs
+  K local_keys[items_per_thread];
+  V local_values[items_per_thread];
+
+  const auto invalid_value = V{};
+  LoadKeys(warp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key);
+  WARP_SYNC();
+  LoadValues(warp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value);
+  WARP_SYNC();
+
+  // Sort! We use stable sort to ensure that invalid values are never
+  // sorted before valid values. In testing it performed the same as
+  // .Sort, so there is no down-side.
+  Sort(warp_storage.sort).StableSort(
+      local_keys, local_values, comp, keySliceSize, invalid_key);
+  WARP_SYNC();
+
+  // Store outputs
+  StoreKeys(warp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize);
+  WARP_SYNC();
+  StoreValues(warp_storage.store_values).Store(values_iter, local_values, keySliceSize);
+}
+
+#endif // HAS_WARP_MERGE_SORT()
+
+template <int KeyDims, int ValueDims,
+          int block_size, int items_per_thread,
+          typename K, typename V, typename IndexType>
+C10_LAUNCH_BOUNDS_1(block_size)
+__global__ void
+radixSortKVInPlace(at::zoom::detail::TensorInfo<K, IndexType> keys,
+                   IndexType keySlices,
+                   IndexType keySliceSize,
+                   IndexType keySliceStride,
+                   at::zoom::detail::TensorInfo<V, IndexType> values,
+                   IndexType valueSliceStride,
+                   bool descending) {
+  static_assert(block_size > 0, "");
+
+  // Find the slice of the tensor that we are sorting
+  const IndexType linearIndex = getLinearBlockId<IndexType>();
+  // Tiling the slices could have us be out of bounds, if there are a
+  // lot of slices to sort
+  if (linearIndex >= keySlices) {
+    return;
+  }
+
+  const IndexType keyStartOffset =
+    at::zoom::detail::IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    at::zoom::detail::IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  K *keys_slice = &keys.data[keyStartOffset];
+  V *values_slice = &values.data[valueStartOffset];
+
+  StridedRandomAccessor<K, IndexType> keys_iter(keys_slice, keySliceStride);
+  StridedRandomAccessor<V, IndexType> values_iter(values_slice, valueSliceStride);
+
+  namespace cub = ROCM_HIPCUB(at_zoom_detail::cub);
+
+  using key_t = typename at::zoom::hipcub::detail::hip_type<K>::type;
+  using LoadKeys = hipcub::BlockLoad<K, block_size, items_per_thread,
+                                  cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
+  using LoadValues = hipcub::BlockLoad<V, block_size, items_per_thread,
+                                    cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
+  using Sort = cub::BlockRadixSort<key_t, block_size, items_per_thread, V>;
+  using StoreKeys = hipcub::BlockStore<K, block_size, items_per_thread,
+                                    cub::BLOCK_STORE_TRANSPOSE>;
+  using StoreValues = hipcub::BlockStore<V, block_size, items_per_thread,
+                                      cub::BLOCK_STORE_TRANSPOSE>;
+
+  __shared__ union {
+    typename LoadKeys::TempStorage load_keys;
+    typename LoadValues::TempStorage load_values;
+    typename Sort::TempStorage sort;
+    typename StoreKeys::TempStorage store_keys;
+    typename StoreValues::TempStorage store_values;
+  } tmp_storage;
+
+  // cub's Block operations operate on a fixed number of items, but the
+  // actual slice we are sorting might be smaller. So, we need to make
+  // up the difference with keys that will always sort higher.
+  const K invalid_key = [descending] {
+    using radix_t = typename cub::Traits<key_t>::UnsignedBits;
+    union {
+      K key;
+      radix_t radix;
+    } tmp;
+    tmp.radix = descending ?
+        cub::Traits<key_t>::LOWEST_KEY :
+        cub::Traits<key_t>::MAX_KEY;
+    return tmp.key;
+  }();
+  const V invalid_value = static_cast<V>(0);
+
+  // Load inputs
+  K local_keys[items_per_thread];
+  V local_values[items_per_thread];
+
+  LoadKeys(tmp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key);
+  __syncthreads();
+  LoadValues(tmp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value);
+  __syncthreads();
+
+  // Sort!
+  if (descending) {
+    Sort(tmp_storage.sort).SortDescending(
+        reinterpret_cast<key_t (&)[items_per_thread]>(local_keys),
+        local_values);
+  } else {
+    Sort(tmp_storage.sort).Sort(
+        reinterpret_cast<key_t (&)[items_per_thread]>(local_keys),
+        local_values);
+  }
+  __syncthreads();
+
+  // Store outputs
+  StoreKeys(tmp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize);
+  __syncthreads();
+  StoreValues(tmp_storage.store_values).Store(values_iter, local_values, keySliceSize);
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/zoom/Sorting.cpp b/aten/src/ATen/native/zoom/Sorting.cpp
new file mode 100644
index 00000000000000..405184c65a32f3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Sorting.cpp
@@ -0,0 +1,208 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/Sorting.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/Context.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+
+#include <ATen/native/SortingUtils.h>
+#include <ATen/native/ReduceOpsUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/full.h>
+#include <ATen/ops/kthvalue_native.h>
+#include <ATen/ops/median_native.h>
+#include <ATen/ops/nanmedian_native.h>
+#include <ATen/ops/where.h>
+#endif
+
+namespace at::native {
+namespace {
+
+std::tuple<Tensor&, Tensor&> kthvalue_out_impl_zoom(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t k,
+    int64_t dim_,
+    bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim());
+  int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
+  zero_numel_check_dims(self, dim, "kthvalue()");
+
+  TORCH_CHECK(k >= 1 && k <= slicesize,
+              "kthvalue(): selected number k out of range for dimension ", dim);
+
+  at::assert_no_overlap(self, values);
+
+  _reduction_with_indices_allocate_or_resize_output(
+      values, indices, self, dim, keepdim);
+  if (self.dim() == 0 && self.numel() == 1) {
+    values.copy_(self);
+    indices.zero_();
+    return std::forward_as_tuple(values, indices);
+  }
+
+  TORCH_CHECK(
+      self.dim() <= MAX_TENSORINFO_DIMS,
+      "cannot operate on more than ",
+      MAX_TENSORINFO_DIMS,
+      " dimensions");
+
+  // Based on required index size, run the algorithm with the
+  // appropriate index type
+  if (self.numel() != 0) {
+    launch_kthvalue_kernel(values, indices, self, dim, k);
+  }
+
+  if (!keepdim) {
+    values.squeeze_(dim);
+    indices.squeeze_(dim);
+  }
+  return std::forward_as_tuple(values, indices);
+}
+
+std::tuple<Tensor&, Tensor&> median_with_indices_impl(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim,
+    bool ignore_nan) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of a median value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("median Zoom with indices output");
+  NoNamesGuard guard;
+
+  dim = at::maybe_wrap_dim(dim, self.dim());
+  Tensor in = self.dim() > 0 ? self.contiguous() : self.unsqueeze(0);
+
+  checkDeviceType("median", {values, indices}, self.device().type());
+  checkScalarType("median", {indices, "indices", 1}, kLong);
+  checkSameType("median", {values, "values", 0}, {self, "self", 2});
+
+  TORCH_CHECK(
+      self.dim() <= MAX_TENSORINFO_DIMS,
+      "median() cannot operate on more than ",
+      MAX_TENSORINFO_DIMS,
+      " dimensions");
+
+  std::vector<int64_t> out_shape = self.sizes().vec();
+  zero_numel_check_dims(self, dim, "median()");
+  if (self.dim() > 0) {
+    assert(dim >= 0);
+    assert(dim < static_cast<int64_t>(out_shape.size()));
+
+    if (keepdim) {
+      out_shape[dim] = 1;
+    } else {
+      out_shape.erase(out_shape.begin() + dim);
+    }
+  }
+
+  values.resize_(out_shape);
+  indices.resize_(out_shape);
+
+  // Only launch kernel for non-empty tensors
+  if (self.numel() > 0) {
+    // Ensure #dim is the same for all tensors required for reduction
+    Tensor vals = keepdim && self.dim() > 0 ? values : values.unsqueeze(dim);
+    Tensor inds = keepdim && self.dim() > 0 ? indices : indices.unsqueeze(dim);
+
+    launch_median_kernel(vals, inds, in, dim, ignore_nan);
+  }
+
+  guard.reset();
+  namedinference::propagate_names_for_reduction(values, self, dim, keepdim);
+  namedinference::propagate_names_for_reduction(indices, self, dim, keepdim);
+
+  return std::forward_as_tuple(values, indices);
+}
+
+Tensor median_impl(const Tensor& self, bool ignore_nan) {
+  NoNamesGuard guard;
+
+  int64_t size = self.numel();
+  // Return nan for empty tensors
+  if (size <= 0) {
+    return at::full({}, std::numeric_limits<float>::quiet_NaN()).to(self.options());
+  }
+
+  // Sort input tensor to efficiently query for median element
+  Tensor sorted = std::get<0>(self.flatten().sort());
+
+  if (!ignore_nan) {
+    // For torch.median return either the middle element or nan (sorted as
+    // largest) if there are any
+    int64_t k = (size - 1) / 2;
+    return at::where(sorted[-1].isnan(), sorted[-1], sorted[k]);
+  } else {
+    // For torch.nanmedian return the middle element among the non-nan values
+    int64_t k = ((size - 1) - sorted.isnan().sum().item<int64_t>()) / 2;
+    return sorted[k].clone();  // Clone so we aren't keeping `sorted` alive
+  }
+}
+
+} // namespace (anonymous)
+
+std::tuple<Tensor&, Tensor&> kthvalue_out_zoom(
+    const Tensor& self,
+    int64_t k,
+    int64_t dim,
+    bool keepdim,
+    Tensor& values,
+    Tensor& indices) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of the kth value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("kthvalue Zoom");
+  auto result = [&]() {
+    NoNamesGuard guard;
+    // `kthvalue_out_impl_zoom` expects contiguous in input `self`.
+    return kthvalue_out_impl_zoom(values, indices, self.contiguous(), k, dim, keepdim);
+  }();
+  namedinference::propagate_names_for_reduction(values, self, dim, keepdim);
+  namedinference::propagate_names_for_reduction(indices, self, dim, keepdim);
+  return result;
+}
+
+// Mark: median
+
+std::tuple<Tensor&, Tensor&> median_out_zoom(
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim,
+    Tensor& values,
+    Tensor& indices) {
+  return median_with_indices_impl(
+      values, indices, self, dim, keepdim, /*ignore_nan=*/false);
+}
+
+Tensor median_zoom(const Tensor& self) {
+  return median_impl(self, /*ignore_nan=*/false);
+}
+
+std::tuple<Tensor&, Tensor&> nanmedian_out_zoom(
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim,
+    Tensor& values,
+    Tensor& indices) {
+  return median_with_indices_impl(
+      values, indices, self, dim, keepdim, /*ignore_nan=*/true);
+}
+
+Tensor nanmedian_zoom(const Tensor& self) {
+  return median_impl(self, /*ignore_nan=*/true);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Sorting.cu b/aten/src/ATen/native/zoom/Sorting.cu
new file mode 100644
index 00000000000000..e3a0a647fc8181
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Sorting.cu
@@ -0,0 +1,282 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/Sorting.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <c10/macros/Macros.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/native/zoom/SortingRadixSelect.cuh>
+
+#include <c10/zoom/ZoomStream.h>
+
+#include <cassert>
+#include <cstdlib>
+
+namespace at::native {
+
+namespace {
+
+// Finds the rank k element, and its index, of the values along dimension dim
+template <typename scalar_t, typename index_t, int Dim>
+__global__ void gatherKthValue(
+    zoom::detail::TensorInfo<const scalar_t, index_t> input,
+    index_t inputSliceSize,
+    index_t k,
+    index_t numInputSlices,
+    index_t inputWithinSliceStride,
+    zoom::detail::TensorInfo<scalar_t, index_t> kthValue,
+    zoom::detail::TensorInfo<int64_t, index_t> indices) {
+  // Indices are limited to integer fp precision, so counts can fit in
+  // int32, regardless of index_t
+  __shared__ int smem[C10_WARP_SIZE]; // one per each warp, up to warp limit
+
+  index_t slice = getLinearBlockId<index_t>();
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Find the start offset for our slice
+  index_t sliceStartIndex =
+      zoom::detail::IndexToOffset<const scalar_t, index_t, Dim>::get(slice, input);
+  index_t kthValueSliceStartIndex =
+      zoom::detail::IndexToOffset<scalar_t, index_t, Dim>::get(slice, kthValue);
+  index_t indicesSliceStartIndex =
+      zoom::detail::IndexToOffset<int64_t, index_t, Dim>::get(slice, indices);
+
+  const scalar_t* inputSliceStart = &input.data[sliceStartIndex];
+  scalar_t* kthValueSliceStart = &kthValue.data[kthValueSliceStartIndex];
+  int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  scalar_t kValue = static_cast<scalar_t>(0);
+  radixSelect<
+      scalar_t,
+      typename TopKTypeConfig<scalar_t>::RadixType,
+      index_t>(
+      inputSliceStart,
+      k,
+      false,
+      inputSliceSize,
+      inputWithinSliceStride,
+      smem,
+      &kValue);
+
+  // Find the index of the k-th highest element
+  index_t kValueIndex = 0;
+  bool foundKValue = false;
+
+  for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    scalar_t v = inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride])
+                         : static_cast<scalar_t>(0);
+    bool isKValue = inRange &&
+        ((v == kValue) || (at::_isnan(v) && at::_isnan(kValue)));
+    if (isKValue) {
+      kValueIndex = i;
+      foundKValue = true;
+      break;
+    }
+  }
+
+  if (foundKValue) {
+    kthValueSliceStart[0] = kValue;
+    indicesSliceStart[0] = kValueIndex;
+  }
+}
+
+// CUDA kernel to find the median, and its index, of the values along dimension dim
+template <typename scalar_t, typename index_t, int Dim>
+__global__ void gatherMedian(
+    zoom::detail::TensorInfo<scalar_t, index_t> values,
+    zoom::detail::TensorInfo<int64_t, index_t> indices,
+    zoom::detail::TensorInfo<const scalar_t, index_t> input,
+    index_t inputSliceSize,
+    index_t numInputSlices,
+    index_t inputWithinSliceStride,
+    bool ignore_nan) {
+  // Shared memory for the subroutine RadixSelect. Note that RadixSelect converts the
+  // floating point type to int with the same relative ordering.
+  __shared__ int smem[C10_WARP_SIZE]; // one per each warp, up to warp limit
+
+  index_t slice = getLinearBlockId<index_t>();
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Finds the start offset for our slice
+  index_t valuesSliceStartIndex =
+      zoom::detail::IndexToOffset<scalar_t, index_t, Dim>::get(slice, values);
+  index_t indicesSliceStartIndex =
+      zoom::detail::IndexToOffset<int64_t, index_t, Dim>::get(slice, indices);
+  index_t inputSliceStartIndex =
+      zoom::detail::IndexToOffset<const scalar_t, index_t, Dim>::get(slice, input);
+
+  scalar_t* valuesSliceStart = &values.data[valuesSliceStartIndex];
+  int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+  const scalar_t* inputSliceStart = &input.data[inputSliceStartIndex];
+
+  index_t nan_count = 0;
+  for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) {
+    scalar_t val = doLdg(&inputSliceStart[i * inputWithinSliceStride]);
+    nan_count += at::_isnan(val) ? 1 : 0;
+  }
+
+  // Counts number of nan values
+  // This code performs a parallel sum reduction (not the most efficient code)
+  __shared__ int64_t num_nan;
+  if (threadIdx.x == 0) {
+    num_nan = 0;
+  }
+  __syncthreads();
+  if (nan_count > 0) {
+    gpuAtomicAddNoReturn(&num_nan, nan_count);
+  }
+  __syncthreads();
+
+  // For torch.median, if we found nan set k to last index so the computed value
+  // is nan, otherwise set k to the middle element of the non-nan values
+  index_t k = (!ignore_nan && num_nan > 0) ? inputSliceSize - 1
+                                           : (inputSliceSize - num_nan - 1) / 2;
+
+  // Find the median
+  scalar_t median = static_cast<scalar_t>(0);
+  radixSelect<
+      scalar_t,
+      typename TopKTypeConfig<scalar_t>::RadixType,
+      index_t>(
+      inputSliceStart,
+      k + 1,
+      false,
+      inputSliceSize,
+      inputWithinSliceStride,
+      smem,
+      &median);
+
+  valuesSliceStart[0] = median;
+
+  // Find the index of the median value in the slice
+  for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) {
+    scalar_t val = doLdg(&inputSliceStart[i * inputWithinSliceStride]);
+    if (val == median || (at::_isnan(val) && at::_isnan(median))) {
+      indicesSliceStart[0] = i;
+      break;
+    }
+  }
+}
+
+struct KthValueLauncher {
+  int64_t k;
+
+  KthValueLauncher(int64_t k) : k(k) {}
+
+  template <typename scalar_t, typename index_t, int all_dims>
+  inline void launch(
+      zoom::detail::TensorInfo<scalar_t, index_t> values_info,
+      int collapse_values_dim,
+      zoom::detail::TensorInfo<int64_t, index_t> indices_info,
+      int collapse_indices_dim,
+      zoom::detail::TensorInfo<const scalar_t, index_t> self_info,
+      int collapse_self_dim,
+      int64_t num_slices,
+      int64_t slice_size) {
+    (void)collapse_indices_dim; // Suppress unused variable warning
+    dim3 grid;
+    if (!getGridFromTiles(num_slices, grid)) {
+      AT_ERROR("slices are too many");
+    }
+
+    dim3 block(::min(
+        round_up(slice_size, (int64_t)at::zoom::warp_size()), (int64_t)1024));
+    auto stream = c10::zoom::getCurrentZoomStream();
+   hipLaunchKernelGGL(( gatherKthValue<scalar_t, index_t, all_dims>), dim3(grid), dim3(block), 0, stream, 
+        self_info,
+        slice_size,
+        k,
+        num_slices,
+        /* The actual dimension that the k-selection is running in */
+        /* may have changed from collapseDims() */
+        self_info.strides[collapse_self_dim],
+        values_info,
+        indices_info);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+};
+
+struct MedianLauncher {
+  bool ignore_nan;
+
+  MedianLauncher(bool ignore_nan) : ignore_nan(ignore_nan) {}
+
+  template <typename scalar_t, typename index_t, int all_dims>
+  inline void launch(
+      zoom::detail::TensorInfo<scalar_t, index_t> values_info,
+      int collapse_values_dim,
+      zoom::detail::TensorInfo<int64_t, index_t> indices_info,
+      int collapse_indices_dim,
+      zoom::detail::TensorInfo<const scalar_t, index_t> self_info,
+      int collapse_self_dim,
+      int64_t num_slices,
+      int64_t slice_size) {
+    (void)collapse_values_dim; // Suppress unused variable warning
+    (void)collapse_indices_dim; // Suppress unused variable warning
+    dim3 grid;
+    if (!getGridFromTiles(num_slices, grid)) {
+      AT_ERROR("slices are too many");
+    }
+
+    dim3 block(::min(
+        round_up(slice_size, (int64_t)at::zoom::warp_size()), (int64_t)1024));
+    auto stream = c10::zoom::getCurrentZoomStream();
+   hipLaunchKernelGGL(( gatherMedian<scalar_t, index_t, all_dims>), dim3(grid), dim3(block), 0, stream, 
+        values_info,
+        indices_info,
+        self_info,
+        slice_size,
+        num_slices,
+        self_info.strides[collapse_self_dim],
+        ignore_nan);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+};
+
+}  // namespace (anonymous)
+
+void launch_kthvalue_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t dim, int64_t k) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "kthvalue_zoom", [&] {
+    AT_DISPATCH_INDEX_TYPES(
+        zoom::detail::canUse32BitIndexMath(self) &&
+        zoom::detail::canUse32BitIndexMath(values) &&
+        zoom::detail::canUse32BitIndexMath(indices) ? ScalarType::Int : ScalarType::Long,
+        "kth_value_launcher", [&] {
+          run_launcher<scalar_t, index_t>(
+              values, indices, self, dim, KthValueLauncher(k));
+    });
+  });
+}
+
+void launch_median_kernel(
+    const TensorBase &vals, const TensorBase &inds,
+    const TensorBase &self, int64_t dim, bool ignore_nan) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "median_out_impl", [&] {
+        if (zoom::detail::canUse32BitIndexMath(vals) &&
+            zoom::detail::canUse32BitIndexMath(inds) &&
+            zoom::detail::canUse32BitIndexMath(self)) {
+          run_launcher<scalar_t, uint32_t>(
+              vals, inds, self, dim, MedianLauncher(ignore_nan));
+        } else {
+          run_launcher<scalar_t, uint64_t>(
+              vals, inds, self, dim, MedianLauncher(ignore_nan));
+        }
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Sorting.h b/aten/src/ATen/native/zoom/Sorting.h
new file mode 100644
index 00000000000000..bd10ffb1a02741
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Sorting.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_kthvalue_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t dim, int64_t k);
+void launch_median_kernel(
+    const TensorBase &vals, const TensorBase &inds,
+    const TensorBase &in, int64_t dim, bool ignore_nan);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/SortingCommon.cuh b/aten/src/ATen/native/zoom/SortingCommon.cuh
new file mode 100644
index 00000000000000..902145fd4fbfba
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SortingCommon.cuh
@@ -0,0 +1,188 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
+#include <ATen/NumericUtils.h>
+#include <c10/macros/Macros.h>
+#include <stdlib.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+
+namespace at {
+namespace native {
+
+// Is this questionable namespace pollution?
+constexpr int MAX_BLOCK_SIZE = 256;
+
+// Maximum size per grid dimension that we assume (compute capability >= 2.0)
+constexpr int64_t MAX_GRID_SIZE = 65535LL;
+
+static bool getGridFromTiles(int64_t gridTiles, dim3& grid) {
+  if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
+    return false;
+  }
+
+  int64_t gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+  int64_t gridY = 1;
+  int64_t gridZ = 1;
+
+  if (gridTiles > MAX_GRID_SIZE) {
+    gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE);
+    gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+
+    if (gridTiles > MAX_GRID_SIZE) {
+      gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE);
+      gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+    }
+  }
+
+  grid = dim3(gridX, gridY, gridZ);
+  return true;
+}
+
+template <typename scalar_t, bool handleNaN = false>
+struct GTOp {
+  __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const {
+    return (handleNaN && at::_isnan(lhs) && !at::_isnan(rhs)) || (lhs > rhs);
+  }
+};
+
+template <typename scalar_t, bool handleNaN = false>
+struct LTOp {
+  __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const {
+    return (handleNaN && at::_isnan(rhs) && !at::_isnan(lhs)) || (lhs < rhs);
+  }
+};
+
+template <typename index_t>
+__device__ __forceinline__ index_t getLinearBlockId() {
+  return blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x +
+      blockIdx.x;
+}
+
+// For slice sorting in Thrust; extracts a slice index from a linear
+// index and uses that for comparison
+struct SliceComp {
+  SliceComp(int64_t size) : sliceSize(size) {}
+
+  __device__ bool operator()(const int64_t& a, const int64_t& b) const {
+    // Since the slices are guaranteed to be innermost,
+    // the segment is just via int64_t division
+    int64_t segA = a / sliceSize;
+    int64_t segB = b / sliceSize;
+    return segA < segB;
+  }
+
+  const int64_t sliceSize;
+};
+
+// For sorting in Thurst; extracts a within-slice index from a linear index
+struct GlobalIndexToPerSliceIndex {
+  GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {}
+
+  __device__ inline void operator()(int64_t& v) const {
+    v = v % sliceSize;
+  }
+
+  const int64_t sliceSize;
+};
+
+// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
+static uint64_t nextHighestPowerOf2(uint64_t n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+#ifndef _MSC_VER
+  n |= n >> 32;
+#endif
+  n++;
+
+  return n;
+}
+
+
+// WARNING: This function assumes input tensors are contiguous
+template <typename scalar_t, typename index_t, typename Launcher>
+void run_launcher(
+    const TensorBase &values,
+    const TensorBase &indices,
+    const TensorBase &self,
+    int64_t dim,
+    Launcher l) {
+  auto self_info = zoom::detail::getTensorInfo<const scalar_t, index_t>(self);
+  auto values_info = zoom::detail::getTensorInfo<scalar_t, index_t>(values);
+  auto indices_info = zoom::detail::getTensorInfo<int64_t, index_t>(indices);
+
+  int64_t slice_size = self.size(dim);
+  /* We use these structures solely to find the offset to */
+  /* each slice we are operating on */
+  self_info.reduceDim(dim);
+  values_info.reduceDim(dim);
+  indices_info.reduceDim(dim);
+
+  /* Collapse all other dims */
+  int collapse_self_dim = self_info.collapseDims(dim);
+  int collapse_values_dim = values_info.collapseDims(dim);
+  int collapse_indices_dim = indices_info.collapseDims(dim);
+
+  int64_t num_slices = 1;
+  for (int i = 0; i < self_info.dims; ++i) {
+    num_slices *= self_info.sizes[i];
+  }
+
+  /* This is used as a template parameter to calculate indices. */
+  /* We only specialize it if all collapsed dim sizes are the */
+  /* same; otherwise, we use -1 which is the specialization */
+  /* parameter for arbitrary dimensions */
+  int all_dims = self_info.dims;
+  if (values_info.dims != all_dims || indices_info.dims != all_dims) {
+    all_dims = -1;
+  }
+
+  if (all_dims == 1) {
+    l.template launch<scalar_t, index_t, 1>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  } else if (all_dims == 2) {
+    l.template launch<scalar_t, index_t, 2>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  } else if (all_dims == 3) {
+    l.template launch<scalar_t, index_t, 3>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  } else {
+    l.template launch<scalar_t, index_t, -1>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  }
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/SortingRadixSelect.cuh b/aten/src/ATen/native/zoom/SortingRadixSelect.cuh
new file mode 100644
index 00000000000000..83f893b76f9d75
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SortingRadixSelect.cuh
@@ -0,0 +1,410 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#include <ATen/ceil_div.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/zoom/AsmUtils.cuh>
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+template <typename scalar_t>
+struct TopKTypeConfig {};
+
+template <>
+struct TopKTypeConfig<float> {
+  typedef uint32_t RadixType;
+
+  // Converts a float to an integer representation with the same
+  // sorting; i.e., for floats f1, f2:
+  // if f1 < f2 then convert(f1) < convert(f2)
+  // We use this to enable radix selection of floating-point values.
+  // This also gives a relative order for NaNs, but that's ok, as they
+  // will all be adjacent
+  // neg inf: signbit=1 exp=ff fraction=0 --> radix = 0 00 ff..
+  // pos inf: signbit=0 exp=ff fraction=0 --> radix = 1 ff 00..
+  // pos nan: signbit=0 exp=ff fraction>0 --> radix = 1 ff x>0
+  // neg nan: signbit=1 exp=ff fraction>0 --> radix = 0 00 x<ff...
+  static inline __device__ RadixType convert(float v) {
+    RadixType x = __float_as_int(v);
+    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (v == v) ? (x ^ mask) : 0xffffffff;
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<uint8_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(uint8_t v) {
+    return v;
+  }
+
+  static inline __device__ uint8_t deconvert(RadixType v) {
+    return v;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int8_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int8_t v) {
+    return 128u + v;
+  }
+
+  static inline __device__ int8_t deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int16_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int16_t v) {
+    static_assert(sizeof(short) == 2, "");
+    return 32768u + v;
+  }
+
+  static inline __device__ int16_t deconvert(RadixType v) {
+    return v - 32768;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int32_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int32_t v) {
+    static_assert(sizeof(int) == 4, "");
+    return 2147483648u + v;
+  }
+
+  static inline __device__ int32_t deconvert(RadixType v) {
+    return v - 2147483648u;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int64_t> {
+  typedef uint64_t RadixType;
+
+  static inline __device__ RadixType convert(int64_t v) {
+    static_assert(sizeof(int64_t) == 8, "");
+    return 9223372036854775808ull + v;
+  }
+
+  static inline __device__ int64_t deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
+  }
+};
+
+template <>
+struct TopKTypeConfig<double> {
+  typedef uint64_t RadixType;
+
+  static inline __device__ RadixType convert(double v) {
+    RadixType x = __double_as_longlong(v);
+    RadixType mask = -((x >> 63)) | 0x8000000000000000;
+    return (v == v) ? (x ^ mask) : 0xffffffffffffffff;
+  }
+
+  static inline __device__ double deconvert(RadixType v) {
+    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+    return __longlong_as_double(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<at::Half> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(at::Half v) {
+    RadixType x = __half_as_ushort(v);
+    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
+    return (v == v) ? (x ^ mask) : 0xffff;
+  }
+
+  static inline __device__ at::Half deconvert(RadixType v) {
+    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
+    return __ushort_as_half(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<at::BFloat16> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(at::BFloat16 v) {
+    RadixType x = v.x;
+    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
+    return (v == v) ? (x ^ mask) : 0xffff;
+  }
+
+  static inline __device__ at::BFloat16 deconvert(RadixType v) {
+    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
+    at::BFloat16 r;
+    r.x = (v ^ mask);
+    return r;
+  }
+};
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <
+    typename scalar_t,
+    typename bitwise_t,
+    typename index_t,
+    typename CountType,
+    int RadixSize,
+    int RadixBits>
+__device__ void countRadixUsingMask(
+    CountType counts[RadixSize],
+    CountType* smem,
+    bitwise_t desired,
+    bitwise_t desiredMask,
+    int radixDigitPos,
+    index_t sliceSize,
+    index_t withinSliceStride,
+    const scalar_t* data) {
+  // Clear out per-thread counts from a previous round
+#pragma unroll
+  for (int i = 0; i < RadixSize; ++i) {
+    counts[i] = 0;
+  }
+
+  if (threadIdx.x < RadixSize) {
+    smem[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Scan over all the data. Upon a read, the warp will accumulate
+  // counts per each digit in the radix using warp voting.
+  for (index_t i = threadIdx.x; i < sliceSize;) {
+    bitwise_t val =
+        TopKTypeConfig<scalar_t>::convert(doLdg(&data[i * withinSliceStride]));
+
+    bool hasVal = ((val & desiredMask) == desired);
+    bitwise_t digitInRadix = at::zoom::Bitfield<bitwise_t>::getBitfield(
+        val, radixDigitPos, RadixBits);
+
+#pragma unroll
+    for (uint32_t j = 0; j < RadixSize; ++j) {
+      bool vote = hasVal && (digitInRadix == j);
+      counts[j] += __popcll(WARP_BALLOT(vote));
+    }
+    i += blockDim.x;
+  }
+
+  // Now, for each warp, sum values
+  if (at::zoom::getLaneId() == 0) {
+#pragma unroll
+    for (uint32_t i = 0; i < RadixSize; ++i) {
+      gpuAtomicAddNoReturn(&smem[i], counts[i]);
+    }
+  }
+
+  __syncthreads();
+
+  // For each thread, read in the total counts
+#pragma unroll
+  for (uint32_t i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[i];
+  }
+
+  __syncthreads();
+}
+
+// Over what radix we are selecting values
+constexpr int RADIX_BITS = 2; // digits are base-(2 ^ RADIX_BITS)
+constexpr int RADIX_SIZE = 4; // 2 ^ RADIX_BITS
+constexpr int RADIX_MASK = (RADIX_SIZE - 1);
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename scalar_t, typename bitwise_t, typename index_t>
+__device__ scalar_t findPattern(
+    scalar_t* smem,
+    const scalar_t* data,
+    index_t sliceSize,
+    index_t withinSliceStride,
+    bitwise_t desired,
+    bitwise_t desiredMask) {
+  if (threadIdx.x < 2) {
+    smem[threadIdx.x] = static_cast<scalar_t>(0);
+  }
+  __syncthreads();
+
+  // All threads participate in the loop, in order to sync on the flag
+  index_t numIterations =
+      round_up(sliceSize, static_cast<index_t>(blockDim.x));
+  for (index_t i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < sliceSize);
+    scalar_t v = inRange ? doLdg(&data[i * withinSliceStride])
+                         : static_cast<scalar_t>(0);
+
+    if (inRange &&
+        ((TopKTypeConfig<scalar_t>::convert(v) & desiredMask) == desired)) {
+      // There should not be conflicts if we are using findPattern,
+      // since the result is unique
+      smem[0] = static_cast<scalar_t>(1);
+      smem[1] = v; // can't use val as the flag, since it could be 0
+    }
+
+    __syncthreads();
+
+    scalar_t found = smem[0];
+    scalar_t val = smem[1];
+
+    __syncthreads();
+
+    // Check to see if a thread found the value
+    if (found != static_cast<scalar_t>(0)) {
+      // all threads return this value
+      return val;
+    }
+  }
+
+  // should not get here
+  ZOOM_KERNEL_ASSERT(false);
+  return static_cast<scalar_t>(0);
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename scalar_t, typename bitwise_t, typename index_t>
+__device__ void radixSelect(
+    const scalar_t* data,
+    index_t k,
+    bool largest,
+    index_t sliceSize,
+    index_t withinSliceStride,
+    int* smem,
+    scalar_t* topK) {
+  // Per-thread buckets into which we accumulate digit counts in our
+  // radix
+  int counts[RADIX_SIZE];
+
+  // We only consider elements x such that (x & desiredMask) == desired
+  // Initially, we consider all elements of the array, so the above
+  // statement is true regardless of input.
+  bitwise_t desired = 0;
+  bitwise_t desiredMask = 0;
+
+  // We are looking for the top kToFind-th element when iterating over
+  // digits; this count gets reduced by elimination when counting
+  // successive digits
+  int kToFind = k;
+
+  // We start at the most significant digit in our radix, scanning
+  // through to the least significant digit
+  for (int digitPos = sizeof(scalar_t) * 8 - RADIX_BITS; digitPos >= 0;
+       digitPos -= RADIX_BITS) {
+    // Count radix distribution for the current position and reduce
+    // across all threads
+    countRadixUsingMask<
+        scalar_t,
+        bitwise_t,
+        index_t,
+        int,
+        RADIX_SIZE,
+        RADIX_BITS>(
+        counts,
+        smem,
+        desired,
+        desiredMask,
+        digitPos,
+        sliceSize,
+        withinSliceStride,
+        data);
+
+    auto found_unique = [&](int i, int count) -> bool {
+      /* All threads have the same value in counts here, so all */
+      /* threads will return from the function. */
+      if (count == 1 && kToFind == 1) {
+        /* There is a unique answer. */
+        desired = at::zoom::Bitfield<bitwise_t>::setBitfield(
+            desired, i, digitPos, RADIX_BITS);
+        desiredMask = at::zoom::Bitfield<bitwise_t>::setBitfield(
+            desiredMask, RADIX_MASK, digitPos, RADIX_BITS);
+
+        /* The answer is now the unique element v such that: */
+        /* (v & desiredMask) == desired */
+        /* However, we do not yet know what the actual element is. We */
+        /* need to perform a search through the data to find the */
+        /* element that matches this pattern. */
+        *topK = findPattern<scalar_t, bitwise_t, index_t>(
+            (scalar_t*)smem,
+            data,
+            sliceSize,
+            withinSliceStride,
+            desired,
+            desiredMask);
+        return true;
+      }
+      return false;
+    };
+    auto found_non_unique = [&](int i, int count) -> bool {
+      if (count >= kToFind) {
+        desired =
+            at::zoom::Bitfield<bitwise_t>::setBitfield(
+                desired, i, digitPos, RADIX_BITS);
+        desiredMask = at::zoom::Bitfield<bitwise_t>::setBitfield(
+            desiredMask, RADIX_MASK, digitPos, RADIX_BITS);
+
+        /* The top-Kth element v must now be one such that: */
+        /* (v & desiredMask == desired) */
+        /* but we haven't narrowed it down; we must check the next */
+        /* least-significant digit */
+        return true;
+      }
+      kToFind -= count;
+      return false; // continue the loop
+    };
+
+    // All threads participate in the comparisons below to know the
+    // final result
+    if (largest) {
+      // Process in descending order
+#pragma unroll
+      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+        int count = counts[i];
+        if (found_unique(i, count)) {
+          return;
+        }
+        if (found_non_unique(i, count)) {
+          break;
+        }
+      }
+    } else {
+      // Process in ascending order
+#pragma unroll
+      for (int i = 0; i < RADIX_SIZE; ++i) {
+        int count = counts[i];
+        if (found_unique(i, count)) {
+          return;
+        }
+        if (found_non_unique(i, count)) {
+          break;
+        }
+      }
+    }
+  } // end digitPos for
+
+  // There is no unique result, but there is a non-unique result
+  // matching `desired` exactly
+  *topK = TopKTypeConfig<scalar_t>::deconvert(desired);
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/TensorTopK.cpp b/aten/src/ATen/native/zoom/TensorTopK.cpp
new file mode 100644
index 00000000000000..bf0539c0b2db97
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorTopK.cpp
@@ -0,0 +1,96 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/TensorTopK.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/native/zoom/Sort.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <c10/zoom/ZoomFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/sort_privateuse1_dispatch.h>
+#include <ATen/ops/topk_native.h>
+#endif
+
+namespace at::native {
+
+// TODO: remove this when CUDA <11.6 is no longer supported
+void topk_out_with_sort(
+  const Tensor& self,
+  int64_t k, int64_t dim, bool largest,
+  const Tensor& values,
+  const Tensor& indices
+) {
+  auto [sorted_values, sorted_indices] = at::privateuse1::sort(self, /* stable= */false, dim, largest);
+  values.copy_(sorted_values.narrow(dim, 0, k));
+  indices.copy_(sorted_indices.narrow(dim, 0, k));
+}
+
+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk();
+bool should_use_sort(const Tensor& self, int64_t dim) {
+  if (disable_sort_for_topk()) return false;
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
+  if (self.dim() == 0) return false;
+  if (self.dtype() == kBool) return false; // Bool is not support by topk
+  int64_t slice_size = self.size(dim);
+  if (slice_size == 0) return false;
+  int64_t num_slices = self.numel() / slice_size;
+  return num_slices <= 10 && slice_size >= 100000;
+}
+
+TORCH_IMPL_FUNC(topk_out_zoom)
+  (const Tensor& self,
+   int64_t k, int64_t dim, bool largest, bool sorted,
+   const Tensor& values,
+   const Tensor& indices) {
+  TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3};
+  checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg});
+
+  dim = at::maybe_wrap_dim(dim, self);
+
+  if (should_use_sort(self, dim)) {
+    topk_out_with_sort(self, k, dim, largest, values, indices);
+    return;
+  }
+
+  // If k is 0 the result is an empty tensor, so we don't need to launch a kernel.
+  if (k == 0) {
+    return;
+  }
+
+  launch_gather_topk_kernel(self, k, dim, largest, values, indices);
+
+  // Sort the results if the user wants them sorted, since our
+  // selection routine does not ensure sorting
+  if (sorted && values.numel() > 1) {
+    if (should_use_small_sort(values, dim)) {
+      // This avoids any memory allocations and performs all sorting
+      // work inplace along the slice
+
+      sortKeyValueInplace(values, indices, dim, largest);
+    } else {
+      // Depend upon the backup sort that returns indices, which we
+      // can use in conjunction with gather to produce the original
+      // indices.
+      // This is not the most efficient implementation, especially since
+      // there are memory allocations performed here. If the user desires
+      // greater performance, they should torch.gather() the results
+      // themselves using the reported indices, providing previously
+      // allocated tensors to receive the results.
+
+      Tensor sortedIndices = at::empty_like(indices);
+      Tensor sortedValues = at::empty_like(values);
+      at::privateuse1::sort_outf(values, /* stable= */ false, dim, largest, sortedValues, sortedIndices);
+      indices.copy_(indices.gather(dim, sortedIndices));
+      values.copy_(sortedValues);
+    }
+  }
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorTopK.cu b/aten/src/ATen/native/zoom/TensorTopK.cu
new file mode 100644
index 00000000000000..c4a431b2a1dc98
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorTopK.cu
@@ -0,0 +1,895 @@
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/TensorTopK.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/ScanUtils.cuh>
+#include <ATen/zoom/AsmUtils.cuh>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/native/zoom/SortingRadixSelect.cuh>
+#include <ATen/zoom/cub.cuh>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+
+#include <c10/macros/Macros.h>
+
+using namespace at::native;
+
+namespace at::native {
+
+// TODO: remove this when CUDA <11.6 is no longer supported
+bool disable_sort_for_topk() {
+  return CUB_SUPPORTS_SCAN_BY_KEY();
+}
+
+namespace sbtopk { // single_block_topk
+
+template <typename T>
+struct AddOp {
+  __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
+    return (lhs + rhs);
+  }
+};
+
+template <typename T, typename IndexType, int Dim, bool WithKthValues>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void gatherTopK(at::zoom::detail::TensorInfo<const T, IndexType> input,
+                           IndexType inputSliceSize,
+                           IndexType outputSliceSize, // aka `k`
+                           bool largest,
+
+                           IndexType numInputSlices,
+                           IndexType inputWithinSliceStride,
+
+                           at::zoom::detail::TensorInfo<T, IndexType> topK,
+                           IndexType topKWithinSliceStride,
+
+                           at::zoom::detail::TensorInfo<int64_t, IndexType> indices,
+                           IndexType indicesWithinSliceStride,
+                           T* kthValues) {
+  // Indices are limited to integer fp precision, so counts can fit in
+  // int32, regardless of IndexType
+  __shared__ int smem[64];
+  IndexType slice = getLinearBlockId<IndexType>();
+  if (slice >= numInputSlices) {
+    return;
+  }
+
+  // Find the start offset for our slice
+  IndexType sliceStartIndex =
+    at::zoom::detail::IndexToOffset<const T, IndexType, Dim>::get(slice, input);
+  IndexType topKSliceStartIndex =
+    at::zoom::detail::IndexToOffset<T, IndexType, Dim>::get(slice, topK);
+  IndexType indicesSliceStartIndex =
+    at::zoom::detail::IndexToOffset<int64_t, IndexType, Dim>::get(slice, indices);
+
+  const T* inputSliceStart = &input.data[sliceStartIndex];
+  T* topKSliceStart = &topK.data[topKSliceStartIndex];
+  int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  T topKValue;
+  if (WithKthValues){
+    topKValue = kthValues[slice];
+  } else {
+    topKValue = static_cast<T>(0);
+    radixSelect<T, typename TopKTypeConfig<T>::RadixType, IndexType>(
+      inputSliceStart, outputSliceSize, largest,
+      inputSliceSize, inputWithinSliceStride,
+      smem, &topKValue);
+  }
+  const auto topKConverted = at::native::TopKTypeConfig<T>::convert(topKValue);
+
+  // Every value that is strictly less/greater than `pattern`
+  // (depending on sort dir) in sorted int format is in the top-K.
+  // The top-K value itself might not be unique.
+  //
+  // Since there are a variable number of elements that we see that
+  // are within the top-k, we don't know at what index to write out
+  // the resulting values.
+  // In order to get this, we perform an exclusive prefix sum of
+  // `hasTopK`. This will return the resulting index into which we
+  // need to write the result, if a thread has a result.
+
+  // All threads need to participate in the loop and the prefix sum,
+  // but not necessarily in the load; hence loop bounds being rounded
+  // up to a multiple of the block dim.
+  IndexType numIterations = round_up(inputSliceSize, (IndexType) blockDim.x);
+  IndexType writeIndexStart = 0;
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : static_cast<T>(0);
+    const auto convertedV = at::native::TopKTypeConfig<T>::convert(v);
+    bool hasTopK;
+    if (largest) {
+      hasTopK = inRange && (convertedV > topKConverted);
+    } else {
+      hasTopK = inRange && (convertedV < topKConverted);
+    }
+
+    int index;
+    int carry;
+    at::zoom::exclusiveBinaryPrefixScan<int, true>(
+        smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK) {
+      int writeIndex = writeIndexStart + index;
+      ZOOM_KERNEL_ASSERT(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i;
+    }
+
+    writeIndexStart += carry;
+  }
+
+  // We need to fill in the rest with actual == top-K values.
+  // The number that we need is outputSliceSize -
+  // writeIndexStart. There might be more than that number available,
+  // in which case we have to choose the first seen set. We do this
+  // via a prefix sum to calculate indices for writing results.
+  ZOOM_KERNEL_ASSERT(outputSliceSize >= writeIndexStart);
+  IndexType topKRemaining = (outputSliceSize - writeIndexStart);
+
+  for (IndexType i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < inputSliceSize);
+    T v =
+      inRange ? doLdg(&inputSliceStart[i * inputWithinSliceStride]) : static_cast<T>(0);
+    const auto convertedV = at::native::TopKTypeConfig<T>::convert(v);
+    bool hasTopK = inRange && (convertedV == topKConverted);
+
+    int index;
+    int carry;
+    at::zoom::exclusiveBinaryPrefixScan<int, true>(
+        smem, hasTopK, &index, &carry, AddOp<int>());
+
+    if (hasTopK && index < topKRemaining) {
+      int writeIndex = writeIndexStart + index;
+      ZOOM_KERNEL_ASSERT(writeIndex < outputSliceSize);
+
+      IndexType topKOffset = writeIndex * topKWithinSliceStride;
+      IndexType indexOffset = writeIndex * indicesWithinSliceStride;
+
+      topKSliceStart[topKOffset] = v;
+      indicesSliceStart[indexOffset] = i;
+    }
+
+    if (carry >= topKRemaining) {
+      break;
+    }
+
+    topKRemaining -= carry;
+    writeIndexStart += carry;
+  }
+
+};
+
+template <typename T, typename IndexType, int Dim>
+void launch(
+    at::zoom::detail::TensorInfo<const T, IndexType> input,
+    IndexType inputSliceSize,
+    IndexType outputSliceSize, // aka `k`
+    bool largest,
+
+    IndexType numInputSlices,
+    IndexType inputWithinSliceStride,
+
+    at::zoom::detail::TensorInfo<T, IndexType> topK,
+    IndexType topKWithinSliceStride,
+
+    at::zoom::detail::TensorInfo<int64_t, IndexType> indices,
+    IndexType indicesWithinSliceStride) {
+
+    dim3 grid;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
+    int warp_size = at::zoom::warp_size();
+    dim3 block(::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
+   hipLaunchKernelGGL(( gatherTopK<T, IndexType, Dim, /* WithKthValues= */false>), dim3(grid), dim3(block), 0, c10::zoom::getCurrentZoomStream(), 
+        input,
+        inputSliceSize,
+        outputSliceSize,
+        largest,
+        numInputSlices,
+        inputWithinSliceStride,
+        topK,
+        topKWithinSliceStride,
+        indices,
+        indicesWithinSliceStride,
+        nullptr);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+} // namespace sbtopk
+
+namespace mbtopk { // multi_block_topk
+
+// Assumptions:
+// The number of elements can be larger than UINT32_MAX, but
+// the number of total blocks can not be larger than UINT32_MAX.
+// So we can not have more than UINT32_MAX slices. The actual limit
+// for number of slices could be a few fold smaller than UINT32_MAX,
+// because we could be using multiple blocks per slice.
+// Further more, the size of each input slice is also assumped to be
+// smaller than UINT32_MAX
+
+constexpr int BLOCK_THREADS = 256;
+
+// Over what radix we are selecting values
+constexpr int RADIX_BITS = 8;
+constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
+constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+constexpr int MIN_ITEMS_PER_THREAD = 4;
+constexpr int MAX_ITEMS_PER_THREAD = 64;
+
+template <typename T, typename IndexType>
+__global__ void fill(T* x, T value, IndexType size) {
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexType i = idx; i < size; i += gridDim.x * blockDim.x) {
+    x[i] = value;
+  }
+}
+
+// find the kth smallest value,
+// for largest topk, k_to_find = slice_size - k + 1
+template <typename T, typename IndexType, typename Bitwise, int Dim>
+C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
+__global__ void radixFindKthValues(
+    at::zoom::detail::TensorInfo<const T, IndexType> input,
+    uint32_t slice_size,
+    uint32_t* ks_to_find,  // size: num_slices
+
+    uint32_t num_slices,
+    IndexType withinSliceStride,
+
+    int current_bit,
+    int items_per_thread,
+    uint32_t blocks_per_slice,
+    Bitwise desiredMask,
+
+    // outputs
+    uint32_t* semaphores,  // size: num_slices
+    Bitwise* desires,      // size: num_slices
+    short* counts,         // size: num_slices * blocks_per_slice * radix_digits
+    T* kthValues           // size: num_slices, only write when current_bit reaches 0
+  ) {
+
+  int items_per_block = items_per_thread * BLOCK_THREADS;
+  int tidx = threadIdx.x;
+  uint32_t block_idx = getLinearBlockId<uint32_t>();
+  uint32_t slice_idx = block_idx / blocks_per_slice;
+  uint32_t blk_idx_in_slice = block_idx % blocks_per_slice;
+  if (slice_idx >= num_slices) {
+    return;
+  }
+
+  Bitwise desired = desires[slice_idx];
+  uint32_t k_to_find = ks_to_find[slice_idx];
+  IndexType slice_start_index = at::zoom::detail::IndexToOffset<const T, IndexType, Dim>::get(slice_idx, input);
+  const T* data = &input.data[slice_start_index];
+
+  typedef hipcub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
+    "blockwise counter too large");
+  union __align__(16) TempStorage {
+    uint32_t digit_counters[RADIX_DIGITS];
+    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
+    typename BlockScan::TempStorage scan_storage;
+  };
+  __shared__ TempStorage temp_storage;
+
+  // fill digit_counters with zeros
+  if (tidx < RADIX_DIGITS) {
+    temp_storage.digit_counters[tidx] = 0;
+  }
+  __syncthreads();
+
+  items_per_thread = (blk_idx_in_slice + 1 < blocks_per_slice)
+      ? items_per_thread
+      : at::ceil_div((int64_t)(slice_size - blk_idx_in_slice * items_per_block), (int64_t)BLOCK_THREADS);
+
+  // collect digit counts and store in shared memory
+  for (int i = 0; i < items_per_thread; ++i) {
+    // Find the start offset for this slice
+    IndexType idx = blk_idx_in_slice * items_per_block + i * BLOCK_THREADS + tidx;
+    if (idx < slice_size) {
+      idx *= withinSliceStride;
+      Bitwise val = TopKTypeConfig<T>::convert(doLdg(&data[idx]));
+      bool has_val = ((val & desiredMask) == (desired & desiredMask));
+      Bitwise digit = at::zoom::Bitfield<Bitwise>::getBitfield(val, current_bit, RADIX_BITS);
+      if (has_val) {
+        atomicAdd(&temp_storage.digit_counters[digit], 1);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  // load digit counter to register, one digit per thread
+  static_assert(RADIX_DIGITS <= BLOCK_THREADS, "this kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+  uint32_t digit_count = 0;
+  if (tidx < RADIX_DIGITS) {
+    digit_count = temp_storage.digit_counters[tidx];
+  }
+
+  // We always write out counts regardless if blocks_per_slice == 1 because
+  // it will be used to compute offsets for `gatherTopK`.
+  if (tidx < RADIX_DIGITS) {
+    counts[block_idx * RADIX_DIGITS + tidx] = digit_count;
+  }
+  // if blocks_per_slice == 1, there is no need to do cross-block reduction
+  // in this case we use counts saved at registers directly
+  if (blocks_per_slice > 1) {
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // make sure all writes are finished before update semaphores
+  }
+
+  // the last block of each slice accumulates counters from multiple blocks and updates desired and ks_to_find
+  __shared__ bool s_is_last_block_done;
+
+  if (tidx == 0) {
+    if (blocks_per_slice == 1) {
+      s_is_last_block_done = true;
+    } else {
+      uint32_t blocks_finished_old = atomicAdd(&semaphores[slice_idx], 1);
+      s_is_last_block_done = (blocks_finished_old == blocks_per_slice - 1);
+    }
+  }
+
+  __syncthreads();
+
+  if (!s_is_last_block_done)
+    return;
+
+  // accumulates counters from multiple blocks
+  if (tidx < RADIX_DIGITS && blocks_per_slice > 1) {
+    digit_count = 0;
+    for (int blk = 0; blk < blocks_per_slice; ++blk) {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
+    }
+  }
+
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
+  if (tidx < RADIX_DIGITS) {
+    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
+  }
+  __syncthreads();
+
+  if (tidx < RADIX_DIGITS) {
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+
+    // if not the last pass: update desired and ks_to_find
+    // if last pass: write out the kth value
+    if (digit_count_cumsum_left < k_to_find && k_to_find <= digit_count_cumsum) {
+      desired = at::zoom::Bitfield<Bitwise>::setBitfield(desired, tidx, current_bit, RADIX_BITS);
+      desires[slice_idx] = desired;
+      if (current_bit > 0) {
+        ks_to_find[slice_idx] = k_to_find - digit_count_cumsum_left;
+      } else {
+        kthValues[slice_idx] = TopKTypeConfig<T>::deconvert(desired);
+      }
+    }
+  }
+
+  // reset semaphores for the next pass
+  if (tidx == 0) {
+    semaphores[slice_idx] = 0;
+  }
+}
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+// Assumption: k can not be larger than UINT32_MAX
+template <typename Bitwise>
+C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
+__global__ void computeBlockwiseWithinKCounts(
+  Bitwise* desires,          // size: num_slices
+  short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t blocks_per_slice,
+  int current_bit,
+  bool largest,
+  // outputs:
+  uint32_t* withinKCounts,  // size: num_slices * blocks_per_slice == num_blocks
+  uint32_t num_blocks
+) {
+  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  int tidx = threadIdx.x;
+  uint32_t block_idx = getLinearBlockId<uint32_t>();
+  uint32_t slice_idx = block_idx / blocks_per_slice;
+
+  // The grid is computed from `getGridFromTiles`, when there are lots of
+  // elements, we will use both blockIdx.x and blockIdx.y, and maybe blockIdx.z
+  // when this is the case, the number of blocks that we are launching can be
+  // more than the number of blocks we need. So we need to check the range of
+  // `block_idx`.
+  if (block_idx >= num_blocks) {
+    return;
+  }
+
+  Bitwise desired = doLdg(desires + slice_idx);
+  Bitwise desired_digit = at::zoom::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
+
+  // if largest, then only threads that has tidx > desired_digit are active
+  // if !largest, then only threads that has tidx < desired_digit are active
+  // each active thread will read the count for its corresponding, and
+  // do warp reduction followed by shared memory reduction to get the total count
+  // non-active thread should not load, and non-active warp should not do reduction.
+  bool warp_is_active, thread_is_active;
+  int warp = tidx / C10_WARP_SIZE;
+  if (largest) {
+    int end_of_warp = warp * C10_WARP_SIZE + C10_WARP_SIZE - 1;
+    warp_is_active = end_of_warp > desired_digit;
+    thread_is_active = tidx > desired_digit;
+  } else {
+    int start_of_warp = warp * C10_WARP_SIZE;
+    warp_is_active = start_of_warp < desired_digit;
+    thread_is_active = tidx < desired_digit;
+  }
+  uint32_t count = 0;
+  if (warp_is_active) {
+    if (thread_is_active) {
+      count = doLdg(counts + block_idx * RADIX_DIGITS + tidx);
+    }
+    for (int offset = C10_WARP_SIZE / 2; offset > 0; offset /= 2) {
+      count += WARP_SHFL_DOWN(count, offset);
+    }
+  }
+
+  constexpr int num_warps = RADIX_DIGITS / C10_WARP_SIZE;
+  __shared__ uint32_t warp_counts[num_warps];
+  if (tidx % C10_WARP_SIZE == 0) {
+    warp_counts[warp] = count;
+  }
+  __syncthreads();
+  static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE,
+    "Assuming only 1 warp is needed for final reduction");
+  if (warp != 0) {
+    return;
+  }
+  count = 0;
+  if (tidx < num_warps) {
+    count = warp_counts[tidx];
+  }
+  for (int offset = num_warps / 2; offset > 0; offset /= 2) {
+    count += WARP_SHFL_DOWN(count, offset);
+  }
+  if (tidx == 0) {
+    withinKCounts[block_idx] += count;
+  }
+}
+
+// Assumption: slice_size can not be larger than UINT32_MAX
+template <typename Bitwise>
+__global__ void computeBlockwiseKthCounts(
+  Bitwise* desires,            // size: num_slices
+  short* counts,               // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t blocks_per_slice,
+  // outputs:
+  uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
+) {
+  HIP_KERNEL_LOOP_TYPE(idx, num_blocks, uint32_t) {
+    uint32_t slice_idx = idx / blocks_per_slice;
+    Bitwise desired = doLdg(desires + slice_idx);
+    Bitwise desired_digit = at::zoom::Bitfield<Bitwise>::getBitfield(desired, 0, RADIX_BITS);
+    kthCounts[idx] = doLdg(counts + idx * RADIX_DIGITS + desired_digit);
+  }
+}
+
+template <typename T, typename IndexType, int Dim>
+C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
+__global__ void gatherTopK(at::zoom::detail::TensorInfo<const T, IndexType> input,
+                           IndexType inputSliceSize,
+                           IndexType outputSliceSize, // aka `k`
+                           bool largest,
+
+                           uint32_t numInputSlices,
+                           IndexType inputWithinSliceStride,
+
+                           at::zoom::detail::TensorInfo<T, IndexType> topK,
+                           IndexType topKWithinSliceStride,
+
+                           at::zoom::detail::TensorInfo<int64_t, IndexType> indices,
+                           IndexType indicesWithinSliceStride,
+
+                           uint32_t items_per_thread,
+                           uint32_t blocks_per_slice,
+
+                           T *kthValues,
+                           uint32_t* withinKCounts,
+                           uint32_t* kthCounts,
+                           uint32_t num_blocks) {
+
+  uint32_t items_per_block = items_per_thread * BLOCK_THREADS;
+  uint32_t tidx = threadIdx.x;
+  uint32_t block_idx = getLinearBlockId<uint32_t>();
+
+  // The grid is computed from `getGridFromTiles`, when there are lots of
+  // elements, we will use both blockIdx.x and blockIdx.y, and maybe blockIdx.z
+  // when this is the case, the number of blocks that we are launching can be
+  // more than the number of blocks we need. So we need to check the range of
+  // `block_idx`.
+  if (block_idx >= num_blocks) {
+    return;
+  }
+
+  uint32_t slice_idx = block_idx / blocks_per_slice;
+  uint32_t blk_idx_in_slice = block_idx % blocks_per_slice;
+
+  items_per_thread = (blk_idx_in_slice + 1 < blocks_per_slice)
+      ? items_per_thread
+      : at::ceil_div((int64_t)(inputSliceSize - blk_idx_in_slice * items_per_block), (int64_t)BLOCK_THREADS);
+
+  // Find the start offset for our slice
+  IndexType sliceStartIndex =
+    at::zoom::detail::IndexToOffset<const T, IndexType, Dim>::get(slice_idx, input);
+  IndexType topKSliceStartIndex =
+    at::zoom::detail::IndexToOffset<T, IndexType, Dim>::get(slice_idx, topK);
+  IndexType indicesSliceStartIndex =
+    at::zoom::detail::IndexToOffset<int64_t, IndexType, Dim>::get(slice_idx, indices);
+
+  const T* inputSliceStart = &input.data[sliceStartIndex];
+  T* topKSliceStart = &topK.data[topKSliceStartIndex];
+  int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex];
+
+  // Find the k-th highest element in our input
+  T kthValue = kthValues[slice_idx];
+  const auto kthValueConverted = at::native::TopKTypeConfig<T>::convert(kthValue);
+
+  // Find the start index in output tensor of this block
+  uint32_t startWithinK = 0;
+  if (blk_idx_in_slice > 0) {
+    startWithinK = withinKCounts[block_idx - 1];
+  }
+  uint32_t startKth = withinKCounts[slice_idx * blocks_per_slice + blocks_per_slice - 1];
+  if (blk_idx_in_slice > 0) {
+    startKth += kthCounts[block_idx - 1];
+  }
+
+  // Read input, select topk out and write
+  typedef hipcub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  for (int i = 0; i < items_per_thread; ++i) {
+    // Find the start offset for this slice
+    IndexType idx = blk_idx_in_slice * items_per_block + i * BLOCK_THREADS + tidx;
+    T val;
+    int withinK = 0;
+    int kth = 0;
+    if (idx < inputSliceSize) {
+      val = doLdg(inputSliceStart + idx * inputWithinSliceStride);
+      const auto valConverted = at::native::TopKTypeConfig<T>::convert(val);
+      withinK = (largest ? valConverted > kthValueConverted : valConverted < kthValueConverted);
+      kth = (valConverted == kthValueConverted);
+    }
+
+    uint32_t withinKIndex;
+    uint32_t numWithinK;
+    BlockScan(temp_storage).ExclusiveSum(withinK, withinKIndex, numWithinK);
+    __syncthreads();
+    if (withinK) {
+      uint32_t offset = withinKIndex + startWithinK;
+      topKSliceStart[offset * topKWithinSliceStride] = val;
+      indicesSliceStart[offset * indicesWithinSliceStride] = idx;
+    }
+    startWithinK += numWithinK;
+
+    if (startKth < outputSliceSize) {
+      uint32_t kthIndex;
+      uint32_t numKth;
+      BlockScan(temp_storage).ExclusiveSum(kth, kthIndex, numKth);
+      __syncthreads();
+      if (kth) {
+        uint32_t offset = kthIndex + startKth;
+        if (offset < outputSliceSize) {
+          topKSliceStart[offset * topKWithinSliceStride] = val;
+          indicesSliceStart[offset * indicesWithinSliceStride] = idx;
+        }
+      }
+      startKth += numKth;
+    }
+  }
+}
+#endif
+
+int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
+  // occupancy of this kernel is limited by registers per threads
+  constexpr int REGS_PER_THREAD = 40; // from nsight launch statistics
+  constexpr int REGS_PER_BLOCK = REGS_PER_THREAD * BLOCK_THREADS;
+  hipDeviceProp_t* prop = at::zoom::getCurrentDeviceProperties();
+  int mpc = prop->multiProcessorCount;
+  int regs_per_mp = prop->regsPerBlock;
+  int max_blocks_per_mp = 32;
+  int blocks_per_mp = ::min(regs_per_mp / REGS_PER_BLOCK, max_blocks_per_mp);
+  int64_t items_per_thread = at::ceil_div((int64_t)(slice_size * num_slices), (int64_t)(mpc * blocks_per_mp * BLOCK_THREADS));
+  items_per_thread = ::max(MIN_ITEMS_PER_THREAD, ::min((int)items_per_thread, MAX_ITEMS_PER_THREAD)); // clamp to (4, 64)
+  return items_per_thread;
+}
+
+class BlockIdxToKey {
+  uint32_t blocks_per_slice;
+public:
+  BlockIdxToKey(uint32_t blocks_per_slice): blocks_per_slice(blocks_per_slice) {}
+  __device__ __forceinline__ uint32_t operator()(uint32_t blk) const {
+    return blk / blocks_per_slice;
+  }
+};
+
+template <typename T, typename IndexType, int Dim>
+void launch(
+    at::zoom::detail::TensorInfo<const T, IndexType> input,
+    IndexType inputSliceSize,
+    IndexType outputSliceSize, // aka `k`
+    bool largest,
+
+    uint32_t numInputSlices,
+    IndexType inputWithinSliceStride,
+
+    at::zoom::detail::TensorInfo<T, IndexType> topK,
+    IndexType topKWithinSliceStride,
+
+    at::zoom::detail::TensorInfo<int64_t, IndexType> indices,
+    IndexType indicesWithinSliceStride) {
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  // configure items_per_thread based on device architecture and input size
+  int items_per_thread = get_items_per_thread(numInputSlices, inputSliceSize);
+  int items_per_block = items_per_thread * BLOCK_THREADS;
+
+  using Bitwise = typename TopKTypeConfig<T>::RadixType;
+  uint32_t blocks_per_slice = at::ceil_div((int64_t)inputSliceSize, (int64_t)items_per_block);
+  uint32_t num_blocks = numInputSlices * blocks_per_slice;
+
+  // temporary storage
+  auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
+
+  auto kthValues_buffer = allocator.allocate(numInputSlices * sizeof(T));
+  T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());
+
+  TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
+  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
+  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
+  C10_ZOOM_CHECK(hipMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+
+  auto ks_to_find_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
+  uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
+  uint32_t k_to_find = largest ? inputSliceSize - outputSliceSize + 1: outputSliceSize;
+ hipLaunchKernelGGL(( fill<uint32_t>), dim3(::min(((int64_t)numInputSlices + 511) / 512, (int64_t)1073741824)), dim3(512), 0, stream, 
+    ks_to_find, k_to_find, numInputSlices);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  auto desired_buffer = allocator.allocate(numInputSlices * sizeof(Bitwise));
+  Bitwise* desired = reinterpret_cast<Bitwise*>(desired_buffer.get());
+
+  auto counts_buffer = allocator.allocate(num_blocks * RADIX_DIGITS * sizeof(short));
+  short* counts = reinterpret_cast<short*>(counts_buffer.get());
+  static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
+    "blockwise counter too large");
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+  auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
+  uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
+  C10_ZOOM_CHECK(hipMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
+
+  auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
+  uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
+#endif
+
+  Bitwise desiredMask = 0;
+  dim3 grid;
+  TORCH_INTERNAL_ASSERT(getGridFromTiles(num_blocks, grid), "Too many slices for topk");
+  dim3 block(BLOCK_THREADS);
+
+  // iterate radix bits for multiple passes
+  for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
+   hipLaunchKernelGGL(( radixFindKthValues<T, IndexType, Bitwise, Dim>), dim3(grid), dim3(block), 0, stream, 
+        input,
+        inputSliceSize,
+        ks_to_find,
+        numInputSlices,
+        inputWithinSliceStride,
+        current_bit,
+        items_per_thread,
+        blocks_per_slice,
+        desiredMask,
+        semaphores,
+        desired,
+        counts,
+        kthValues);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+   hipLaunchKernelGGL(( computeBlockwiseWithinKCounts<Bitwise>), dim3(grid), dim3(RADIX_DIGITS), 0, stream, 
+      desired, counts, blocks_per_slice, current_bit, largest, withinKCounts, num_blocks);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+#endif
+    desiredMask = at::zoom::Bitfield<Bitwise>::setBitfield(desiredMask, RADIX_MASK, current_bit, RADIX_BITS);
+  }
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+ hipLaunchKernelGGL(( computeBlockwiseKthCounts<Bitwise>), dim3(::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824)), dim3(256), 0, stream, 
+    desired, counts, num_blocks, blocks_per_slice, kthCounts);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  // Do a prefix scan of withinKCounts and kthCounts using slice_idx as keys to get the starting index of each block
+  using counting_iter_t = hipcub::CountingInputIterator<uint32_t, uint32_t>;
+  using slice_idx_iter_t = hipcub::TransformInputIterator<uint32_t, BlockIdxToKey, counting_iter_t>;
+  slice_idx_iter_t slice_idx_iter(counting_iter_t(0), BlockIdxToKey(blocks_per_slice));
+  at::zoom::hipcub::inclusive_sum_by_key(slice_idx_iter, withinKCounts, withinKCounts, num_blocks);
+  at::zoom::hipcub::inclusive_sum_by_key(slice_idx_iter, kthCounts, kthCounts, num_blocks);
+  // copy topk values to output tensor
+ hipLaunchKernelGGL(( gatherTopK<T, IndexType, Dim>), dim3(grid), dim3(block), 0, stream, 
+    input, inputSliceSize, outputSliceSize, largest, numInputSlices, inputWithinSliceStride,
+    topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
+    blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+#else
+  // Find topk values based on kth values
+  {
+    dim3 grid;
+    TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
+    int warp_size = at::zoom::warp_size();
+    dim3 block(::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
+   hipLaunchKernelGGL(( sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true>), dim3(grid), dim3(block), 0, stream, 
+        input,
+        inputSliceSize,
+        outputSliceSize,
+        largest,
+        numInputSlices,
+        inputWithinSliceStride,
+        topK,
+        topKWithinSliceStride,
+        indices,
+        indicesWithinSliceStride,
+        kthValues);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+#endif
+}
+
+} // namespace mbtopk
+
+bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
+  if (num_slices > std::numeric_limits<uint32_t>::max() ||
+      slice_size > std::numeric_limits<uint32_t>::max()) return false;
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
+  return (num_slices <= 20 && slice_size >= 20000) ||
+      (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
+      (num_slices > 40 && num_slices <= 80 && slice_size >= 8000) ||
+      (num_slices > 80 && num_slices < 200 && slice_size >= 5000) ||
+      (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
+      (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
+      (num_slices > 4000 && slice_size >= 400);
+#else
+  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
+  return (num_slices <= 400 && slice_size >= 5000) ||
+      (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
+      (num_slices >= 4000 && slice_size >= 300);
+#endif
+}
+
+void launch_gather_topk_kernel(
+    const TensorBase& self, int64_t k, int64_t dim, bool largest,
+    const TensorBase& values, const TensorBase& indices) {
+  int numDims = self.dim();
+  numDims = numDims == 0 ? 1 : numDims;
+  TORCH_CHECK(numDims <= MAX_DIMS, "input tensor has too many dimensions");
+  int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
+
+  auto input = self.contiguous();
+  // static_cast is required to ensure that the correct type (INDEX_T)
+  // is provided to the kernel for the arguments.
+#define RUN_K(INDEX_T, DIM, LAUNCH_FUNCTION_NAME)                       \
+  LAUNCH_FUNCTION_NAME<scalar_t, INDEX_T, DIM>(                         \
+      inputInfo,                                                        \
+      static_cast<INDEX_T>(sliceSize),                                  \
+      static_cast<INDEX_T>(k),                                          \
+      largest,                                                          \
+      static_cast<INDEX_T>(numInputSlices),                             \
+      /* The actual dimension that the k-selection is running in */     \
+      /* may have changed from collapseDims() */                        \
+      static_cast<INDEX_T>(inputInfo.strides[collapseInputDim]),        \
+      topKInfo,                                                         \
+      static_cast<INDEX_T>(topKInfo.strides[collapseTopKDim]),          \
+      indicesInfo,                                                      \
+      static_cast<INDEX_T>(indicesInfo.strides[collapseIndicesDim]));
+
+#define RUN_MB(INDEX_T, DIM)                                            \
+  if (should_use_multiblock(numInputSlices, sliceSize)) {               \
+    RUN_K(INDEX_T, DIM, mbtopk::launch);                                \
+  } else {                                                              \
+    RUN_K(INDEX_T, DIM, sbtopk::launch);                                \
+  }
+
+#define RUN_DIM(INDEX_T)                        \
+  if (allDims == 1) {                           \
+    RUN_MB(INDEX_T, 1);                         \
+  } else if (allDims == 2) {                    \
+    RUN_MB(INDEX_T, 2);                         \
+  } else if (allDims == 3) {                    \
+    RUN_MB(INDEX_T, 3);                         \
+  } else {                                      \
+    RUN_MB(INDEX_T, -1);                        \
+  }
+
+#define RUN_T(INDEX_T)                                                    \
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "topk_out_zoom", [&] { \
+    at::zoom::detail::TensorInfo<const scalar_t, INDEX_T> inputInfo =     \
+      at::zoom::detail::getTensorInfo<const scalar_t, INDEX_T>(input);    \
+    at::zoom::detail::TensorInfo<scalar_t, INDEX_T> topKInfo =            \
+      at::zoom::detail::getTensorInfo<scalar_t, INDEX_T>(values);         \
+    at::zoom::detail::TensorInfo<int64_t, INDEX_T> indicesInfo =          \
+      at::zoom::detail::getTensorInfo<int64_t, INDEX_T>(indices);         \
+    /* tensorInfoLegacyIfScalar*/                                         \
+    if (!input.dim()) {                                                   \
+      inputInfo.dims = 1;                                                 \
+      inputInfo.sizes[0] = 1;                                             \
+      inputInfo.strides[0] = 1;                                           \
+      topKInfo.dims = 1;                                                  \
+      topKInfo.sizes[0] = 1;                                              \
+      topKInfo.strides[0] = 1;                                            \
+      indicesInfo.dims = 1;                                               \
+      indicesInfo.sizes[0] = 1;                                           \
+      indicesInfo.strides[0] = 1;                                         \
+    }                                                                     \
+    /* We use these structures solely to find the offset to */            \
+    /* each slice we are operating on */                                  \
+    inputInfo.sizes[dim] = 1;                                             \
+    topKInfo.sizes[dim] = 1;                                              \
+    indicesInfo.sizes[dim] = 1;                                           \
+    /* stash the stride of dim because it can be accidentally collapsed */ \
+    auto strideTopK = topKInfo.strides[dim];                              \
+    auto strideIndices = indicesInfo.strides[dim];                        \
+    /* Collapse all other dims */                                         \
+    int collapseInputDim = inputInfo.collapseDims(dim);                   \
+    int collapseTopKDim = topKInfo.collapseDims(dim);                     \
+    int collapseIndicesDim = indicesInfo.collapseDims(dim);               \
+    /* restore stride in case it was collapsed */                         \
+    topKInfo.strides[collapseTopKDim] = strideTopK;                       \
+    indicesInfo.strides[collapseIndicesDim] = strideIndices;              \
+    int64_t numInputSlices = 1;                                           \
+    for (int i = 0; i < inputInfo.dims; ++i) {                            \
+      numInputSlices *= inputInfo.sizes[i];                               \
+    }                                                                     \
+                                                                          \
+    /* This is used as a template parameter to calculate indices. */      \
+    /* We only specialize it if all collapsed dim sizes are the */        \
+    /* same; otherwise, we use -1 which is the specialization */          \
+    /* parameter for arbitrary dimensions */                              \
+    int allDims = inputInfo.dims;                                         \
+    if (topKInfo.dims != allDims || indicesInfo.dims != allDims) {        \
+      allDims = -1;                                                       \
+    }                                                                     \
+                                                                          \
+    RUN_DIM(INDEX_T);                                                     \
+  });
+
+  // the below is safe with 0-dimensional tensors because it is based on
+  // TensorInfo which implicitly expands to 1-dimensional.
+  if (input.numel() > 0) {
+    // Based on required index size, run the algorithm with the
+    // appropriate index type
+    if (at::zoom::detail::canUse32BitIndexMath(input) &&
+        at::zoom::detail::canUse32BitIndexMath(values) &&
+        at::zoom::detail::canUse32BitIndexMath(indices)) {
+      RUN_T(uint32_t);
+    } else {
+      RUN_T(uint64_t);
+    }
+  }
+#undef RUN_T
+#undef RUN_DIM
+#undef RUN_K
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/TensorTopK.h b/aten/src/ATen/native/zoom/TensorTopK.h
new file mode 100644
index 00000000000000..9eebf2cd6040c4
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorTopK.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+void launch_gather_topk_kernel(
+    const TensorBase& self,
+    int64_t k, int64_t dim, bool largest,
+    const TensorBase& values, const TensorBase& indices);
+}}
diff --git a/aten/src/ATen/native/zoom/TriangularOps.cu b/aten/src/ATen/native/zoom/TriangularOps.cu
new file mode 100644
index 00000000000000..b1bd67b8f501c5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TriangularOps.cu
@@ -0,0 +1,165 @@
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/ceil_div.h>
+#include <ATen/Context.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/Dispatch.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/diag.h>
+#include <ATen/ops/diag_native.h>
+#include <ATen/ops/trace_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_native.h>
+#endif
+
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+namespace at::native {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+constexpr static int block_size = 128;
+
+template <typename scalar_t, typename IndexType, bool upper, int elements_per_thread, bool inplace>
+C10_LAUNCH_BOUNDS_1(block_size)
+__global__ void triu_tril_kernel(
+    zoom::detail::TensorInfo<scalar_t, IndexType> result_info,
+    const zoom::detail::TensorInfo<const scalar_t, IndexType> self_info,
+    const int64_t k,
+    const int64_t N_padded,
+    const IndexType last_dim_padded) {
+  int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
+  if (linear_idx >= N_padded) {
+    return;
+  }
+
+  auto dims = self_info.dims;
+
+  // Compute column index amd row index
+  IndexType col = linear_idx % last_dim_padded;
+  linear_idx /= last_dim_padded;
+  IndexType row = linear_idx % self_info.sizes[dims - 2];
+
+  if constexpr (inplace) {
+    bool mask_all_true = upper ? (col - row >= k) : (col + elements_per_thread - row <= k);
+    if (mask_all_true)
+      return;
+  }
+
+  // Compute offset
+  IndexType self_offset = 0, result_offset = 0;
+  self_offset += self_info.strides[dims - 1] * col;
+  result_offset += result_info.strides[dims - 1] * col;
+  linear_idx /= self_info.sizes[dims - 2];
+  self_offset += self_info.strides[dims - 2] * row;
+  result_offset += result_info.strides[dims - 2] * row;
+
+  // Compute remaining offsets
+  IndexType running_index;
+  #pragma unroll
+  for (IndexType i = dims - 3; i >= 0; --i) {
+    running_index = linear_idx % self_info.sizes[i];
+    linear_idx /= self_info.sizes[i];
+    self_offset += running_index * self_info.strides[i];
+    result_offset += running_index * result_info.strides[i];
+  }
+
+  if constexpr (inplace) {
+    #pragma unroll
+    for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++) {
+      bool mask = upper ? (col + i - row >= k) : (col + i - row <= k);
+      if (!mask)
+        result_info.data[result_offset + i * result_info.strides[dims - 1]] = scalar_t(0);
+    }
+  } else {
+    scalar_t frag[elements_per_thread] = {};
+    bool has_mask = (upper && col + elements_per_thread - row >= k) || (!upper && col - row <= k);
+    if (has_mask) {
+      #pragma unroll
+      for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++)
+        frag[i] = self_info.data[self_offset + i * self_info.strides[dims - 1]];
+
+      #pragma unroll
+      for (int i = 0; i < elements_per_thread; i++) {
+        bool mask = upper ? (col + i - row >= k) : (col + i - row <= k);
+        frag[i] = mask ? frag[i] : scalar_t(0);
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++)
+      result_info.data[result_offset + i * result_info.strides[dims - 1]] = frag[i];
+  }
+}
+
+template <bool upper>
+void triu_tril_zoom_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      at::ScalarType::ComplexHalf,
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      at::ScalarType::Bool,
+      self.scalar_type(), "triu_tril_zoom_template", [&] {
+    constexpr int elements_per_thread = sizeof(scalar_t) < 8 ? 8 / sizeof(scalar_t) : 1;
+    auto sizes = self.sizes();
+    int64_t last_dim_padded = round_up<int64_t>(sizes.back(), elements_per_thread);
+    int64_t N_padded = c10::multiply_integers(sizes.begin(), sizes.end() - 1) * last_dim_padded;
+    dim3 dim_block = block_size;
+    dim3 dim_grid((N_padded / elements_per_thread + dim_block.x - 1) / dim_block.x);
+    if (zoom::detail::canUse32BitIndexMath(result) && zoom::detail::canUse32BitIndexMath(self)) {
+      auto result_info = zoom::detail::getTensorInfo<scalar_t, int32_t>(result);
+      auto self_info = zoom::detail::getTensorInfo<const scalar_t, int32_t>(self);
+      BOOL_SWITCH(self.is_same(result), inplace, [&] {
+       hipLaunchKernelGGL(( triu_tril_kernel<scalar_t, int32_t, upper, elements_per_thread, inplace>)
+          , dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
+            result_info, self_info, k, N_padded, last_dim_padded);
+      });
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      auto result_info = zoom::detail::getTensorInfo<scalar_t, int64_t>(result);
+      auto self_info = zoom::detail::getTensorInfo<const scalar_t, int64_t>(self);
+      BOOL_SWITCH(self.is_same(result), inplace, [&] {
+       hipLaunchKernelGGL(( triu_tril_kernel<scalar_t, int64_t, upper, elements_per_thread, inplace>)
+          , dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
+            result_info, self_info, k, N_padded, last_dim_padded);
+      });
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+}
+
+TORCH_IMPL_FUNC(tril_zoom)(const Tensor& self, int64_t k, const Tensor &result) {
+  if (self.numel() != 0) {
+    triu_tril_zoom_template<false>(result, self, k, "tril");
+  }
+}
+
+TORCH_IMPL_FUNC(triu_zoom)(const Tensor& self, int64_t k, const Tensor &result) {
+  if (self.numel() != 0) {
+    triu_tril_zoom_template<true>(result, self, k, "triu");
+  }
+}
+
+Tensor trace_zoom(const Tensor& self) {
+  TORCH_CHECK(self.dim() == 2, "expected a matrix");
+  return self.diagonal().sum();
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricCosKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricCosKernel.cu
new file mode 100644
index 00000000000000..d76b21a6bcd095
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricCosKernel.cu
@@ -0,0 +1,58 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char cos_name[] = "cos_impl";
+#endif // AT_USE_JITERATOR()
+
+void cos_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto cos_string = jiterator_stringify(
+        template <typename T> T cos_impl(T a) { return std::cos(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "cos_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/cos_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, cos_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "cos_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::cos(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "cos_zoom",
+        [&]() {
+          gpu_kernel(
+              iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::cos(a); });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(cos_stub, &cos_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricSinKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricSinKernel.cu
new file mode 100644
index 00000000000000..d7417fb6477a87
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricSinKernel.cu
@@ -0,0 +1,58 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char sin_name[] = "sin_impl";
+#endif
+
+void sin_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto sin_string = jiterator_stringify(
+        template <typename T> T sin_impl(T a) { return std::sin(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "sin_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/sin_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, sin_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "sin_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::sin(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "sin_zoom",
+        [&]() {
+          gpu_kernel(
+              iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::sin(a); });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(sin_stub, &sin_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnarySignKernels.cu b/aten/src/ATen/native/zoom/UnarySignKernels.cu
new file mode 100644
index 00000000000000..57362dfcd6007d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnarySignKernels.cu
@@ -0,0 +1,121 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/OpMathType.h>
+
+#include <type_traits>
+
+namespace at::native {
+
+void logical_not_kernel_zoom(TensorIteratorBase& iter) {
+  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND_COMPLEX_AND3(...)
+  // so we don't have to maintain a separate list or to do double dispatch.
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_zoom", [&]() {});
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_zoom", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !a; });
+  });
+}
+
+// NB: Ignores the negative bit on tensors
+CONSTEXPR_EXCEPT_WIN_CUDA char neg_name[] = "neg_kernel";
+void neg_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.dtype();
+  if (at::isComplexType(dtype)) {
+  static const auto neg_string = jiterator_stringify(
+      template <typename T>
+      T neg_kernel(T a) {
+        return -a;
+      }
+  ); // neg_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "neg_zoom", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ neg_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, neg_string);
+  });
+
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, dtype, "neg_zoom", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return -a;
+    });
+  });
+  }
+}
+
+void sign_kernel_zoom(TensorIteratorBase& iter){
+  if (iter.dtype() == ScalarType::Bool) {
+    gpu_kernel(iter, []GPU_LAMBDA(bool a){
+      return a;
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "sign_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return c10::signum(a);
+        });
+    });
+  }
+}
+
+void signbit_kernel_zoom(TensorIteratorBase& iter){
+  // NOTE: signbit does not always support integral arguments.
+  if (at::isIntegralType(iter.input_dtype(), /*includeBool=*/false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.input_dtype(), "signbit_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return is_negative(a); });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, ScalarType::Half, iter.input_dtype(), "signbit_zoom", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return signbit(opmath_t{a}); });
+    });
+  }
+}
+
+template<typename T>
+C10_HOST_DEVICE static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
+  if (z == c10::complex<T>(0, 0)) {
+    return c10::complex<T>(0, 0);
+  } else {
+    return z / std::abs(z);
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char sgn_name[] = "sgn_kernel";
+void sgn_kernel_zoom(TensorIteratorBase& iter){
+  auto dtype = iter.dtype();
+    static const auto sgn_string = jiterator_stringify(
+        template <typename T>
+        T sgn_kernel(T z) {
+          const T zero = T(0);
+          if (z == zero) {
+            return zero;
+          } else {
+            return z / std::abs(z);
+          }
+        }
+      ); // sgn_string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sgn_zoom", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ sgn_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, sgn_string);
+      });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(logical_not_stub, &logical_not_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(neg_stub, &neg_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(sign_stub, &sign_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(signbit_stub, &signbit_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(sgn_stub, &sgn_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/block_reduce.cuh b/aten/src/ATen/native/zoom/block_reduce.cuh
new file mode 100644
index 00000000000000..16f9b2ba6b492a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/block_reduce.cuh
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <thrust/tuple.h>
+
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+
+namespace at {
+namespace native {
+namespace zoom_utils {
+
+constexpr int kHIPBlockReduceNumThreads = 512;
+// Algorithmic limitation: BlockReduce does two WarpReduce calls, each
+// of which reduces C10_WARP_SIZE elements. So, at most
+// C10_WARP_SIZE**2 elements can be reduced at a time.
+// NOTE: This is >= the max block size on current hardware anyway (1024).
+constexpr int kHIPBlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE;
+
+// Sums `val` across all threads in a warp.
+//
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+template <typename T>
+__inline__ __device__ T WarpReduceSum(T val) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val += WARP_SHFL_DOWN(val, offset);
+  }
+  return val;
+}
+
+// Picks the maximum `val` across all threads in a warp.
+//
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+template <typename T>
+__inline__ __device__ T WarpReduceMax(T val) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = max_propagate_nan(val, WARP_SHFL_DOWN(val, offset));
+  }
+  return val;
+}
+
+struct Block1D {
+    static __forceinline__ __device__ int Tid() { return threadIdx.x; }
+
+    static __forceinline__ __device__ int Warps() {
+        return blockDim.x / C10_WARP_SIZE;
+    }
+};
+
+struct Block2D {
+    static __forceinline__ __device__ int Tid() {
+        return threadIdx.x + threadIdx.y * blockDim.x;
+    }
+
+    static __forceinline__ __device__ int Warps() {
+        return blockDim.x * blockDim.y / C10_WARP_SIZE;
+    }
+};
+
+// Sums `val` across all threads in a block.
+//
+// Warning: the return value is only valid for thread 0.
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+//   - `shared` should be a pointer to shared memory with size of, at least,
+//     `sizeof(T) * number_of_warps`
+template <typename T, typename B = Block1D>
+__inline__ __device__ T BlockReduceSum(T val, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceSum(val);
+  }
+  return val;
+}
+
+// Picks out the maximum `val` across all threads in a block.
+//
+// Warning: the return value is only valid for thread 0.
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+//   - `shared` should be a pointer to shared memory with size of, at least,
+//     `sizeof(T) * number_of_warps`
+template <typename T, typename B = Block1D>
+__inline__ __device__ T BlockReduceMax(T val, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduceMax(val);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceMax(val);
+  }
+  return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = op.combine(val, op.warp_shfl_down(val, offset));
+  }
+  return val;
+}
+
+template <typename T, class ReduceOp, typename B = Block1D>
+__inline__ __device__ T
+BlockReduce(T val, const ReduceOp& op, const T& identity_element, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduce(val, op);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : identity_element;
+  if (wid == 0) {
+    val = WarpReduce(val, op);
+  }
+  return val;
+}
+
+} // namespace zoom_utils
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/torchgen/dest/ufunc.py b/torchgen/dest/ufunc.py
index 999f7489a8ff66..51dc66bfc6af88 100644
--- a/torchgen/dest/ufunc.py
+++ b/torchgen/dest/ufunc.py
@@ -349,7 +349,7 @@ def compute_ufunc_zoom(g: NativeFunctionsGroup) -> str:
     {dtype_cases_str}
   );
 }}
-REGISTER_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
+REGISTER_PRIVATEUSE1_DISPATCH({stub_sig.name}, &{stub_sig.kernel_name});
 {sig.defn()} {{
   {stub_sig.direct_call(sig.arguments())};
 }}

From b33031d27b8ccd1e5d271f68c5abaf2735469755 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Mon, 13 Jan 2025 01:04:45 +0000
Subject: [PATCH 05/23] fix matmul kernel

---
 aten/src/ATen/native/zoom/Bmm.cpp   |  7 +++---
 aten/src/ATen/native/zoom/HIPbmm.cu | 38 +++++++++++++++++------------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/zoom/Bmm.cpp b/aten/src/ATen/native/zoom/Bmm.cpp
index f95e530655919f..53e87a7eb3913e 100644
--- a/aten/src/ATen/native/zoom/Bmm.cpp
+++ b/aten/src/ATen/native/zoom/Bmm.cpp
@@ -28,14 +28,15 @@ namespace at::native {
         } else if (batch1.size(2) == 0) {
             return result.zero_();
         }
+        TORCH_CHECK(batch1.sizes()[2] == batch2.sizes()[1], "batch1 dim 2 must match batch2 dim 1");
 
         c10::MaybeOwned<Tensor> result_ = c10::MaybeOwned<Tensor>::borrowed(result);
         IntArrayRef result_strides = result.strides();
         IntArrayRef result_sizes = result.sizes();
 
-        int m = result_sizes[1];
-        int n = result_sizes[2];
-        int k = batch1.sizes()[2];
+        int m = batch1.sizes()[1];
+        int n = batch1.sizes()[2];
+        int k = batch2.sizes()[2];
         int num_batches = result_->sizes()[0];
 
         AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "bmm_hip", [&] {
diff --git a/aten/src/ATen/native/zoom/HIPbmm.cu b/aten/src/ATen/native/zoom/HIPbmm.cu
index 84f5eb2aaf6201..a77a31efaf1af6 100644
--- a/aten/src/ATen/native/zoom/HIPbmm.cu
+++ b/aten/src/ATen/native/zoom/HIPbmm.cu
@@ -2,9 +2,14 @@
 #include <hip/hip_fp16.h>
 #include <hip/hip_bfloat16.h>
 #include <c10/core/ScalarType.h>
+#include <c10/zoom/ZoomException.h>
 
 namespace at::native {
 
+    int num_threads() {
+        return 32;
+    }
+
     // Helper function to convert hip_bfloat16 to float
     __device__ float bfloat16_to_float(hip_bfloat16 a) {
         union {
@@ -63,64 +68,65 @@ namespace at::native {
         int col = blockIdx.x * blockDim.x + threadIdx.x;
         int batch = blockIdx.z;
 
-        if (row < M && col < N) {
+        if (row < M && col < K && batch < batch_size) {
             float sum = 0.0f;
-            for (int k = 0; k < K; ++k) {
-                sum += convert_to_float(A[batch * M * K + row * K + k]) * 
-                    convert_to_float(B[batch * K * N + k * N + col]);
+            for (int n = 0; n < N; ++n) {
+                sum += convert_to_float(A[batch * M * N + row * N + n]) * 
+                    convert_to_float(B[batch * N * K + n * K + col]);
             }
-            C[batch * M * N + row * N + col] = convert_from_float<T>(sum);
+            C[batch * M * K + row * K + col] = convert_from_float<T>(sum);
         }
     }
 
     template <typename T>
     void batched_matmul(const T* A, const T* B, T* C, 
                         int M, int N, int K, int batch_size) {
-        dim3 threadsPerBlock(16, 16);
-        dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+        dim3 threadsPerBlock(num_threads(), num_threads());
+        dim3 numBlocks((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                     (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
                     batch_size);
 
-        hipLaunchKernelGGL(batched_matmul_kernel<T>, numBlocks, threadsPerBlock, 0, 0,
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(batched_matmul_kernel<T>), numBlocks, threadsPerBlock, 0, 0,
                         A, B, C, M, N, K, batch_size);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();        
     }
 
     // Specialization for at::Half
     template <>
     void batched_matmul<at::Half>(const at::Half* A, const at::Half* B, at::Half* C,
                                         int M, int N, int K, int batch_size) {
-        dim3 threadsPerBlock(16, 16);
-        dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+        dim3 threadsPerBlock(num_threads(), num_threads());
+        dim3 numBlocks((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                     (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
                     batch_size);
 
-        hipLaunchKernelGGL(batched_matmul_kernel<__half>, numBlocks, threadsPerBlock, 0, 0,
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(batched_matmul_kernel<__half>), numBlocks, threadsPerBlock, 0, 0,
             reinterpret_cast<const __half*>(A),
             reinterpret_cast<const __half*>(B),
             reinterpret_cast<__half*>(C),
             M, N, K, batch_size);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();        
     }
 
     // Specialization for at::BFloat16
     template <>
     void batched_matmul<at::BFloat16>(const at::BFloat16* A, const at::BFloat16* B, at::BFloat16* C,
                                             int M, int N, int K, int batch_size) {
-        dim3 threadsPerBlock(16, 16);
-        dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
+        dim3 threadsPerBlock(num_threads(), num_threads());
+        dim3 numBlocks((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
                     (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
                     batch_size);
 
-        hipLaunchKernelGGL(batched_matmul_kernel<hip_bfloat16>, numBlocks, threadsPerBlock, 0, 0,
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(batched_matmul_kernel<hip_bfloat16>), numBlocks, threadsPerBlock, 0, 0,
             reinterpret_cast<const hip_bfloat16*>(A),
             reinterpret_cast<const hip_bfloat16*>(B),
             reinterpret_cast<hip_bfloat16*>(C),
             M, N, K, batch_size);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();        
     }
 
     // Explicit instantiations for supported types
     template void batched_matmul<float>(const float*, const float*, float*, int, int, int, int);
     template void batched_matmul<double>(const double*, const double*, double*, int, int, int, int);
-    template void batched_matmul<half>(const half*, const half*, half*, int, int, int, int);
-    template void batched_matmul<hip_bfloat16>(const hip_bfloat16*, const hip_bfloat16*, hip_bfloat16*, int, int, int, int);
 
 } // at::native
\ No newline at end of file

From 1aa7e9241d8c4b7190da5e0ce48bfe6d4d1e732d Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Thu, 16 Jan 2025 22:53:31 +0000
Subject: [PATCH 06/23] some ops + llama script running

---
 aten/src/ATen/native/native_functions.yaml    |  85 +--
 .../native/zoom/BinaryBitwiseOpsKernels.cu    |  78 ++
 .../zoom/DistributionExponentialKernel.cu     |  16 +
 .../ATen/native/zoom/DistributionTemplates.h  | 671 ++++++++++++++++++
 aten/src/ATen/native/zoom/IndexKernel.cu      | 463 ++++++++++++
 aten/src/ATen/native/zoom/IndexKernel.h       |  16 +
 aten/src/ATen/native/zoom/LaunchUtils.h       |  18 +
 .../src/ATen/native/zoom/MultinomialKernel.cu | 462 ++++++++++++
 .../ATen/native/zoom/ReduceAMinMaxKernel.cu   |  45 ++
 .../ATen/native/zoom/ReduceArgMaxKernel.cu    |  46 ++
 .../ATen/native/zoom/ReduceArgMinKernel.cu    |  46 ++
 .../src/ATen/native/zoom/ReduceLogicKernel.cu |   2 +-
 .../ATen/native/zoom/ReduceMaxValuesKernel.cu |  61 ++
 .../ATen/native/zoom/ReduceMinValuesKernel.cu |  58 ++
 aten/src/ATen/native/zoom/ReduceNormKernel.cu |  51 ++
 aten/src/ATen/native/zoom/ReduceOps.cpp       | 102 +++
 aten/src/ATen/native/zoom/ReduceOps.h         |  20 +
 .../ATen/native/zoom/ReduceSumProdKernel.cu   | 215 ++++++
 .../ATen/native/zoom/ScatterGatherKernel.cu   | 573 +++++++++++++++
 aten/src/ATen/native/zoom/UnaryOpsKernel.cu   | 286 ++++++++
 20 files changed, 3271 insertions(+), 43 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionExponentialKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionTemplates.h
 create mode 100644 aten/src/ATen/native/zoom/IndexKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/IndexKernel.h
 create mode 100644 aten/src/ATen/native/zoom/LaunchUtils.h
 create mode 100644 aten/src/ATen/native/zoom/MultinomialKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceAMinMaxKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceArgMaxKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceArgMinKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceMaxValuesKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceMinValuesKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceNormKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceOps.cpp
 create mode 100644 aten/src/ATen/native/zoom/ReduceOps.h
 create mode 100644 aten/src/ATen/native/zoom/ReduceSumProdKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ScatterGatherKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryOpsKernel.cu

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a5876201f7e9c6..1664a6642b4cc4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -816,7 +816,7 @@
 - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: argmax_out
+    CPU, CUDA, PrivateUse1: argmax_out
     MPS: argmax_out_mps
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
@@ -828,7 +828,7 @@
 - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: argmin_out
+    CPU, CUDA, PrivateUse1: argmin_out
     MPS: argmin_out_mps
 
 - func: acosh(Tensor self) -> Tensor
@@ -1194,7 +1194,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: bitwise_not_out
+    CPU, CUDA, PrivateUse1: bitwise_not_out
     MPS: bitwise_not_out_mps
   tags: pointwise
 
@@ -2564,7 +2564,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: exp_out
+    CPU, CUDA, PrivateUse1: exp_out
     MPS: exp_out_mps
   tags: pointwise
 
@@ -2609,7 +2609,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: expm1_out
+    CPU, CUDA, PrivateUse1: expm1_out
     MPS: expm1_out_mps
     SparseCPU, SparseCUDA: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
@@ -3061,7 +3061,7 @@
   precomputed:
   - indices -> DimVector sizes, DimVector strides
   dispatch:
-    CPU, CUDA, MPS: index_out
+    CPU, CUDA, MPS, PrivateUse1: index_out
 
 # Used by inductor to signal indexing without bounds checks
 # Note that we don't support boolean indexing, to avoid dynamic output shapes
@@ -3076,7 +3076,7 @@
   precomputed:
   - dim -> int dim
   dispatch:
-    CPU, CUDA: index_copy_out
+    CPU, CUDA, PrivateUse1: index_copy_out
 
 - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
   variants: method
@@ -3298,7 +3298,7 @@
 
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: nan_to_num_out
+    CPU, CUDA, PrivateUse1: nan_to_num_out
     MPS: nan_to_num_out_mps
     SparseCPU, SparseCUDA: nan_to_num_sparse_out
   tags: pointwise
@@ -3797,7 +3797,7 @@
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: aminmax_out
+    CPU, CUDA, PrivateUse1: aminmax_out
     MPS: aminmax_out_mps
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@@ -3822,7 +3822,7 @@
   precomputed:
   - dim -> int dim
   dispatch:
-    CPU, CUDA: max_out
+    CPU, CUDA, PrivateUse1: max_out
     MPS: max_out_mps
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -4013,7 +4013,7 @@
   precomputed:
   - dim -> int dim
   dispatch:
-    CPU, CUDA: min_out
+    CPU, CUDA, PrivateUse1: min_out
     MPS: min_out_mps
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -5132,7 +5132,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: rsqrt_out
+    CPU, CUDA, PrivateUse1: rsqrt_out
     MPS: rsqrt_out_mps
   tags: pointwise
 
@@ -5764,7 +5764,7 @@
   structured: True
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: sum_out
+    CPU, CUDA, PrivateUse1: sum_out
     MPS: sum_out_mps
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -5778,12 +5778,12 @@
 - func: nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
   dispatch:
-    CPU, CUDA: nansum
+    CPU, CUDA, PrivateUse1: nansum
     MPS: nansum_mps
 
 - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: nansum_out
+    CPU, CUDA, PrivateUse1: nansum_out
     MPS: nansum_out_mps
 
 - func: sum_to_size(Tensor self, SymInt[] size) -> Tensor
@@ -5816,7 +5816,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sqrt_out
+    CPU, CUDA, PrivateUse1: sqrt_out
     MPS: sqrt_out_mps
     SparseCPU, SparseCUDA: sqrt_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
@@ -5911,7 +5911,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: prod
+    CPU, CUDA, PrivateUse1: prod
     MPS: prod_mps
   autogen: prod.out
   tags: core
@@ -5926,7 +5926,7 @@
   structured: True
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: prod_out
+    CPU, CUDA, PrivateUse1: prod_out
     MPS: prod_out_mps
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
@@ -6107,7 +6107,7 @@
 - func: flip(Tensor self, int[] dims) -> Tensor
   variants: function, method
   dispatch:
-    CPU, QuantizedCPU, CUDA, QuantizedCUDA: flip
+    CPU, QuantizedCPU, CUDA, QuantizedCUDA, PrivateUse1: flip
     MPS: flip_mps
   autogen: flip.out
   tags: core
@@ -6770,7 +6770,7 @@
 
 - func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
   dispatch:
-    CPU, CUDA: frexp_out
+    CPU, CUDA, PrivateUse1: frexp_out
   tags: pointwise
 
 # Deprecated (v.1.12)
@@ -8048,7 +8048,7 @@
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
   variants: method
   dispatch:
-    CPU, CUDA: put_
+    CPU, CUDA, PrivateUse1: put_
   autogen: put.out
 
 - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
@@ -8102,6 +8102,7 @@
   dispatch:
     CPU: index_fill_
     CUDA: index_fill_
+    PrivateUse1: index_fill_
     MPS: index_fill_mps_
   autogen: index_fill.int_Scalar_out
 
@@ -8115,7 +8116,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: index_fill_
+    CPU, CUDA, PrivateUse1: index_fill_
     MPS: index_fill_mps_
   autogen: index_fill.int_Tensor_out
 
@@ -8154,7 +8155,7 @@
   structured: True
   variants: function
   dispatch:
-    CPU, CUDA: scatter_src_out
+    CPU, CUDA, PrivateUse1: scatter_src_out
     MPS: scatter_src_out_mps
 
 - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
@@ -8170,7 +8171,7 @@
   structured: True
   variants: function
   dispatch:
-    CPU, CUDA: scatter_value_out
+    CPU, CUDA, PrivateUse1: scatter_value_out
     MPS: scatter_value_out_mps
 
 - func: scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
@@ -8185,7 +8186,7 @@
   structured: True
   variants: function
   dispatch:
-    CPU, CUDA: scatter_reduce_out
+    CPU, CUDA, PrivateUse1: scatter_reduce_out
     MPS: scatter_reduce_out_mps
 
 - func: scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
@@ -8200,7 +8201,7 @@
   structured: True
   variants: function
   dispatch:
-    CPU, CUDA: scatter_value_reduce_out
+    CPU, CUDA, PrivateUse1: scatter_value_reduce_out
     MPS: scatter_value_reduce_out_mps
 
 - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
@@ -8222,7 +8223,7 @@
   structured: True
   variants: function
   dispatch:
-    CPU, CUDA: scatter_add
+    CPU, CUDA, PrivateUse1: scatter_add
     MPS: scatter_add_mps_out
 
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
@@ -8241,7 +8242,7 @@
   structured: True
   variants: function
   dispatch:
-    CPU, CUDA: scatter_reduce_two
+    CPU, CUDA, PrivateUse1: scatter_reduce_two
 
 - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: eq.Scalar_out
@@ -8259,7 +8260,7 @@
   structured_inherits: TensorIteratorBase
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_and_out
+    CPU, CUDA, PrivateUse1: bitwise_and_out
     MPS: bitwise_and_out_mps
   tags: pointwise
 
@@ -8326,7 +8327,7 @@
   structured_inherits: TensorIteratorBase
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_or_out
+    CPU, CUDA, PrivateUse1: bitwise_or_out
     MPS: bitwise_or_out_mps
   tags: pointwise
 
@@ -8393,7 +8394,7 @@
   structured_inherits: TensorIteratorBase
   variants: function
   dispatch:
-    CPU, CUDA: bitwise_xor_out
+    CPU, CUDA, PrivateUse1: bitwise_xor_out
     MPS: bitwise_xor_out_mps
   tags: pointwise
 
@@ -8718,7 +8719,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: exponential_
+    CPU, CUDA, PrivateUse1: exponential_
     MPS: exponential_mps_
   autogen: exponential, exponential.out
 
@@ -9150,12 +9151,12 @@
 
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: take_out
+    CPU, CUDA, PrivateUse1: take_out
 
 - func: take(Tensor self, Tensor index) -> Tensor
   variants: method, function
   dispatch:
-    CPU, CUDA: take
+    CPU, CUDA, PrivateUse1: take
 
 - func: take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -9248,7 +9249,7 @@
 - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: gather_out
+    CPU, CUDA, PrivateUse1: gather_out
     MPS: gather_out_mps
 
 - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
@@ -9464,13 +9465,13 @@
 - func: multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: multinomial_out
+    CPU, CUDA, PrivateUse1: multinomial_out
     MPS: multinomial_out_mps
 
 - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
   variants: method, function
   dispatch:
-    CPU, CUDA: multinomial
+    CPU, CUDA, PrivateUse1: multinomial
     MPS: multinomial_mps
   tags: nondeterministic_seeded
 
@@ -9905,14 +9906,14 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: min
+    CPU, CUDA, PrivateUse1: min
     MPS: min_mps
     QuantizedCPU: min_quantized_cpu
 
 - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: min_unary_out
+    CPU, CUDA, PrivateUse1: min_unary_out
     QuantizedCPU: min_quantized_unary_out
 
 - func: fmin(Tensor self, Tensor other) -> Tensor
@@ -9933,7 +9934,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: max
+    CPU, CUDA, PrivateUse1: max
     MPS: max_mps
     QuantizedCPU: max_quantized_cpu
 
@@ -9980,7 +9981,7 @@
 - func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: max_unary_out
+    CPU, CUDA, PrivateUse1: max_unary_out
     QuantizedCPU: max_quantized_unary_out
 
 - func: minimum(Tensor self, Tensor other) -> Tensor
@@ -14092,7 +14093,7 @@
   python_module: linalg
   structured: True
   dispatch:
-    CPU, CUDA: linalg_vector_norm_out
+    CPU, CUDA, PrivateUse1: linalg_vector_norm_out
     MPS: linalg_vector_norm_out_mps
 
 - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
diff --git a/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu b/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu
new file mode 100644
index 00000000000000..fbd3657a48b6fd
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu
@@ -0,0 +1,78 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+namespace at::native {
+
+template<typename scalar_t>
+struct BitwiseAndFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a & b;
+  }
+};
+
+template<>
+struct BitwiseAndFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a && b;
+  }
+};
+
+void bitwise_and_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_and_zoom", [&]() {
+    BitwiseAndFunctor<scalar_t> f;
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, f);
+  });
+}
+
+template<typename scalar_t>
+struct BitwiseOrFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a | b;
+  }
+};
+
+template<>
+struct BitwiseOrFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a || b;
+  }
+};
+
+void bitwise_or_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_or_zoom", [&]() {
+    BitwiseOrFunctor<scalar_t> f;
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, f);
+  });
+}
+
+template<typename scalar_t>
+struct BitwiseXorFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a ^ b;
+  }
+};
+
+template<>
+struct BitwiseXorFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a != b;
+  }
+};
+
+void bitwise_xor_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_xor_zoom", [&]() {
+    BitwiseXorFunctor<scalar_t> f;
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, f);
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(bitwise_and_stub, &bitwise_and_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(bitwise_or_stub, &bitwise_or_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(bitwise_xor_stub, &bitwise_xor_kernel_zoom);
+
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionExponentialKernel.cu b/aten/src/ATen/native/zoom/DistributionExponentialKernel.cu
new file mode 100644
index 00000000000000..2dd9cece286995
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionExponentialKernel.cu
@@ -0,0 +1,16 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+
+namespace at::native {
+
+void exponential_kernel(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::exponential_kernel(iter, lambda, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(exponential_stub, &exponential_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionTemplates.h b/aten/src/ATen/native/zoom/DistributionTemplates.h
new file mode 100644
index 00000000000000..24981a26aa817b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionTemplates.h
@@ -0,0 +1,671 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/ExpandBase.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <c10/util/Half.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/HIPGraphsUtils.hpp>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/core/DistributionsHelper.h>
+
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <mutex>
+#include <tuple>
+#include <type_traits>
+
+namespace at {
+namespace native {
+namespace {
+
+// launch bounds used for kernels utilizing TensorIterator
+const uint32_t block_size_bound = 256;
+const uint32_t grid_size_bound = 4;
+// number of randoms given by distributions like hiprand_uniform4, hiprand_uniform2_double
+// used in calculating philox offset.
+const uint32_t hiprand4_engine_calls = 4;
+
+// utility function that calculates proper philox_offset
+// for distributions utilizing TensorIterator. For distributions using
+// TensorIterator, we are using a grid-stride loop with each
+// thread yielding one element per thread. For the edge of the grid-stride
+// loop, if the tensor size is large, the unroll loop will kick in and the float4
+// from hiprand4 will start getting utilized (for common tensor sizes, we end up
+// using rand.x from each thread). Hence, the philox_offset is
+// (number of elements per thread * number of engine calls), which makes
+// sure that philox offset increment is not less than the number of randoms used
+// in each thread.
+std::tuple<uint64_t, dim3, dim3> calc_execution_policy(int64_t total_elements) {
+  const uint64_t numel = static_cast<uint64_t>(total_elements);
+  const uint32_t block_size = block_size_bound;
+  const uint32_t unroll = hiprand4_engine_calls;
+  dim3 dim_block(block_size);
+  dim3 grid((numel + block_size - 1) / block_size);
+  uint32_t blocks_per_sm = at::zoom::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
+  grid.x = std::min(
+      static_cast<uint32_t>(at::zoom::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm,
+      grid.x);
+  //number of times random will be generated per thread, to offset philox counter in thc random state
+  uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll) + 1)
+                                * hiprand4_engine_calls;
+  return std::make_tuple(counter_offset, grid, dim_block);
+}
+
+// grid stride loop kernel for distributions
+template<typename accscalar_t, int unroll_factor, typename dist_t, typename transform_t>
+C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound)
+__global__ void distribution_elementwise_grid_stride_kernel(int numel,
+                                                            PhiloxHIPState philox_args,
+                                                            const dist_t dist_func,
+                                                            const transform_t transform_func) {
+  auto seeds = at::zoom::philox::unpack(philox_args);
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) *
+      blockDim.x * gridDim.x * unroll_factor;
+  for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
+    auto rand = dist_func(&state);
+    #pragma unroll
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      int li = linear_index + blockDim.x * gridDim.x * ii;
+      if (li < numel) {
+        transform_func(li, static_cast<accscalar_t>((&rand.x)[ii]));
+      }
+    }
+    __syncthreads();
+  }
+}
+
+/**
+ * distribution_nullary_kernel is analogous to gpu_kernel in
+ * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses
+ * TensorIterator to launch a kernel. However, the differences are
+ *   - it launches a grid-stride loop based kernel. The kernel is not
+ *     generic like elementwise_kernel in Loops.cuh and is specialized
+ *     for the distribution kernels here.
+ *   - For big size tensors, we can launch multiple kernels recursively
+ *     (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox
+ *     offset calculation is done in this function.
+ *
+ * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh
+ * to have grid-stride loop kernel and then use that to launch our distribution
+ * kernels? Note that we need a grid-stride loop kernel because, we found by testing
+ * that it achieves peak effective bandwidth.
+ */
+template<typename scalar_t,
+         typename accscalar_t,
+         int unroll_factor,
+         typename RNG,
+         typename dist_t,
+         typename transform_t>
+void distribution_nullary_kernel(at::TensorIteratorBase& iter,
+                                 RNG gen,
+                                 const dist_t& dist_func,
+                                 const transform_t transform_func) {
+  static_assert(unroll_factor >= 1, "unroll_factor must be >= 1.");
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+
+  auto execution_policy = calc_execution_policy(numel);
+  auto counter_offset = std::get<0>(execution_policy);
+  auto grid = std::get<1>(execution_policy);
+  auto block = std::get<2>(execution_policy);
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(counter_offset);
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_nullary_kernel<scalar_t, accscalar_t, unroll_factor>(sub_iter,
+        gen, dist_func, transform_func);
+    }
+    return;
+  }
+
+  char* out_data = (char*)iter.data_ptr(0);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  if (iter.is_trivial_1d()) {
+    auto strides = iter.get_inner_strides();
+    int stride0 = strides[0];
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      rng_engine_inputs,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        scalar_t* out = (scalar_t*)&out_data[stride0 * idx];
+        *out = transform_func(rand);
+      }
+    );
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+    auto offset_calc = make_offset_calculator<1>(iter);
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      rng_engine_inputs,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        auto offsets = offset_calc.get(idx);
+        scalar_t* out = (scalar_t*)&out_data[offsets[0]];
+        *out = transform_func(rand);
+      }
+    );
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+// Binary kernel
+template <typename func_t, typename inp_offset_calc_t, typename out_offset_calc_t>
+__global__ void distribution_binary_elementwise_kernel(
+    int numel,
+    func_t f,
+    PhiloxHIPState philox_args,
+    typename function_traits<func_t>::result_type *output_data,
+    const typename function_traits<func_t>::template arg<1>::type *input_data_1,
+    const typename function_traits<func_t>::template arg<2>::type *input_data_2,
+    inp_offset_calc_t inp_calc,
+    out_offset_calc_t out_calc) {
+  auto seeds = at::zoom::philox::unpack(philox_args);
+
+  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
+  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
+
+  input_t_1 inputs_1[thread_work_size()];
+  input_t_2 inputs_2[thread_work_size()];
+
+  int base_index = block_work_size() * blockIdx.x;
+  int remaining = std::min<int>(numel - base_index, block_work_size());
+
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds),
+              blockIdx.x * blockDim.x + threadIdx.x,
+              std::get<1>(seeds),
+              &state);
+
+  // load data into registers
+  int thread_idx = threadIdx.x;
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (thread_idx >= remaining) {
+      break;
+    }
+    int input_idx = thread_idx + base_index;
+    auto offsets = inp_calc.get(input_idx);
+    inputs_1[i] = input_data_1[offsets[0]];
+    inputs_2[i] = input_data_2[offsets[1]];
+
+    thread_idx += num_threads();
+  }
+
+  // compute and store
+  thread_idx = threadIdx.x;
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (thread_idx >= remaining) {
+      break;
+    }
+    int input_idx = thread_idx + base_index;
+    auto offsets = out_calc.get(input_idx);
+    output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]);
+    thread_idx += num_threads();
+  }
+}
+
+template <typename func_t>
+void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxHIPState philox_args, const func_t &f) {
+  static_assert(std::is_same<typename function_traits<func_t>::template arg<0>::type, hiprandStatePhilox4_32_10_t&>::value, "the first argument of functor must be hiprandStatePhilox4_32_10_t");
+  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
+  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
+  using output_t = typename function_traits<func_t>::result_type;
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_binary_kernel(sub_iter, philox_args, f);
+    }
+    return;
+  }
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(iter.can_use_32bit_indexing());
+
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+
+  output_t *output_data = static_cast<output_t *>(iter.data_ptr(0));
+  const input_t_1 *input_data_1 = static_cast<const input_t_1 *>(iter.data_ptr(1));
+  const input_t_2 *input_data_2 = static_cast<const input_t_2 *>(iter.data_ptr(2));
+
+  int64_t grid = (numel + block_work_size() - 1) / block_work_size();
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  if (iter.is_contiguous()) {
+    distribution_binary_elementwise_kernel<<<grid,num_threads(), 0, stream>>>(
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
+        TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+    distribution_binary_elementwise_kernel<<<grid, num_threads(), 0, stream>>>(
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
+        make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // namespace
+}} // namespace at::native
+
+
+namespace at {
+namespace native {
+namespace templates {
+namespace zoom {
+
+// ==================================================== Random ========================================================
+
+template<typename RNG>
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) {
+  AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_zoom", AT_WRAP([&] {
+    if ((
+      std::is_same<scalar_t, int64_t>::value ||
+      std::is_same<scalar_t, double>::value ||
+      std::is_same<scalar_t, float>::value ||
+      std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
+    {
+      // define lambda to mod with range and add base
+      auto random_func = [range, base] __device__ (uint64_t rand) {
+        return transformation::uniform_int_from_to<scalar_t>(rand, range, base);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, hiprand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = hiprand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      auto random_func = [range, base] __device__ (uint32_t rand) {
+        return transformation::uniform_int_from_to<scalar_t>(rand, range, base);
+      };
+      distribution_nullary_kernel<scalar_t, uint32_t, hiprand4_engine_calls>(iter,
+        gen,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) {
+          return hiprand4(state);
+        },
+        random_func);
+    }
+   }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+// This is the special kernel to handle single specific case:
+// from(inclusive) = std::numeric_limits<int64_t>::lowest()
+// to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
+template<typename RNG>
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_zoom", [&] {
+    if (std::is_same<scalar_t, int64_t>::value ||
+        std::is_same<scalar_t, double>::value ||
+        std::is_same<scalar_t, float>::value ||
+        std::is_same<scalar_t, at::BFloat16>::value) {
+      auto random_func = [] __device__ (uint64_t rand) {
+        return transformation::uniform_int_full_range<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, hiprand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = hiprand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      TORCH_CHECK(false, "random_full_64_bits_range_kernel_zoom handles only int64, double, float and bfloat16");
+    }
+  });
+}
+
+template<typename RNG>
+struct RandomFromToKernel {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
+    random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
+  }
+  void operator()(TensorIteratorBase& iter, std::optional<Generator> gen) {
+    random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+
+template<typename RNG>
+void random_kernel(TensorIteratorBase& iter, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_zoom", [&] {
+    if (std::is_same<scalar_t, double>::value || std::is_same<scalar_t, int64_t>::value) {
+      auto random_func = [] __device__ (uint64_t rand) {
+        return transformation::uniform_int<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, hiprand4_engine_calls/2>(iter, gen,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = hiprand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      auto random_func = [] __device__ (uint32_t rand) {
+        return transformation::uniform_int<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint32_t, hiprand4_engine_calls>(iter,
+        gen,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) {
+          return hiprand4(state);
+        },
+        random_func);
+    }
+  });
+}
+
+template<typename RNG>
+struct RandomKernel {
+  void operator()(TensorIteratorBase& iter, RNG gen) {
+    random_kernel(iter, gen);
+  }
+};
+
+// ====================================================================================================================
+
+template<typename scalar_t, typename accscalar_t, size_t hiprand4_engine_calls, typename RNG, typename transform_t>
+void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
+  if (std::is_same<scalar_t, double>::value) {
+    distribution_nullary_kernel<scalar_t, accscalar_t, hiprand4_engine_calls/2>(iter,
+      gen,
+      [] __device__ (hiprandStatePhilox4_32_10_t* state) { return hiprand_uniform2_double(state); },
+      transform);
+  } else {
+    distribution_nullary_kernel<scalar_t, accscalar_t, hiprand4_engine_calls>(iter,
+      gen,
+      [] __device__ (hiprandStatePhilox4_32_10_t* state) { return hiprand_uniform4(state); },
+      transform);
+  }
+}
+
+template<typename scalar_t, typename accscalar_t, size_t hiprand4_engine_calls, typename RNG, typename transform_t>
+void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
+  if (std::is_same<scalar_t, double>::value) {
+    distribution_nullary_kernel<scalar_t, accscalar_t, hiprand4_engine_calls/2>(iter,
+      gen,
+      [] __device__ (hiprandStatePhilox4_32_10_t* state) { return hiprand_normal2_double(state); },
+      transform);
+  } else {
+    distribution_nullary_kernel<scalar_t, accscalar_t, hiprand4_engine_calls>(iter,
+      gen,
+      [] __device__ (hiprandStatePhilox4_32_10_t* state) { return hiprand_normal4(state); },
+      transform);
+  }
+}
+
+// ==================================================== Normal ========================================================
+
+template<typename RNG>
+void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda to multiply std and add mean
+    auto normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::normal<accscalar_t>(rand, mean, std));
+    };
+    normal_and_transform<scalar_t, accscalar_t, hiprand4_engine_calls>(iter, gen, normal_func);
+   });
+}
+
+template<typename RNG>
+struct NormalKernel {
+  void operator()(const TensorBase &self, double mean, double std, std::optional<Generator> gen) {
+    normal_kernel(self, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Uniform ========================================================
+
+template<typename RNG>
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_zoom", [&] {
+    auto from = static_cast<scalar_t>(from_);
+    auto to = static_cast<scalar_t>(to_);
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto range = static_cast<opmath_t>(to-from);
+    // define lambda to reverse bounds, multiply 'range' and add 'from_'
+    auto uniform_func = [range, from, to] __device__ (opmath_t rand) {
+      // Compute output value before reversing the bounds
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/96947
+      auto value = static_cast<scalar_t>(rand * range + from);
+      // reverse the bounds of hiprand4 from (0, 1] to [0, 1)
+      // Note that this method is from legacy THCTensorRandom and is likely to give
+      // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
+      // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
+      auto reverse_bound_value = value == to ? from : value;
+      return reverse_bound_value;
+    };
+    uniform_and_transform<scalar_t, opmath_t, hiprand4_engine_calls>(iter, gen, uniform_func);
+   });
+}
+
+template<typename RNG>
+struct UniformKernel {
+  void operator()(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
+    uniform_kernel(iter, from, to, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== LogNormal =======================================================
+
+template<typename RNG>
+void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda for log_normal transformation
+    auto log_normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::log_normal<accscalar_t>(transformation::normal<accscalar_t>(rand, mean, std)));
+    };
+    normal_and_transform<scalar_t, accscalar_t, hiprand4_engine_calls>(iter, gen, log_normal_func);
+   });
+}
+
+template<typename RNG>
+struct LogNormalKernel {
+  void operator()(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
+    log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// =================================================== Geometric ======================================================
+
+template<typename RNG>
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_zoom", [&] {
+    using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
+    // define lambda for geometric transformation
+    auto geometric_func = [p] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::geometric<accscalar_t>(rand, p));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, hiprand4_engine_calls>(iter, gen, geometric_func);
+  });
+}
+
+template<typename RNG>
+struct GeometricKernel {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
+    geometric_kernel(iter, p, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== Exponential =====================================================
+
+template<typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto lambda = static_cast<accscalar_t>(lambda_);
+    // define lambda for exponential transformation
+    auto exponential_func = [lambda] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::exponential<accscalar_t>(rand, lambda));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, hiprand4_engine_calls>(iter, gen, exponential_func);
+   });
+}
+
+template<typename RNG>
+struct ExponentialKernel {
+  void operator()(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
+    exponential_kernel(iter, lambda, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Cauchy ========================================================
+
+template<typename RNG>
+void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto median = static_cast<accscalar_t>(median_);
+    auto sigma = static_cast<accscalar_t>(sigma_);
+    // define lambda for cauchy transformation
+    auto cauchy_func = [median, sigma] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::cauchy<accscalar_t>(rand, median, sigma));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, hiprand4_engine_calls>(iter, gen, cauchy_func);
+   });
+}
+
+template<typename RNG>
+struct CauchyKernel {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
+    cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Bernoulli =====================================================
+
+template<typename scalar_t, typename prob_t>
+void bernoulli_tensor_zoom_kernel(
+    const TensorBase &ret, const at::TensorBase &p,
+    PhiloxHIPState philox_args) {
+  auto functor = [philox_args] __device__(
+          int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
+          const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) {
+        auto seeds = at::zoom::philox::unpack(philox_args);
+        hiprandStatePhilox4_32_10_t state;
+        hiprand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
+
+        // See Note [Register spilling in curand call for CUDA < 10]
+        float4 rand = hiprand_uniform4(&state);
+        switch (n) {
+          case 4: {
+            ZOOM_KERNEL_ASSERT(0 <= p4 && p4 <= 1);
+            v4 = static_cast<scalar_t>(rand.w <= p4);
+            // fallthrough
+          }
+          case 3: {
+            ZOOM_KERNEL_ASSERT(0 <= p3 && p3 <= 1);
+            v3 = static_cast<scalar_t>(rand.z <= p3);
+            // fallthrough
+          }
+          case 2: {
+            ZOOM_KERNEL_ASSERT(0 <= p2 && p2 <= 1);
+            v2 = static_cast<scalar_t>(rand.y <= p2);
+            // fallthrough
+          }
+          case 1: {
+            ZOOM_KERNEL_ASSERT(0 <= p1 && p1 <= 1);
+            v1 = static_cast<scalar_t>(rand.x <= p1);
+          }
+        }
+      };
+  // The template argument `4` below indicates that we want to operate on four
+  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
+  at::zoom::Zoom_tensor_apply2<scalar_t, const prob_t, 4, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, p, functor);
+}
+
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(10);
+  }
+  TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type());
+  // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
+  const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat;
+  auto p_zoom = p_.to(TensorOptions().device(self.device()).dtype(p_type));
+  auto p = expand_inplace(self, p_zoom);
+  AT_DISPATCH_ALL_TYPES_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_zoom_self_", [&] {
+      if (std::is_same<scalar_t, double>::value) {
+        return bernoulli_tensor_zoom_kernel<double, double>(self, *p, rng_engine_inputs);
+      } else {
+        return bernoulli_tensor_zoom_kernel<scalar_t, float>(self, *p, rng_engine_inputs);
+      }
+   });
+}
+
+template<typename RNG>
+void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_zoom_", [&] {
+      using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
+      // define lambda for bernoulli transformation
+      auto bernoulli_func = [p] __device__ (accscalar_t rand) {
+        return static_cast<scalar_t>(transformation::bernoulli<accscalar_t>(rand, p));
+      };
+      uniform_and_transform<scalar_t, accscalar_t, hiprand4_engine_calls>(iter, gen, bernoulli_func);
+   });
+}
+
+template<typename RNG>
+struct BernoulliKernel {
+  void operator()(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
+    bernoulli_kernel(iter, p, check_generator<RNG>(gen));
+  }
+  void operator()(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen) {
+    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
+  }
+};
+
+}}}}
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/IndexKernel.cu b/aten/src/ATen/native/zoom/IndexKernel.cu
new file mode 100644
index 00000000000000..3df2d1bb120407
--- /dev/null
+++ b/aten/src/ATen/native/zoom/IndexKernel.cu
@@ -0,0 +1,463 @@
+#include <hip/hip_runtime.h>
+// #define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/IndexKernel.h>
+#include <ATen/native/IndexKernel.h>
+
+#include <type_traits>
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Array.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/cub.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/native/quantized/IndexKernel.h>
+
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+static constexpr int launch_bound2 = 4;
+
+static constexpr int launch_size_nd = 128;
+
+template<int nt, int vt, typename func_t>
+C10_LAUNCH_BOUNDS_2(nt, launch_bound2)
+__global__ void index_elementwise_kernel(const int64_t N, const func_t f) {
+  const auto tid = threadIdx.x;
+  const auto nv = nt * vt;
+  auto idx = nv * blockIdx.x + tid;
+  #pragma unroll
+  for (int i = 0; i < vt; i++) {
+    if (idx < N) {
+      f(idx);
+      idx += nt;
+    }
+  }
+}
+
+template<int nt, int vt, typename func_t>
+static void launch_kernel(const int64_t N, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
+  if (N == 0) {
+    return;
+  }
+  const dim3 block(nt);
+  const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  const auto stream = c10::zoom::getCurrentZoomStream();
+ hipLaunchKernelGGL(( index_elementwise_kernel<nt, vt, func_t>), dim3(grid), dim3(block), 0, stream, N, f);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename func_t>
+void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const func_t& f) {
+  const auto num_indices = index_size.size();
+  AT_ASSERT(num_indices == index_stride.size());
+  AT_ASSERT(static_cast<int64_t>(num_indices) == iter.ntensors() - 2);
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_index_kernel(sub_iter, index_size, index_stride, f);
+    }
+    return;
+  }
+
+  auto sizes = at::detail::Array<int64_t, MAX_DIMS>(0);
+  auto strides = at::detail::Array<int64_t, MAX_DIMS>(0);
+  auto index_ptrs = at::detail::Array<char*, MAX_DIMS>(nullptr);
+  for (unsigned i = 0; i < num_indices; i++) {
+    sizes[i] = index_size[i];
+    strides[i] = index_stride[i];
+    index_ptrs[i] = (char*)iter.data_ptr(i + 2);
+  }
+
+  char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
+  char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
+
+  auto offset_calc = make_offset_calculator<3>(iter);
+  launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), [=]__device__(int idx) {
+    const auto offsets = offset_calc.get(idx);
+    char* const out_data = out_ptr + offsets[0];
+    const char* const in_data = in_ptr + offsets[1];
+
+    int64_t offset = 0;
+    #pragma unroll
+    for (int i = 0; i < num_indices; i++) {
+      int64_t index = *reinterpret_cast<int64_t*>(index_ptrs[i] + offsets[2]);
+      ZOOM_KERNEL_ASSERT(-sizes[i] <= index && index < sizes[i] && "index out of bounds");
+      if (index < 0) {
+        index += sizes[i];
+      }
+      offset += index * strides[i];
+    }
+
+    f(out_data, in_data, offset);
+  });
+}
+
+// The kernels are templated on an opaque, self-aligned type of the correct
+// size to avoid redundant kernels for different types of the same size.
+template <int N> struct alignas(N) OpaqueType { char data[N]; };
+
+template <typename scalar_t>
+void index_fill_kernel_impl(
+  TensorIterator& iter,
+  const int64_t dim,
+  const int64_t self_dim_size,
+  const int64_t self_dim_stride,
+  const scalar_t fill_val) {
+  if (0 == iter.numel()) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      index_fill_kernel_impl(sub_iter, dim, self_dim_size, self_dim_stride, fill_val);
+    }
+    return;
+  }
+
+  char* const __restrict__ self_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  char* const __restrict__ idx_ptr = reinterpret_cast<char*>(iter.data_ptr(1));
+
+  const auto offset_calc = make_offset_calculator<2>(iter);
+
+  const auto loop = [=]C10_DEVICE(int i) {
+    const auto offsets = offset_calc.get(i);
+
+    auto* __restrict__ self_data = reinterpret_cast<scalar_t*>(self_ptr + offsets[0]);
+    auto idx = *reinterpret_cast<int64_t*>(idx_ptr + offsets[1]);
+    ZOOM_KERNEL_ASSERT(idx >= -self_dim_size && idx < self_dim_size && "index out of bounds");
+    if (idx < 0) {
+      idx += self_dim_size;
+    }
+
+    self_data[idx * self_dim_stride] = fill_val;
+  };
+  launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), loop);
+}
+
+template <typename scalar_t>
+void index_copy_kernel_impl(
+  TensorIterator& iter,
+  const int64_t dim,
+  const int64_t self_dim_size,
+  const int64_t self_dim_stride) {
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      index_copy_kernel_impl<scalar_t>(sub_iter, dim, self_dim_size, self_dim_stride);
+    }
+    return;
+  }
+
+  char* const __restrict__ self_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  char* const __restrict__ idx_ptr = reinterpret_cast<char*>(iter.data_ptr(1));
+  char* const __restrict__ source_ptr = reinterpret_cast<char*>(iter.data_ptr(2));
+
+  const auto offset_calc = make_offset_calculator<3>(iter);
+
+  const auto loop = [=]C10_DEVICE(int i) {
+    const auto offsets = offset_calc.get(i);
+
+    auto* const __restrict__ self_data = reinterpret_cast<scalar_t*>(self_ptr + offsets[0]);
+    auto idx = *reinterpret_cast<int64_t*>(idx_ptr + offsets[1]);
+    const auto* const __restrict__ source_data = reinterpret_cast<scalar_t*>(source_ptr + offsets[2]);
+    ZOOM_KERNEL_ASSERT(idx >= 0 && idx < self_dim_size && "index_copy_(): index out of bounds");
+
+    self_data[idx * self_dim_stride] = *source_data;
+  };
+  launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), loop);
+}
+
+template <typename scalar_t>
+void index_kernel_impl(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
+  gpu_index_kernel(iter, index_size, index_stride, []C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
+    *reinterpret_cast<scalar_t*>(out_data) = *reinterpret_cast<const scalar_t*>(in_data + offset);
+  });
+}
+
+template <typename scalar_t>
+void index_put_kernel_impl(TensorIterator& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
+  gpu_index_kernel(iter, index_size, index_stride, []C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
+    *reinterpret_cast<scalar_t*>(out_data + offset) = *reinterpret_cast<const scalar_t*>(in_data);
+  });
+}
+
+static void index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, const IntArrayRef index_stride) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_zoom", [&] {
+    using dtype = OpaqueType<sizeof(scalar_t)>;
+    index_kernel_impl<dtype>(iter, index_size, index_stride);
+  });
+}
+
+static void index_fill_kernel(
+  TensorIterator& iter,
+  const int64_t dim,
+  const int64_t self_dim_size,
+  const int64_t self_dim_stride,
+  const Scalar& source) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, kComplexHalf,
+    iter.dtype(), "index_fill_zoom", [&] {
+    using dtype = OpaqueType<sizeof(scalar_t)>;
+    const auto fill_val = source.to<scalar_t>();
+    const auto fill_val_opaque = *reinterpret_cast<const dtype*>(&fill_val);
+    index_fill_kernel_impl<dtype>(iter, dim, self_dim_size, self_dim_stride, fill_val_opaque);
+  });
+}
+
+static void index_copy_kernel(
+  TensorIterator& iter,
+  const int64_t dim,
+  const int64_t self_dim_size,
+  const int64_t self_dim_stride) {
+  // See note [Writing Nondeterministic Operations]
+  // Nondeterministic when index contains duplicate entries
+  // this kernel will not be called when torch.use_deterministic_algorithms(True)
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, kComplexHalf,
+    iter.dtype(), "index_copy_zoom", [&] {
+    using dtype = OpaqueType<sizeof(scalar_t)>;
+    index_copy_kernel_impl<dtype>(iter, dim, self_dim_size, self_dim_stride);
+  });
+}
+
+static void index_put_kernel(TensorIterator& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const bool accumulate) {
+  TORCH_CHECK(!accumulate, "index_put does not support accumulate=true");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBool, kBFloat16, iter.dtype(), "index_put", [&] {
+    using dtype = OpaqueType<sizeof(scalar_t)>;
+    index_put_kernel_impl<dtype>(iter, index_size, index_stride);
+  });
+}
+
+void index_put_kernel_quantized_zoom(TensorIterator& iter, const IntArrayRef index_size, const IntArrayRef index_stride, const bool accumulate, const double scale, const int zero_point) {
+  TORCH_CHECK(!accumulate, "index_put does not support accumulate=true");
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(iter.dtype(), "index_put", [&] {
+    constexpr int64_t qmin = std::numeric_limits<typename scalar_t::underlying>::min();
+    constexpr int64_t qmax = std::numeric_limits<typename scalar_t::underlying>::max();
+    const float inv_scale = 1.0f / static_cast<float>(scale);
+
+    gpu_index_kernel(iter, index_size, index_stride, [inv_scale, zero_point, qmin, qmax]C10_DEVICE(char* const out_data, const char* const in_data, const int64_t offset) {
+      int64_t qvalue = static_cast<int64_t>(zero_point + nearbyintf(*(float*)in_data * inv_scale));
+      qvalue = std::clamp(qvalue, qmin, qmax);
+      *(scalar_t*)(out_data + offset) = static_cast<scalar_t>(qvalue);
+    });
+  });
+}
+
+template <typename scalar_t, typename index_t, typename func_t>
+void zoom_take_put_kernel(
+  TensorIterator& iter,
+  const TensorBase& indexed,
+  const func_t& f) {
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      zoom_take_put_kernel<scalar_t, index_t>(sub_iter, indexed, f);
+    }
+    return;
+  }
+
+  const auto numel = indexed.numel();
+  const bool is_contiguous = indexed.is_contiguous();
+
+  char* const __restrict__ iterated_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  char* const __restrict__ idx_ptr = reinterpret_cast<char*>(iter.data_ptr(1));
+
+  const auto offset_calc = make_offset_calculator<2>(iter);
+  using uindex_t = std::make_unsigned_t<index_t>;
+
+  // OffsetCalculator needs the sizes and strides reveresed
+  const auto indexed_sizes = std::vector<int64_t>(indexed.sizes().rbegin(), indexed.sizes().rend());
+  const auto indexed_strides = std::vector<int64_t>(indexed.strides().rbegin(), indexed.strides().rend());
+  const auto* indexed_strides_data = indexed_strides.data();
+  const auto offset_indexed = OffsetCalculator<1, uindex_t>(indexed.dim(),
+                                                            indexed_sizes.data(),
+                                                            &indexed_strides_data);
+
+  const auto loop = [=]C10_DEVICE(int i) {
+    const auto offsets = offset_calc.get(i);
+
+    auto& iterated = *reinterpret_cast<scalar_t*>(iterated_ptr + offsets[0]);
+    const auto idx = *reinterpret_cast<int64_t*>(idx_ptr + offsets[1]);
+    ZOOM_KERNEL_ASSERT(idx < numel && idx >= -numel && "zoom_take_put_kernel() index out of bounds");
+    index_t offset = static_cast<index_t>(idx);
+    if (offset < 0) {
+      offset += numel;
+    }
+    if (!is_contiguous) {
+      offset = offset_indexed.get(offset)[0];
+    }
+
+    f(iterated, offset);
+  };
+  launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), loop);
+}
+
+void put_kernel(TensorIterator& iter, const TensorBase& output, const bool accumulate) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "put_zoom", [&] {
+    // Cannot use `OpaqueType`, as we need the actual type for `fastSpecializedgpuAtomicAdd`
+    AT_DISPATCH_INDEX_TYPES(zoom::detail::canUse32BitIndexMath(output) ? ScalarType::Int : ScalarType::Long,
+        "put_zoom_index", [&] {
+           auto* __restrict__ indexed_ptr = output.template data_ptr<scalar_t>();
+           if (accumulate) {
+             index_t numel = output.numel();
+             zoom_take_put_kernel<scalar_t, index_t>(iter, output,
+                 [numel, indexed_ptr] __device__(scalar_t& iterated, const index_t offset) {
+                   fastSpecializedAtomicAdd(indexed_ptr, offset, numel, iterated);
+                 });
+           }
+           else {
+             zoom_take_put_kernel<scalar_t, index_t>(iter, output,
+                 [indexed_ptr] __device__(scalar_t& iterated, const index_t offset) {
+                   indexed_ptr[offset] = iterated;
+                 });
+           }
+    });
+  });
+}
+
+void take_kernel(
+  TensorIterator& iter,
+  const TensorBase& input) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), "take_zoom", [&] {
+    // Cannot use `OpaqueType`, as Tensor::data_ptr<OpaqueType<N>> is not implemented
+    AT_DISPATCH_INDEX_TYPES(zoom::detail::canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long,
+      "take_zoom_index", [&] {
+         const auto* __restrict__ indexed_ptr = input.template const_data_ptr<scalar_t>();
+         zoom_take_put_kernel<scalar_t, index_t>(iter, input,
+            [indexed_ptr] __device__(scalar_t& iterated, const index_t offset) {
+               iterated = indexed_ptr[offset];
+             });
+     });
+  });
+}
+
+namespace {
+
+__global__ void masked_scatter_size_check(
+  const int64_t* const mask_exclusive_sum,
+  const bool* const mask,
+  const int64_t srcSize) {
+  // Convert exclusive sum to inclusive sum
+  const auto totalElements = *mask_exclusive_sum + *mask;
+  ZOOM_KERNEL_ASSERT(totalElements <= srcSize);
+}
+
+} // anonymous namespace
+
+void launch_masked_scatter_kernel(
+    const TensorBase &self, const TensorBase &mask,
+    const TensorBase &maskPrefixSum, const TensorBase &source) {
+  const auto srcSize = source.numel();
+  const auto mask_cont = mask.contiguous();
+  const auto mask_numel = mask.numel();
+
+  // Use a prefix sum to determine the output locations of the masked elements
+  auto maskPrefixSum_data = maskPrefixSum.mutable_data_ptr<int64_t>();
+  auto mask_data = mask_cont.const_data_ptr<bool>();
+
+  at::zoom::hipcub::mask_exclusive_sum(
+      mask_data, maskPrefixSum_data, mask_numel);
+
+  // Asynchronously check that the number of `1` elements present in the mask
+  // must be <= the number of elements available in `src`.
+ hipLaunchKernelGGL(( masked_scatter_size_check), dim3(1), dim3(1), 0, c10::zoom::getCurrentZoomStream(), 
+      &maskPrefixSum_data[mask_numel - 1], &mask_data[mask_numel - 1], srcSize);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  // We are getting elements from `src` based on an offset from
+  // `maskPrefixSum`, so that should be made contiguous too
+  auto source_contig = source.contiguous();
+
+  auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self)
+      .add_input(self)
+      .add_const_input(mask_cont)
+      .add_input(maskPrefixSum)
+      .build();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::Bool,
+      ScalarType::BFloat16,
+      ScalarType::Half,
+      self.scalar_type(),
+      "masked_scatter_",
+      [&]() {
+        auto source_ptr = source_contig.const_data_ptr<scalar_t>();
+        gpu_kernel(
+            iter, [=] GPU_LAMBDA(const scalar_t a, const bool mask, const int64_t maskPrefixSum) -> scalar_t {
+              if (mask) {
+                return source_ptr[maskPrefixSum];
+              }
+              return a;
+            });
+        C10_ZOOM_CHECK(hipGetLastError());
+      });
+}
+
+template <typename scalar_t>
+void flip_kernel_impl(TensorIterator& iter) {
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      flip_kernel_impl<scalar_t>(sub_iter);
+    }
+    return;
+  }
+
+  char* const __restrict__ out_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  const char* const __restrict__ in_ptr = reinterpret_cast<const char*>(iter.data_ptr(1));
+
+  const auto offset_calc = make_offset_calculator<2, /*signed_strides=*/true>(iter);
+
+  const auto loop = [=]C10_DEVICE(const int i) {
+    const auto offsets = offset_calc.get(i);
+    // offsets can be negative here, but it's fine
+    scalar_t* const __restrict__ out_data = reinterpret_cast<scalar_t*>(out_ptr + offsets[0]);
+    const scalar_t* const __restrict__ in_data = reinterpret_cast<const scalar_t*>(in_ptr + offsets[1]);
+    *out_data = *in_data;
+  };
+  launch_kernel<launch_size_nd, launch_bound2>(iter.numel(), loop);
+}
+
+void flip_kernel(TensorIterator& iter, const bool quantized) {
+  if (quantized) {
+    AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(iter.dtype(), "flip_quantized_zoom",
+    [&] {
+      using dtype = OpaqueType<sizeof(scalar_t)>;
+      flip_kernel_impl<dtype>(iter);
+    });
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+                                           iter.dtype(), "flip_zoom",
+    [&] {
+      using dtype = OpaqueType<sizeof(scalar_t)>;
+      flip_kernel_impl<dtype>(iter);
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(index_stub, &index_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(index_fill_stub, &index_fill_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(index_copy_stub, &index_copy_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(index_put_stub, &index_put_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(put_stub, &put_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(take_stub, &take_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(flip_stub, &flip_kernel);
+
+REGISTER_PRIVATEUSE1_DISPATCH(index_put_kernel_quantized_stub, &index_put_kernel_quantized_zoom);
+
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/IndexKernel.h b/aten/src/ATen/native/zoom/IndexKernel.h
new file mode 100644
index 00000000000000..edd9190deb0dba
--- /dev/null
+++ b/aten/src/ATen/native/zoom/IndexKernel.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <cstdint>
+
+namespace at {
+struct TensorIteratorBase;
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+/// @param maskPrefixSum[in,out]
+void launch_masked_scatter_kernel(
+    const TensorBase &self, const TensorBase &mask,
+    const TensorBase &maskPrefixSum, const TensorBase &source);
+}}
diff --git a/aten/src/ATen/native/zoom/LaunchUtils.h b/aten/src/ATen/native/zoom/LaunchUtils.h
new file mode 100644
index 00000000000000..4d2f35a56a5837
--- /dev/null
+++ b/aten/src/ATen/native/zoom/LaunchUtils.h
@@ -0,0 +1,18 @@
+#pragma once
+#include<algorithm>
+
+namespace at {
+namespace native {
+
+// returns 2**floor(log2(n))
+static int lastPow2(unsigned int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max<int>(1, n - (n >> 1));
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/MultinomialKernel.cu b/aten/src/ATen/native/zoom/MultinomialKernel.cu
new file mode 100644
index 00000000000000..ca9709637cf030
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MultinomialKernel.cu
@@ -0,0 +1,462 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/LaunchUtils.h>
+#include <ATen/zoom/HIPGraphsUtils.hpp>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <c10/zoom/ZoomFunctions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/cumsum_privateuse1_dispatch.h>
+#include <ATen/ops/uniform_native.h>
+#endif
+
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+#include <type_traits>
+
+namespace at::native {
+
+namespace {
+
+template <
+    typename T,
+    typename = std::enable_if_t<
+        std::is_floating_point_v<T> || std::is_convertible_v<T, float>>>
+inline __device__ bool _isinf(T x) {
+  if constexpr (std::is_floating_point_v<T>) {
+    return ::isinf(x);
+  } else {
+    return ::isinf(static_cast<float>(x));
+  }
+}
+
+#define MAX_NUM_BLOCKS 200
+
+// Normalizes the L1 norm of every row to 1; used by multinomial
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(zoom::detail::HIP_NUM_THREADS)
+__global__ void renormRowsL1(scalar_t* dist, long rows, long cols) {
+  extern __shared__  unsigned char my_smem[];
+  scalar_t *smem = reinterpret_cast<scalar_t *>(my_smem);
+  scalar_t zero = static_cast<scalar_t>(0);
+  scalar_t val;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    scalar_t sum = static_cast<scalar_t>(0);
+    for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) {
+      val = dist[row * cols + col];
+      ZOOM_KERNEL_ASSERT(!(val < zero)); // ! < 0 for NaN handling
+      sum = sum + val;
+    }
+
+    sum = zoom_utils::BlockReduceSum(sum, smem);
+    if (threadIdx.x == 0) {
+      ZOOM_KERNEL_ASSERT(!(val < zero)); // ! < 0 for NaN handling
+      smem[0] = sum;
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    if (sum > zero) {
+      for (int64_t col = threadIdx.x; col < cols; col += blockDim.x) {
+        dist[row * cols + col] = dist[row * cols + col] / sum;
+      }
+    }
+  }
+}
+
+void renormRows(Tensor& t) {
+  TORCH_CHECK(t.dim() == 2);
+  int64_t rows = t.size(0);
+  int64_t cols = t.size(1);
+
+  auto props = at::zoom::getCurrentDeviceProperties();
+  TORCH_CHECK(props != nullptr);
+  int numSM = props->multiProcessorCount;
+  const int64_t maxThreads = std::min(
+      props->maxThreadsPerBlock, zoom_utils::kHIPBlockReduceMaxThreads);
+
+  int warp_size = at::zoom::warp_size();
+  dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
+  dim3 block(std::min(maxThreads, warp_size * ceil_div(cols, int64_t{warp_size})));
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, t.scalar_type(), "renormRows_zoom", [&] {
+    renormRowsL1<scalar_t>
+        <<<grid, block, (block.x / warp_size) * sizeof(scalar_t),
+        c10::zoom::getCurrentZoomStream()>>>(t.mutable_data_ptr<scalar_t>(),
+            rows, cols);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+template <typename scalar_t>
+__device__ int binarySearchForMultinomial(const scalar_t* cumdist,
+                                          const scalar_t* dist,
+                                          int size,
+                                          scalar_t val) {
+  int start = 0;
+  int end = size;
+  // cumdist[size - 1] = 0 => all zero prob dist
+  ZOOM_KERNEL_ASSERT(cumdist[size - 1] > static_cast<scalar_t>(0));
+
+  while (end - start > 0) {
+    int mid = start + (end - start) / 2;
+
+    scalar_t midVal = cumdist[mid];
+    if (midVal < val) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+
+  if (start == size) {
+    // No probability mass or precision problems; just return the
+    // first non-zero element by setting start to size-1 here,
+    // the code below will move it to the last non-zero probability
+    // this actually can happen when the random number is 1
+    // (github pytorch issue #4858).
+    start = size - 1;
+  }
+
+  while(start >= 1 && dist[start] == 0) start--;
+
+  return start;
+}
+
+template <typename scalar_t>
+__global__ void
+sampleMultinomialWithReplacement(PhiloxHIPState philox_args,
+                                 int totalSamples,
+                                 int64_t* dest,
+                                 int64_t distributions,
+                                 int categories,
+                                 const scalar_t* normDistPrefixSum,
+                                 const scalar_t* normDist) {
+  // At the moment, each warp computes one sample value in the binary
+  // search due to divergence. It seems possible to compute multiple
+  // values and limit divergence though later on.
+
+  auto seeds = at::zoom::philox::unpack(philox_args);
+
+  // global index formula for 2D grid of 1D blocks
+  int idx = blockIdx.y * gridDim.x * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  // The block determines the distribution for which we generate a point
+  for (int64_t curDist = blockIdx.y;
+       curDist < distributions;
+       curDist += gridDim.y) {
+    for (int sample = blockIdx.x*blockDim.x + threadIdx.x;
+         sample < totalSamples; sample += blockDim.x*gridDim.x) {
+
+      //we are losing 3 out of 4 generated numbers but it's ok
+      //this kernel is not very efficient anyway
+      auto rand = hiprand_uniform4(&state);
+      scalar_t r = static_cast<scalar_t>(rand.x);
+
+      // Find the bucket that a uniform sample lies in
+      int choice = binarySearchForMultinomial<scalar_t>(
+          normDistPrefixSum + curDist * categories,
+          normDist + curDist * categories,
+          categories,
+          r);
+
+      dest[curDist * totalSamples + sample] = choice;
+
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(zoom::detail::HIP_NUM_THREADS)
+__global__ void sampleMultinomialOnce(
+    int64_t* dest,
+    int64_t distributions,
+    int categories,
+    const scalar_t* sampled,
+    const scalar_t* dist,
+    int stride_dist, // dist->stride(0)
+    int stride_categories // dist->stride(1)
+) {
+  extern __shared__  unsigned char my_smem[];
+  __shared__ bool found;
+  __shared__ unsigned foundPos;
+
+  accscalar_t *smem = reinterpret_cast<accscalar_t *>(my_smem);
+
+  accscalar_t accZero = static_cast<accscalar_t>(0);
+  scalar_t zero = static_cast<scalar_t>(0);
+
+  for (int64_t curDist = blockIdx.x;
+       curDist < distributions; curDist += gridDim.x) {
+    // Each block handles one distribution
+    // First pass, find the total sum of the distribution
+    accscalar_t sum = accZero;
+    scalar_t val;
+    for (int cat = threadIdx.x; cat < categories; cat += blockDim.x) {
+      val = dist[curDist * stride_dist + cat * stride_categories];
+      ZOOM_KERNEL_ASSERT(!at::_isnan(val));
+      ZOOM_KERNEL_ASSERT(!_isinf(val));
+      ZOOM_KERNEL_ASSERT(!(val < zero));
+      sum = sum + static_cast<accscalar_t>(val);
+    }
+
+    // threadIdx.x == 0 has the sum value from this
+    sum = zoom_utils::BlockReduceSum(sum, smem);
+
+    // Broadcast sum and sample value
+    if (threadIdx.x == 0) {
+      // Make sure the sum of our distribution didn't overflow
+      ZOOM_KERNEL_ASSERT(!_isinf(val));
+      ZOOM_KERNEL_ASSERT(sum > accZero);
+
+      foundPos = 0;
+      smem[0] = sum;
+      smem[1] = sampled[curDist];
+    }
+    __syncthreads();
+
+    sum = smem[0];
+    scalar_t sample = static_cast<scalar_t>(smem[1]);
+    __syncthreads();
+
+    if (sum == accZero) {
+      // Choose the first element
+      if (threadIdx.x == 0) {
+        dest[curDist] = 0;
+      }
+
+      continue;
+    }
+
+    int chunks = (categories + (int)blockDim.x - 1) / blockDim.x;
+    accscalar_t prevHighProb = accZero;
+    found = false;
+
+    for (int chunk = 0; chunk < chunks && !found; ++chunk) {
+      // All threads in bounds load a value
+      int cat = chunk * blockDim.x + threadIdx.x;
+
+      accscalar_t dist_val = cat < categories ?
+                             static_cast<accscalar_t>(dist[curDist * stride_dist + cat * stride_categories]) / sum :
+                             accZero;
+
+      smem[threadIdx.x] = dist_val;
+      __syncthreads();
+
+      // Perform an inclusive prefix sum of the shared memory contents
+      for (int offset = 1; offset < blockDim.x; offset *= 2) {
+        accscalar_t val = accZero;
+
+        if (threadIdx.x >= offset) {
+          val = smem[threadIdx.x - offset] + smem[threadIdx.x];
+        }
+
+        __syncthreads();
+        if (threadIdx.x >= offset) {
+          smem[threadIdx.x] = val;
+        }
+        __syncthreads();
+      }
+
+      // Each thread will check to see if the sample falls in its
+      // bucket
+      scalar_t curBucket =
+          static_cast<scalar_t>(smem[threadIdx.x] + prevHighProb);
+      scalar_t prevBucket = static_cast<scalar_t>(
+          threadIdx.x == 0 ? prevHighProb
+                          : smem[threadIdx.x - 1] + prevHighProb);
+      bool inBucket =
+          (cat < categories) &&
+          (!(sample >= curBucket) &&
+          (sample >= prevBucket) &&
+          (dist_val > zero));
+
+      if (inBucket) {
+        // We're done; we have the sample
+        // Torch indices are 1-based
+        atomicMax(&foundPos, cat);
+        found = true;
+      }
+
+      // Store the previous scan's high value for future use
+      prevHighProb = prevHighProb + smem[blockDim.x - 1];
+
+      __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+      if (found) {
+          dest[curDist] = foundPos;
+      } else {
+        // This should address a rare bug where we don't select a valid index. This likely occurs when
+        // due to floating point arithmetic rounding errors, our cumulative sum does not add up to 1, but
+        // and our uniform sample is greater than this value. In this case we likely have unitialized memory
+        // in dest[curDist]. So basically we will loop through the distribution and pick the largest index
+        // where the distribution is non-zero. This is obviously terribly inefficient, but due to the
+        // rarity in which this occurs, this should not be an issue.
+        for (int cat = categories - 1; cat >= 0; --cat) {
+          if (dist[curDist * stride_dist + cat * stride_categories] > zero) {
+            dest[curDist] = cat;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+void multinomial_with_replacement_kernel_impl(
+    Tensor& result,
+    const Tensor& self,
+    const int64_t n_sample,
+    std::optional<Generator> generator) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(generator, zoom::detail::getDefaultZoomGenerator());
+
+  int inputSize = self.dim();
+  int64_t numDist =
+      inputSize == 1 ? 1 : self.size(0);
+  int numCategories =
+      inputSize == 1 ? self.size(0) : self.size(1);
+
+  // Restructure data for 2d
+  auto self_v = inputSize == 1 ? self.view({numDist, numCategories}) : self;
+
+  result.resize_({numDist, n_sample});
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self_v.scalar_type(), "multinomial_kernel_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto props = at::zoom::getCurrentDeviceProperties();
+    TORCH_CHECK(props != nullptr);
+    int numSM = props->multiProcessorCount;
+    int maxThreads = props->maxThreadsPerBlock;
+    int maxShared = props->sharedMemPerBlock;
+
+    int warp_size = at::zoom::warp_size();
+    int requiredWarps = at::ceil_div(numCategories, warp_size);
+    int requiredThreads = std::min(maxThreads, requiredWarps * warp_size);
+    int requiredShared = requiredThreads * sizeof(accscalar_t);
+
+    if (n_sample == 1 && maxShared >= requiredShared) {
+      // Optimized allocation-free implementation
+      // To exploit greater parallelism for the sampling, generate the
+      // Uniform random samples in a separate kernel launch, into
+      // temporarily allocated memory. The device RNG is thread-limited
+      Tensor sampled = at::detail::empty_zoom({numDist, n_sample}, self_v.options());
+      at::native::uniform_(sampled, 0.0, 1.0, generator);
+
+      dim3 block(requiredThreads);
+      dim3 grid(std::min(static_cast<int>(numDist), numSM * 4));
+
+      sampleMultinomialOnce<scalar_t, accscalar_t>
+          <<<grid, block,
+          requiredShared,
+          c10::zoom::getCurrentZoomStream()>>>(
+              result.mutable_data_ptr<int64_t>(),
+                  numDist,
+                  numCategories,
+                  sampled.const_data_ptr<scalar_t>(),
+                  self_v.const_data_ptr<scalar_t>(),
+                  self_v.stride(0),
+                  self_v.stride(1)
+          );
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      // Generic, slow implementation with memory allocations
+
+      // For sampling without replacement, we modify the distribution
+      // for subsequent samples in this space
+      Tensor origDist = native::empty_like(
+          self_v,
+          c10::nullopt /* dtype */,
+          c10::nullopt /* layout */,
+          c10::nullopt /* device */,
+          c10::nullopt /* pin_memory */,
+          LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+      origDist.copy_(self_v);
+
+      Tensor normDist = native::empty_like(
+          self_v,
+          c10::nullopt /* dtype */,
+          c10::nullopt /* layout */,
+          c10::nullopt /* device */,
+          c10::nullopt /* pin_memory */,
+          LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+      Tensor prefixSum = native::empty_like(
+          self_v,
+          c10::nullopt /* dtype */,
+          c10::nullopt /* layout */,
+          c10::nullopt /* device */,
+          c10::nullopt /* pin_memory */,
+          LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+      // Renorm along rows
+      normDist.copy_(origDist);
+      renormRows(normDist);
+
+      // Prefix sum along rows
+      at::privateuse1::cumsum_out(prefixSum, normDist, 1);
+
+      PhiloxHIPState rng_engine_inputs;
+
+        // Binary search is warp divergent (so effectively we're running
+        // with just a single thread), but for better utilization,
+        // we need each block to have at least 4 warps.
+        dim3 block(128);
+
+        // Each block will generate a sample from one
+        // distribution concurrently.
+        int grid_y=std::min<int>(numDist, at::zoom::getCurrentDeviceProperties()->maxGridSize[1]);
+        dim3 grid((n_sample-1)/block.x+1, grid_y);
+        {
+          // See Note [Acquire lock when using random generators]
+          std::lock_guard<std::mutex> lock(gen->mutex_);
+
+          // each thread generates a single sample for (numdist/numblocks.y) distributions, however, since we have to use
+          // curand_uniform4 (See Note [Register spilling in curand call for CUDA < 10]),
+          // offset is 4 times that.
+          auto offset = ((numDist-1)/grid.y+1)*4;
+          rng_engine_inputs = gen->philox_hip_state(offset);
+        }
+        // Sample with replacement
+
+        sampleMultinomialWithReplacement
+            <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                rng_engine_inputs,
+                n_sample,
+                result.mutable_data_ptr<int64_t>(),
+                numDist, numCategories,
+                prefixSum.const_data_ptr<scalar_t>(),
+                normDist.const_data_ptr<scalar_t>());
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+
+  if (inputSize == 1) {
+    result.resize_({n_sample});
+  }
+}
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(
+    multinomial_with_replacement_stub,
+    &multinomial_with_replacement_kernel_impl);
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/ReduceAMinMaxKernel.cu b/aten/src/ATen/native/zoom/ReduceAMinMaxKernel.cu
new file mode 100644
index 00000000000000..e31cfc186de1ba
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceAMinMaxKernel.cu
@@ -0,0 +1,45 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/ReduceOps.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/native/zoom/Reduce.cuh>
+
+namespace at::native {
+
+template <typename scalar_t>
+void _min_max_values_kernel_zoom_impl(TensorIterator& iter) {
+  gpu_reduce_kernel<scalar_t, scalar_t>(
+      iter,
+      MinMaxOps<scalar_t, scalar_t, int32_t>{},
+      thrust::pair<scalar_t, scalar_t>(
+          at::numeric_limits<scalar_t>::upper_bound(),
+          at::numeric_limits<scalar_t>::lower_bound()));
+}
+
+void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_zoom", [&] {
+        _min_max_values_kernel_zoom_impl<scalar_t>(iter);
+      });
+}
+
+void aminmax_launch_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_zoom", [&]() {
+        gpu_reduce_kernel<scalar_t, scalar_t>(
+            iter,
+            MinMaxOps<scalar_t, scalar_t, int32_t>{},
+            thrust::pair<scalar_t, scalar_t>(
+                at::numeric_limits<scalar_t>::upper_bound(),
+                at::numeric_limits<scalar_t>::lower_bound()));
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceArgMaxKernel.cu b/aten/src/ATen/native/zoom/ReduceArgMaxKernel.cu
new file mode 100644
index 00000000000000..b5f526ebff9ef2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceArgMaxKernel.cu
@@ -0,0 +1,46 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/ReduceOps.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/native/zoom/Reduce.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/zoom/NumericLimits.cuh>
+
+namespace at::native {
+
+template <typename scalar_t, typename acc_t = scalar_t>
+void argmax_kernel_zoom_impl(TensorIterator& iter) {
+  gpu_reduce_kernel<scalar_t, int64_t>(
+      iter,
+      ArgMaxOps<acc_t>{},
+      thrust::pair<acc_t, int64_t>(
+          at::numeric_limits<acc_t>::lower_bound(), 0));
+};
+
+void argmax_kernel_zoom(TensorIterator& iter) {
+  // For float16 & bfloat16, instead of implementing is_nan and warp_shfl_down,
+  // we can convert float16 & bfloat16 to float and do all the operations in
+  // float.
+  if (iter.dtype(1) == kHalf) {
+    argmax_kernel_zoom_impl<at::Half, float>(iter);
+  } else if (iter.dtype(1) == kBFloat16) {
+    argmax_kernel_zoom_impl<at::BFloat16, float>(iter);
+  } else {
+    AT_DISPATCH_ALL_TYPES(iter.dtype(1), "argmax_zoom", [&]() {
+      argmax_kernel_zoom_impl<scalar_t>(iter);
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(argmax_stub, &argmax_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceArgMinKernel.cu b/aten/src/ATen/native/zoom/ReduceArgMinKernel.cu
new file mode 100644
index 00000000000000..5007d0abeeca3f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceArgMinKernel.cu
@@ -0,0 +1,46 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/ReduceOps.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/native/zoom/Reduce.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/zoom/NumericLimits.cuh>
+
+namespace at::native {
+
+template <typename scalar_t, typename acc_t = scalar_t>
+void argmin_kernel_zoom_impl(TensorIterator& iter) {
+  gpu_reduce_kernel<scalar_t, int64_t>(
+      iter,
+      ArgMinOps<acc_t>{},
+      thrust::pair<acc_t, int64_t>(
+          at::numeric_limits<acc_t>::upper_bound(), 0));
+};
+
+void argmin_kernel_zoom(TensorIterator& iter) {
+  // For float16 & bfloat16, instead of implementing is_nan and warp_shfl_down,
+  // we can convert float16 & bfloat16 to float and do all the operations in
+  // float.
+  if (iter.dtype(1) == kHalf) {
+    argmin_kernel_zoom_impl<at::Half, float>(iter);
+  } else if (iter.dtype(1) == kBFloat16) {
+    argmin_kernel_zoom_impl<at::BFloat16, float>(iter);
+  } else {
+    AT_DISPATCH_ALL_TYPES(iter.dtype(1), "argmin_zoom", [&]() {
+      argmin_kernel_zoom_impl<scalar_t>(iter);
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(argmin_stub, &argmin_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceLogicKernel.cu b/aten/src/ATen/native/zoom/ReduceLogicKernel.cu
index fb6bb731781358..fafe22cc4b1fd3 100644
--- a/aten/src/ATen/native/zoom/ReduceLogicKernel.cu
+++ b/aten/src/ATen/native/zoom/ReduceLogicKernel.cu
@@ -35,4 +35,4 @@ void or_kernel_zoom(TensorIterator& iter) {
 REGISTER_PRIVATEUSE1_DISPATCH(and_stub, &and_kernel_zoom);
 REGISTER_PRIVATEUSE1_DISPATCH(or_stub, &or_kernel_zoom);
 
-} // namespace at::native
\ No newline at end of file
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceMaxValuesKernel.cu b/aten/src/ATen/native/zoom/ReduceMaxValuesKernel.cu
new file mode 100644
index 00000000000000..7da6d4a0e0855b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceMaxValuesKernel.cu
@@ -0,0 +1,61 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/ReduceOps.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/native/zoom/Reduce.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/zoom/NumericLimits.cuh>
+
+namespace at::native {
+
+template <typename acc_t>
+struct MaxNanFunctor {
+  __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
+    return (at::_isnan(a) || a > b) ? a : b;
+  }
+};
+
+template <typename scalar_t, typename acc_t = scalar_t>
+void max_values_kernel_zoom_impl(TensorIterator& iter) {
+  gpu_reduce_kernel<scalar_t, scalar_t>(
+      iter,
+      func_wrapper<acc_t>(MaxNanFunctor<acc_t>()),
+      at::numeric_limits<acc_t>::lower_bound());
+}
+
+void max_values_kernel_zoom(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.dtype(), "max_values_zoom", [&]() {
+        max_values_kernel_zoom_impl<scalar_t>(iter);
+      });
+}
+
+void max_launch_kernel(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "max_zoom", [&]() {
+        gpu_reduce_kernel<scalar_t, scalar_t>(
+            iter,
+            MaxOps<scalar_t>{},
+            thrust::pair<scalar_t, int64_t>(
+                at::numeric_limits<scalar_t>::lower_bound(), 0));
+      });
+}
+
+void max_all_launch_kernel(TensorIterator &iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_zoom", [&] {
+    max_values_kernel_zoom_impl<scalar_t>(iter);
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(max_values_stub, &max_values_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceMinValuesKernel.cu b/aten/src/ATen/native/zoom/ReduceMinValuesKernel.cu
new file mode 100644
index 00000000000000..e5acf1a000207c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceMinValuesKernel.cu
@@ -0,0 +1,58 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Reduce.cuh>
+#include <ATen/native/zoom/ReduceOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/NumericUtils.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/zoom/NumericLimits.cuh>
+
+
+namespace at::native {
+
+template <typename acc_t>
+struct MinNanFunctor {
+  __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
+      return (at::_isnan(a) || a < b) ? a : b;
+  }
+};
+
+template <typename scalar_t, typename acc_t=scalar_t>
+void min_values_kernel_zoom_impl(TensorIterator& iter) {
+  gpu_reduce_kernel<scalar_t, scalar_t>(
+    iter, func_wrapper<acc_t> (MinNanFunctor<acc_t>()),
+    at::numeric_limits<acc_t>::upper_bound());
+}
+
+void min_values_kernel_zoom(TensorIterator& iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_zoom", [&]() {
+    min_values_kernel_zoom_impl<scalar_t>(iter);
+  });
+}
+
+void min_launch_kernel(TensorIterator &iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_zoom", [&]() {
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+      iter,
+      MinOps<scalar_t>{},
+      thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
+  });
+}
+
+void min_all_launch_kernel(TensorIterator &iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_zoom", [&] {
+    min_values_kernel_zoom_impl<scalar_t>(iter);
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(min_values_stub, &min_values_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceNormKernel.cu b/aten/src/ATen/native/zoom/ReduceNormKernel.cu
new file mode 100644
index 00000000000000..71b1db37634943
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceNormKernel.cu
@@ -0,0 +1,51 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/zoom/Reduce.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/LinearAlgebra.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+// This reduction accumulates results as the type `acc_t`. By default, when
+// `scalar_t` is complex, `acc_t` is the downgraded real number type.
+// Otherwise, `acc_t` and `scalar_t` are the same type.
+template <typename scalar_t, typename acc_t=typename scalar_value_type<scalar_t>::type, typename out_t=typename scalar_value_type<scalar_t>::type>
+void norm_kernel_zoom_impl(TensorIterator& iter, double p) {
+  if (p == static_cast<double>(0)) {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormZeroOps<scalar_t, acc_t, out_t>(), 0);
+  } else if (p == static_cast<double>(1)) {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormOneOps<scalar_t, acc_t, out_t>(), 0);
+  } else if (p == static_cast<double>(2)) {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormTwoOps<scalar_t, acc_t, out_t>(), 0);
+  } else if (p == static_cast<double>(INFINITY)) {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t, out_t>(), 0);
+  } else if (p == static_cast<double>(-INFINITY)) {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMinOps<scalar_t, acc_t, out_t>(), std::numeric_limits<acc_t>::infinity());
+  } else {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, NormOps<scalar_t, acc_t, out_t>{acc_t(p)}, 0);
+  }
+}
+
+void norm_launch_kernel(TensorIterator& iter, double ord) {
+  if (iter.dtype(0) == kHalf) {
+    return norm_kernel_zoom_impl<at::Half, float>(iter, ord);
+  } else if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    return norm_kernel_zoom_impl<at::Half, float, float>(iter, ord);
+  }
+  else if(iter.dtype(0) == kBFloat16) {
+    return norm_kernel_zoom_impl<at::BFloat16, float>(iter, ord);
+  } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    return norm_kernel_zoom_impl<at::BFloat16, float, float>(iter, ord);
+  }
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "norm_zoom", [&] {
+    norm_kernel_zoom_impl<scalar_t>(iter, ord);
+  });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceOps.cpp b/aten/src/ATen/native/zoom/ReduceOps.cpp
new file mode 100644
index 00000000000000..c57c4303ccea69
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceOps.cpp
@@ -0,0 +1,102 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/ReduceOps.h>
+
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/ReduceAllOps.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/TensorCompare.h>
+
+#include <ATen/Context.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/TensorIterator.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/full.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/kthvalue_native.h>
+#include <ATen/ops/median_native.h>
+#include <ATen/ops/nanmedian_native.h>
+#include <ATen/ops/where.h>
+#endif
+
+namespace at::native {
+namespace {
+
+void norm_kernel_zoom(TensorIterator& iter, const Scalar& val) {
+  double p;
+  if (val.isIntegral(false)) {
+    p = val.to<int64_t>();
+  } else if (val.isFloatingPoint()) {
+    p = val.to<double>();
+  } else {
+    TORCH_CHECK(false, "norm_kernel_zoom_impl expects norm to be integer or float");
+  }
+  if (iter.numel() == 0) {
+    iter.output().fill_((p < 0) ? INFINITY : 0);
+    return;
+  }
+
+  norm_launch_kernel(iter, p);
+
+  if (isComplexType(iter.output().scalar_type())) {
+    at::imag(iter.output()).zero_();
+  }
+
+}
+
+void min_kernel_impl(const Tensor& result, const Tensor& indice, const Tensor& self, int64_t dim, bool keepdim) {
+  auto iter = meta::make_reduction(self, result, indice, dim, keepdim, self.scalar_type(), kLong);
+  min_launch_kernel(iter);
+}
+
+void max_kernel_impl(const Tensor& result, const Tensor& indice, const Tensor& self, int64_t dim, bool keepdim) {
+  auto iter = meta::make_reduction(self, result, indice, dim, keepdim, self.scalar_type(), kLong);
+  max_launch_kernel(iter);
+}
+
+void aminmax_kernel_impl(
+    const Tensor& self, int64_t dim, bool keepdim, Tensor& min_result, Tensor& max_result) {
+  at::TensorIterator iter = make_reduction("aminmax_zoom", min_result,
+                                           max_result, self, dim, keepdim, self.scalar_type());
+  if (iter.numel() != 0) {
+    aminmax_launch_kernel(iter);
+  }
+}
+
+void min_all_kernel_impl(Tensor& result, const Tensor& input) {
+  auto dtype = input.scalar_type();
+  auto iter = make_reduction("min_all", result, input, IntArrayRef{}, false, dtype);
+  min_all_launch_kernel(iter);
+}
+
+void max_all_kernel_impl(Tensor& result, const Tensor& input) {
+  auto dtype = input.scalar_type();
+  auto iter = make_reduction("max_all", result, input, IntArrayRef{}, false, dtype);
+  max_all_launch_kernel(iter);
+}
+
+void aminmax_allreduce_kernel_impl(const Tensor& input, Tensor& min_result, Tensor& max_result) {
+  auto dtype = input.scalar_type();
+  auto iter = make_reduction("aminmax_zoom", min_result, max_result, input,
+                             IntArrayRef{}, false, dtype);
+  TORCH_CHECK(iter.numel() > 0, "min_max on a tensor with no elements is not defined.");
+  aminmax_allreduce_launch_kernel(iter);
+}
+
+}  // namespace (anonymous)
+
+REGISTER_PRIVATEUSE1_DISPATCH(min_stub, &min_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(max_stub, &max_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(min_all_stub, &min_all_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(max_all_stub, &max_all_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(aminmax_allreduce_stub, &aminmax_allreduce_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(aminmax_stub, &aminmax_kernel_impl);
+
+REGISTER_PRIVATEUSE1_DISPATCH(norm_stub, &norm_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceOps.h b/aten/src/ATen/native/zoom/ReduceOps.h
new file mode 100644
index 00000000000000..a67a019ae49e2e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceOps.h
@@ -0,0 +1,20 @@
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at { namespace native {
+
+void norm_launch_kernel(TensorIterator &iter, double val);
+void min_launch_kernel(TensorIterator &iter);
+void max_launch_kernel(TensorIterator &iter);
+void aminmax_launch_kernel(TensorIterator &iter);
+void min_all_launch_kernel(TensorIterator &iter);
+void max_all_launch_kernel(TensorIterator &iter);
+void aminmax_allreduce_launch_kernel(TensorIterator &iter);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceSumProdKernel.cu b/aten/src/ATen/native/zoom/ReduceSumProdKernel.cu
new file mode 100644
index 00000000000000..815dfb35ac0252
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceSumProdKernel.cu
@@ -0,0 +1,215 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Reduce.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/ReduceOps.h>
+// #include <ATen/jit_macros.h>
+#include <ATen/OpMathType.h>
+
+namespace at::native {
+
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
+struct sum_functor {
+  void operator()(TensorIterator& iter) {
+    gpu_reduce_kernel<scalar_t, out_t>(
+        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a + b;
+        }));
+  }
+};
+
+// jiterated specialization for `complex<Half>`
+CONSTEXPR_EXCEPT_WIN_CUDA char sum_name[] = "sum";
+template <>
+struct sum_functor<c10::complex<at::Half>> {
+// jiterator reduction fails on windows
+// Ref: https://github.com/pytorch/pytorch/issues/77305
+#if AT_USE_JITERATOR() && !defined(_MSC_VER)
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    std::string func = jiterator_stringify(
+    arg_t combine(arg_t a, arg_t b) {
+      return a + b;
+    }
+    );
+    jitted_gpu_reduce_kernel<sum_name, scalar_t, scalar_t>(
+        iter, func, 0.);
+  }
+#else
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter, func_wrapper<scalar_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a + b;
+        }), acc_t{0.});
+  }
+#endif
+};
+
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
+struct nansum_functor {
+  void operator()(TensorIterator& iter) {
+    gpu_reduce_kernel<scalar_t, out_t>(
+        iter, NanSumOps<acc_t, out_t>{});
+  }
+};
+
+CONSTEXPR_EXCEPT_WIN_CUDA char nansum_name[] = "nansum";
+template <typename scalar_t>
+struct nansum_functor_complex {
+#if AT_USE_JITERATOR()
+  void operator()(TensorIterator& iter) {
+    std::string func = jiterator_stringify(
+        arg_t combine(arg_t a, scalar_t b) {
+          return a + (std::isnan(b) ? arg_t{0.} : arg_t{b});
+        }
+    );
+    jitted_gpu_reduce_kernel<nansum_name, scalar_t, scalar_t>(
+        iter, func, 0.);
+  }
+#else
+  void operator()(TensorIterator& iter) {
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, acc_t>(
+        iter, NanSumOps<acc_t, acc_t>{});
+  }
+#endif
+};
+
+CONSTEXPR_EXCEPT_WIN_CUDA char prod_name[] = "prod";
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
+struct prod_functor {
+  // jiterator reduction fails on windows
+  // Ref: https://github.com/pytorch/pytorch/issues/77305
+  #if AT_USE_JITERATOR() && !defined(_MSC_VER)
+  void operator()(TensorIterator& iter) {
+    std::string func = jiterator_stringify(
+    arg_t combine(arg_t a, arg_t b) {
+      return a * b;
+    }
+    );
+    jitted_gpu_reduce_kernel<prod_name, scalar_t, out_t>(
+        iter, func, 1.);
+  }
+  #else
+  void operator()(TensorIterator& iter) {
+    gpu_reduce_kernel<scalar_t, out_t>(
+        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a * b;
+        }), 1.);
+  }
+  #endif
+};
+
+// Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
+template <>
+struct prod_functor<bool> {
+  void operator()(TensorIterator& iter) {
+    gpu_reduce_kernel<bool, bool>(
+        iter, func_wrapper<bool>([] GPU_LAMBDA(bool a, bool b) -> bool {
+          return a && b;
+        }), 1);
+  }
+};
+
+// jiterated specialization for `complex<Half>`
+template <>
+struct prod_functor<c10::complex<at::Half>> {
+// jiterator reduction fails on windows
+// Ref: https://github.com/pytorch/pytorch/issues/77305
+#if AT_USE_JITERATOR() && !defined(_MSC_VER)
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    std::string func =
+        jiterator_stringify(arg_t combine(arg_t a, arg_t b) { return a * b; });
+    jitted_gpu_reduce_kernel<prod_name, scalar_t, scalar_t>(iter, func, 1.);
+  }
+#else
+  void operator()(TensorIterator& iter) {
+    using scalar_t = c10::complex<at::Half>;
+    using acc_t = at::opmath_type<scalar_t>;
+    gpu_reduce_kernel<scalar_t, scalar_t>(
+        iter,
+        func_wrapper<scalar_t>(
+            [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { return a * b; }),
+        acc_t{1.});
+  }
+#endif
+};
+
+// The function `reduce_dispatch` below dispatches to the kernel based
+// on the type of `iter`. It takes care of the common logic
+// for handling Half-Precision floating types.
+// Otherwise the functor `op` is called to dispatch to the kernel
+// of relevant type.
+//
+// Note: Functor `op` should take care of all the types to be supported
+//       except for `at::Half` and `at::BFloat16`.
+template <
+    template <
+        typename scalar_t,
+        typename acc_t = scalar_t,
+        typename out_t = scalar_t>
+    typename OpFunctor,
+    typename GeneralDispatcher>
+static void reduce_dispatch(TensorIterator& iter, GeneralDispatcher op) {
+  if (iter.dtype() == kHalf) {
+    return OpFunctor<at::Half, float>{}(iter);
+  } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    return OpFunctor<at::Half, float, float>{}(iter);
+  } else if (iter.dtype() == kBFloat16) {
+    return OpFunctor<at::BFloat16, float>{}(iter);
+  } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    return OpFunctor<at::BFloat16, float, float>{}(iter);
+  }
+  op(iter);
+}
+
+static void sum_kernel_zoom(TensorIterator& iter){
+  auto general_dispatcher = [](TensorIterator& iter) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        kBool, kComplexHalf, iter.dtype(), "sum_zoom", [&]() {
+          sum_functor<scalar_t>{}(iter);
+        });
+  };
+
+  reduce_dispatch<sum_functor>(iter, general_dispatcher);
+}
+
+static void nansum_kernel_zoom(TensorIterator& iter) {
+  auto general_dispatcher = [](TensorIterator& iter) {
+    auto dtype = iter.dtype();
+    if (at::isComplexType(dtype)) {
+        AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "nansum_zoom", [&]() {
+          nansum_functor_complex<scalar_t>{}(iter);
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "nansum_zoom", [&]() {
+          nansum_functor<scalar_t>{}(iter);
+        });
+    }
+  };
+
+  reduce_dispatch<nansum_functor>(iter, general_dispatcher);
+}
+
+static void prod_kernel_zoom(TensorIterator& iter) {
+  auto general_dispatcher = [](TensorIterator& iter) {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kComplexHalf, kBool, iter.dtype(), "prod_zoom", [&]() {
+      prod_functor<scalar_t>{}(iter);
+    });
+  };
+
+  reduce_dispatch<prod_functor>(iter, general_dispatcher);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(sum_stub, &sum_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(nansum_stub, &nansum_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(prod_stub, &prod_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ScatterGatherKernel.cu b/aten/src/ATen/native/zoom/ScatterGatherKernel.cu
new file mode 100644
index 00000000000000..4d0121d83e5204
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ScatterGatherKernel.cu
@@ -0,0 +1,573 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/TensorAdvancedIndexing.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/MemoryOverlap.h>
+
+#include <ATen/native/ScatterGatherChecks.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/TensorIterator.h>
+
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+
+namespace at::native {
+
+// Implement as functors since lambdas don't get optimized.
+class ReduceMultiply {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMul(self_data_start + index, *src_data);
+  }
+};
+static ReduceMultiply reduce_multiply;
+
+class ReduceAdd {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    fastAtomicAdd(self_data_start, index, numel, *src_data, true);
+  }
+};
+static ReduceAdd reduce_add;
+
+class ReduceMean {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    fastAtomicAdd(self_data_start, index, numel, *src_data, true);
+  }
+};
+static ReduceMean reduce_mean;
+
+class ReduceMinimum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMin(self_data_start + index, *src_data);
+  }
+};
+static ReduceMinimum reduce_minimum;
+
+class ReduceMaximum {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    gpuAtomicMax(self_data_start + index, *src_data);
+  }
+};
+static ReduceMaximum reduce_maximum;
+
+class TensorAssign {
+public:
+  template <typename scalar_t>
+  constexpr C10_DEVICE void operator() (scalar_t* self_data_start, int64_t index, int64_t numel, const scalar_t * src_data) const {
+    (void)numel; // suppress unused warning
+    *(self_data_start + index) = *src_data;
+  }
+};
+static TensorAssign tensor_assign;
+
+// The kernels are implemented on an opaque,
+// self-aligned type of the correct size,
+// to avoid redundant kernels for different types
+// of the same size.
+template <int N> struct alignas(N) OpaqueType { char data[N]; };
+
+// essentially rewritten related to legacy::launch_kernel parts
+template <int nt, int vt, typename func_t>
+C10_LAUNCH_BOUNDS_2(nt, vt)
+__global__ void _scatter_gather_elementwise_kernel(int N, func_t f) {
+  constexpr int nv = nt * vt;
+  int idx = nv * blockIdx.x + threadIdx.x;
+
+  #pragma unroll
+  for (int i = 0; i < vt; ++i) {
+    if (idx < N) {
+      f(idx);
+      idx += nt;
+    }
+  }
+}
+
+template <int nt, int vt, typename func_t>
+static void _launch_scatter_gather_kernel(int64_t N, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
+  if (N == 0) {
+    return;
+  }
+
+  const dim3 block(nt);
+  const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  const auto stream = c10::zoom::getCurrentZoomStream();
+  _scatter_gather_elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+
+template <bool is_scatter_like, typename scalar_t>
+struct _zoom_scatter_gather_internal_kernel {
+  template <typename func_t>
+  void operator() (
+    TensorIterator& iter,
+    int64_t index_size,
+    int64_t index_stride,
+    int64_t numel,  // Do not use `const` qualifier here as it may cause issue in cuda 11.6.x. See #75434, #75545
+    const func_t& f
+  ) {
+    if (!iter.can_use_32bit_indexing()) {
+      for (auto& sub_iter : iter.with_32bit_indexing()) {
+        _zoom_scatter_gather_internal_kernel<is_scatter_like, scalar_t>()(
+          sub_iter, index_size, index_stride, numel, f
+        );
+      }
+      return;
+    }
+
+    char* self_ptr = (char*)iter.data_ptr(0);
+    char* src_ptr = (char*)iter.data_ptr(1);
+    char* index_ptr = (char*)iter.data_ptr(2);
+
+    auto offset_calc = make_offset_calculator<3>(iter);
+    auto loop = [=]C10_DEVICE(int i) {
+      auto offsets = offset_calc.get(i);
+
+      int64_t idx_dim = *(int64_t*)(index_ptr + offsets[2]);
+      ZOOM_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
+        && "index out of bounds");
+
+      f(
+        (scalar_t*)(self_ptr + offsets[0]),
+        is_scatter_like ? idx_dim * index_stride : 0,
+        numel,
+        (scalar_t*)(src_ptr + offsets[1]) + (is_scatter_like ? 0 : idx_dim * index_stride)
+      );
+    };
+
+    _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+  }
+}; // struct _zoom_scatter_fill_internal_kernel
+
+template <bool is_scatter_like = true, bool cast_to_opaque = true>
+struct zoom_scatter_gather_base_kernel {
+  void operator()(
+    const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name,
+    const ReduceAdd& f
+  ) {
+    at::assert_no_internal_overlap(self);
+
+    auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
+    auto self_strides = ensure_nonempty_vec(self.strides().vec());
+    auto src_strides = ensure_nonempty_vec(src.strides().vec());
+
+    // restride self and src such that
+    // self.shape = src.shape = index.shape
+    //
+    // restride stride[dim] such that
+    // if (is_scatter_like) self.stride[dim] = 0
+    // else src.stride[dim] = 0
+    auto self_restrided = is_scatter_like ?
+        restride_dim(self, dim, index_sizes)
+      : self.as_strided(index_sizes, self_strides);
+    auto src_restrided = is_scatter_like ?
+        src.as_strided(index_sizes, src_strides)
+      : restride_dim(src, dim, index_sizes);
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self_restrided)
+      .add_const_input(src_restrided)
+      .add_const_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
+    auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
+
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      iter.dtype(),
+      "zoom_scatter_gather_base_kernel_func", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+
+        _zoom_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
+          iter, index_size, index_stride, self.numel(), f
+        );
+      }
+    );
+  }
+
+  void operator()(
+    const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name,
+    const TensorAssign& f
+  ) {
+    at::assert_no_internal_overlap(self);
+
+    auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
+    auto self_strides = ensure_nonempty_vec(self.strides().vec());
+    auto src_strides = ensure_nonempty_vec(src.strides().vec());
+
+    // restride self and src such that
+    // self.shape = src.shape = index.shape
+    //
+    // restride stride[dim] such that
+    // if (is_scatter_like) self.stride[dim] = 0
+    // else src.stride[dim] = 0
+    auto self_restrided = is_scatter_like ?
+        restride_dim(self, dim, index_sizes)
+      : self.as_strided(index_sizes, self_strides);
+    auto src_restrided = is_scatter_like ?
+        src.as_strided(index_sizes, src_strides)
+      : restride_dim(src, dim, index_sizes);
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self_restrided)
+      .add_const_input(src_restrided)
+      .add_const_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
+    auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
+
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      iter.dtype(),
+      "zoom_scatter_gather_base_kernel_func", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+
+        _zoom_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
+          iter, index_size, index_stride, self.numel(), f
+        );
+      }
+    );
+  }
+
+  template <typename func_t>
+  void operator()(
+    const Tensor& self, int64_t dim,
+    const Tensor& index, const Tensor& src,
+    const std::string& method_name,
+    const func_t& f
+  ) {
+    at::assert_no_internal_overlap(self);
+
+    auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
+    auto self_strides = ensure_nonempty_vec(self.strides().vec());
+    auto src_strides = ensure_nonempty_vec(src.strides().vec());
+
+    // restride self and src such that
+    // self.shape = src.shape = index.shape
+    //
+    // restride stride[dim] such that
+    // if (is_scatter_like) self.stride[dim] = 0
+    // else src.stride[dim] = 0
+    auto self_restrided = is_scatter_like ?
+        restride_dim(self, dim, index_sizes)
+      : self.as_strided(index_sizes, self_strides);
+    auto src_restrided = is_scatter_like ?
+        src.as_strided(index_sizes, src_strides)
+      : restride_dim(src, dim, index_sizes);
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self_restrided)
+      .add_const_input(src_restrided)
+      .add_const_input(index)
+      .build();
+
+    auto self_dim_stride = ensure_nonempty_stride(self, dim);
+    auto self_dim_size = ensure_nonempty_size(self, dim);
+
+    auto src_dim_stride = ensure_nonempty_stride(src, dim);
+    auto src_dim_size = ensure_nonempty_size(src, dim);
+
+    auto index_size = is_scatter_like ? self_dim_size : src_dim_size;
+    auto index_stride = is_scatter_like ? self_dim_stride : src_dim_stride;
+
+
+    AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      iter.dtype(),
+      "zoom_scatter_gather_base_kernel_func", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+
+        _zoom_scatter_gather_internal_kernel<is_scatter_like, dtype>()(
+          iter, index_size, index_stride, self.numel(), f
+        );
+      }
+    );
+  }
+}; // struct zoom_scatter_gather_base_kernel
+
+template <typename scalar_t>
+struct _zoom_scatter_fill_internal_kernel {
+  template <typename func_t>
+  void operator()(
+    TensorIterator& iter,
+    scalar_t src_val,
+    int64_t index_size,
+    int64_t index_stride,
+    int64_t numel,  // Do not use `const` qualifier here as it may cause issue in cuda 11.6.x. See #75434, #75545
+    const func_t& f
+  ) {
+    if (!iter.can_use_32bit_indexing()) {
+      for (auto& sub_iter : iter.with_32bit_indexing()) {
+        _zoom_scatter_fill_internal_kernel<scalar_t>()(
+          sub_iter, src_val, index_size, index_stride, numel, f
+        );
+      }
+      return;
+    }
+
+    char* self_ptr = (char*)iter.data_ptr(0);
+    char* index_ptr = (char*)iter.data_ptr(1);
+
+    auto offset_calc = make_offset_calculator<2>(iter);
+    auto loop = [=]C10_DEVICE(int i) {
+      auto offsets = offset_calc.get(i);
+
+      int64_t idx_dim = *(int64_t*)(index_ptr + offsets[1]);
+      ZOOM_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
+        && "index out of bounds"
+      );
+
+      f(
+        (scalar_t*)(self_ptr + offsets[0]),
+        idx_dim * index_stride,
+        numel,
+        (scalar_t*)&src_val
+      );
+    };
+
+    _launch_scatter_gather_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+  }
+}; // struct _zoom_scatter_fill_internal_kernel
+
+template <bool cast_to_opaque = true>
+struct zoom_scatter_fill_base_kernel {
+  template <typename func_t>
+  void operator()(
+    const Tensor& self, int64_t dim,
+    const Tensor& index, Scalar src,
+    const std::string& method_name,
+    const func_t& f
+  ) {
+    at::assert_no_internal_overlap(self);
+
+    auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
+
+    // restride self such that
+    // self.shape = index.shape and
+    // self.stride[dim] = 0
+    auto self_restrided = restride_dim(self, dim, index_sizes);
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self_restrided)
+      .add_const_input(index)
+      .build();
+
+    auto index_size = ensure_nonempty_size(self, dim);
+    auto index_stride = ensure_nonempty_stride(self, dim);
+
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      iter.dtype(),
+      "zoom_scatter_fill_base_kernel_func", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+
+        auto src_scalar_val = src.to<scalar_t>();
+        auto src_val = *(dtype*)&src_scalar_val;
+
+        _zoom_scatter_fill_internal_kernel<dtype>()(
+          iter, src_val, index_size, index_stride, self.numel(), f
+        );
+      }
+    );
+  }
+
+  void operator()(
+    const Tensor& self, int64_t dim,
+    const Tensor& index, Scalar src,
+    const std::string& method_name,
+    const ReduceMultiply& f
+  ) {
+    at::assert_no_internal_overlap(self);
+
+    auto index_sizes = ensure_nonempty_vec(index.sizes().vec());
+
+    // restride self such that
+    // self.shape = index.shape and
+    // self.stride[dim] = 0
+    auto self_restrided = restride_dim(self, dim, index_sizes);
+
+    auto iter = TensorIteratorConfig()
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(false)
+      .resize_outputs(false)
+      .add_output(self_restrided)
+      .add_const_input(index)
+      .build();
+
+    auto index_size = ensure_nonempty_size(self, dim);
+    auto index_stride = ensure_nonempty_stride(self, dim);
+
+    AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      iter.dtype(),
+      "zoom_scatter_fill_base_kernel_reduce_multiply", [&] {
+        using dtype = typename std::conditional<cast_to_opaque,
+          OpaqueType<sizeof(scalar_t)>, scalar_t>::type;
+
+        auto src_scalar_val = src.to<scalar_t>();
+        auto src_val = *(dtype*)&src_scalar_val;
+
+        _zoom_scatter_fill_internal_kernel<dtype>()(
+          iter, src_val, index_size, index_stride, self.numel(), f
+        );
+      }
+    );
+  }
+}; // struct zoom_scatter_fill_base_kernel
+
+void gather_zoom_kernel(const Tensor& result, const Tensor& self, int64_t dim, const Tensor& index) {
+  zoom_scatter_gather_base_kernel</*is_scatter_like=*/false>()(
+    result, dim, index, self,
+    "gather_out_zoom", tensor_assign);
+}
+
+void scatter_zoom_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
+  // When indices are not unique, the behavior is non-deterministic
+  globalContext().alertNotDeterministic("scatter_zoom_");
+  zoom_scatter_gather_base_kernel<>()(
+    self, dim, index, src,
+    "scatter_zoom_", tensor_assign);
+}
+
+void scatter_fill_zoom_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src) {
+  zoom_scatter_fill_base_kernel<>()(
+    self, dim, index, src,
+    "scatter_fill_zoom_", tensor_assign);
+}
+
+void scatter_add_zoom_kernel(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("scatter_add_zoom_kernel");
+  zoom_scatter_gather_base_kernel</*is_scatter_like=*/true, /*cast_to_opaque=*/false>()(
+    self, dim, index, src,
+    "scatter_add_zoom_", reduce_add);
+}
+
+void scatter_reduce_zoom_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
+                               const Tensor& src, const ReductionType& reduce) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd/AtomicMul usage
+  globalContext().alertNotDeterministic("scatter_reduce_zoom_kernel");
+  switch (reduce) {
+  case ReductionType::SUM :
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+                                       "scatter_reduce_zoom_add_", reduce_add);
+    break;
+  case ReductionType::PROD :
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+                                       "scatter_reduce_zoom_multiply_", reduce_multiply);
+    break;
+  default :
+    break;
+  }
+}
+
+void scatter_reduce_two_zoom_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
+                                    const Tensor& src, const ReductionType& reduce) {
+  switch (reduce) {
+  case ReductionType::SUM :
+    globalContext().alertNotDeterministic("scatter_reduce_zoom_sum_");
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_zoom_sum_", reduce_add);
+    break;
+  case ReductionType::PROD :
+    globalContext().alertNotDeterministic("scatter_reduce_zoom_prod_");
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_zoom_prod_", reduce_multiply);
+    break;
+  case ReductionType::MAX :
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_zoom_amax_", reduce_maximum);
+    break;
+  case ReductionType::MIN :
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_zoom_amin_", reduce_minimum);
+    break;
+  case ReductionType::MEAN :
+    globalContext().alertNotDeterministic("scatter_reduce_zoom_mean_");
+    zoom_scatter_gather_base_kernel<true, false>()(self, dim, index, src,
+            "scatter_reduce_zoom_mean_", reduce_mean);
+    break;
+  }
+}
+
+void scatter_scalar_reduce_zoom_kernel(const Tensor& self, const int64_t dim, const Tensor& index,
+                               const Scalar& value, const ReductionType& reduce) {
+  switch (reduce) {
+  case ReductionType::SUM :
+    zoom_scatter_fill_base_kernel<false>()(self, dim, index, value,
+                                      "scatter_fill_zoom_add_", reduce_add);
+    break;
+  case ReductionType::PROD :
+    zoom_scatter_fill_base_kernel<false>()(self, dim, index, value,
+                                      "scatter_fill_zoom_multiply_", reduce_multiply);
+    break;
+  default :
+    break;
+  }
+}
+
+
+REGISTER_PRIVATEUSE1_DISPATCH(gather_stub, &gather_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(scatter_stub, &scatter_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(scatter_fill_stub, &scatter_fill_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(scatter_add_stub, &scatter_add_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(scatter_reduce_stub, &scatter_reduce_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(scatter_reduce_two_stub, &scatter_reduce_two_zoom_kernel);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/UnaryOpsKernel.cu b/aten/src/ATen/native/zoom/UnaryOpsKernel.cu
new file mode 100644
index 00000000000000..49ed65a45004ce
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryOpsKernel.cu
@@ -0,0 +1,286 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/NumericUtils.h>
+#include <ATen/OpMathType.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+
+void bitwise_not_kernel_zoom(TensorIteratorBase& iter) {
+  if (iter.dtype() == ScalarType::Bool) {
+    gpu_kernel(iter, []GPU_LAMBDA(bool a) {
+      return !a;
+    });
+  } else {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_not_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ~a;
+      });
+    });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char exp_name[] = "exp_kernel";
+void exp_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    #if AT_USE_JITERATOR()
+      static const auto exp_string = jiterator_stringify(
+          template <typename T>
+          T exp_kernel(T x) {
+            return std::exp(x);
+      }); // exp_string
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "exp_zoom", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/exp_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, exp_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "exp_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          return std::exp(static_cast<opmath_t>(a));
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, common_dtype, "exp_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return std::exp(a);
+      });
+    });
+  }
+}
+
+void expm1_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      ScalarType::BFloat16, ScalarType::Half,
+      iter.common_dtype(), "expm1_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return ::expm1(a);
+        });
+      });
+}
+
+// We manually overload rsqrt because std::rsqrt does not work with complex types.
+template<typename scalar_t>
+C10_HOST_DEVICE static inline scalar_t rsqrt_wrapper(scalar_t v) {
+  return ::rsqrt(v);
+}
+
+template<typename T>
+C10_HOST_DEVICE static inline c10::complex<T> rsqrt_wrapper(c10::complex<T> v) {
+  const c10::complex<T> one = c10::complex<T>(1.0, 0);
+  // std::sqrt for c10::complex is overloaded in c10/util/complex_math.h
+  return one / ::sqrt(v);
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char rsqrt_name[] = "rsqrt_kernel";
+void rsqrt_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    #if AT_USE_JITERATOR()
+      static const auto rsqrt_string = jiterator_stringify(
+          template <typename T>
+          T rsqrt_kernel(T x) {
+            const T one = T{1};
+            return one / std::sqrt(x);
+      }); // rsqrt_string
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "rsqrt_zoom", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/rsqrt_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, rsqrt_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "rsqrt_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          return rsqrt_wrapper(static_cast<opmath_t>(a));
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::BFloat16, ScalarType::Half,
+      iter.common_dtype(), "rsqrt_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          // In CUDA, ::rsqrt is overloaded for float and at::Half here is implicitly cast to float.
+          return rsqrt_wrapper(a);
+        });
+      });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char sqrt_name[] = "sqrt_kernel";
+void sqrt_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    #if AT_USE_JITERATOR()
+      static const auto sqrt_string = jiterator_stringify(
+          template <typename T>
+          T sqrt_kernel(T x) {
+            return std::sqrt(x);
+      }); // sqrt_string
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sqrt_zoom", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/sqrt_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, sqrt_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sqrt_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          return ::sqrt(static_cast<opmath_t>(a));
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, common_dtype, "sqrt_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return std::sqrt(a);
+      });
+    });
+  }
+}
+
+void clamp_kernel_zoom(TensorIteratorBase& iter, const Scalar& min_value, const Scalar& max_value) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_zoom", [&]() {
+    auto lower = min_value.to<scalar_t>();
+    auto upper = max_value.to<scalar_t>();
+    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (_isnan(v)) {
+        return v;
+      } else {
+        return ::min(::max(v, lower), upper);
+      }
+    });
+  });
+}
+
+void clamp_min_kernel_zoom(TensorIteratorBase& iter, const Scalar& min_value) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_min_zoom", [&]() {
+    auto lower = min_value.to<scalar_t>();
+    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (_isnan(v)) {
+        return v;
+      } else {
+        return ::max(v, lower);
+      }
+    });
+  });
+}
+
+void clamp_max_kernel_zoom(TensorIteratorBase& iter, const Scalar& max_value) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_max_zoom", [&]() {
+    auto upper = max_value.to<scalar_t>();
+    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (_isnan(v)) {
+        return v;
+      } else {
+        return ::min(v, upper);
+      }
+    });
+  });
+}
+
+template<typename scalar_t>
+C10_HOST_DEVICE static inline scalar_t _nan_to_num_replace(scalar_t a, scalar_t nan_replacement, scalar_t pos_inf_replacement, scalar_t neg_inf_replacement) {
+  return at::_isnan(a)
+    ? nan_replacement
+    : (a == std::numeric_limits<scalar_t>::infinity()
+      ? pos_inf_replacement
+      : (a == -std::numeric_limits<scalar_t>::infinity()
+        ? neg_inf_replacement
+        : a));
+}
+
+void nan_to_num_kernel_zoom(
+    TensorIteratorBase& iter,
+    std::optional<double> nan,
+    std::optional<double> pos_inf,
+    std::optional<double> neg_inf) {
+  if (isComplexType(iter.dtype())) {
+    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "nan_to_num", [&]() {
+      using value_t = scalar_t::value_type;
+      value_t nan_replacement = static_cast<value_t>(nan.value_or(0.));
+      value_t pos_inf_replacement = pos_inf.has_value()
+          ? static_cast<value_t>(pos_inf.value())
+          : std::numeric_limits<value_t>::max();
+      value_t neg_inf_replacement = neg_inf.has_value()
+          ? static_cast<value_t>(neg_inf.value())
+          : std::numeric_limits<value_t>::lowest();
+
+      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t a) -> scalar_t {
+        value_t res_real = _nan_to_num_replace(
+          a.real(), nan_replacement, pos_inf_replacement, neg_inf_replacement);
+        value_t res_imag = _nan_to_num_replace(
+          a.imag(), nan_replacement, pos_inf_replacement, neg_inf_replacement);
+        return scalar_t(res_real, res_imag);
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "nan_to_num_zoom", [&]() {
+      scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
+      scalar_t pos_inf_replacement = pos_inf.has_value()
+          ? static_cast<scalar_t>(pos_inf.value())
+          : std::numeric_limits<scalar_t>::max();
+      scalar_t neg_inf_replacement = neg_inf.has_value()
+          ? static_cast<scalar_t>(neg_inf.value())
+          : std::numeric_limits<scalar_t>::lowest();
+
+      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return _nan_to_num_replace(
+            a, nan_replacement, pos_inf_replacement, neg_inf_replacement);
+      });
+    });
+  }
+}
+
+void frexp_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half,
+    // The iter.dtype() here is the dtype of mantissa output.
+    // It's a floating point type and must be the same as the input's dtype.
+    iter.dtype(),
+    "frexp_zoom", [&]() {
+      gpu_kernel_multiple_outputs(iter, [=] GPU_LAMBDA (scalar_t a) -> thrust::tuple<scalar_t, int32_t> {
+        int32_t exponent;
+        scalar_t mantissa = std::frexp(a, &exponent);
+        return {mantissa, exponent};
+      });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(bitwise_not_stub, &bitwise_not_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(exp_stub, &exp_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(expm1_stub, &expm1_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(rsqrt_stub, &rsqrt_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(sqrt_stub, &sqrt_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(nan_to_num_stub, &nan_to_num_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(frexp_stub, &frexp_kernel_zoom);
+
+} // namespace at::native

From aaef6b9087992179551d3e3f6503b75b770cf196 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Fri, 17 Jan 2025 00:02:02 +0000
Subject: [PATCH 07/23] remove deps on hipblas, hipblaslt, hipsparse,
 hipsolver, hipfft, roctx, miopen

---
 cmake/Dependencies.cmake | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e29c89479f9dad..bc0d184cb8fd98 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1240,12 +1240,17 @@ if(USE_ROCM OR USE_ZOOM)
     # This is needed for library added by hip_add_library (same for hip_add_executable)
     hip_include_directories(${Caffe2_HIP_INCLUDE})
 
-    set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
+    if(USE_ZOOM)
+      set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
+      ${PYTORCH_HIP_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB})
+      list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS hip::hiprand)
+    else()
+      set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
       ${PYTORCH_HIP_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
-    list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES})
-
-    list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
-      roc::hipblas hip::hipfft hip::hiprand roc::hipsparse roc::hipsolver)
+      list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES})
+      list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
+        roc::hipblas hip::hipfft hip::hiprand roc::hipsparse roc::hipsolver)
+    endif()
 
     # ---[ Kernel asserts
     # Kernel asserts is disabled for ROCm by default.

From ac54e3e7aa427ff6737d9911f1962a9dbaebcd4a Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Sun, 26 Jan 2025 05:30:36 +0000
Subject: [PATCH 08/23] llama example working, bmm triton kernel

---
 .github/workflows/build_zoom_backend.yml      | 124 ++++++++++++
 aten/src/ATen/native/native_functions.yaml    |  36 ++--
 aten/src/ATen/native/zoom/Bmm.cpp             | 122 ------------
 .../native/zoom/DistributionRandomKernel.cu   |  27 +++
 .../ATen/native/zoom/DistributionUniform.cu   |  15 ++
 aten/src/ATen/native/zoom/HIPbmm.cu           | 132 -------------
 aten/src/ATen/native/zoom/TensorCompare.cu    | 133 +++++++++++++
 test/test_ops.py                              |   3 +-
 torch/zoom/__init__.py                        |   2 +-
 torch/zoom/zoom_triton_mm.py                  | 182 ++++++++++++++++++
 10 files changed, 501 insertions(+), 275 deletions(-)
 create mode 100644 .github/workflows/build_zoom_backend.yml
 delete mode 100644 aten/src/ATen/native/zoom/Bmm.cpp
 create mode 100644 aten/src/ATen/native/zoom/DistributionRandomKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionUniform.cu
 delete mode 100644 aten/src/ATen/native/zoom/HIPbmm.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorCompare.cu
 create mode 100644 torch/zoom/zoom_triton_mm.py

diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
new file mode 100644
index 00000000000000..aa7053cafe8379
--- /dev/null
+++ b/.github/workflows/build_zoom_backend.yml
@@ -0,0 +1,124 @@
+name: "Build PyTorch"
+
+on:
+  workflow_dispatch:
+    inputs:
+      force_debug_with_tmate:
+        type: boolean
+        description: 'Run the build with tmate session'
+        required: false
+        default: false
+      debug_with_tmate:
+        type: boolean
+        description: 'Run the build with a tmate session ONLY in case of failure'
+        required: false
+        default: false
+  pull_request:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "ubuntu-22.04"
+            runs-on: "mi300"
+            # container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"
+            # runs-on: "nod-ai-shared-cpubuilder-manylinux-x86_64"
+
+    runs-on: ${{ matrix.runs-on }}
+
+    name: ${{ matrix.name }}
+
+    env:
+      CACHE_DIR: ${{ github.workspace }}/.container-cache
+      # either the PR number or `branch-N` where N always increments
+      CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}
+
+    defaults:
+      run:
+        shell: bash
+
+    permissions:
+      id-token: write
+      contents: write
+
+    container:
+      image: ${{ matrix.container }}
+
+    steps:
+      - name: "Check out repository"
+        uses: actions/checkout@v4.2.2
+        with:
+          submodules: true
+
+      - name: Enable cache
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key:  ${{ env.CACHE_KEY }}
+          restore-keys: linux-build-test-cpp-
+
+      - name: "Build PyTorch"
+        id: build
+        run: |
+
+          export CCACHE_DIR="${{ env.CACHE_DIR }}"
+          export CMAKE_C_COMPILER_LAUNCHER=ccache
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros
+
+          python -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          ./build.sh
+
+      - name: "Audit"
+        id: audit
+        run: |
+
+          sudo apt install patchelf
+          source venv/bin/activate
+          pip install auditwheel
+          auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*
+
+      - name: Save cache
+        uses: actions/cache/save@v3
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: ${{ env.CACHE_KEY }}
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.name }}_artifact
+          path: dist
+          if-no-files-found: warn
+
+      - name: Release current commit
+        uses: ncipollo/release-action@v1.12.0
+        with:
+          artifacts: "dist/torch*.whl"
+          token: "${{ secrets.GITHUB_TOKEN }}"
+          tag: "latest"
+          name: "latest"
+          removeArtifacts: false
+          allowUpdates: true
+          replacesArtifacts: true
+          makeLatest: true
+
+      - name: "Setup tmate session"
+        if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          limit-access-to-actor: true
+          install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}
\ No newline at end of file
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 1664a6642b4cc4..6271e79c453abf 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1352,7 +1352,6 @@
   dispatch:
     CPU: bmm_out_cpu
     CUDA: bmm_out_cuda
-    PrivateUse1: bmm_out_zoom
     MPS: bmm_out_mps
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
@@ -1513,7 +1512,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_out
+    CPU, CUDA, PrivateUse1: clamp_out
     MPS: clamp_out_mps
   tags: pointwise
 
@@ -1522,7 +1521,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_Tensor_out
+    CPU, CUDA, PrivateUse1: clamp_Tensor_out
     MPS: clamp_Tensor_out_mps
   tags: pointwise
 
@@ -1553,7 +1552,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_max_out
+    CPU, CUDA, PrivateUse1: clamp_max_out
     MPS: clamp_max_out_mps
   tags: pointwise
 
@@ -1562,7 +1561,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_max_Tensor_out
+    CPU, CUDA, PrivateUse1: clamp_max_Tensor_out
     MPS: clamp_max_Tensor_out_mps
   tags: pointwise
 
@@ -1593,7 +1592,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_min_out
+    CPU, CUDA, PrivateUse1: clamp_min_out
     MPS: clamp_min_out_mps
   tags: pointwise
 
@@ -1602,7 +1601,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: clamp_min_Tensor_out
+    CPU, CUDA, PrivateUse1: clamp_min_Tensor_out
     MPS: clamp_min_Tensor_out_mps
   tags: pointwise
 
@@ -3168,7 +3167,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, MPS: isnan
+    CPU, CUDA, MPS, PrivateUse1: isnan
     SparseCPU, SparseCUDA: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
   autogen: isnan.out
@@ -4121,7 +4120,6 @@
   dispatch:
     CPU: mm_out_cpu
     CUDA: mm_out_cuda
-    PrivateUse1: mm_out_zoom
     MPS: mm_out_mps
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
@@ -6463,13 +6461,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA, MPS: where
+    CPU, CUDA, MPS, PrivateUse1: where
   tags: [core, pointwise]
 
 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA, MPS: where_self_out
+    CPU, CUDA, MPS, PrivateUse1: where_self_out
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
   variants: function
@@ -7874,7 +7872,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, MPS: set_
+    CPU, CUDA, Meta, MPS, PrivateUse1: set_
   autogen: set.source_Storage, set.source_Storage_out
   tags: inplace_view
 
@@ -7905,7 +7903,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, MPS: set_tensor_
+    CPU, CUDA, Meta, MPS, PrivateUse1: set_tensor_
   autogen: set.source_Tensor, set.source_Tensor_out
   tags: inplace_view
 
@@ -8663,7 +8661,7 @@
   variants: method
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: random_
+    CPU, CUDA, PrivateUse1: random_
     Meta: random_meta_
     MPS: random_mps_
   autogen: random.from, random.from_out
@@ -8673,7 +8671,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: random_
+    CPU, CUDA, PrivateUse1: random_
     Meta: random_meta_
     MPS: random_mps_
   autogen: random.to, random.to_out
@@ -8683,7 +8681,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: random_
+    CPU, CUDA, PrivateUse1: random_
     MPS: random_mps_
     Meta: random_meta_
   autogen: random, random.out
@@ -8693,7 +8691,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: uniform_
+    CPU, CUDA, PrivateUse1: uniform_
     MPS: uniform_mps_
     Meta: uniform_meta_
   autogen: uniform, uniform.out
@@ -13077,7 +13075,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: isposinf_out
+    CPU, CUDA, PrivateUse1: isposinf_out
     SparseCPU, SparseCUDA: isposinf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out
   tags: pointwise
@@ -13094,7 +13092,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: isneginf_out
+    CPU, CUDA, PrivateUse1: isneginf_out
     SparseCPU, SparseCUDA: isneginf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out
   tags: pointwise
diff --git a/aten/src/ATen/native/zoom/Bmm.cpp b/aten/src/ATen/native/zoom/Bmm.cpp
deleted file mode 100644
index 53e87a7eb3913e..00000000000000
--- a/aten/src/ATen/native/zoom/Bmm.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-#include <ATen/core/Tensor.h>
-#include <ATen/core/NamedTensor.h>
-#include <ATen/TensorMeta.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/Dispatch.h>
-#include <ATen/OpMathType.h>
-#include <ATen/native/Resize.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/bmm_native.h>
-#include <ATen/ops/mm_native.h>
-#endif
-
-
-namespace at::native {
-    // Forward decl, defined in HIPbmm.cu
-    template <typename T>
-    void batched_matmul(const T* A, const T* B, T* C, int M, int N, int K, int batch_size);
-
-    const Tensor& bmm_out_hip_impl(const Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2) {
-        // handle pathological cases
-        if (result.numel() == 0) {
-            return result;
-        } else if (batch1.size(2) == 0) {
-            return result.zero_();
-        }
-        TORCH_CHECK(batch1.sizes()[2] == batch2.sizes()[1], "batch1 dim 2 must match batch2 dim 1");
-
-        c10::MaybeOwned<Tensor> result_ = c10::MaybeOwned<Tensor>::borrowed(result);
-        IntArrayRef result_strides = result.strides();
-        IntArrayRef result_sizes = result.sizes();
-
-        int m = batch1.sizes()[1];
-        int n = batch1.sizes()[2];
-        int k = batch2.sizes()[2];
-        int num_batches = result_->sizes()[0];
-
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "bmm_hip", [&] {
-            const scalar_t* batch1_ptr = batch1.const_data_ptr<scalar_t>();
-            const scalar_t* batch2_ptr = batch2.const_data_ptr<scalar_t>();
-            scalar_t* result_ptr = result_->mutable_data_ptr<scalar_t>();
-           
-           batched_matmul<scalar_t>(batch1_ptr, batch2_ptr, result_ptr, m, n, k, num_batches);
-        });
-        if (!result.is_same(*result_)) {
-            result.copy_(*result_);
-        }
-        return result;
-
-    }
-
-    TORCH_IMPL_FUNC(bmm_out_zoom)(const Tensor& batch1, const Tensor& batch2, const Tensor &result)
-    {
-        NoNamesGuard guard;
-        bmm_out_hip_impl(result, result, batch1, batch2);
-    }
-
-    Tensor& mm_out_hip_impl(Tensor& result, const Tensor& mat1, const Tensor& mat2) {
-        // Make sure to keep addmm_hip below in sync with this code; it
-        // preflights a check to try to avoid actually needing to call
-        // expand().
-        TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
-        TORCH_CHECK(
-            mat1.dtype() == mat2.dtype(),
-            "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
-        )
-
-        TensorArg targs[]{{result, "out", 0}, {mat1, "mat1", 1}, {mat2, "mat2", 2}};
-        checkAllSameGPU(__func__, targs);
-
-        IntArrayRef mat1_sizes = mat1.sizes();
-        IntArrayRef mat2_sizes = mat2.sizes();
-        at::ScalarType scalar_type = mat1.scalar_type();
-        TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
-        TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
-
-        // resize result tensor
-        at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
-        IntArrayRef result_sizes = result.sizes();
-        if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
-            return result;
-        }
-
-        if (mat1.numel() == 0) {
-            // By definition, values in self should be ignored. nans and infs
-            // should not propagate
-            return result.zero_();
-        }
-
-        int m = mat1_sizes[0];
-        int n = mat1_sizes[1];
-        int k = mat2_sizes[1];
-
-        // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result.is_conj());
-        
-        AT_DISPATCH_FLOATING_TYPES_AND2(
-            at::ScalarType::Half,
-            at::ScalarType::BFloat16,
-            scalar_type,
-            "mm_zoom",
-            [&] {
-                const scalar_t* mat1_ptr = mat1.const_data_ptr<scalar_t>();
-                const scalar_t* mat2_ptr = mat2.const_data_ptr<scalar_t>();
-                scalar_t* result_ptr = result.mutable_data_ptr<scalar_t>();
-                batched_matmul<scalar_t>(mat1_ptr, mat2_ptr, result_ptr, m, n, k, 1);
-            });
-
-        return result;
-    }
-
-    TORCH_IMPL_FUNC(mm_out_zoom)(const Tensor& self, const Tensor& mat2, const Tensor& result) 
-    {
-        mm_out_hip_impl(const_cast<Tensor&>(result), self, mat2);
-    }
-
-} // at::native
-
-
diff --git a/aten/src/ATen/native/zoom/DistributionRandomKernel.cu b/aten/src/ATen/native/zoom/DistributionRandomKernel.cu
new file mode 100644
index 00000000000000..7e8aa20d652bae
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionRandomKernel.cu
@@ -0,0 +1,27 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+namespace at::native {
+
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::random_from_to_kernel(iter, range, base, gen);
+}
+
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::random_full_64_bits_range_kernel(iter, gen);
+}
+
+void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::random_kernel(iter, gen);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(random_from_to_stub, &random_from_to_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(random_stub, &random_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(random_full_64_bits_range_stub, &random_full_64_bits_range_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionUniform.cu b/aten/src/ATen/native/zoom/DistributionUniform.cu
new file mode 100644
index 00000000000000..25ed5e7b8b1148
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionUniform.cu
@@ -0,0 +1,15 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+namespace at::native {
+
+void uniform_kernel(TensorIteratorBase& iter, double from, double to, std::optional<Generator> gen) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  templates::zoom::uniform_kernel(iter, from, to, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(uniform_stub, &uniform_kernel);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/HIPbmm.cu b/aten/src/ATen/native/zoom/HIPbmm.cu
deleted file mode 100644
index a77a31efaf1af6..00000000000000
--- a/aten/src/ATen/native/zoom/HIPbmm.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-#include <hip/hip_runtime.h>
-#include <hip/hip_fp16.h>
-#include <hip/hip_bfloat16.h>
-#include <c10/core/ScalarType.h>
-#include <c10/zoom/ZoomException.h>
-
-namespace at::native {
-
-    int num_threads() {
-        return 32;
-    }
-
-    // Helper function to convert hip_bfloat16 to float
-    __device__ float bfloat16_to_float(hip_bfloat16 a) {
-        union {
-            uint32_t int32;
-            float float32;
-        } u = {uint32_t(a.data) << 16};
-        return u.float32;
-    }
-
-    // Helper function to convert float to hip_bfloat16
-    __device__ hip_bfloat16 float_to_bfloat16(float a) {
-        union {
-            float float32;
-            uint32_t int32;
-        } u = {a};
-        hip_bfloat16 b;
-        b.data = uint16_t(u.int32 >> 16);
-        return b;
-    }
-
-    template <typename T>
-    __device__ float convert_to_float(T a) {
-        return a;
-    }
-
-    template <>
-    __device__ float convert_to_float<hip_bfloat16>(hip_bfloat16 a) {
-        return bfloat16_to_float(a);
-    }
-
-    template <>
-    __device__ float convert_to_float<__half>( __half a) {
-        return __half2float(a);
-    }
-
-    template <typename T>
-    __device__ T convert_from_float(float a) {
-        return static_cast<T>(a);
-    }
-
-    template <>
-    __device__ hip_bfloat16 convert_from_float<hip_bfloat16>(float a) {
-        return float_to_bfloat16(a);
-    }
-
-    template <>
-    __device__ __half convert_from_float<__half>(float a) {
-        return __float2half(a);
-    }
-
-
-    template <typename T>
-    __global__ void batched_matmul_kernel(const T* A, const T* B, T* C, 
-                                        int M, int N, int K, int batch_size) {
-        int row = blockIdx.y * blockDim.y + threadIdx.y;
-        int col = blockIdx.x * blockDim.x + threadIdx.x;
-        int batch = blockIdx.z;
-
-        if (row < M && col < K && batch < batch_size) {
-            float sum = 0.0f;
-            for (int n = 0; n < N; ++n) {
-                sum += convert_to_float(A[batch * M * N + row * N + n]) * 
-                    convert_to_float(B[batch * N * K + n * K + col]);
-            }
-            C[batch * M * K + row * K + col] = convert_from_float<T>(sum);
-        }
-    }
-
-    template <typename T>
-    void batched_matmul(const T* A, const T* B, T* C, 
-                        int M, int N, int K, int batch_size) {
-        dim3 threadsPerBlock(num_threads(), num_threads());
-        dim3 numBlocks((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
-                    (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
-                    batch_size);
-
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(batched_matmul_kernel<T>), numBlocks, threadsPerBlock, 0, 0,
-                        A, B, C, M, N, K, batch_size);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();        
-    }
-
-    // Specialization for at::Half
-    template <>
-    void batched_matmul<at::Half>(const at::Half* A, const at::Half* B, at::Half* C,
-                                        int M, int N, int K, int batch_size) {
-        dim3 threadsPerBlock(num_threads(), num_threads());
-        dim3 numBlocks((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
-                    (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
-                    batch_size);
-
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(batched_matmul_kernel<__half>), numBlocks, threadsPerBlock, 0, 0,
-            reinterpret_cast<const __half*>(A),
-            reinterpret_cast<const __half*>(B),
-            reinterpret_cast<__half*>(C),
-            M, N, K, batch_size);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();        
-    }
-
-    // Specialization for at::BFloat16
-    template <>
-    void batched_matmul<at::BFloat16>(const at::BFloat16* A, const at::BFloat16* B, at::BFloat16* C,
-                                            int M, int N, int K, int batch_size) {
-        dim3 threadsPerBlock(num_threads(), num_threads());
-        dim3 numBlocks((K + threadsPerBlock.x - 1) / threadsPerBlock.x,
-                    (M + threadsPerBlock.y - 1) / threadsPerBlock.y,
-                    batch_size);
-
-        hipLaunchKernelGGL(HIP_KERNEL_NAME(batched_matmul_kernel<hip_bfloat16>), numBlocks, threadsPerBlock, 0, 0,
-            reinterpret_cast<const hip_bfloat16*>(A),
-            reinterpret_cast<const hip_bfloat16*>(B),
-            reinterpret_cast<hip_bfloat16*>(C),
-            M, N, K, batch_size);
-        C10_ZOOM_KERNEL_LAUNCH_CHECK();        
-    }
-
-    // Explicit instantiations for supported types
-    template void batched_matmul<float>(const float*, const float*, float*, int, int, int, int);
-    template void batched_matmul<double>(const double*, const double*, double*, int, int, int, int);
-
-} // at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/TensorCompare.cu b/aten/src/ATen/native/zoom/TensorCompare.cu
new file mode 100644
index 00000000000000..e92d058c9b7222
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorCompare.cu
@@ -0,0 +1,133 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/NumericUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorCompare.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <c10/core/Scalar.h>
+
+
+namespace at::native {
+
+namespace {
+
+void where_kernel_impl(TensorIterator &iter) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(kComplexHalf, kHalf, kBFloat16, kBool, iter.dtype(), "where_zoom", [&] {
+      gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA (bool cond_val, scalar_t self_val, scalar_t other_val) -> scalar_t {
+          return cond_val ? self_val : other_val;
+        });
+  });
+}
+
+void isposinf_kernel_impl(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_zoom", [&]() {
+    gpu_kernel(
+      iter,
+      [] GPU_LAMBDA (scalar_t a) -> bool { return a == std::numeric_limits<scalar_t>::infinity(); }
+    );
+  });
+}
+
+void isneginf_kernel_impl(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_zoom", [&]() {
+    gpu_kernel(
+      iter,
+      [] GPU_LAMBDA (scalar_t a) -> bool { return a == -std::numeric_limits<scalar_t>::infinity(); }
+    );
+  });
+}
+
+void clamp_kernel_impl(TensorIteratorBase& iter) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_zoom", [&] {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t v, scalar_t lower, scalar_t upper) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (at::_isnan(v)) {
+        return v;
+      } if (at::_isnan(lower)) {
+        return lower;
+      } if (at::_isnan(upper)) {
+        return upper;
+      } else {
+        return ::min(::max(v, lower), upper);
+      }
+    });
+  });
+}
+
+void inline launch_clamp_scalar(TensorIteratorBase& iter, Scalar lim0, Scalar lim1, at::native::detail::ClampLimits minmax){
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "clamp_scalar_zoom", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto lim0_val = lim0.to<opmath_t>();
+    auto lim1_val = lim1.to<opmath_t>();
+
+    gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
+      // Propagate nan, which doesn't propagate automatically for ROCm
+      if (_isnan(static_cast<opmath_t>(v))) {
+        return v;
+      } else if (minmax==at::native::detail::ClampLimits::Min){
+        return ::max(static_cast<opmath_t>(v), lim0_val);
+      } else if (minmax==at::native::detail::ClampLimits::Max){
+        return ::min(static_cast<opmath_t>(v), lim0_val);
+      } else {
+        return ::min(::max(static_cast<opmath_t>(v), lim0_val), lim1_val);
+      }
+    });
+  });
+}
+
+
+void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min, const Scalar& max) {
+  launch_clamp_scalar(iter, min, max, at::native::detail::ClampLimits::MinMax);
+}
+
+void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min) {
+  launch_clamp_scalar(iter, min, min, at::native::detail::ClampLimits::Min);
+}
+
+void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max) {
+  launch_clamp_scalar(iter, max, max, at::native::detail::ClampLimits::Max);
+}
+
+} // anonymous namespace
+
+
+REGISTER_PRIVATEUSE1_DISPATCH(where_kernel, &where_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(isposinf_stub, &isposinf_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(isneginf_stub, &isneginf_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_stub, &clamp_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_scalar_stub, &clamp_scalar_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_min_scalar_stub, &clamp_min_scalar_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(clamp_max_scalar_stub, &clamp_max_scalar_kernel_impl);
+
+template <typename scalar_t>
+__global__ void _assert_async_zoom_kernel(const scalar_t* input) {
+  ZOOM_KERNEL_ASSERT(input[0] != 0);
+}
+
+__global__ void _assert_async_zoom_kernel(const c10::complex<float>* input) {
+  ZOOM_KERNEL_ASSERT(input[0] != c10::complex<float>(0, 0));
+}
+__global__ void _assert_async_zoom_kernel(const c10::complex<double>* input) {
+  ZOOM_KERNEL_ASSERT(input[0] != c10::complex<double>(0, 0));
+}
+
+void _assert_async_zoom(const Tensor& self_tensor) {
+  const TensorBase &self = get_tensor_base(self_tensor);
+  auto n = self.numel();
+  TORCH_CHECK(n != 0, "Boolean value of Tensor with no values is ambiguous");
+  TORCH_CHECK(n < 2, "Boolean value of Tensor with more than one value is ambiguous");
+  auto stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "_assert_async_zoom", [&] {
+    _assert_async_zoom_kernel<<<1, 1, 0, stream>>>(self.const_data_ptr<scalar_t>());
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+// TODO (tmanlaibaatar) Ignore assert msg for now
+void _assert_async_msg_zoom(const Tensor& self_tensor, c10::string_view assert_msg) {
+  _assert_async_zoom(self_tensor);
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/test/test_ops.py b/test/test_ops.py
index 44f503ae9b6ed8..cd473ac92c4f4f 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -32,6 +32,7 @@
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
+    onlyCUDAAndZOOM,
     onlyNativeDeviceTypes,
     OpDTypes,
     ops,
@@ -283,7 +284,7 @@ def test_numpy_ref(self, device, dtype, op):
                 )
 
     # Tests that the cpu and gpu results are consistent
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @suppress_warnings
     @slowTest
     @ops(_ops_and_refs_with_no_numpy_ref, dtypes=OpDTypes.any_common_cpu_cuda_one)
diff --git a/torch/zoom/__init__.py b/torch/zoom/__init__.py
index 7b5a757d08520c..debc3c917f96ae 100644
--- a/torch/zoom/__init__.py
+++ b/torch/zoom/__init__.py
@@ -44,7 +44,7 @@ def _maybe_exchange_device(device: int) -> int:
             return -1
         raise RuntimeError("PyTorch was compiled without Zoom support")
 
-
+from .zoom_triton_mm import *
 
 _initialized = False
 _tls = threading.local()
diff --git a/torch/zoom/zoom_triton_mm.py b/torch/zoom/zoom_triton_mm.py
new file mode 100644
index 00000000000000..6967ed7f8c1a77
--- /dev/null
+++ b/torch/zoom/zoom_triton_mm.py
@@ -0,0 +1,182 @@
+import torch
+import triton
+import triton.language as tl
+from torch.library import register_kernel
+torch.utils.rename_privateuse1_backend('zoom')
+
+@triton.heuristics({
+    'BLOCK_SIZE_M': lambda args: 128,
+    'BLOCK_SIZE_N': lambda args: 64,
+    'BLOCK_SIZE_K': lambda args: 32,
+    'GROUP_SIZE_M': lambda args: 32,
+    'EVEN_K': lambda args: args['K'] % args['BLOCK_SIZE_K'] == 0,
+})
+@triton.jit
+def batched_matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    B,
+    M,
+    N,
+    K,
+    stride_ab,
+    stride_am,
+    stride_ak,
+    stride_bb,
+    stride_bk,
+    stride_bn,
+    stride_cb,
+    stride_cm,
+    stride_cn,
+    a_scale_ptr,
+    b_scale_ptr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    APPLY_SCALE: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+):
+    """Kernel for computing the batched matmul C = A x B.
+    A has shape (B, M, K), B has shape (B, K, N) and C has shape (B, M, N)
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_batch = num_pid_m * num_pid_n
+    batch_id = pid // num_pid_in_batch
+    pid_in_batch = pid % num_pid_in_batch
+
+    if GROUP_SIZE_M == 1:
+        pid_m = pid_in_batch // num_pid_n
+        pid_n = pid_in_batch % num_pid_n
+    else:
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid_in_batch // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid_in_batch % group_size_m)
+        pid_n = (pid_in_batch % num_pid_in_group) // group_size_m
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    a_ptrs = a_ptr + batch_id * stride_ab + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + batch_id * stride_bb + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    if APPLY_SCALE:
+        a_scale = tl.load(a_scale_ptr)
+        b_scale = tl.load(b_scale_ptr)
+
+    acc_dtype = tl.float32 if c_ptr.type.element_ty != tl.int8 else tl.int32
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=acc_dtype)
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        if EVEN_K:
+            a = tl.load(a_ptrs)
+            b = tl.load(b_ptrs)
+        else:
+            a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        accumulator += tl.dot(a, b)
+
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if APPLY_SCALE:
+        accumulator = accumulator * a_scale * b_scale
+
+    if ACTIVATION == "leaky_relu":
+        accumulator = leaky_relu(accumulator)
+    c = accumulator.to(c_ptr.type.element_ty)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + batch_id * stride_cb + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+# Wrapper for batched gemm kernel
+def batched_matmul(a, b, c, a_scale, b_scale, scale_a8_b8=False, activation=""):
+    assert a.shape[2] == b.shape[1], "Incompatible matrix dimensions!!!"
+    assert a.shape[0] == b.shape[0], "Incompatible batch dimensions!!!"
+    assert a.dtype == b.dtype, "Mixed dtype GEMMs are not supported!!!"
+    B, M, K = a.shape
+    _, K, N = b.shape
+    grid = lambda META: (B * triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),)
+    batched_matmul_kernel[grid](
+        a,
+        b,
+        c,
+        B,
+        M,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        a.stride(2),
+        b.stride(0),
+        b.stride(1),
+        b.stride(2),
+        c.stride(0),
+        c.stride(1),
+        c.stride(2),
+        a_scale,
+        b_scale,
+        APPLY_SCALE=scale_a8_b8,
+        ACTIVATION=activation,
+    )
+
+# Activation function.
+@triton.jit
+def leaky_relu(x):
+    x = x + 1
+    return tl.where(x >= 0, x, 0.01 * x)
+
+name_to_torch_types = {
+    'int8': torch.int8,
+    'int32': torch.int32,
+    'fp16': torch.float16,
+    'fp32': torch.float32,
+    'bf16': torch.bfloat16,
+    'fp8e5': torch.float8_e5m2fnuz,
+    'fp8e4': torch.float8_e4m3fnuz,
+}
+
+dtype_max = {
+    dtype: (torch.finfo(dtype) if dtype.is_floating_point else torch.iinfo(dtype)).max
+    for dtype in [
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.int8,
+    ]
+}
+
+def mm_out_zoom(self, mat2, out):
+    batched_matmul(self.unsqueeze(0), mat2.unsqueeze(0), out.unsqueeze(0), None, None, False)
+    
+def bmm_out_zoom(self, mat2, out):
+    batched_matmul(self, mat2, out, None, None, False)
+
+@register_kernel("aten::mm.out", "zoom")
+def mm_out(self, mat2, out):
+    mm_out_zoom(self, mat2, out)
+    
+@register_kernel("aten::mm", "zoom")
+def mm(self, mat2):
+    out = self.new_empty((self.size(0), mat2.size(1)))
+    mm_out_zoom(self, mat2, out)
+    return out
+    
+@register_kernel("aten::bmm.out", "zoom")
+def bmm_out(self, mat2, out):
+    bmm_out_zoom(self, mat2, out)
+        
+@register_kernel("aten::bmm", "zoom")
+def bmm(self, mat2):
+    out = self.new_empty((self.size(0), self.size(1), mat2.size(2)))
+    bmm_out_zoom(self, mat2, out)
+    return out
+    
\ No newline at end of file

From 74b12b7cc39027ad97365ca528c28b680ed51acb Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Mon, 27 Jan 2025 05:02:18 +0000
Subject: [PATCH 09/23] add build and llama3 demo instructions

---
 BuildingZoom.md | 136 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 BuildingZoom.md

diff --git a/BuildingZoom.md b/BuildingZoom.md
new file mode 100644
index 00000000000000..66918e23162819
--- /dev/null
+++ b/BuildingZoom.md
@@ -0,0 +1,136 @@
+# Setup Python Env
+
+To start out, we just need to follow the normal procedure to build PyTorch from source. For convenience I've included these steps here:
+
+```bash
+conda create -n nod-pytorch python==3.10
+conda activate nod-pytorch
+conda install cmake ninja
+pip install -r requirements.txt
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+python setup.py develop
+```
+
+# CMake Build
+
+Using the `USE_ZOOM` flag with CMake will enable building with HIP for ROCm without requiring any of the "HIPify" scripts in order to build. This will include HIP libraries and populate `torch.version.hip` appropriately. This flag is NOT yet entered into the `setup.py` script, so for now it needs to be added manually via `cmake` or `ccmake`.
+
+You'll need to set the `ROCM_PATH` and `HIP_ROOT_DIR` environment variables appropriately, by default on linux these should be `/opt/rocm/` and `/opt/rocm/hip` respectively.
+
+If you're running on Linux you can just use `build.sh` to build:
+```bash
+cd pytorch/
+source build.sh
+```
+
+Alternatively, if you want to manually setup your CMake build you can use the following commands:
+
+```bash
+cd build/
+export PYTORCH_ROCM_ARCH=gfx90a
+export ROCM_PATH=/opt/rocm
+export HIP_ROOT_DIR=/opt/rocm/hip
+cmake -DUSE_ZOOM=ON --build . --target install
+```
+
+# Running PyTorch with Zoom
+
+Programs using the zoom backend must be prefaced with this stub until we register a proper dispatch key in pytorch
+
+```python
+import torch
+import torch.zoom
+torch.utils.rename_privateuse1_backend('zoom')
+torch.utils.generate_methods_for_privateuse1_backend(unsupported_dtype=None)
+```
+
+# Installing Triton
+
+Since main Triton currently treats ROCm as if its masquerading as `torch.cuda`, we need a custom installation:
+
+```bash
+git clone https://github.com/123epsilon/triton.git
+cd triton/
+git checkout zoom
+pip install pybind11
+pip install python/
+```
+
+# Running LLama3 with Triton using LigerKernels and HuggingFace
+
+```bash
+pip install liger-kernel
+```
+
+```python
+# Run Llama 3
+import torch
+from transformers import AutoTokenizer
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
+from time import perf_counter as pf
+torch.utils.rename_privateuse1_backend('zoom')
+
+# Set up the model and tokenizer
+model_id = "meta-llama/Meta-Llama-3-8B"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoLigerKernelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="zoom"
+)
+
+# Function to generate text
+def generate_text(prompt, max_length=30):
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=max_length)
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+# Example usage
+prompt = "Hey, how are you doing today?"
+s = pf()
+response = generate_text(prompt)
+e = pf()
+print(f"Prompt: {prompt}")
+print(f"Response: {response}")
+
+print(f"{e-s} seconds")
+```
+
+```python
+# Or run the instruct-tuned variant
+import torch
+import transformers
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+torch.utils.rename_privateuse1_backend('zoom')
+
+apply_liger_kernel_to_llama()
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+pipeline = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device_map="zoom",
+)
+
+messages = [
+    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+    {"role": "user", "content": "Who are you?"},
+]
+
+terminators = [
+    pipeline.tokenizer.eos_token_id,
+    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+
+outputs = pipeline(
+    messages,
+    max_new_tokens=30,
+    eos_token_id=terminators,
+    do_sample=True,
+    temperature=0.6,
+    top_p=0.9,
+)
+print(outputs[0]["generated_text"][-1])
+
+```
\ No newline at end of file

From e7b9919f40145fe41e83da73a3f3dc1b5a17dc3e Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Tue, 28 Jan 2025 00:10:35 +0000
Subject: [PATCH 10/23] add range factories

---
 aten/src/ATen/native/native_functions.yaml  |   4 +
 aten/src/ATen/native/zoom/RangeFactories.cu | 270 ++++++++++++++++++++
 2 files changed, 274 insertions(+)
 create mode 100644 aten/src/ATen/native/zoom/RangeFactories.cu

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 6271e79c453abf..5af124fc7703fc 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -797,6 +797,7 @@
   dispatch:
     CPU, Meta: arange_out
     CUDA: arange_cuda_out
+    PrivateUse1: arange_zoom_out
     MPS: arange_mps_out
   cpp_no_default_args: ['step']
 
@@ -3431,6 +3432,7 @@
   dispatch:
     CPU, Meta: linspace_out
     CUDA: linspace_cuda_out
+    PrivateUse1: linspace_zoom_out
     MPS: linspace_out_mps
 
 - func: linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
@@ -3647,6 +3649,7 @@
   dispatch:
     CPU, Meta: logspace_out
     CUDA: logspace_cuda_out
+    PrivateUse1: logspace_zoom_out
 
 - func: logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
   category_override: factory
@@ -4795,6 +4798,7 @@
   dispatch:
     CPU, Meta: range_out
     CUDA: range_cuda_out
+    PrivateUse1: range_zoom_out
     MPS: range_mps_out
   cpp_no_default_args: ['step']
 
diff --git a/aten/src/ATen/native/zoom/RangeFactories.cu b/aten/src/ATen/native/zoom/RangeFactories.cu
new file mode 100644
index 00000000000000..5f7417703ca601
--- /dev/null
+++ b/aten/src/ATen/native/zoom/RangeFactories.cu
@@ -0,0 +1,270 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <cmath>
+#include <limits>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/range_native.h>
+#endif
+
+#define GPU_LAMBDA __device__ __host__
+
+namespace {
+
+
+constexpr int num_threads() {
+  return 128;
+}
+
+constexpr int thread_work_size = 1;
+constexpr int block_work_size = thread_work_size * num_threads();
+
+template<typename index_t, typename func_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void elementwise_kernel_with_index(index_t N, func_t f, typename function_traits<func_t>::result_type *data) {
+  #pragma unroll
+  for (int i = 0; i < thread_work_size; i++) {
+    index_t idx = block_work_size * blockIdx.x + num_threads() * i + threadIdx.x;
+    if (idx < N) {
+      data[idx] = f(idx);
+    }
+  }
+}
+
+template<typename func_t>
+void gpu_kernel_with_index(at::Tensor &output, func_t f) {
+  int64_t N = output.numel();
+  if (N == 0) {
+    return;
+  }
+  int64_t grid = (N + block_work_size - 1) / block_work_size;
+  auto stream = c10::zoom::getCurrentZoomStream();
+  using scalar_t = typename function_traits<func_t>::result_type;
+  if (N <= std::numeric_limits<int>::max()) {
+    elementwise_kernel_with_index<int><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+    elementwise_kernel_with_index<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+}  // namespace
+
+namespace at::native {
+
+Tensor& linspace_zoom_out(const Scalar& start, const Scalar& end, int64_t steps, Tensor& result) {
+  TORCH_CHECK(steps >= 0, "number of steps must be non-negative");
+
+  if (result.numel() != steps) {
+    result.resize_({steps});
+  }
+  bool is_contiguous = result.is_contiguous();
+  Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+
+  if (steps == 0) {
+    // skip
+  } else if (steps == 1) {
+    r.fill_(start);
+  } else if (isIntegralType(r.scalar_type(), 0)) {
+    AT_DISPATCH_INTEGRAL_TYPES(r.scalar_type(), "linspace_zoom", [&]() {
+      scalar_t scalar_start = start.to<scalar_t>();
+      scalar_t scalar_end = end.to<scalar_t>();
+      // Cast `end` and `start` to `float`, since range can be larger than scalar_t for integral types
+      float step = (static_cast<float>(scalar_end) - static_cast<float>(scalar_start)) / (steps - 1);
+      const int64_t halfway = steps / 2;
+      gpu_kernel_with_index(r, [scalar_start, scalar_end, steps, step, halfway]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        if (ind < halfway) {
+          return scalar_start + (step * ind);
+        }
+
+        return scalar_end - step * (steps - ind - 1);
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, r.scalar_type(), "linspace_zoom", [&]() {
+      scalar_t scalar_start = start.to<scalar_t>();
+      scalar_t scalar_end = end.to<scalar_t>();
+      scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
+      const int64_t halfway = steps / 2;
+      gpu_kernel_with_index(r, [scalar_start, scalar_end, steps, step, halfway]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        if (ind < halfway) {
+          return scalar_start + (step * ind);
+        }
+
+        return scalar_end - step * (steps - ind - 1);
+      });
+    });
+  }
+
+  if (!is_contiguous) {
+    result.copy_(r);
+  }
+
+  return result;
+}
+
+Tensor& logspace_zoom_out(const Scalar& start, const Scalar& end, int64_t steps, double base, Tensor& result) {
+  TORCH_CHECK(steps >= 0, "number of steps must be non-negative");
+
+  if (result.numel() != steps) {
+    result.resize_({steps});
+  }
+  bool is_contiguous = result.is_contiguous();
+  Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+
+  if (steps == 0) {
+    // skip
+  } else if (steps == 1) {
+    if (isComplexType(r.scalar_type())){
+      r.fill_(std::pow(base, start.to<c10::complex<double>>()));
+    } else {
+      r.fill_(std::pow(base, start.to<double>()));
+    }
+  } else if (isIntegralType(r.scalar_type(), 0)) {
+    AT_DISPATCH_INTEGRAL_TYPES(r.scalar_type(), "logspace_zoom", [&]() {
+      float scalar_base = static_cast<float>(base); // Use float to avoid promotion to double
+      scalar_t scalar_start = start.to<scalar_t>();
+      scalar_t scalar_end = end.to<scalar_t>();
+      float step = static_cast<float>(scalar_end - scalar_start) / (steps - 1);
+      const int64_t halfway = steps / 2;
+      gpu_kernel_with_index(r, [scalar_start, scalar_end, scalar_base, steps, step, halfway]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        if (ind < halfway) {
+          return std::pow(scalar_base, scalar_start + step * ind);
+        }
+        return std::pow(scalar_base, scalar_end - step * (steps - ind - 1));
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, r.scalar_type(), "logspace_zoom", [&]() {
+      scalar_t scalar_base = static_cast<scalar_t>(base);
+      scalar_t scalar_start = start.to<scalar_t>();
+      scalar_t scalar_end = end.to<scalar_t>();
+      scalar_t step = (scalar_end - scalar_start) / static_cast<scalar_t>(steps - 1);
+      const int64_t halfway = steps / 2;
+      gpu_kernel_with_index(r, [scalar_start, scalar_end, scalar_base, steps, step, halfway]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        if (ind < halfway) {
+          return std::pow(scalar_base, scalar_start + step * ind);
+        }
+        return std::pow(scalar_base, scalar_end - step * (steps - ind - 1));
+      });
+    });
+  }
+
+  if (!is_contiguous) {
+    result.copy_(r);
+  }
+
+  return result;
+}
+
+Tensor& range_zoom_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.scalar_type(), "range_zoom", [&]() {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
+
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+             std::isfinite(static_cast<double>(xend)),
+             "unsupported range: ", xstart, " -> ", xend);
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+             "upper bound and larger bound inconsistent with step sign");
+    int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
+
+    if (result.numel() != size) {
+      result.resize_({size});
+    }
+    bool is_contiguous = result.is_contiguous();
+    Tensor r = !is_contiguous ?  at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+
+    gpu_kernel_with_index(r, [xstart, xstep]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        accscalar_t inc = xstep * static_cast<accscalar_t>(ind);
+        accscalar_t val = xstart + inc;
+        return static_cast<scalar_t>(val);
+    });
+
+    if(!is_contiguous) {
+      result.copy_(r);
+    }
+
+  });
+
+  return result;
+}
+
+Tensor& arange_zoom_out(const Scalar& start, const Scalar& end, const Scalar& step, Tensor& result) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, result.scalar_type(), "arange_zoom", [&]() {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto xstart = start.to<accscalar_t>();
+    auto xend = end.to<accscalar_t>();
+    auto xstep = step.to<accscalar_t>();
+
+    TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+    TORCH_CHECK(std::isfinite(static_cast<double>(xstart)) &&
+              std::isfinite(static_cast<double>(xend)),
+              "unsupported range: ", xstart, " -> ", xend);
+    TORCH_CHECK(((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+              "upper bound and larger bound inconsistent with step sign");
+
+    // we use double precision for (start - end) / step
+    // to compute size_d for consistency across devices.
+    // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t,
+    // but double on cpu for the same,
+    // and the effective output size starts differing on CPU vs GPU because of precision issues, which
+    // we dont want.
+    // the corner-case we do want to take into account is int64_t, which has higher precision than double
+    double size_d;
+    if constexpr (std::is_same_v<scalar_t, int64_t>) {
+      int64_t sgn = (xstep > 0) - (xstep < 0);
+      size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
+    } else {
+      size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+                          / step.to<double>());
+    }
+
+    TORCH_CHECK(size_d >= 0 && size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+              "invalid size, possible overflow?");
+    int64_t size = static_cast<int64_t>(size_d);
+    int64_t numel = result.numel();
+
+    if (numel != size) {
+      if(numel > 0){
+        TORCH_WARN("The number of elements in the out tensor of shape ", result.sizes(),
+                    " is ", numel, " which does not match the computed number of elements ", size,
+                    ". Note that this may occur as a result of rounding error. "
+                    "The out tensor will be resized to a tensor of shape (", size, ",).");
+      }
+      result.resize_({size});
+    }
+    bool is_contiguous = result.is_contiguous();
+    Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
+
+    gpu_kernel_with_index(r, [xstart, xstep]GPU_LAMBDA(int64_t ind) -> scalar_t {
+        accscalar_t inc = xstep * static_cast<accscalar_t>(ind);
+        accscalar_t val = xstart + inc;
+        return static_cast<scalar_t>(val);
+    });
+
+    if(!is_contiguous) {
+      result.copy_(r);
+    }
+  });
+
+  return result;
+}
+
+} // namespace at::native
\ No newline at end of file

From 1eae71dac3ce6460dece13e121dc36824e79dbcd Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Tue, 28 Jan 2025 00:42:24 +0000
Subject: [PATCH 11/23] adjust find_package calls for zoom in cmake

---
 cmake/public/LoadHIP.cmake | 45 ++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 107a6fbc15dac5..b7ab4c6d3d5aeb 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -151,23 +151,34 @@ if(HIP_FOUND)
   set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
   set(hipsolver_DIR ${ROCM_PATH}/lib/cmake/hipsolver)
 
-
-  find_package_and_print_version(hip REQUIRED)
-  find_package_and_print_version(hsa-runtime64 REQUIRED)
-  find_package_and_print_version(amd_comgr REQUIRED)
-  find_package_and_print_version(rocrand REQUIRED)
-  find_package_and_print_version(hiprand REQUIRED)
-  find_package_and_print_version(rocblas REQUIRED)
-  find_package_and_print_version(hipblas REQUIRED)
-  find_package_and_print_version(hipblaslt REQUIRED)
-  find_package_and_print_version(miopen REQUIRED)
-  find_package_and_print_version(hipfft REQUIRED)
-  find_package_and_print_version(hipsparse REQUIRED)
-  find_package_and_print_version(rccl)
-  find_package_and_print_version(rocprim REQUIRED)
-  find_package_and_print_version(hipcub REQUIRED)
-  find_package_and_print_version(rocthrust REQUIRED)
-  find_package_and_print_version(hipsolver REQUIRED)
+  if(USE_ROCM)
+    find_package_and_print_version(hip REQUIRED)
+    find_package_and_print_version(hsa-runtime64 REQUIRED)
+    find_package_and_print_version(amd_comgr REQUIRED)
+    find_package_and_print_version(rocrand REQUIRED)
+    find_package_and_print_version(hiprand REQUIRED)
+    find_package_and_print_version(rocblas REQUIRED)
+    find_package_and_print_version(hipblas REQUIRED)
+    find_package_and_print_version(hipblaslt REQUIRED)
+    find_package_and_print_version(miopen REQUIRED)
+    find_package_and_print_version(hipfft REQUIRED)
+    find_package_and_print_version(hipsparse REQUIRED)
+    find_package_and_print_version(rccl)
+    find_package_and_print_version(rocprim REQUIRED)
+    find_package_and_print_version(hipcub REQUIRED)
+    find_package_and_print_version(rocthrust REQUIRED)
+    find_package_and_print_version(hipsolver REQUIRED)
+  else() # USE_ZOOM
+    find_package_and_print_version(hip REQUIRED)
+    find_package_and_print_version(hsa-runtime64 REQUIRED)
+    find_package_and_print_version(amd_comgr REQUIRED)
+    find_package_and_print_version(rocrand REQUIRED)
+    find_package_and_print_version(hiprand REQUIRED)
+    find_package_and_print_version(miopen REQUIRED)
+    find_package_and_print_version(rocprim REQUIRED)
+    find_package_and_print_version(hipcub REQUIRED)
+    find_package_and_print_version(rocthrust REQUIRED)
+  endif()
 
 
   find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib)

From 2ca34c835a82d7a06cc36112f7555f1bfbfcdf35 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Tue, 4 Feb 2025 02:31:49 +0000
Subject: [PATCH 12/23] add sudo to build whl

---
 .github/workflows/build_zoom_backend.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
index aa7053cafe8379..724b3f0e23e960 100644
--- a/.github/workflows/build_zoom_backend.yml
+++ b/.github/workflows/build_zoom_backend.yml
@@ -79,7 +79,7 @@ jobs:
           python -m venv venv
           source venv/bin/activate
           pip install -r requirements.txt
-          ./build.sh
+          sudo ./build.sh
 
       - name: "Audit"
         id: audit

From 5d099e92698938775da93219124e553c256113b9 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Tue, 4 Feb 2025 22:27:14 +0000
Subject: [PATCH 13/23] chmod build script

---
 .github/workflows/build_zoom_backend.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
index 724b3f0e23e960..3afa5e97be74ee 100644
--- a/.github/workflows/build_zoom_backend.yml
+++ b/.github/workflows/build_zoom_backend.yml
@@ -79,7 +79,8 @@ jobs:
           python -m venv venv
           source venv/bin/activate
           pip install -r requirements.txt
-          sudo ./build.sh
+          chmod +x ./build.sh
+          ./build.sh
 
       - name: "Audit"
         id: audit

From 51f643283286ba60ee0269734391d98959844689 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Wed, 19 Feb 2025 00:49:36 +0000
Subject: [PATCH 14/23] CI checkout recursive

---
 .github/workflows/build_zoom_backend.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
index 3afa5e97be74ee..5e8e27a7926805 100644
--- a/.github/workflows/build_zoom_backend.yml
+++ b/.github/workflows/build_zoom_backend.yml
@@ -58,7 +58,7 @@ jobs:
       - name: "Check out repository"
         uses: actions/checkout@v4.2.2
         with:
-          submodules: true
+          submodules: recursive
 
       - name: Enable cache
         uses: actions/cache/restore@v3

From 6c373c534c5aa00c8ffb70bf9e77030ee68d5d1d Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Wed, 19 Feb 2025 02:28:26 +0000
Subject: [PATCH 15/23] clang-19 compat in intrusive_ptr

---
 c10/util/intrusive_ptr.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 035f22e3c1867b..8f50e91d8295cd 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -379,7 +379,7 @@ class intrusive_ptr final {
 
   intrusive_ptr& operator=(intrusive_ptr&& rhs) & noexcept {
     // NOLINTNEXTLINE(*assign*)
-    return operator= <TTarget, NullType>(std::move(rhs));
+    return this->template operator= <TTarget, NullType>(std::move(rhs));
   }
 
   template <class From, class FromNullType>
@@ -397,7 +397,7 @@ class intrusive_ptr final {
   // NOLINTNEXTLINE(bugprone-unhandled-self-assignment)
   intrusive_ptr& operator=(const intrusive_ptr& rhs) & noexcept {
     // NOLINTNEXTLINE(*assign-operator, *assignment-signature)
-    return operator= <TTarget, NullType>(rhs);
+    return this->template operator= <TTarget, NullType>(rhs);
   }
 
   template <class From, class FromNullType>
@@ -769,7 +769,7 @@ class weak_intrusive_ptr final {
 
   weak_intrusive_ptr& operator=(weak_intrusive_ptr&& rhs) & noexcept {
     // NOLINTNEXTLINE(*assign*)
-    return operator= <TTarget, NullType>(std::move(rhs));
+    return this->template operator= <TTarget, NullType>(std::move(rhs));
   }
 
   template <class From, class FromNullType>
@@ -788,7 +788,7 @@ class weak_intrusive_ptr final {
       return *this;
     }
     // NOLINTNEXTLINE(*assign*)
-    return operator= <TTarget, NullType>(rhs);
+    return this->template operator= <TTarget, NullType>(rhs);
   }
 
   weak_intrusive_ptr& operator=(

From 12f62e26c1008bc5adb74ee470d8a2f785173cff Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Wed, 19 Feb 2025 02:47:30 +0000
Subject: [PATCH 16/23] add venv to audit build step

---
 .github/workflows/build_zoom_backend.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_zoom_backend.yml b/.github/workflows/build_zoom_backend.yml
index 5e8e27a7926805..8550d088aeb633 100644
--- a/.github/workflows/build_zoom_backend.yml
+++ b/.github/workflows/build_zoom_backend.yml
@@ -87,6 +87,7 @@ jobs:
         run: |
 
           sudo apt install patchelf
+          python -m venv venv
           source venv/bin/activate
           pip install auditwheel
           auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*

From a20c49366edd23bc8cb0c1ee1a63e68866f6ef67 Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Tue, 18 Mar 2025 21:03:20 +0000
Subject: [PATCH 17/23] add more kernels for autograd examples

---
 aten/src/ATen/native/Activation.cpp           |  23 +
 aten/src/ATen/native/native_functions.yaml    | 224 +++++-
 aten/src/ATen/native/zoom/Activation.cpp      | 108 +++
 aten/src/ATen/native/zoom/Activation.h        |  20 +
 .../ATen/native/zoom/ActivationEluKernel.cu   |  86 +++
 .../ATen/native/zoom/ActivationGeluKernel.cu  |  88 +++
 .../ATen/native/zoom/ActivationGluKernel.cu   | 141 ++++
 .../native/zoom/ActivationHardshrinkKernel.cu |  39 +
 .../zoom/ActivationHardsigmoidKernel.cu       |  74 ++
 .../native/zoom/ActivationHardswishKernel.cu  |  63 ++
 .../native/zoom/ActivationHardtanhKernel.cu   |  45 ++
 .../native/zoom/ActivationLeakyReluKernel.cu  |  62 ++
 .../native/zoom/ActivationLogSigmoidKernel.cu |  64 ++
 .../ATen/native/zoom/ActivationMishKernel.cu  |  64 ++
 .../ATen/native/zoom/ActivationPreluKernel.cu |  48 ++
 .../ATen/native/zoom/ActivationSiluKernel.cu  |  60 ++
 .../native/zoom/ActivationSoftplusKernel.cu   |  74 ++
 .../native/zoom/ActivationSoftshrinkKernel.cu |  58 ++
 .../native/zoom/ActivationThresholdKernel.cu  |  52 ++
 .../ATen/native/zoom/ForeachBinaryOpList.cu   | 295 ++++++++
 .../ATen/native/zoom/ForeachBinaryOpScalar.cu | 247 +++++++
 .../native/zoom/ForeachBinaryOpScalarList.cu  | 241 +++++++
 .../zoom/ForeachBinaryOpScalarTensor.cu       | 206 ++++++
 aten/src/ATen/native/zoom/ForeachFunctors.cuh | 681 ++++++++++++++++++
 .../native/zoom/ForeachMinMaxFunctors.cuh     |  22 +
 .../ATen/native/zoom/ForeachPointwiseOp.cu    | 272 +++++++
 aten/src/ATen/native/zoom/ForeachReduceOp.cu  | 352 +++++++++
 aten/src/ATen/native/zoom/ForeachTernaryOp.cu | 159 ++++
 aten/src/ATen/native/zoom/ForeachUnaryOp.cu   | 408 +++++++++++
 aten/src/ATen/native/zoom/Loss.cu             | 627 ++++++++++++++++
 .../src/ATen/native/zoom/MultiTensorApply.cuh | 379 ++++++++++
 aten/src/ATen/native/zoom/NLLLoss2d.cu        | 537 ++++++++++++++
 aten/src/ATen/native/zoom/Pow.cuh             |  58 ++
 aten/src/ATen/native/zoom/RecordStream.cu     |  17 +
 aten/src/ATen/native/zoom/RreluWithNoise.cu   | 195 +++++
 35 files changed, 6057 insertions(+), 32 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/Activation.cpp
 create mode 100644 aten/src/ATen/native/zoom/Activation.h
 create mode 100644 aten/src/ATen/native/zoom/ActivationEluKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationGeluKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationGluKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationHardshrinkKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationHardsigmoidKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationHardswishKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationHardtanhKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationLeakyReluKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationLogSigmoidKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationMishKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationPreluKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationSiluKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationSoftplusKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationSoftshrinkKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ActivationThresholdKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachBinaryOpList.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachBinaryOpScalar.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachBinaryOpScalarList.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachBinaryOpScalarTensor.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachFunctors.cuh
 create mode 100644 aten/src/ATen/native/zoom/ForeachMinMaxFunctors.cuh
 create mode 100644 aten/src/ATen/native/zoom/ForeachPointwiseOp.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachReduceOp.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachTernaryOp.cu
 create mode 100644 aten/src/ATen/native/zoom/ForeachUnaryOp.cu
 create mode 100644 aten/src/ATen/native/zoom/Loss.cu
 create mode 100644 aten/src/ATen/native/zoom/MultiTensorApply.cuh
 create mode 100644 aten/src/ATen/native/zoom/NLLLoss2d.cu
 create mode 100644 aten/src/ATen/native/zoom/Pow.cuh
 create mode 100644 aten/src/ATen/native/zoom/RecordStream.cu
 create mode 100644 aten/src/ATen/native/zoom/RreluWithNoise.cu

diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index a0141f974923e6..be525a961d9d6c 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -787,6 +787,18 @@ Tensor log_sigmoid_backward_cuda(const Tensor& grad_output, const Tensor& input,
   return iter.output();
 }
 
+Tensor log_sigmoid_backward_zoom(const Tensor& grad_output, const Tensor& input, const Tensor& buffer) {
+  auto grad_input = at::empty_like(grad_output);
+  // NOTE: buffer is only used by CPU dispatch, we just ignore it here
+  auto iter = at::TensorIteratorConfig()
+      .add_output(grad_input)
+      .add_const_input(input)
+      .add_const_input(grad_output)
+      .build();
+  log_sigmoid_backward_stub(kPrivateUse1, iter);
+  return iter.output();
+}
+
 Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& buffer) {
   auto grad_input = at::empty_like(grad_output);
   auto iter = at::TensorIteratorConfig()
@@ -810,6 +822,17 @@ Tensor& log_sigmoid_backward_cuda_out(const Tensor& grad_output, const Tensor& i
   return grad_input;
 }
 
+Tensor& log_sigmoid_backward_zoom_out(const Tensor& grad_output, const Tensor& input,
+  const Tensor& buffer, Tensor& grad_input) {
+auto iter = TensorIteratorConfig()
+.add_output(grad_input)
+.add_const_input(input)
+.add_const_input(grad_output)
+.build();
+log_sigmoid_backward_stub(kPrivateUse1, iter);
+return grad_input;
+}
+
 Tensor& log_sigmoid_backward_cpu_out(const Tensor& grad_output,
     const Tensor& input,
     const Tensor& buffer,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5af124fc7703fc..fd33884a40b15a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1135,6 +1135,7 @@
   dispatch:
     CPU: binary_cross_entropy_cpu
     CUDA: binary_cross_entropy_cuda
+    PrivateUse1: binary_cross_entropy_zoom
     MPS: binary_cross_entropy_mps
 
 - func: binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
@@ -1144,6 +1145,7 @@
   dispatch:
     CPU: binary_cross_entropy_out_cpu
     CUDA: binary_cross_entropy_out_cuda
+    PrivateUse1: binary_cross_entropy_out_zoom
     MPS: binary_cross_entropy_out_mps
 
 - func: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
@@ -1152,6 +1154,7 @@
   dispatch:
     CPU: binary_cross_entropy_backward_cpu
     CUDA: binary_cross_entropy_backward_cuda
+    PrivateUse1: binary_cross_entropy_backward_zoom
     MPS: binary_cross_entropy_backward_mps
 
 - func: binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -1160,6 +1163,7 @@
   dispatch:
     CPU: binary_cross_entropy_backward_out_cpu
     CUDA: binary_cross_entropy_backward_out_cuda
+    PrivateUse1: binary_cross_entropy_backward_out_zoom
     MPS: binary_cross_entropy_backward_out_mps
 
 - func: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
@@ -4995,7 +4999,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: relu
+    CPU, CUDA, PrivateUse1: relu
     MPS: relu_mps
     MkldnnCPU: mkldnn_relu
     QuantizedCPU: relu_quantized_cpu
@@ -5009,7 +5013,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: relu_
+    CPU, CUDA, PrivateUse1: relu_
     MPS: relu_mps_
     MkldnnCPU: mkldnn_relu_
     QuantizedCPU: relu_quantized_cpu_
@@ -5032,14 +5036,14 @@
 
 - func: _prelu_kernel(Tensor self, Tensor weight) -> Tensor
   dispatch:
-    CPU, CUDA: _prelu_kernel
+    CPU, CUDA, PrivateUse1: _prelu_kernel
     QuantizedCPU: _prelu_kernel_quantized_cpu
     MkldnnCPU: mkldnn_prelu
     MPS: prelu_mps
 
 - func: _prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
   dispatch:
-    CPU, CUDA: _prelu_kernel_backward
+    CPU, CUDA, PrivateUse1: _prelu_kernel_backward
     MkldnnCPU: mkldnn_prelu_backward
     MPS: prelu_backward_mps
 
@@ -5051,6 +5055,7 @@
   dispatch:
     CPU: gelu_out_cpu
     CUDA: gelu_out_cuda
+    PrivateUse1: gelu_out_zoom
     MPS: gelu_out_mps
 
 - func: gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
@@ -5079,6 +5084,7 @@
   dispatch:
     CPU: gelu_backward_out_cpu
     CUDA: gelu_backward_out_cuda
+    PrivateUse1: gelu_backward_out_zoom
     MPS: gelu_backward_out_mps
 
 - func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
@@ -5100,7 +5106,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: hardshrink_out
+    CPU, CUDA, PrivateUse1: hardshrink_out
 
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   structured_delegate: hardshrink.out
@@ -5111,7 +5117,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: hardshrink_backward_out
+    CPU, CUDA, PrivateUse1: hardshrink_backward_out
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
   structured_delegate: hardshrink_backward.grad_input
@@ -5204,7 +5210,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: silu_out
+    CPU, CUDA, PrivateUse1: silu_out
     MPS: silu_out_mps
   tags: pointwise
 
@@ -5213,7 +5219,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: silu_backward_out
+    CPU, CUDA, PrivateUse1: silu_backward_out
     MPS: silu_backward_out_mps
   tags: pointwise
 
@@ -5238,13 +5244,13 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: mish_out
+    CPU, CUDA, PrivateUse1: mish_out
     MPS: mish_out_mps
 
 - func: mish_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: mish_backward
+    CPU, CUDA, PrivateUse1: mish_backward
     MPS: mish_backward_mps
     CompositeImplicitAutograd: math_mish_backward
 
@@ -6040,14 +6046,14 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: threshold_out
+    CPU, CUDA, PrivateUse1: threshold_out
     MPS: threshold_out_mps
 
 - func: threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: threshold_backward_out
+    CPU, CUDA, PrivateUse1: threshold_backward_out
     MPS: threshold_backward_out_mps
     SparseCPU, SparseCUDA: threshold_backward_sparse_out
     SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed_out
@@ -10367,6 +10373,7 @@
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow
     CUDA: foreach_tensor_add_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_add_scalar_kernel_zoom
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10374,6 +10381,7 @@
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow_
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_add_scalar_kernel_zoom_
   autogen: _foreach_add.Scalar_out
 
 - func: _foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
@@ -10382,6 +10390,7 @@
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
+    PrivateUse1: foreach_tensor_add_list_kernel_zoom
 
 - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10389,6 +10398,7 @@
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_add_list_kernel_zoom_
   autogen: _foreach_add.List_out
 
 - func: _foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10397,6 +10407,7 @@
   dispatch:
     CPU: foreach_tensor_add_scalarlist_kernel_slow
     CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_add_scalarlist_kernel_zoom
 
 - func: _foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10404,6 +10415,7 @@
   dispatch:
     CPU: foreach_tensor_add_scalarlist_kernel_slow_
     CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_add_scalarlist_kernel_zoom_
   autogen: _foreach_add.ScalarList_out
 
 - func: _foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[]
@@ -10412,6 +10424,7 @@
   dispatch:
     CPU: foreach_tensor_add_tensor_kernel_slow
     CUDA: foreach_tensor_add_tensor_kernel_cuda
+    PrivateUse1: foreach_tensor_add_tensor_kernel_zoom
 
 - func: _foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10419,6 +10432,7 @@
   dispatch:
     CPU: foreach_tensor_add_tensor_kernel_slow_
     CUDA: foreach_tensor_add_tensor_kernel_cuda_
+    PrivateUse1: foreach_tensor_add_tensor_kernel_zoom_
   autogen: _foreach_add.Tensor_out
 
 - func: _foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10427,6 +10441,7 @@
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_sub_scalar_kernel_zoom
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10434,6 +10449,7 @@
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow_
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_sub_scalar_kernel_zoom_
   autogen: _foreach_sub.Scalar_out
 
 - func: _foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
@@ -10442,6 +10458,7 @@
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
+    PrivateUse1: foreach_tensor_sub_list_kernel_zoom
 
 - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10449,6 +10466,7 @@
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow_
     CUDA: foreach_tensor_sub_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_sub_list_kernel_zoom_
   autogen: _foreach_sub.List_out
 
 - func: _foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10457,6 +10475,7 @@
   dispatch:
     CPU: foreach_tensor_sub_scalarlist_kernel_slow
     CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_sub_scalarlist_kernel_zoom
 
 - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10464,6 +10483,7 @@
   dispatch:
     CPU: foreach_tensor_sub_scalarlist_kernel_slow_
     CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_sub_scalarlist_kernel_zoom_
   autogen: _foreach_sub.ScalarList_out
 
 - func: _foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10472,6 +10492,7 @@
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_mul_scalar_kernel_zoom
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10479,6 +10500,7 @@
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow_
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_mul_scalar_kernel_zoom_
   autogen: _foreach_mul.Scalar_out
 
 - func: _foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
@@ -10487,6 +10509,7 @@
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow
     CUDA: foreach_tensor_mul_list_kernel_cuda
+    PrivateUse1: foreach_tensor_mul_list_kernel_zoom
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10494,6 +10517,7 @@
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_mul_list_kernel_zoom_
   autogen: _foreach_mul.List_out
 
 - func: _foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10502,6 +10526,7 @@
   dispatch:
     CPU: foreach_tensor_mul_scalarlist_kernel_slow
     CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_mul_scalarlist_kernel_zoom
 
 - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10509,6 +10534,7 @@
   dispatch:
     CPU: foreach_tensor_mul_scalarlist_kernel_slow_
     CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_mul_scalarlist_kernel_zoom_
   autogen: _foreach_mul.ScalarList_out
 
 - func: _foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
@@ -10517,6 +10543,7 @@
   dispatch:
     CPU: foreach_tensor_mul_tensor_kernel_slow
     CUDA: foreach_tensor_mul_tensor_kernel_cuda
+    PrivateUse1: foreach_tensor_mul_tensor_kernel_zoom
 
 - func: _foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10524,6 +10551,7 @@
   dispatch:
     CPU: foreach_tensor_mul_tensor_kernel_slow_
     CUDA: foreach_tensor_mul_tensor_kernel_cuda_
+    PrivateUse1: foreach_tensor_mul_tensor_kernel_zoom_
   autogen: _foreach_mul.Tensor_out
 
 - func: _foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10532,6 +10560,7 @@
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow
     CUDA: foreach_tensor_div_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_div_scalar_kernel_zoom
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10539,6 +10568,7 @@
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_div_scalar_kernel_zoom_
   autogen: _foreach_div.Scalar_out
 
 - func: _foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
@@ -10547,6 +10577,7 @@
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
+    PrivateUse1: foreach_tensor_div_list_kernel_zoom
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10554,6 +10585,7 @@
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_div_list_kernel_zoom_
   autogen: _foreach_div.List_out
 
 - func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10562,6 +10594,7 @@
   dispatch:
     CPU: foreach_tensor_div_scalarlist_kernel_slow
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_div_scalarlist_kernel_zoom
 
 - func: _foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10569,6 +10602,7 @@
   dispatch:
     CPU: foreach_tensor_div_scalarlist_kernel_slow_
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_div_scalarlist_kernel_zoom_
   autogen: _foreach_div.ScalarList_out
 
 - func: _foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]
@@ -10577,6 +10611,7 @@
   dispatch:
     CPU: foreach_tensor_div_tensor_kernel_slow
     CUDA: foreach_tensor_div_tensor_kernel_cuda
+    PrivateUse1: foreach_tensor_div_tensor_kernel_zoom
 
 - func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10584,6 +10619,7 @@
   dispatch:
     CPU: foreach_tensor_div_tensor_kernel_slow_
     CUDA: foreach_tensor_div_tensor_kernel_cuda_
+    PrivateUse1: foreach_tensor_div_tensor_kernel_zoom_
   autogen: _foreach_div.Tensor_out
 
 - func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10592,6 +10628,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalar_kernel_slow
     CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_max_scalar_kernel_zoom
 
 - func: _foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10599,6 +10636,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_max_scalar_kernel_zoom_
   autogen: _foreach_clamp_max.Scalar_out
 
 - func: _foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
@@ -10607,6 +10645,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_list_kernel_slow
     CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_max_list_kernel_zoom
 
 - func: _foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10614,6 +10653,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_list_kernel_slow_
     CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_max_list_kernel_zoom_
   autogen: _foreach_clamp_max.List_out
 
 - func: _foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10622,6 +10662,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
     CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_max_scalarlist_kernel_zoom
 
 - func: _foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10629,6 +10670,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
     CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_max_scalarlist_kernel_zoom_
   autogen: _foreach_clamp_max.ScalarList_out
 
 - func: _foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10637,6 +10679,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalar_kernel_slow
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_min_scalar_kernel_zoom
 
 - func: _foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10644,6 +10687,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_min_scalar_kernel_zoom_
   autogen: _foreach_clamp_min.Scalar_out
 
 - func: _foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
@@ -10652,6 +10696,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_list_kernel_slow
     CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_min_list_kernel_zoom
 
 - func: _foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10659,6 +10704,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_list_kernel_slow_
     CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_min_list_kernel_zoom_
   autogen: _foreach_clamp_min.List_out
 
 - func: _foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10667,6 +10713,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
     CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_min_scalarlist_kernel_zoom
 
 - func: _foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10674,6 +10721,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_min_scalarlist_kernel_zoom_
   autogen: _foreach_clamp_min.ScalarList_out
 
 # foreach_minimum/maximum dispatches to clamp_max/min
@@ -10683,6 +10731,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalar_kernel_slow
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_min_scalar_kernel_zoom
 
 - func: _foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10690,6 +10739,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_min_scalar_kernel_zoom_
   autogen: _foreach_maximum.Scalar_out
 
 # foreach_minimum/maximum dispatches to clamp_max/min
@@ -10699,6 +10749,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_list_kernel_slow
     CUDA: foreach_tensor_clamp_min_list_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_min_list_kernel_zoom
 
 - func: _foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10706,6 +10757,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_list_kernel_slow_
     CUDA: foreach_tensor_clamp_min_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_min_list_kernel_zoom_
   autogen: _foreach_maximum.List_out
 
 # foreach_minimum/maximum dispatches to clamp_max/min
@@ -10715,6 +10767,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow
     CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_min_scalarlist_kernel_zoom
 
 - func: _foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10722,6 +10775,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_min_scalarlist_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_min_scalarlist_kernel_zoom_
   autogen: _foreach_maximum.ScalarList_out
 
 - func: _foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
@@ -10730,6 +10784,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalar_kernel_slow
     CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_max_scalar_kernel_zoom
 
 - func: _foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10737,6 +10792,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_max_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_max_scalar_kernel_zoom_
   autogen: _foreach_minimum.Scalar_out
 
 - func: _foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
@@ -10745,6 +10801,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_list_kernel_slow
     CUDA: foreach_tensor_clamp_max_list_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_max_list_kernel_zoom
 
 - func: _foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10752,6 +10809,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_list_kernel_slow_
     CUDA: foreach_tensor_clamp_max_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_max_list_kernel_zoom_
   autogen: _foreach_minimum.List_out
 
 - func: _foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@@ -10760,6 +10818,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow
     CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_clamp_max_scalarlist_kernel_zoom
 
 - func: _foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10767,6 +10826,7 @@
   dispatch:
     CPU: foreach_tensor_clamp_max_scalarlist_kernel_slow_
     CUDA: foreach_tensor_clamp_max_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_clamp_max_scalarlist_kernel_zoom_
   autogen: _foreach_minimum.ScalarList_out
 
 - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
@@ -10775,6 +10835,7 @@
   dispatch:
     CPU: foreach_tensor_addcdiv_scalar_slow
     CUDA: foreach_tensor_addcdiv_scalar_cuda
+    PrivateUse1: foreach_tensor_addcdiv_scalar_zoom
 
 - func: _foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10782,6 +10843,7 @@
   dispatch:
     CPU: foreach_tensor_addcdiv_scalarlist_slow
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda
+    PrivateUse1: foreach_tensor_addcdiv_scalarlist_zoom
 
 - func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10789,6 +10851,7 @@
   dispatch:
     CPU: foreach_tensor_addcdiv_tensor_slow
     CUDA: foreach_tensor_addcdiv_tensor_cuda
+    PrivateUse1: foreach_tensor_addcdiv_tensor_zoom
 
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10796,6 +10859,7 @@
   dispatch:
     CPU: foreach_tensor_addcdiv_scalar_slow_
     CUDA: foreach_tensor_addcdiv_scalar_cuda_
+    PrivateUse1: foreach_tensor_addcdiv_scalar_zoom_
   autogen: _foreach_addcdiv.Scalar_out
 
 - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
@@ -10804,6 +10868,7 @@
   dispatch:
     CPU: foreach_tensor_addcdiv_scalarlist_slow_
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
+    PrivateUse1: foreach_tensor_addcdiv_scalarlist_zoom_
   autogen: _foreach_addcdiv.ScalarList_out
 
 - func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
@@ -10812,6 +10877,7 @@
   dispatch:
     CPU: foreach_tensor_addcdiv_tensor_slow_
     CUDA: foreach_tensor_addcdiv_tensor_cuda_
+    PrivateUse1: foreach_tensor_addcdiv_tensor_zoom_
   autogen: _foreach_addcdiv.Tensor_out
 
 - func: _foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
@@ -10820,6 +10886,7 @@
   dispatch:
     CPU: foreach_tensor_addcmul_scalar_slow
     CUDA: foreach_tensor_addcmul_scalar_cuda
+    PrivateUse1: foreach_tensor_addcmul_scalar_zoom
 
 - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10827,6 +10894,7 @@
   dispatch:
     CPU: foreach_tensor_addcmul_scalarlist_slow
     CUDA: foreach_tensor_addcmul_scalarlist_cuda
+    PrivateUse1: foreach_tensor_addcmul_scalarlist_zoom
 
 - func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10834,6 +10902,7 @@
   dispatch:
     CPU: foreach_tensor_addcmul_tensor_slow
     CUDA: foreach_tensor_addcmul_tensor_cuda
+    PrivateUse1: foreach_tensor_addcmul_tensor_zoom
 
 - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10841,6 +10910,7 @@
   dispatch:
     CPU: foreach_tensor_addcmul_scalar_slow_
     CUDA: foreach_tensor_addcmul_scalar_cuda_
+    PrivateUse1: foreach_tensor_addcmul_scalar_zoom_
   autogen: _foreach_addcmul.Scalar_out
 
 - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
@@ -10849,6 +10919,7 @@
   dispatch:
     CPU: foreach_tensor_addcmul_scalarlist_slow_
     CUDA: foreach_tensor_addcmul_scalarlist_cuda_
+    PrivateUse1: foreach_tensor_addcmul_scalarlist_zoom_
   autogen: _foreach_addcmul.ScalarList_out
 
 - func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
@@ -10857,6 +10928,7 @@
   dispatch:
     CPU: foreach_tensor_addcmul_tensor_slow_
     CUDA: foreach_tensor_addcmul_tensor_cuda_
+    PrivateUse1: foreach_tensor_addcmul_tensor_zoom_
   autogen: _foreach_addcmul.Tensor_out
 
 - func: _foreach_abs(Tensor[] self) -> Tensor[]
@@ -10865,6 +10937,7 @@
   dispatch:
     CPU: foreach_tensor_abs_slow
     CUDA: foreach_tensor_abs_cuda
+    PrivateUse1: foreach_tensor_abs_zoom
 
 - func: _foreach_abs_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10872,6 +10945,7 @@
   dispatch:
     CPU: foreach_tensor_abs_slow_
     CUDA: foreach_tensor_abs_cuda_
+    PrivateUse1: foreach_tensor_abs_zoom_
   autogen: _foreach_abs.out
 
 - func: _foreach_acos(Tensor[] self) -> Tensor[]
@@ -10880,6 +10954,7 @@
   dispatch:
     CPU: foreach_tensor_acos_slow
     CUDA: foreach_tensor_acos_cuda
+    PrivateUse1: foreach_tensor_acos_zoom
 
 - func: _foreach_acos_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10887,6 +10962,7 @@
   dispatch:
     CPU: foreach_tensor_acos_slow_
     CUDA: foreach_tensor_acos_cuda_
+    PrivateUse1: foreach_tensor_acos_zoom_
   autogen: _foreach_acos.out
 
 - func: _foreach_asin(Tensor[] self) -> Tensor[]
@@ -10895,6 +10971,7 @@
   dispatch:
     CPU: foreach_tensor_asin_slow
     CUDA: foreach_tensor_asin_cuda
+    PrivateUse1: foreach_tensor_asin_zoom
 
 - func: _foreach_asin_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10902,6 +10979,7 @@
   dispatch:
     CPU: foreach_tensor_asin_slow_
     CUDA: foreach_tensor_asin_cuda_
+    PrivateUse1: foreach_tensor_asin_zoom_
   autogen: _foreach_asin.out
 
 - func: _foreach_atan(Tensor[] self) -> Tensor[]
@@ -10910,6 +10988,7 @@
   dispatch:
     CPU: foreach_tensor_atan_slow
     CUDA: foreach_tensor_atan_cuda
+    PrivateUse1: foreach_tensor_atan_zoom
 
 - func: _foreach_atan_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10917,6 +10996,7 @@
   dispatch:
     CPU: foreach_tensor_atan_slow_
     CUDA: foreach_tensor_atan_cuda_
+    PrivateUse1: foreach_tensor_atan_zoom_
   autogen: _foreach_atan.out
 
 - func: _foreach_ceil(Tensor[] self) -> Tensor[]
@@ -10925,6 +11005,7 @@
   dispatch:
     CPU: foreach_tensor_ceil_slow
     CUDA: foreach_tensor_ceil_cuda
+    PrivateUse1: foreach_tensor_ceil_zoom
 
 - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10932,6 +11013,7 @@
   dispatch:
     CPU: foreach_tensor_ceil_slow_
     CUDA: foreach_tensor_ceil_cuda_
+    PrivateUse1: foreach_tensor_ceil_zoom_
   autogen: _foreach_ceil.out
 
 - func: _foreach_cos(Tensor[] self) -> Tensor[]
@@ -10940,6 +11022,7 @@
   dispatch:
     CPU: foreach_tensor_cos_slow
     CUDA: foreach_tensor_cos_cuda
+    PrivateUse1: foreach_tensor_cos_zoom
 
 - func: _foreach_cos_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10947,6 +11030,7 @@
   dispatch:
     CPU: foreach_tensor_cos_slow_
     CUDA: foreach_tensor_cos_cuda_
+    PrivateUse1: foreach_tensor_cos_zoom_
   autogen: _foreach_cos.out
 
 - func: _foreach_cosh(Tensor[] self) -> Tensor[]
@@ -10955,6 +11039,7 @@
   dispatch:
     CPU: foreach_tensor_cosh_slow
     CUDA: foreach_tensor_cosh_cuda
+    PrivateUse1: foreach_tensor_cosh_zoom
 
 - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10962,6 +11047,7 @@
   dispatch:
     CPU: foreach_tensor_cosh_slow_
     CUDA: foreach_tensor_cosh_cuda_
+    PrivateUse1: foreach_tensor_cosh_zoom_
   autogen: _foreach_cosh.out
 
 - func: _foreach_erf(Tensor[] self) -> Tensor[]
@@ -10970,6 +11056,7 @@
   dispatch:
     CPU: foreach_tensor_erf_slow
     CUDA: foreach_tensor_erf_cuda
+    PrivateUse1: foreach_tensor_erf_zoom
 
 - func: _foreach_erf_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10977,6 +11064,7 @@
   dispatch:
     CPU: foreach_tensor_erf_slow_
     CUDA: foreach_tensor_erf_cuda_
+    PrivateUse1: foreach_tensor_erf_zoom_
   autogen: _foreach_erf.out
 
 - func: _foreach_erfc(Tensor[] self) -> Tensor[]
@@ -10985,6 +11073,7 @@
   dispatch:
     CPU: foreach_tensor_erfc_slow
     CUDA: foreach_tensor_erfc_cuda
+    PrivateUse1: foreach_tensor_erfc_zoom
 
 - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -10992,6 +11081,7 @@
   dispatch:
     CPU: foreach_tensor_erfc_slow_
     CUDA: foreach_tensor_erfc_cuda_
+    PrivateUse1: foreach_tensor_erfc_zoom_
   autogen: _foreach_erfc.out
 
 - func: _foreach_exp(Tensor[] self) -> Tensor[]
@@ -11000,6 +11090,7 @@
   dispatch:
     CPU: foreach_tensor_exp_slow
     CUDA: foreach_tensor_exp_cuda
+    PrivateUse1: foreach_tensor_exp_zoom
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11007,6 +11098,7 @@
   dispatch:
     CPU: foreach_tensor_exp_slow_
     CUDA: foreach_tensor_exp_cuda_
+    PrivateUse1: foreach_tensor_exp_zoom_
   autogen: _foreach_exp.out
 
 - func: _foreach_expm1(Tensor[] self) -> Tensor[]
@@ -11015,6 +11107,7 @@
   dispatch:
     CPU: foreach_tensor_expm1_slow
     CUDA: foreach_tensor_expm1_cuda
+    PrivateUse1: foreach_tensor_expm1_zoom
 
 - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11022,6 +11115,7 @@
   dispatch:
     CPU: foreach_tensor_expm1_slow_
     CUDA: foreach_tensor_expm1_cuda_
+    PrivateUse1: foreach_tensor_expm1_zoom_
   autogen: _foreach_expm1.out
 
 - func: _foreach_floor(Tensor[] self) -> Tensor[]
@@ -11030,6 +11124,7 @@
   dispatch:
     CPU: foreach_tensor_floor_slow
     CUDA: foreach_tensor_floor_cuda
+    PrivateUse1: foreach_tensor_floor_zoom
 
 - func: _foreach_floor_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11037,6 +11132,7 @@
   dispatch:
     CPU: foreach_tensor_floor_slow_
     CUDA: foreach_tensor_floor_cuda_
+    PrivateUse1: foreach_tensor_floor_zoom_
   autogen: _foreach_floor.out
 
 - func: _foreach_frac(Tensor[] self) -> Tensor[]
@@ -11045,6 +11141,7 @@
   dispatch:
     CPU: foreach_tensor_frac_slow
     CUDA: foreach_tensor_frac_cuda
+    PrivateUse1: foreach_tensor_frac_zoom
 
 - func: _foreach_frac_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11052,6 +11149,7 @@
   dispatch:
     CPU: foreach_tensor_frac_slow_
     CUDA: foreach_tensor_frac_cuda_
+    PrivateUse1: foreach_tensor_frac_zoom_
   autogen: _foreach_frac.out
 
 - func: _foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
@@ -11060,6 +11158,7 @@
   dispatch:
     CPU: foreach_tensor_ternary_lerp_slow
     CUDA: foreach_tensor_lerp_ternary_cuda
+    PrivateUse1: foreach_tensor_lerp_ternary_zoom
   autogen: _foreach_lerp.List_out
 
 - func: _foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
@@ -11068,6 +11167,7 @@
   dispatch:
     CPU: foreach_tensor_ternary_lerp_slow_
     CUDA: foreach_tensor_lerp_ternary_cuda_
+    PrivateUse1: foreach_tensor_lerp_ternary_zoom_
   autogen: _foreach_lerp.List_out
 
 - func: _foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
@@ -11076,6 +11176,7 @@
   dispatch:
     CPU: foreach_tensor_lerp_list_kernel_slow
     CUDA: foreach_tensor_lerp_list_cuda
+    PrivateUse1: foreach_tensor_lerp_list_zoom
   autogen: _foreach_lerp.Scalar_out
 
 - func: _foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
@@ -11084,6 +11185,7 @@
   dispatch:
     CPU: foreach_tensor_lerp_list_kernel_slow_
     CUDA: foreach_tensor_lerp_list_cuda_
+    PrivateUse1: foreach_tensor_lerp_list_zoom_
   autogen: _foreach_lerp.Scalar_out
 
 - func: _foreach_lgamma(Tensor[] self) -> Tensor[]
@@ -11092,6 +11194,7 @@
   dispatch:
     CPU: foreach_tensor_lgamma_slow
     CUDA: foreach_tensor_lgamma_cuda
+    PrivateUse1: foreach_tensor_lgamma_zoom
 
 - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11099,6 +11202,7 @@
   dispatch:
     CPU: foreach_tensor_lgamma_slow_
     CUDA: foreach_tensor_lgamma_cuda_
+    PrivateUse1: foreach_tensor_lgamma_zoom_
   autogen: _foreach_lgamma.out
 
 - func: _foreach_log(Tensor[] self) -> Tensor[]
@@ -11107,6 +11211,7 @@
   dispatch:
     CPU: foreach_tensor_log_slow
     CUDA: foreach_tensor_log_cuda
+    PrivateUse1: foreach_tensor_log_zoom
 
 - func: _foreach_log_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11114,6 +11219,7 @@
   dispatch:
     CPU: foreach_tensor_log_slow_
     CUDA: foreach_tensor_log_cuda_
+    PrivateUse1: foreach_tensor_log_zoom_
   autogen: _foreach_log.out
 
 - func: _foreach_log10(Tensor[] self) -> Tensor[]
@@ -11122,6 +11228,7 @@
   dispatch:
     CPU: foreach_tensor_log10_slow
     CUDA: foreach_tensor_log10_cuda
+    PrivateUse1: foreach_tensor_log10_zoom
 
 - func: _foreach_log10_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11129,6 +11236,7 @@
   dispatch:
     CPU: foreach_tensor_log10_slow_
     CUDA: foreach_tensor_log10_cuda_
+    PrivateUse1: foreach_tensor_log10_zoom_
   autogen: _foreach_log10.out
 
 - func: _foreach_log1p(Tensor[] self) -> Tensor[]
@@ -11137,6 +11245,7 @@
   dispatch:
     CPU: foreach_tensor_log1p_slow
     CUDA: foreach_tensor_log1p_cuda
+    PrivateUse1: foreach_tensor_log1p_zoom
 
 - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11144,6 +11253,7 @@
   dispatch:
     CPU: foreach_tensor_log1p_slow_
     CUDA: foreach_tensor_log1p_cuda_
+    PrivateUse1: foreach_tensor_log1p_zoom_
   autogen: _foreach_log1p.out
 
 - func: _foreach_log2(Tensor[] self) -> Tensor[]
@@ -11152,6 +11262,7 @@
   dispatch:
     CPU: foreach_tensor_log2_slow
     CUDA: foreach_tensor_log2_cuda
+    PrivateUse1: foreach_tensor_log2_zoom
 
 - func: _foreach_log2_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11159,6 +11270,7 @@
   dispatch:
     CPU: foreach_tensor_log2_slow_
     CUDA: foreach_tensor_log2_cuda_
+    PrivateUse1: foreach_tensor_log2_zoom_
   autogen: _foreach_log2.out
 
 - func: _foreach_neg(Tensor[] self) -> Tensor[]
@@ -11167,6 +11279,7 @@
   dispatch:
     CPU: foreach_tensor_neg_slow
     CUDA: foreach_tensor_neg_cuda
+    PrivateUse1: foreach_tensor_neg_zoom
 
 - func: _foreach_neg_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11174,6 +11287,7 @@
   dispatch:
     CPU: foreach_tensor_neg_slow_
     CUDA: foreach_tensor_neg_cuda_
+    PrivateUse1: foreach_tensor_neg_zoom_
   autogen: _foreach_neg.out
 
 - func: _foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
@@ -11182,6 +11296,7 @@
   dispatch:
     CPU: foreach_tensor_norm_slow
     CUDA: foreach_tensor_norm_cuda
+    PrivateUse1: foreach_tensor_norm_zoom
   autogen: _foreach_norm.Scalar_out
 
 - func: _foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
@@ -11190,6 +11305,7 @@
   dispatch:
     CPU: foreach_tensor_pow_list_kernel_slow
     CUDA: foreach_tensor_pow_list_kernel_cuda
+    PrivateUse1: foreach_tensor_pow_list_kernel_zoom
 
 - func: _foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11197,6 +11313,7 @@
   dispatch:
     CPU: foreach_tensor_pow_scalar_kernel_slow
     CUDA: foreach_tensor_pow_scalar_kernel_cuda
+    PrivateUse1: foreach_tensor_pow_scalar_kernel_zoom
 
 - func: _foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11204,6 +11321,7 @@
   dispatch:
     CPU: foreach_tensor_pow_scalarlist_kernel_slow
     CUDA: foreach_tensor_pow_scalarlist_kernel_cuda
+    PrivateUse1: foreach_tensor_pow_scalarlist_kernel_zoom
 
 - func: _foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11211,6 +11329,7 @@
   dispatch:
     CPU: foreach_scalar_pow_list_kernel_slow
     CUDA: foreach_scalar_pow_list_kernel_cuda
+    PrivateUse1: foreach_scalar_pow_list_kernel_zoom
 
 - func: _foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
   device_check: NoCheck
@@ -11218,6 +11337,7 @@
   dispatch:
     CPU: foreach_tensor_pow_list_kernel_slow_
     CUDA: foreach_tensor_pow_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_pow_list_kernel_zoom_
   autogen: _foreach_pow.List_out
 
 - func: _foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
@@ -11226,6 +11346,7 @@
   dispatch:
     CPU: foreach_tensor_pow_scalar_kernel_slow_
     CUDA: foreach_tensor_pow_scalar_kernel_cuda_
+    PrivateUse1: foreach_tensor_pow_scalar_kernel_zoom_
   autogen: _foreach_pow.Scalar_out
 
 - func: _foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
@@ -11234,6 +11355,7 @@
   dispatch:
     CPU: foreach_tensor_pow_scalarlist_kernel_slow_
     CUDA: foreach_tensor_pow_scalarlist_kernel_cuda_
+    PrivateUse1: foreach_tensor_pow_scalarlist_kernel_zoom_
   autogen: _foreach_pow.ScalarList_out
 
 - func: _foreach_reciprocal(Tensor[] self) -> Tensor[]
@@ -11242,6 +11364,7 @@
   dispatch:
     CPU: foreach_tensor_reciprocal_slow
     CUDA: foreach_tensor_reciprocal_cuda
+    PrivateUse1: foreach_tensor_reciprocal_zoom
 
 - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11249,6 +11372,7 @@
   dispatch:
     CPU: foreach_tensor_reciprocal_slow_
     CUDA: foreach_tensor_reciprocal_cuda_
+    PrivateUse1: foreach_tensor_reciprocal_zoom_
   autogen: _foreach_reciprocal.out
 
 - func: _foreach_round(Tensor[] self) -> Tensor[]
@@ -11257,6 +11381,7 @@
   dispatch:
     CPU: foreach_tensor_round_slow
     CUDA: foreach_tensor_round_cuda
+    PrivateUse1: foreach_tensor_round_zoom
 
 - func: _foreach_round_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11264,6 +11389,7 @@
   dispatch:
     CPU: foreach_tensor_round_slow_
     CUDA: foreach_tensor_round_cuda_
+    PrivateUse1: foreach_tensor_round_zoom_
   autogen: _foreach_round.out
 
 - func: _foreach_sigmoid(Tensor[] self) -> Tensor[]
@@ -11272,6 +11398,7 @@
   dispatch:
     CPU: foreach_tensor_sigmoid_slow
     CUDA: foreach_tensor_sigmoid_cuda
+    PrivateUse1: foreach_tensor_sigmoid_zoom
 
 - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11279,6 +11406,7 @@
   dispatch:
     CPU: foreach_tensor_sigmoid_slow_
     CUDA: foreach_tensor_sigmoid_cuda_
+    PrivateUse1: foreach_tensor_sigmoid_zoom_
   autogen: _foreach_sigmoid.out
 
 - func: _foreach_sign(Tensor[] self) -> Tensor[]
@@ -11287,6 +11415,7 @@
   dispatch:
     CPU: foreach_tensor_sign_slow
     CUDA: foreach_tensor_sign_cuda
+    PrivateUse1: foreach_tensor_sign_zoom
 
 - func: _foreach_sign_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11294,6 +11423,7 @@
   dispatch:
     CPU: foreach_tensor_sign_slow_
     CUDA: foreach_tensor_sign_cuda_
+    PrivateUse1: foreach_tensor_sign_zoom_
   autogen: _foreach_sign.out
 
 - func: _foreach_sin(Tensor[] self) -> Tensor[]
@@ -11302,6 +11432,7 @@
   dispatch:
     CPU: foreach_tensor_sin_slow
     CUDA: foreach_tensor_sin_cuda
+    PrivateUse1: foreach_tensor_sin_zoom
 
 - func: _foreach_sin_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11309,6 +11440,7 @@
   dispatch:
     CPU: foreach_tensor_sin_slow_
     CUDA: foreach_tensor_sin_cuda_
+    PrivateUse1: foreach_tensor_sin_zoom_
   autogen: _foreach_sin.out
 
 - func: _foreach_sinh(Tensor[] self) -> Tensor[]
@@ -11317,6 +11449,7 @@
   dispatch:
     CPU: foreach_tensor_sinh_slow
     CUDA: foreach_tensor_sinh_cuda
+    PrivateUse1: foreach_tensor_sinh_zoom
 
 - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11324,6 +11457,7 @@
   dispatch:
     CPU: foreach_tensor_sinh_slow_
     CUDA: foreach_tensor_sinh_cuda_
+    PrivateUse1: foreach_tensor_sinh_zoom_
   autogen: _foreach_sinh.out
 
 - func: _foreach_sqrt(Tensor[] self) -> Tensor[]
@@ -11332,6 +11466,7 @@
   dispatch:
     CPU: foreach_tensor_sqrt_slow
     CUDA: foreach_tensor_sqrt_cuda
+    PrivateUse1: foreach_tensor_sqrt_zoom
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11339,6 +11474,7 @@
   dispatch:
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
+    PrivateUse1: foreach_tensor_sqrt_zoom_
   autogen: _foreach_sqrt.out
 
 - func: _foreach_tan(Tensor[] self) -> Tensor[]
@@ -11347,6 +11483,7 @@
   dispatch:
     CPU: foreach_tensor_tan_slow
     CUDA: foreach_tensor_tan_cuda
+    PrivateUse1: foreach_tensor_tan_zoom
 
 - func: _foreach_tan_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11354,6 +11491,7 @@
   dispatch:
     CPU: foreach_tensor_tan_slow_
     CUDA: foreach_tensor_tan_cuda_
+    PrivateUse1: foreach_tensor_tan_zoom_
   autogen: _foreach_tan.out
 
 - func: _foreach_tanh(Tensor[] self) -> Tensor[]
@@ -11362,6 +11500,7 @@
   dispatch:
     CPU: foreach_tensor_tanh_slow
     CUDA: foreach_tensor_tanh_cuda
+    PrivateUse1: foreach_tensor_tanh_zoom
 
 - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11369,6 +11508,7 @@
   dispatch:
     CPU: foreach_tensor_tanh_slow_
     CUDA: foreach_tensor_tanh_cuda_
+    PrivateUse1: foreach_tensor_tanh_zoom_
   autogen: _foreach_tanh.out
 
 - func: _foreach_trunc(Tensor[] self) -> Tensor[]
@@ -11377,6 +11517,7 @@
   dispatch:
     CPU: foreach_tensor_trunc_slow
     CUDA: foreach_tensor_trunc_cuda
+    PrivateUse1: foreach_tensor_trunc_zoom
 
 - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
@@ -11384,6 +11525,7 @@
   dispatch:
     CPU: foreach_tensor_trunc_slow_
     CUDA: foreach_tensor_trunc_cuda_
+    PrivateUse1: foreach_tensor_trunc_zoom_
   autogen: _foreach_trunc.out
 
 - func: _foreach_zero_(Tensor(a!)[] self) -> ()
@@ -11392,6 +11534,7 @@
   dispatch:
     CPU: foreach_tensor_zero_slow_
     CUDA: foreach_tensor_zero_cuda_
+    PrivateUse1: foreach_tensor_zero_zoom_
   autogen: _foreach_zero, _foreach_zero.out
 
 - func: _foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
@@ -11400,6 +11543,7 @@
   dispatch:
     CPU: foreach_tensor_copy_list_kernel_slow_
     CUDA: foreach_tensor_copy_list_kernel_cuda_
+    PrivateUse1: foreach_tensor_copy_list_kernel_zoom_
   autogen: _foreach_copy.out
 
 - func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
@@ -11573,6 +11717,7 @@
   dispatch:
     CPU: nll_loss_forward_out_cpu
     CUDA: nll_loss_forward_out_cuda
+    PrivateUse1: nll_loss_forward_out_zoom
     MPS: nll_loss_forward_out_mps
 
 - func: nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
@@ -11585,6 +11730,7 @@
   dispatch:
     CPU: nll_loss_backward_out_cpu
     CUDA: nll_loss_backward_out_cuda
+    PrivateUse1: nll_loss_backward_out_zoom
     MPS: nll_loss_backward_out_mps
 
 - func: nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
@@ -11604,6 +11750,7 @@
   dispatch:
     CPU: nll_loss2d_forward_out_cpu
     CUDA: nll_loss2d_forward_out_cuda
+    PrivateUse1: nll_loss2d_forward_out_zoom
     MPS: nll_loss2d_forward_out_mps
 
 - func: nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
@@ -11611,6 +11758,7 @@
   dispatch:
     CPU: nll_loss2d_forward_cpu
     CUDA: nll_loss2d_forward_cuda
+    PrivateUse1: nll_loss2d_forward_zoom
     MPS: nll_loss2d_forward_mps
 
 - func: nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -11618,6 +11766,7 @@
   dispatch:
     CPU: nll_loss2d_backward_out_cpu
     CUDA: nll_loss2d_backward_out_cuda
+    PrivateUse1: nll_loss2d_backward_out_zoom
     MPS: nll_loss2d_backward_out_mps
 
 - func: nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
@@ -11625,6 +11774,7 @@
   dispatch:
     CPU: nll_loss2d_backward_cpu
     CUDA: nll_loss2d_backward_cuda
+    PrivateUse1: nll_loss2d_backward_zoom
     MPS: nll_loss2d_backward_mps
 
 - func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
@@ -11702,7 +11852,7 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: elu_out
+    CPU, CUDA, PrivateUse1: elu_out
     MPS: elu_out_mps
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
@@ -11715,7 +11865,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: elu_backward_out
+    CPU, CUDA, PrivateUse1: elu_backward_out
     MPS: elu_backward_out_mps
 
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
@@ -11745,6 +11895,7 @@
   dispatch:
     CPU: glu_backward_cpu_out
     CUDA: glu_backward_cuda_out
+    PrivateUse1: glu_backward_zoom_out
     MPS: glu_backward_mps_out
 
 - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
@@ -11752,18 +11903,19 @@
   dispatch:
     CPU: glu_backward_cpu
     CUDA: glu_backward_cuda
+    PrivateUse1: glu_backward_zoom
     MPS: glu_backward_mps
 
 - func: glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: glu_jvp
+    CPU, CUDA, PrivateUse1: glu_jvp
   autogen: glu_jvp.out
 
 - func: glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: glu_backward_jvp
+    CPU, CUDA, PrivateUse1: glu_backward_jvp
   autogen: glu_backward_jvp.out
 
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -11772,7 +11924,7 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardsigmoid_out
+    CPU, CUDA, PrivateUse1: hardsigmoid_out
     MPS: hardsigmoid_out_mps
     QuantizedCPU: hardsigmoid_out_quantized_cpu
 
@@ -11793,7 +11945,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: hardsigmoid_backward_out
+    CPU, CUDA, PrivateUse1: hardsigmoid_backward_out
     MPS: hardsigmoid_backward_out_mps
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
@@ -11804,61 +11956,61 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA, MPS: hardtanh_out
+    CPU, CUDA, PrivateUse1, MPS: hardtanh_out
     QuantizedCPU: hardtanh_out_quantized_cpu
 
 - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA, MPS: hardtanh
+    CPU, CUDA, PrivateUse1, MPS: hardtanh
     QuantizedCPU: hardtanh_quantized_cpu
   tags: core
 
 - func: hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU, CUDA: hardtanh_backward_out
+    CPU, CUDA, PrivateUse1: hardtanh_backward_out
     MPS: hardtanh_backward_out_mps
 
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: hardtanh_backward
+    CPU, CUDA, PrivateUse1: hardtanh_backward
     MPS: hardtanh_backward_mps
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA, MPS: hardtanh_
+    CPU, CUDA, PrivateUse1, MPS: hardtanh_
     QuantizedCPU: hardtanh_quantized_cpu_
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardswish_out
+    CPU, CUDA, PrivateUse1: hardswish_out
     MPS: hardswish_out_mps
 
 - func: hardswish(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardswish
+    CPU, CUDA, PrivateUse1: hardswish
     MPS: hardswish_mps
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: hardswish_
+    CPU, CUDA, PrivateUse1: hardswish_
     MPS: hardswish_mps_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: hardswish_backward
+    CPU, CUDA, PrivateUse1: hardswish_backward
     MPS: hardswish_backward_mps
   autogen: hardswish_backward.out
 
@@ -11868,7 +12020,7 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: leaky_relu_out
+    CPU, CUDA, PrivateUse1: leaky_relu_out
     MPS: leaky_relu_out_mps
     QuantizedCPU: leaky_relu_out_quantized_cpu
 
@@ -11885,7 +12037,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: leaky_relu_backward_out
+    CPU, CUDA, PrivateUse1: leaky_relu_backward_out
     MPS: leaky_relu_backward_out_mps
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
@@ -11913,6 +12065,7 @@
   dispatch:
     CPU: log_sigmoid_forward_out_cpu
     CUDA: log_sigmoid_forward_out_cuda
+    PrivateUse1: log_sigmoid_forward_out_zoom
     MPS: log_sigmoid_forward_out_mps
 
 - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
@@ -11921,6 +12074,7 @@
   dispatch:
     CPU: log_sigmoid_forward_cpu
     CUDA: log_sigmoid_forward_cuda
+    PrivateUse1: log_sigmoid_forward_zoom
     MPS: log_sigmoid_forward_mps
 
 - func: log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -11928,6 +12082,7 @@
   dispatch:
     CPU: log_sigmoid_backward_cpu_out
     CUDA: log_sigmoid_backward_cuda_out
+    PrivateUse1: log_sigmoid_backward_zoom_out
     MPS: log_sigmoid_backward_mps_out
 
 - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
@@ -11935,6 +12090,7 @@
   dispatch:
     CPU: log_sigmoid_backward_cpu
     CUDA: log_sigmoid_backward_cuda
+    PrivateUse1: log_sigmoid_backward_zoom
     MPS: log_sigmoid_backward_mps
 
 - func: rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
@@ -11943,12 +12099,14 @@
   dispatch:
     CPU: rrelu_with_noise_out_cpu
     CUDA: rrelu_with_noise_out_cuda
+    PrivateUse1: rrelu_with_noise_out_zoom
 
 - func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
   python_module: nn
   dispatch:
     CPU: rrelu_with_noise_cpu
     CUDA: rrelu_with_noise_cuda
+    PrivateUse1: rrelu_with_noise_zoom
   tags: nondeterministic_seeded
 
 - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
@@ -11963,6 +12121,7 @@
   dispatch:
     CPU: rrelu_with_noise_cpu_
     CUDA: rrelu_with_noise_cuda_
+    PrivateUse1: rrelu_with_noise_zoom_
 
 - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -11970,7 +12129,7 @@
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
-    CPU, CUDA: softplus_out
+    CPU, CUDA, PrivateUse1: softplus_out
     MPS: softplus_out_mps
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
@@ -11983,7 +12142,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: softplus_backward_out
+    CPU, CUDA, PrivateUse1: softplus_backward_out
     MPS: softplus_backward_out_mps
 
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
@@ -12009,7 +12168,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: softshrink_backward_out
+    CPU, CUDA, PrivateUse1: softshrink_backward_out
     MPS: softshrink_backward_out_mps
 
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
@@ -13066,6 +13225,7 @@
   variants: method
   dispatch:
     CUDA: record_stream_cuda
+    PrivateUse1: record_stream_zoom
 
 - func: isposinf(Tensor self) -> Tensor
   variants: function, method
diff --git a/aten/src/ATen/native/zoom/Activation.cpp b/aten/src/ATen/native/zoom/Activation.cpp
new file mode 100644
index 00000000000000..039585b1e71605
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Activation.cpp
@@ -0,0 +1,108 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/Activation.h>
+
+#include <ATen/core/DimVector.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/gelu_backward_native.h>
+#include <ATen/ops/gelu_native.h>
+#include <ATen/ops/glu_backward_native.h>
+#include <ATen/ops/log_sigmoid_forward_native.h>
+#endif
+
+namespace at::native {
+
+// -----------------------------------
+// glu backward
+// -----------------------------------
+
+Tensor& glu_backward_zoom_out(const Tensor& grad_output, const Tensor& input,
+                              int64_t dim, Tensor& grad_input) {
+  TORCH_CHECK(input.dim() > 0, "glu does not support 0-dimensional tensors");
+  auto wrap_dim = maybe_wrap_dim(dim, input.dim());
+  auto input_sizes = input.sizes();
+  const int64_t nIn = input_sizes[wrap_dim];
+  TORCH_CHECK(nIn % 2 == 0, "Halving dimension must be even, but dimension ",
+              wrap_dim, " is size ", nIn);
+
+  resize_output(grad_input, input_sizes);
+
+  DimVector iter_shape(input_sizes);
+  const auto dim_size = nIn / 2;
+  iter_shape[wrap_dim] = dim_size;
+  TORCH_CHECK(grad_output.sizes() == IntArrayRef{iter_shape});
+
+  const auto iter = at::TensorIteratorConfig()
+    .add_output(grad_input)
+    .add_const_input(input)
+    .add_const_input(grad_output)
+    .resize_outputs(false)
+    .declare_static_shape(iter_shape)
+    .build();
+
+  if (iter.numel() == 0) {
+    return grad_input;
+  }
+
+  const auto I_stride = input.strides()[wrap_dim] * dim_size;
+  const auto gI_stride = grad_input.strides()[wrap_dim] * dim_size;
+
+  if (iter.can_use_32bit_indexing()) {
+    launch_glu_backward_kernel(iter, gI_stride, I_stride);
+  } else {
+    for (const auto& sub_iter: iter.with_32bit_indexing()) {
+      launch_glu_backward_kernel(sub_iter, gI_stride, I_stride);
+    }
+  }
+  return grad_input;
+}
+
+Tensor glu_backward_zoom(const Tensor& grad_output, const Tensor& input, int64_t dim) {
+  auto grad_input = at::empty({0}, input.options());
+  return glu_backward_zoom_out(grad_output, input, dim, grad_input);
+}
+
+// -----------------------------------
+// log_sigmoid forward
+// -----------------------------------
+
+std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_zoom(const Tensor& input, Tensor& result, Tensor& buffer) {
+  // NOTE: buffer is only used by CPU dispatch, we just ignore it here
+  auto iter = TensorIteratorConfig()
+    .add_output(result)
+    .add_const_input(input)
+    .build();
+  launch_log_sigmoid_forward_kernel(iter);
+  return std::forward_as_tuple(result, buffer);
+}
+
+std::tuple<Tensor, Tensor> log_sigmoid_forward_zoom(const Tensor& input) {
+  auto result = at::empty_like(input);
+  auto buffer = at::empty({0}, input.options());
+  log_sigmoid_forward_out_zoom(input, result, buffer);
+  return std::forward_as_tuple(result, buffer);
+}
+
+TORCH_IMPL_FUNC(gelu_out_zoom) (
+  const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*result*/
+) {
+  GeluZoomKernelImpl(*this, get_gelutype_enum(approximate));
+}
+
+TORCH_IMPL_FUNC(gelu_backward_out_zoom) (
+  const Tensor& /*grad*/, const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*grad_input*/
+) {
+  GeluBackwardZoomKernelImpl(*this, get_gelutype_enum(approximate));
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Activation.h b/aten/src/ATen/native/zoom/Activation.h
new file mode 100644
index 00000000000000..309d316bd5fd7d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Activation.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <ATen/native/Activation.h>
+#include <cstdint>
+
+namespace at {
+struct TensorIteratorBase;
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+void launch_glu_backward_kernel(const TensorIteratorBase& iter,
+                                int64_t gI_stride, int64_t I_stride);
+
+void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter);
+
+void GeluZoomKernelImpl(TensorIteratorBase& it, GeluType approximate);
+void GeluBackwardZoomKernelImpl(TensorIteratorBase& it, GeluType approximate);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationEluKernel.cu b/aten/src/ATen/native/zoom/ActivationEluKernel.cu
new file mode 100644
index 00000000000000..e3f296a2a0ed89
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationEluKernel.cu
@@ -0,0 +1,86 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void elu_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "elu_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto negcoef = alpha.to<opmath_t>() * scale.to<opmath_t>();
+        auto poscoef = scale.to<opmath_t>();
+        auto negiptcoef = input_scale.to<opmath_t>();
+        gpu_kernel(
+            iter,
+            [negcoef, poscoef, negiptcoef] GPU_LAMBDA(scalar_t a) -> scalar_t {
+              opmath_t aop = static_cast<opmath_t>(a);
+              return aop > 0 ? aop * poscoef
+                             : std::expm1(aop * negiptcoef) * negcoef;
+            });
+      });
+}
+
+void elu_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    bool is_result) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "elu_backward_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto negcoef = alpha.to<opmath_t>() * scale.to<opmath_t>();
+        auto poscoef = scale.to<opmath_t>();
+        auto negiptcoef = input_scale.to<opmath_t>();
+        gpu_kernel(
+            iter,
+            [negcoef, poscoef, negiptcoef, is_result] GPU_LAMBDA(
+                scalar_t a, scalar_t b) -> scalar_t {
+              opmath_t aop = static_cast<opmath_t>(a);
+              opmath_t bop = static_cast<opmath_t>(b);
+
+              if (is_result) {
+                return bop <= 0 ? aop * negiptcoef * (bop + negcoef)
+                                : aop * poscoef;
+              } else {
+                return bop <= 0
+                    ? aop * negiptcoef * negcoef * std::exp(bop * negiptcoef)
+                    : aop * poscoef;
+              }
+            });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(elu_stub, &elu_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(elu_backward_stub, &elu_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationGeluKernel.cu b/aten/src/ATen/native/zoom/ActivationGeluKernel.cu
new file mode 100644
index 00000000000000..7da8acc5b7ab17
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationGeluKernel.cu
@@ -0,0 +1,88 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+
+void GeluZoomKernelImpl(TensorIteratorBase& it, GeluType approximate) {
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluZoomKernelImpl", [&]() {
+      gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5);
+        constexpr opmath_t kKappa = 0.044715;
+        auto x_cube = static_cast<opmath_t>(x) * static_cast<opmath_t>(x) * static_cast<opmath_t>(x);
+        auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube);
+        return opmath_t(0.5) * static_cast<opmath_t>(x) * (opmath_t(1) + c10::hip::compat::tanh(inner));
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluZoomKernelImpl", [&]() {
+      gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        constexpr opmath_t kAlpha = M_SQRT1_2;
+        return static_cast<opmath_t>(x) * opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
+      });
+    });
+  }
+}
+
+void GeluBackwardZoomKernelImpl(TensorIteratorBase& it, GeluType approximate) {
+  if (approximate == GeluType::Tanh) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+        it.dtype(), "GeluBackwardZoomKernelImpl", [&]() {
+          gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5);
+            constexpr opmath_t kKappa = 0.044715;
+            auto x_sq = static_cast<opmath_t>(x) * static_cast<opmath_t>(x);
+            auto x_cube = x_sq * static_cast<opmath_t>(x);
+            auto inner = kBeta * (static_cast<opmath_t>(x) + kKappa * x_cube);
+            auto tanh_inner = c10::hip::compat::tanh(inner);
+
+            auto left = opmath_t(0.5) * static_cast<opmath_t>(x);
+            auto right = opmath_t(1) + tanh_inner;
+
+            auto left_derivative = opmath_t(0.5) * right;
+
+            auto tanh_derivative = opmath_t(1) - tanh_inner * tanh_inner;
+            auto inner_derivative = kBeta * (opmath_t(1) + opmath_t(3) * kKappa * x_sq);
+            auto right_derivative = left * tanh_derivative * inner_derivative;
+
+            return static_cast<opmath_t>(dy) * (left_derivative + right_derivative);
+        });
+      });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+        it.dtype(), "GeluBackwardZoomKernelImpl", [&]() {
+          gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5);
+            constexpr opmath_t kAlpha = M_SQRT1_2;
+            const opmath_t cdf =
+                opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast<opmath_t>(x) * kAlpha));
+            const opmath_t pdf =
+                c10::hip::compat::exp(
+                    opmath_t(-0.5) * static_cast<opmath_t>(x) * static_cast<opmath_t>(x)) *
+                kBeta;
+            return static_cast<opmath_t>(dy) * (cdf + static_cast<opmath_t>(x) * pdf);
+          });
+        });
+  }
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationGluKernel.cu b/aten/src/ATen/native/zoom/ActivationGluKernel.cu
new file mode 100644
index 00000000000000..c98794cf016a03
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationGluKernel.cu
@@ -0,0 +1,141 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+
+// -----------------------------------
+// glu forward
+// -----------------------------------
+void glu_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, iter.dtype(), "glu_zoom", [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a_, scalar_t b_) -> scalar_t {
+          const opmath_t a = a_;
+          const opmath_t b = b_;
+          const opmath_t one = opmath_t(1);
+          const opmath_t sigmoid = one / (one + std::exp(-b));
+          return a * sigmoid;
+        });
+      });
+}
+
+// -----------------------------------
+// glu forward ad
+// -----------------------------------
+void glu_jvp_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, iter.dtype(), "glu_zoom", [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        gpu_kernel(
+            iter,
+            [] GPU_LAMBDA(
+                scalar_t res_, scalar_t b_, scalar_t da_, scalar_t db_)
+                -> scalar_t {
+              const opmath_t res = res_;
+              const opmath_t b = b_;
+              const opmath_t da = da_;
+              const opmath_t db = db_;
+              const opmath_t one = opmath_t(1);
+
+              const opmath_t sig_b = one / (one + std::exp(-b));
+              return (da * sig_b + res * (db - sig_b * db));
+            });
+      });
+}
+
+// -----------------------------------
+// glu backward
+// -----------------------------------
+
+// Byte offsets don't require multiplication by sizeof(T), so are slightly
+// cheaper. For fixed offsets, this removes all penalty from 64-bit indexing.
+template <typename T>
+__device__ T* byte_offset(T* ptr, int64_t offset) {
+  using byte_ptr_t = typename std::
+      conditional<std::is_const<T>::value, const char*, char*>::type;
+  return reinterpret_cast<T*>(reinterpret_cast<byte_ptr_t>(ptr) + offset);
+}
+
+template <typename scalar_t, typename OffsetCalc>
+__global__ void glu_backward_kernel(
+    int numel,
+    scalar_t* gI,
+    const scalar_t* I,
+    const scalar_t* gO,
+    OffsetCalc offset_calculator,
+    int64_t gI_byte_offset,
+    int64_t I_byte_offset) {
+  using opmath_t = at::opmath_type<scalar_t>;
+
+  const uint32_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= numel) {
+    return;
+  }
+  const auto offsets = offset_calculator.get(linear_index);
+
+  // We explicitly iterate over the first half of the input tensor, and
+  // gI_byte_offset and I_byte_offset are the offsets to access the
+  // corresponding index in the second half of the tensor.
+  const opmath_t a = I[offsets[1]];
+  const opmath_t b = *byte_offset(I + offsets[1], I_byte_offset);
+  const opmath_t gO_val = gO[offsets[2]];
+
+  const auto one = opmath_t(1);
+  const opmath_t sigmoid = one / (one + std::exp(-b));
+
+  auto* gA = gI + offsets[0];
+  *gA = sigmoid * gO_val;
+
+  auto* gB = byte_offset(gA, gI_byte_offset);
+  *gB = (one - sigmoid) * sigmoid * gO_val * a;
+}
+
+void launch_glu_backward_kernel(
+    const TensorIteratorBase& iter,
+    int64_t gI_stride,
+    int64_t I_stride) {
+  const auto N = iter.numel();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      N > 0 && N <= std::numeric_limits<int32_t>::max());
+  const auto offset_calculator = make_element_offset_calculator<3>(iter);
+  constexpr int64_t block_size = 256;
+  const int64_t grid = (N + block_size - 1) / block_size;
+  const auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, iter.common_dtype(), "glu_backward_zoom", [&] {
+        auto gI = static_cast<scalar_t*>(iter.data_ptr(0));
+        auto I = static_cast<const scalar_t*>(iter.data_ptr(1));
+        auto gO = static_cast<const scalar_t*>(iter.data_ptr(2));
+        glu_backward_kernel<<<grid, block_size, 0, stream>>>(
+            N,
+            gI,
+            I,
+            gO,
+            offset_calculator,
+            gI_stride * sizeof(scalar_t),
+            I_stride * sizeof(scalar_t));
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(glu_stub, &glu_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(glu_jvp_stub, &glu_jvp_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationHardshrinkKernel.cu b/aten/src/ATen/native/zoom/ActivationHardshrinkKernel.cu
new file mode 100644
index 00000000000000..cb581dbc9d661a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationHardshrinkKernel.cu
@@ -0,0 +1,39 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& value) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "hardshrink_zoom",
+      [&]() {
+        auto lambd = value.to<scalar_t>();
+        gpu_kernel(iter, [lambd] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return (a >= -lambd && a <= lambd) ? scalar_t(0) : a;
+        });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(hardshrink_stub, &hardshrink_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationHardsigmoidKernel.cu b/aten/src/ATen/native/zoom/ActivationHardsigmoidKernel.cu
new file mode 100644
index 00000000000000..3af90e876b6e81
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationHardsigmoidKernel.cu
@@ -0,0 +1,74 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void hardsigmoid_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "hardsigmoid_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        const opmath_t zero(0.0f);
+        const opmath_t one_sixth(1.0f / 6.0f);
+        const opmath_t three(3.0f);
+        const opmath_t six(6.0f);
+        gpu_kernel(
+            iter,
+            [zero, one_sixth, three, six] GPU_LAMBDA(
+                scalar_t self_val) -> scalar_t {
+              opmath_t x = static_cast<opmath_t>(self_val);
+              return std::min(std::max(x + three, zero), six) * one_sixth;
+            });
+      });
+}
+
+void hardsigmoid_backward_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "hardsigmoid_backward_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        const opmath_t zero(0.0f);
+        const opmath_t three(3.0f);
+        const opmath_t neg_three(-3.0f);
+        const opmath_t one_sixth(1.0f / 6.0f);
+        gpu_kernel(
+            iter,
+            [zero, three, neg_three, one_sixth] GPU_LAMBDA(
+                scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
+              opmath_t grad_val = static_cast<opmath_t>(grad_val_);
+              opmath_t self_val = static_cast<opmath_t>(self_val_);
+              return (self_val > neg_three && self_val < three)
+                  ? grad_val * one_sixth
+                  : zero;
+            });
+      });
+}
+
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(hardsigmoid_stub, &hardsigmoid_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(hardsigmoid_backward_stub, &hardsigmoid_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationHardswishKernel.cu b/aten/src/ATen/native/zoom/ActivationHardswishKernel.cu
new file mode 100644
index 00000000000000..5b4704cbf85ab8
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationHardswishKernel.cu
@@ -0,0 +1,63 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void hardswish_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "hardswish_zoom", [&]() {
+    using opmath_t = at::opmath_type<scalar_t>;
+    const opmath_t zero(0.0f);
+    const opmath_t one_sixth(1.0f / 6.0f);
+    const opmath_t three(3.0f);
+    const opmath_t six(6.0f);
+    gpu_kernel(iter, [zero, one_sixth, three, six]GPU_LAMBDA(scalar_t self_val) -> scalar_t {
+      opmath_t x = static_cast<opmath_t>(self_val);
+      return x * std::min(std::max(x + three, zero), six) * one_sixth;
+    });
+  });
+}
+
+void hardswish_backward_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "hardswish_backward_zoom", [&]() {
+    using opmath_t = at::opmath_type<scalar_t>;
+    const opmath_t zero(0.0f);
+    const opmath_t three(3.0f);
+    const opmath_t neg_three(-3.0f);
+    const opmath_t one_half(0.5f);
+    gpu_kernel(
+      iter,
+      [zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
+        opmath_t grad_val = static_cast<opmath_t>(grad_val_);
+        opmath_t self_val = static_cast<opmath_t>(self_val_);
+        if (self_val < neg_three) {
+          return zero;
+        } else if (self_val <= three) {
+          return grad_val * ((self_val / three) + one_half);
+        } else {
+          return grad_val;
+        }
+    });
+  });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(hardswish_stub, &hardswish_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(hardswish_backward_stub, &hardswish_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationHardtanhKernel.cu b/aten/src/ATen/native/zoom/ActivationHardtanhKernel.cu
new file mode 100644
index 00000000000000..ecd11f23e87fa3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationHardtanhKernel.cu
@@ -0,0 +1,45 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void hardtanh_backward_kernel(
+    TensorIterator& iter,
+    const Scalar& min,
+    const Scalar& max) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      iter.dtype(), "hardtanh_backward_zoom", [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto min_val = min.to<opmath_t>();
+        auto max_val = max.to<opmath_t>();
+        gpu_kernel(
+            iter,
+            [min_val, max_val] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+              opmath_t aop = static_cast<opmath_t>(a);
+              opmath_t bop = static_cast<opmath_t>(b);
+              return (bop <= min_val) || (bop >= max_val) ? opmath_t(0) : aop;
+            });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(hardtanh_backward_stub, &hardtanh_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationLeakyReluKernel.cu b/aten/src/ATen/native/zoom/ActivationLeakyReluKernel.cu
new file mode 100644
index 00000000000000..94a9a8168c2b02
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationLeakyReluKernel.cu
@@ -0,0 +1,62 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "leaky_relu_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto negval = negval_.to<opmath_t>();
+        gpu_kernel(iter, [negval] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          opmath_t aop = static_cast<opmath_t>(a);
+          return aop > opmath_t(0) ? aop : aop * negval;
+        });
+      });
+}
+
+void leaky_relu_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& negval_) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "leaky_relu_backward_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto negval = negval_.to<opmath_t>();
+        gpu_kernel(
+            iter, [negval] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+              opmath_t aop = static_cast<opmath_t>(a);
+              opmath_t bop = static_cast<opmath_t>(b);
+              return aop > opmath_t(0) ? bop : bop * negval;
+            });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(leaky_relu_stub, &leaky_relu_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(leaky_relu_backward_stub, &leaky_relu_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationLogSigmoidKernel.cu b/aten/src/ATen/native/zoom/ActivationLogSigmoidKernel.cu
new file mode 100644
index 00000000000000..79bad5edc99db3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationLogSigmoidKernel.cu
@@ -0,0 +1,64 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+
+// -----------------------------------
+// log_sigmoid forward
+// -----------------------------------
+
+void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, iter.common_dtype(), "log_sigmoid_forward_zoom", [&] {
+        using opmath_t = at::opmath_type<scalar_t>;
+
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t in_) -> scalar_t {
+          const opmath_t in = in_;
+          const auto min = std::min(opmath_t(0), in);
+          const auto z = std::exp(-std::abs(in));
+          return min - std::log1p(z);
+        });
+      });
+}
+
+namespace {
+// -----------------------------------
+// log_sigmoid backward
+// -----------------------------------
+void log_sigmoid_backward_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, iter.common_dtype(), "log_sigmoid_backward_zoom", [&] {
+        using opmath_t = at::opmath_type<scalar_t>;
+        gpu_kernel(
+            iter, [] GPU_LAMBDA(scalar_t in_, scalar_t grad_out_) -> scalar_t {
+              const opmath_t in = in_;
+              const opmath_t grad_out = grad_out_;
+
+              auto in_negative = in < opmath_t(0);
+              auto max_deriv = in_negative ? opmath_t(1) : opmath_t(0);
+              auto sign = in_negative ? opmath_t(1) : -opmath_t(1);
+              const auto z = std::exp(-std::abs(in));
+              return grad_out * (max_deriv - sign * (z / (opmath_t(1) + z)));
+            });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(log_sigmoid_backward_stub, &log_sigmoid_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationMishKernel.cu b/aten/src/ATen/native/zoom/ActivationMishKernel.cu
new file mode 100644
index 00000000000000..75d69dd119185c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationMishKernel.cu
@@ -0,0 +1,64 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void mish_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "mish_zoom",
+      [&]() {
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          const opmath_t x_acc = static_cast<opmath_t>(x);
+          return x_acc *
+              c10::hip::compat::tanh(
+                     c10::hip::compat::log1p(c10::hip::compat::exp(x_acc)));
+        });
+      });
+}
+
+void mish_backward_kernel(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "mish_backward_zoom",
+      [&]() {
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          const opmath_t dy_acc = static_cast<opmath_t>(dy);
+          const opmath_t x_acc = static_cast<opmath_t>(x);
+          const opmath_t s_acc =
+              opmath_t(1) / (opmath_t(1) + c10::hip::compat::exp(-x_acc));
+          const opmath_t t_acc = c10::hip::compat::tanh(
+              c10::hip::compat::log1p(c10::hip::compat::exp(x_acc)));
+          return dy_acc *
+              (t_acc + x_acc * s_acc * (opmath_t(1) - t_acc * t_acc));
+        });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(mish_stub, &mish_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(mish_backward_stub, &mish_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationPreluKernel.cu b/aten/src/ATen/native/zoom/ActivationPreluKernel.cu
new file mode 100644
index 00000000000000..512cc7224c5c85
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationPreluKernel.cu
@@ -0,0 +1,48 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+
+// -----------------------------------
+// prelu
+// -----------------------------------
+void prelu_kernel(TensorIterator &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "prelu_zoom", [&] {
+    gpu_kernel(iter,
+      [] GPU_LAMBDA (scalar_t input, scalar_t weight) -> scalar_t {
+        return (input > 0) ? input : weight * input;
+      });
+  });
+}
+
+void prelu_backward_kernel(TensorIterator &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "prelu_backward_zoom", [&] {
+    gpu_kernel_multiple_outputs(iter,
+      [] GPU_LAMBDA (scalar_t input, scalar_t weight, scalar_t grad) -> thrust::tuple<scalar_t, scalar_t> {
+        auto mask = input > 0;
+        auto grad_input = mask ? grad : weight * grad;
+        auto grad_weight = mask ? scalar_t{0} : input * grad;
+        return {grad_input, grad_weight};
+      });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(prelu_stub, &prelu_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(prelu_backward_stub, &prelu_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationSiluKernel.cu b/aten/src/ATen/native/zoom/ActivationSiluKernel.cu
new file mode 100644
index 00000000000000..04f7d204a3a97b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationSiluKernel.cu
@@ -0,0 +1,60 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <c10/util/complex.h>
+
+namespace at::native {
+namespace {
+
+void silu_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "silu_zoom",
+      [&]() {
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          const opmath_t x_acc = static_cast<opmath_t>(x);
+          return x_acc / (opmath_t(1) + ::exp(-x_acc));
+        });
+      });
+}
+
+void silu_backward_kernel(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "silu_backward_zoom",
+      [&]() {
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          const opmath_t dy_acc = static_cast<opmath_t>(dy);
+          const opmath_t x_acc = static_cast<opmath_t>(x);
+          const opmath_t s_acc =
+              opmath_t(1) / (opmath_t(1) + c10::hip::compat::exp(-x_acc));
+          return dy_acc * s_acc * (opmath_t(1) + x_acc * (opmath_t(1) - s_acc));
+        });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(silu_stub, &silu_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(silu_backward_stub, &silu_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationSoftplusKernel.cu b/aten/src/ATen/native/zoom/ActivationSoftplusKernel.cu
new file mode 100644
index 00000000000000..ed3358d225af7f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationSoftplusKernel.cu
@@ -0,0 +1,74 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void softplus_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& beta_,
+    const Scalar& threshold_) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "softplus_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto beta = beta_.to<opmath_t>();
+        auto threshold = threshold_.to<opmath_t>();
+        gpu_kernel(iter, [beta, threshold] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          opmath_t aop = static_cast<opmath_t>(a);
+          return (aop * beta) > threshold
+              ? aop
+              : (::log1p(std::exp(aop * beta))) / beta;
+        });
+      });
+}
+
+void softplus_backward_kernel(
+    TensorIteratorBase& iter,
+    const Scalar& beta_,
+    const Scalar& threshold_) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "softplus_backward_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto beta = beta_.to<opmath_t>();
+        auto threshold = threshold_.to<opmath_t>();
+        gpu_kernel(
+            iter,
+            [beta, threshold] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+              opmath_t aop = static_cast<opmath_t>(a);
+              opmath_t bop = static_cast<opmath_t>(b);
+              opmath_t z = std::exp(bop * beta);
+              return (bop * beta) > threshold ? aop
+                                              : aop * z / (z + opmath_t(1.));
+            });
+      });
+}
+
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(softplus_stub, &softplus_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(softplus_backward_stub, &softplus_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationSoftshrinkKernel.cu b/aten/src/ATen/native/zoom/ActivationSoftshrinkKernel.cu
new file mode 100644
index 00000000000000..69e27e22b477fb
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationSoftshrinkKernel.cu
@@ -0,0 +1,58 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+void softshrink_kernel(TensorIteratorBase& iter, const Scalar& value) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "softshrink_zoom",
+      [&]() {
+        auto lambd = value.to<scalar_t>();
+        gpu_kernel(iter, [lambd] GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return a > lambd ? a - lambd : (a < -lambd ? a + lambd : scalar_t(0));
+        });
+      });
+}
+
+void shrink_backward_kernel(TensorIteratorBase& iter, const Scalar& value) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "shrink_backward_zoom",
+      [&]() {
+        auto lambd = value.to<scalar_t>();
+        gpu_kernel(
+            iter,
+            [lambd] GPU_LAMBDA(
+                scalar_t grad_val, scalar_t self_val) -> scalar_t {
+              return (self_val >= -lambd && self_val <= lambd) ? scalar_t(0)
+                                                               : grad_val;
+            });
+      });
+}
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(softshrink_stub, &softshrink_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(shrink_backward_stub, &shrink_backward_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ActivationThresholdKernel.cu b/aten/src/ATen/native/zoom/ActivationThresholdKernel.cu
new file mode 100644
index 00000000000000..0d6a1c7e15f80a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ActivationThresholdKernel.cu
@@ -0,0 +1,52 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <ATen/native/Activation.h>
+
+#include <cmath>
+
+#include <thrust/tuple.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/ApplyGridUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+namespace {
+
+template <typename scalar_t>
+void threshold_kernel_impl(
+    TensorIteratorBase& iter,
+    scalar_t threshold,
+    scalar_t value) {
+  gpu_kernel_with_scalars(
+      iter, [=] GPU_LAMBDA(scalar_t x, scalar_t other) -> scalar_t {
+        return x <= threshold ? value : other;
+      });
+}
+
+static void threshold_kernel_zoom(
+    TensorIteratorBase& iter,
+    const Scalar& threshold,
+    const Scalar& value) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "threshold_zoom",
+      [&] {
+        threshold_kernel_impl<scalar_t>(
+            iter, threshold.to<scalar_t>(), value.to<scalar_t>());
+      });
+}
+
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(threshold_stub, &threshold_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachBinaryOpList.cu b/aten/src/ATen/native/zoom/ForeachBinaryOpList.cu
new file mode 100644
index 00000000000000..02e2c4d4fe942c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachBinaryOpList.cu
@@ -0,0 +1,295 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/ForeachMinMaxFunctors.cuh>
+#include <functional>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_clamp_min_native.h>
+#include <ATen/ops/_foreach_copy_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <typename T, template <class> class Op>
+std::vector<Tensor> foreach_tensor_list_op(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors1.size());
+  for (const auto& t : tensors1) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(tensors1.vec());
+  tensor_lists.emplace_back(tensors2.vec());
+  tensor_lists.emplace_back(std::move(vec_res));
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<3>(
+      tensor_lists,
+      BinaryOpListAlphaFunctor<
+          T,
+          /* depth */ 3,
+          /* r_args_depth */ 2,
+          /* res_arg_index */ 2>(),
+      Op<opmath_t>(),
+      alpha.to<opmath_t>());
+
+  return tensor_lists[2];
+}
+
+template <typename T, template <class> class Op>
+void foreach_tensor_list_op_(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(tensors1.vec());
+  tensor_lists.emplace_back(tensors2.vec());
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<2>(
+      tensor_lists,
+      BinaryOpListAlphaFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 2,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>(),
+      alpha.to<opmath_t>());
+  increment_version(tensors1);
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_bool_half_bfloat16(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kBFloat16,
+      kHalf,
+      tensors1[0].scalar_type(),
+      "foreach_binary_op_list_zoom",
+      [&]() {
+        return foreach_tensor_list_op<scalar_t, Op>(tensors1, tensors2, alpha);
+      });
+}
+
+template <template <class> class Op>
+void all_types_complex_bool_half_bfloat16_(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kBFloat16,
+      kHalf,
+      tensors1[0].scalar_type(),
+      "foreach_binary_op_list_zoom_",
+      [&]() {
+        foreach_tensor_list_op_<scalar_t, Op>(tensors1, tensors2, alpha);
+      });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_half_bfloat16(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  return AT_DISPATCH_ALL_TYPES_AND2(
+      kBFloat16,
+      kHalf,
+      tensors1[0].scalar_type(),
+      "foreach_binary_op_list_zoom",
+      [&]() {
+        return foreach_tensor_list_op<scalar_t, Op>(tensors1, tensors2, alpha);
+      });
+}
+
+template <template <class> class Op>
+void all_types_complex_half_bfloat16_(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kBFloat16,
+      kHalf,
+      tensors1[0].scalar_type(),
+      "foreach_binary_op_list_zoom_",
+      [&]() {
+        foreach_tensor_list_op_<scalar_t, Op>(tensors1, tensors2, alpha);
+      });
+}
+
+template <template <class> class Op>
+void all_types_half_bfloat16_(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      kBFloat16,
+      kHalf,
+      tensors1[0].scalar_type(),
+      "foreach_binary_op_list_zoom_",
+      [&]() {
+        foreach_tensor_list_op_<scalar_t, Op>(tensors1, tensors2, alpha);
+      });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_half_bfloat16(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& alpha = 1) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kBFloat16,
+      kHalf,
+      tensors1[0].scalar_type(),
+      "foreach_binary_op_list_zoom",
+      [&]() {
+        return foreach_tensor_list_op<scalar_t, Op>(tensors1, tensors2, alpha);
+      });
+}
+
+#define FOREACH_BINARY_OP_LIST(FUNCTION, NAME, OP, DIVISION_OP)     \
+  void foreach_tensor_##NAME##_list_kernel_zoom_(                   \
+      TensorList tensors1, TensorList tensors2) {                   \
+    check_foreach_api_restrictions(tensors1, tensors2);             \
+    if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) {     \
+      return at::native::foreach_tensor_##NAME##_list_kernel_slow_( \
+          tensors1, tensors2);                                      \
+    }                                                               \
+                                                                    \
+    FUNCTION##_<OP>(tensors1, tensors2);                            \
+  }                                                                 \
+                                                                    \
+  std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_zoom(     \
+      TensorList tensors1, TensorList tensors2) {                   \
+    check_foreach_api_restrictions(tensors1, tensors2);             \
+    if (!can_use_fast_route(tensors1, tensors2, DIVISION_OP)) {     \
+      return at::native::foreach_tensor_##NAME##_list_kernel_slow(  \
+          tensors1, tensors2);                                      \
+    }                                                               \
+                                                                    \
+    return FUNCTION<OP>(tensors1, tensors2);                        \
+  }
+
+#define FOREACH_BINARY_OP_LIST_ALPHA(FUNCTION, NAME, OP)               \
+  void foreach_tensor_##NAME##_list_kernel_zoom_(                      \
+      TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \
+    check_foreach_api_restrictions(tensors1, tensors2);                \
+    if (!can_use_fast_route({tensors1, tensors2}, alpha)) {            \
+      return at::native::foreach_tensor_##NAME##_list_kernel_slow_(    \
+          tensors1, tensors2, alpha);                                  \
+    }                                                                  \
+                                                                       \
+    FUNCTION##_<OP>(tensors1, tensors2, alpha);                        \
+  }                                                                    \
+                                                                       \
+  std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_zoom(        \
+      TensorList tensors1, TensorList tensors2, const Scalar& alpha) { \
+    check_foreach_api_restrictions(tensors1, tensors2);                \
+    if (!can_use_fast_route({tensors1, tensors2}, alpha)) {            \
+      return at::native::foreach_tensor_##NAME##_list_kernel_slow(     \
+          tensors1, tensors2, alpha);                                  \
+    }                                                                  \
+                                                                       \
+    return FUNCTION<OP>(tensors1, tensors2, alpha);                    \
+  }
+
+FOREACH_BINARY_OP_LIST_ALPHA(
+    all_types_complex_bool_half_bfloat16,
+    add,
+    std::plus);
+FOREACH_BINARY_OP_LIST_ALPHA(
+    all_types_complex_bool_half_bfloat16,
+    sub,
+    std::minus);
+FOREACH_BINARY_OP_LIST(
+    all_types_complex_bool_half_bfloat16,
+    mul,
+    std::multiplies,
+    /*division_op*/ false);
+FOREACH_BINARY_OP_LIST(
+    all_types_complex_bool_half_bfloat16,
+    div,
+    std::divides,
+    /*division_op*/ true);
+// NOTE(crcrpar): `all_types_half_bfloat16` does not cover bool, so temporarily
+// set `division_op` to true.
+FOREACH_BINARY_OP_LIST(
+    all_types_half_bfloat16,
+    clamp_max,
+    minimum,
+    /*division_op*/ true);
+FOREACH_BINARY_OP_LIST(
+    all_types_half_bfloat16,
+    clamp_min,
+    maximum,
+    /*division_op*/ true);
+// NOTE(crcrpar): [Why is foreach_pow's division_op=true?]
+// To push integer inputs to slow path. This is because with integer type inputs
+// the fast path behaves differently from the slow one. Need to investigate
+// later.
+FOREACH_BINARY_OP_LIST(
+    all_types_complex_half_bfloat16,
+    pow,
+    power_functor,
+    /*division_op*/ true);
+
+template <typename T>
+struct Identity {
+  __device__ __forceinline__ T operator()(const T& x) {
+    return x;
+  }
+};
+
+void foreach_tensor_copy_list_kernel_zoom_(
+    TensorList self,
+    TensorList src,
+    const bool non_blocking) {
+  check_foreach_api_restrictions(self, src);
+  if (!can_use_fast_route(
+          self, src, /* does_op_promote_integer_inputs_to_float */ false)) {
+    return at::native::foreach_tensor_copy_list_kernel_slow_(
+        self, src, non_blocking);
+  }
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{src.vec(), self.vec()};
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      ScalarType::Bool,
+      self[0].scalar_type(),
+      "foreach_tensor_copy",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        multi_tensor_apply<2>(
+            tensor_lists,
+            UnaryOpFunctor<
+                scalar_t,
+                /* depth */ 2,
+                /* r_args_depth */ 1,
+                /* res_arg_index */ 1>(),
+            Identity<opmath_t>());
+      });
+  increment_version(self);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/zoom/ForeachBinaryOpScalar.cu
new file mode 100644
index 00000000000000..ed488ef15c9846
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachBinaryOpScalar.cu
@@ -0,0 +1,247 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/ForeachMinMaxFunctors.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_clamp_min_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <typename T, template <class> class Op>
+std::vector<Tensor> foreach_binary_op(
+    TensorList tensors,
+    const Scalar& scalar) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors.size());
+  for (const auto& t : tensors) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(tensors.vec());
+  tensor_lists.emplace_back(std::move(vec_res));
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<2>(
+      tensor_lists,
+      BinaryOpScalarFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+      Op<opmath_t>(),
+      scalar.to<opmath_t>());
+  return tensor_lists[1];
+}
+
+template <typename T, template <class> class Op>
+void foreach_binary_op_(TensorList tensors, const Scalar& scalar) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(tensors.vec());
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<1>(
+      tensor_lists,
+      BinaryOpScalarFunctor<
+          T,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>(),
+      scalar.to<opmath_t>());
+  increment_version(tensors);
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_bool_half_bfloat16(
+    TensorList tensors,
+    const Scalar& scalar) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom",
+      [&]() { return foreach_binary_op<scalar_t, Op>(tensors, scalar); });
+}
+
+template <template <class> class Op>
+void all_types_complex_bool_half_bfloat16_(
+    TensorList tensors,
+    const Scalar& scalar) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalar); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_half_bfloat16(
+    TensorList tensors,
+    const Scalar& scalar) {
+  return AT_DISPATCH_ALL_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom",
+      [&]() { return foreach_binary_op<scalar_t, Op>(tensors, scalar); });
+}
+
+template <template <class> class Op>
+void all_types_half_bfloat16_(TensorList tensors, const Scalar& scalar) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalar); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_half_bfloat16(
+    TensorList tensors,
+    const Scalar& scalar) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom",
+      [&]() { return foreach_binary_op<scalar_t, Op>(tensors, scalar); });
+}
+
+template <template <class> class Op>
+void all_types_complex_half_bfloat16_(
+    TensorList tensors,
+    const Scalar& scalar) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalar); });
+}
+
+#define FOREACH_BINARY_OP_SCALAR(FUNCTION, NAME, OP, DIVISION_OP)     \
+  void foreach_tensor_##NAME##_scalar_kernel_zoom_(                   \
+      TensorList tensors, const Scalar& scalar) {                     \
+    check_foreach_api_restrictions(tensors);                          \
+    if (!can_use_fast_route(tensors, scalar, DIVISION_OP)) {          \
+      return at::native::foreach_tensor_##NAME##_scalar_kernel_slow_( \
+          tensors, scalar);                                           \
+    }                                                                 \
+                                                                      \
+    FUNCTION##_<OP>(tensors, scalar);                                 \
+  }                                                                   \
+                                                                      \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_zoom(     \
+      TensorList tensors, const Scalar& scalar) {                     \
+    check_foreach_api_restrictions(tensors);                          \
+    if (!can_use_fast_route(tensors, scalar, DIVISION_OP)) {          \
+      return at::native::foreach_tensor_##NAME##_scalar_kernel_slow(  \
+          tensors, scalar);                                           \
+    }                                                                 \
+                                                                      \
+    return FUNCTION<OP>(tensors, scalar);                             \
+  }
+
+FOREACH_BINARY_OP_SCALAR(
+    all_types_complex_bool_half_bfloat16,
+    add,
+    std::plus,
+    /*div_op*/ false);
+FOREACH_BINARY_OP_SCALAR(
+    all_types_complex_bool_half_bfloat16,
+    mul,
+    std::multiplies,
+    /*div_op*/ false);
+// See [Why is foreach_pow's division_op=true?]
+FOREACH_BINARY_OP_SCALAR(
+    all_types_complex_half_bfloat16,
+    pow,
+    power_functor,
+    /*div_op*/ true);
+std::vector<Tensor> foreach_scalar_pow_list_kernel_zoom(
+    const Scalar& scalar,
+    TensorList exponent) {
+  check_foreach_api_restrictions(exponent);
+  if (!can_use_fast_route(exponent)) {
+    return at::native::foreach_scalar_pow_list_kernel_slow(scalar, exponent);
+  }
+  return all_types_complex_half_bfloat16<reverse_power_functor>(
+      exponent, scalar);
+}
+
+// In the case of division, integer inputs will result in float.
+// Currently multi tensor apply can only return result of the same type as
+// input.
+FOREACH_BINARY_OP_SCALAR(
+    all_types_complex_bool_half_bfloat16,
+    div,
+    std::divides,
+    /*div_op*/ true);
+
+// In the case of subtraction, we dont allow scalar to be boolean following the
+// torch.sub logic
+void foreach_tensor_sub_scalar_kernel_zoom_(
+    TensorList tensors,
+    const Scalar& scalar) {
+  check_foreach_api_restrictions(tensors);
+  at::native::sub_check(tensors[0], scalar);
+
+  if (!can_use_fast_route(tensors, scalar)) {
+    return at::native::foreach_tensor_sub_scalar_kernel_slow_(tensors, scalar);
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, std::minus>(tensors, scalar); });
+}
+
+std::vector<Tensor> foreach_tensor_sub_scalar_kernel_zoom(
+    TensorList tensors,
+    const Scalar& scalar) {
+  check_foreach_api_restrictions(tensors);
+  at::native::sub_check(tensors[0], scalar);
+
+  if (!can_use_fast_route(tensors, scalar)) {
+    return at::native::foreach_tensor_sub_scalar_kernel_slow(tensors, scalar);
+  }
+
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom",
+      [&]() {
+        return foreach_binary_op<scalar_t, std::minus>(tensors, scalar);
+      });
+}
+
+// NOTE(crcrpar): `all_types_half_bfloat16` does not cover bool, so temporarily
+// set `division_op` to true.
+FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_max, minimum, true);
+FOREACH_BINARY_OP_SCALAR(all_types_half_bfloat16, clamp_min, maximum, true);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/zoom/ForeachBinaryOpScalarList.cu
new file mode 100644
index 00000000000000..ee63e6a12f7c6d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachBinaryOpScalarList.cu
@@ -0,0 +1,241 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/ForeachMinMaxFunctors.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_clamp_min_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <typename T, template <class> class Op>
+std::vector<Tensor> foreach_binary_op(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors.size());
+  for (const auto& t : tensors) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(tensors.vec());
+  tensor_lists.emplace_back(vec_res);
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<2, opmath_t>(
+      tensor_lists,
+      scalars,
+      BinaryOpScalarListFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+
+      Op<opmath_t>());
+  return tensor_lists[1];
+}
+
+template <typename T, template <class> class Op>
+void foreach_binary_op_(TensorList tensors, at::ArrayRef<Scalar> scalars) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(tensors.vec());
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<1, opmath_t>(
+      tensor_lists,
+      scalars,
+      BinaryOpScalarListFunctor<
+          T,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>());
+  increment_version(tensors);
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_bool_half_bfloat16(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom",
+      [&]() { return foreach_binary_op<scalar_t, Op>(tensors, scalars); });
+}
+
+template <template <class> class Op>
+void all_types_complex_bool_half_bfloat16_(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalars); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_half_bfloat16(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  return AT_DISPATCH_ALL_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom",
+      [&]() { return foreach_binary_op<scalar_t, Op>(tensors, scalars); });
+}
+
+template <template <class> class Op>
+void all_types_half_bfloat16_(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalars); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_half_bfloat16(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom",
+      [&]() { return foreach_binary_op<scalar_t, Op>(tensors, scalars); });
+}
+
+template <template <class> class Op>
+void all_types_complex_half_bfloat16_(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalars); });
+}
+
+#define FOREACH_BINARY_OP_SCALARLIST(FUNCTION, NAME, OP, DIV_OP)          \
+  void foreach_tensor_##NAME##_scalarlist_kernel_zoom_(                   \
+      TensorList tensors, at::ArrayRef<Scalar> scalars) {                 \
+    check_foreach_api_restrictions(tensors, scalars);                     \
+    if (!can_use_fast_route(tensors, scalars, DIV_OP)) {                  \
+      return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_( \
+          tensors, scalars);                                              \
+    }                                                                     \
+                                                                          \
+    FUNCTION##_<OP>(tensors, scalars);                                    \
+  }                                                                       \
+                                                                          \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_zoom(     \
+      TensorList tensors, at::ArrayRef<Scalar> scalars) {                 \
+    check_foreach_api_restrictions(tensors, scalars);                     \
+    if (!can_use_fast_route(tensors, scalars, DIV_OP)) {                  \
+      return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(  \
+          tensors, scalars);                                              \
+    }                                                                     \
+                                                                          \
+    return FUNCTION<OP>(tensors, scalars);                                \
+  }
+
+FOREACH_BINARY_OP_SCALARLIST(
+    all_types_complex_bool_half_bfloat16,
+    add,
+    std::plus,
+    /*div_op*/ false);
+FOREACH_BINARY_OP_SCALARLIST(
+    all_types_complex_bool_half_bfloat16,
+    mul,
+    std::multiplies,
+    /*div_op*/ false);
+FOREACH_BINARY_OP_SCALARLIST(
+    all_types_complex_bool_half_bfloat16,
+    div,
+    std::divides,
+    /*div_op*/ true);
+// See [Why is foreach_pow's division_op=true?]
+FOREACH_BINARY_OP_SCALARLIST(
+    all_types_complex_half_bfloat16,
+    pow,
+    power_functor,
+    /*div_op*/ true);
+
+// This does not use FOREACH_BINARY_OP_SCALARLIST because
+// In the case of subtraction, we dont allow scalar to be boolean following the
+// torch.sub logic
+void foreach_tensor_sub_scalarlist_kernel_zoom_(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors, scalars);
+  for (const auto i : c10::irange(tensors.size())) {
+    sub_check(tensors[i], scalars[i]);
+  }
+
+  if (!can_use_fast_route({tensors}, scalars)) {
+    return at::native::foreach_tensor_sub_scalarlist_kernel_slow_(
+        tensors, scalars);
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, std::minus>(tensors, scalars); });
+}
+
+std::vector<Tensor> foreach_tensor_sub_scalarlist_kernel_zoom(
+    TensorList tensors,
+    at::ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors, scalars);
+  for (const auto i : c10::irange(tensors.size())) {
+    sub_check(tensors[i], scalars[i]);
+  }
+
+  if (!can_use_fast_route({tensors}, scalars)) {
+    return at::native::foreach_tensor_sub_scalarlist_kernel_slow(
+        tensors, scalars);
+  }
+
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalarlist_zoom_",
+      [&]() {
+        return foreach_binary_op<scalar_t, std::minus>(tensors, scalars);
+      });
+}
+
+FOREACH_BINARY_OP_SCALARLIST(all_types_half_bfloat16, clamp_max, minimum, true);
+FOREACH_BINARY_OP_SCALARLIST(all_types_half_bfloat16, clamp_min, maximum, true);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/zoom/ForeachBinaryOpScalarTensor.cu
new file mode 100644
index 00000000000000..a9cc255cce1bd2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachBinaryOpScalarTensor.cu
@@ -0,0 +1,206 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/ForeachMinMaxFunctors.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <typename T, template <class> class Op>
+std::vector<Tensor> foreach_binary_op(
+    TensorList tensors,
+    const Tensor& scalar,
+    const Scalar& alpha = 1) {
+  TORCH_CHECK(
+      scalar.dim() == 0 && scalar.numel() == 1,
+      "scalar tensor expected to be 0 dim but it has ",
+      scalar.dim(),
+      " dimensions and ",
+      scalar.numel(),
+      " elements.");
+  TORCH_CHECK(
+      tensors[0].device() == scalar.device(),
+      "scalar tensor expected to be on ",
+      tensors[0].device(),
+      " but is on ",
+      scalar.device());
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors.size());
+  for (const auto& t : tensors) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(tensors.vec());
+  tensor_lists.emplace_back(std::move(vec_res));
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<2>(
+      tensor_lists,
+      BinaryOpScalarTensorFunctor<
+          T,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+      Op<opmath_t>(),
+      scalar.data_ptr<T>(),
+      alpha.to<opmath_t>());
+  return tensor_lists[1];
+}
+
+template <typename T, template <class> class Op>
+void foreach_binary_op_(
+    TensorList tensors,
+    const Tensor& scalar,
+    const Scalar& alpha = 1) {
+  TORCH_CHECK(
+      scalar.dim() == 0 && scalar.numel() == 1,
+      "scalar tensor expected to be 0 dim but has ",
+      scalar.dim(),
+      " dimensions and ",
+      scalar.numel(),
+      " elements.");
+  TORCH_CHECK(
+      tensors[0].device() == scalar.device(),
+      "scalar tensor is expected to be on ",
+      tensors[0].device(),
+      " but is on ",
+      scalar.device());
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(tensors.vec());
+
+  using opmath_t = at::opmath_type<T>;
+  multi_tensor_apply<1>(
+      tensor_lists,
+      BinaryOpScalarTensorFunctor<
+          T,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>(),
+      scalar.data_ptr<T>(),
+      alpha.to<opmath_t>());
+  increment_version(tensors);
+}
+
+// TODO(crcrpar): Nest dispatch by looking up `scalar.scalar_type` for better
+// coverage?
+#define FOREACH_BINARY_OP_SCALAR_TENSOR(FUNCTION, NAME, OP, DIVISION_OP) \
+  void foreach_tensor_##NAME##_tensor_kernel_zoom_(                      \
+      TensorList tensors, const Tensor& scalar) {                        \
+    if (scalar.device().type() == DeviceType::CPU) {                     \
+      return at::native::foreach_tensor_##NAME##_scalar_kernel_zoom_(    \
+          tensors, scalar.item());                                       \
+    }                                                                    \
+    check_foreach_api_restrictions(tensors);                             \
+    if (!(can_use_fast_route(                                            \
+              ArrayRef<TensorList>{tensors}, {}, DIVISION_OP) &&         \
+          tensors[0].scalar_type() == scalar.scalar_type())) {           \
+      return at::native::foreach_tensor_##NAME##_tensor_kernel_slow_(    \
+          tensors, scalar);                                              \
+    }                                                                    \
+                                                                         \
+    FUNCTION##_<OP>(tensors, scalar);                                    \
+  }                                                                      \
+                                                                         \
+  std::vector<Tensor> foreach_tensor_##NAME##_tensor_kernel_zoom(        \
+      TensorList tensors, const Tensor& scalar) {                        \
+    if (scalar.device().type() == DeviceType::CPU) {                     \
+      return at::native::foreach_tensor_##NAME##_scalar_kernel_zoom(     \
+          tensors, scalar.item());                                       \
+    }                                                                    \
+    check_foreach_api_restrictions(tensors);                             \
+    if (!(can_use_fast_route(                                            \
+              ArrayRef<TensorList>{tensors}, {}, DIVISION_OP) &&         \
+          tensors[0].scalar_type() == scalar.scalar_type())) {           \
+      return at::native::foreach_tensor_##NAME##_tensor_kernel_slow(     \
+          tensors, scalar);                                              \
+    }                                                                    \
+                                                                         \
+    return FUNCTION<OP>(tensors, scalar);                                \
+  }
+
+#define FOREACH_BINARY_OP_SCALAR_TENSOR_ALPHA(FUNCTION, NAME, OP)      \
+  void foreach_tensor_##NAME##_tensor_kernel_zoom_(                    \
+      TensorList tensors, const Tensor& scalar, const Scalar& alpha) { \
+    check_foreach_api_restrictions(tensors);                           \
+    if (!(can_use_fast_route(ArrayRef<TensorList>{tensors}, alpha) &&  \
+          tensors[0].scalar_type() == scalar.scalar_type())) {         \
+      return at::native::foreach_tensor_##NAME##_tensor_kernel_slow_(  \
+          tensors, scalar, alpha);                                     \
+    }                                                                  \
+                                                                       \
+    FUNCTION##_<OP>(tensors, scalar, alpha);                           \
+  }                                                                    \
+                                                                       \
+  std::vector<Tensor> foreach_tensor_##NAME##_tensor_kernel_zoom(      \
+      TensorList tensors, const Tensor& scalar, const Scalar& alpha) { \
+    check_foreach_api_restrictions(tensors);                           \
+    if (!(can_use_fast_route(ArrayRef<TensorList>{tensors}, alpha) &&  \
+          tensors[0].scalar_type() == scalar.scalar_type())) {         \
+      return at::native::foreach_tensor_##NAME##_tensor_kernel_slow(   \
+          tensors, scalar, alpha);                                     \
+    }                                                                  \
+                                                                       \
+    return FUNCTION<OP>(tensors, scalar, alpha);                       \
+  }
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_bool_half_bfloat16(
+    TensorList tensors,
+    const Tensor& scalar,
+    const Scalar& alpha = 1) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom",
+      [&]() {
+        return foreach_binary_op<scalar_t, Op>(tensors, scalar, alpha);
+      });
+}
+
+template <template <class> class Op>
+void all_types_complex_bool_half_bfloat16_(
+    TensorList tensors,
+    const Tensor& scalar,
+    const Scalar& alpha = 1) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      kBool,
+      kHalf,
+      kBFloat16,
+      tensors[0].scalar_type(),
+      "foreach_binary_op_scalar_zoom_",
+      [&]() { foreach_binary_op_<scalar_t, Op>(tensors, scalar, alpha); });
+}
+
+FOREACH_BINARY_OP_SCALAR_TENSOR_ALPHA(
+    all_types_complex_bool_half_bfloat16,
+    add,
+    std::plus);
+
+FOREACH_BINARY_OP_SCALAR_TENSOR(
+    all_types_complex_bool_half_bfloat16,
+    mul,
+    std::multiplies,
+    /* div_op */ false);
+
+FOREACH_BINARY_OP_SCALAR_TENSOR(
+    all_types_complex_bool_half_bfloat16,
+    div,
+    std::divides,
+    /* div_op */ true);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachFunctors.cuh b/aten/src/ATen/native/zoom/ForeachFunctors.cuh
new file mode 100644
index 00000000000000..dad643cce34b5b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachFunctors.cuh
@@ -0,0 +1,681 @@
+#pragma once
+#include <ATen/OpMathType.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/Pow.cuh>
+
+namespace at::native {
+
+namespace {
+
+// TODO(crcrpar): Handle version bump in codegen.
+// rel:
+// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482
+inline void increment_version(TensorList tensors) {
+  for (const auto& t : tensors) {
+    t.unsafeGetTensorImpl()->bump_version();
+  }
+}
+
+// Initializes args and checks if all args are aligned
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    TensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+// Initializes args and checks if all args are aligned
+template <int depth, typename T, typename T2>
+__device__ bool init_args(
+    T** args,
+    TensorListScalarListMetadata<T2, depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    FusedOptimizerTensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+template <int depth, typename T>
+__device__ void load_args(
+    T r_args[][kILP],
+    T** args,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const auto i = i_start + threadIdx.x + ii * blockDim.x;
+    for (int r_index = 0; r_index < depth; r_index++) {
+      r_args[r_index][ii] = 0;
+      if (i < n && i < chunk_size) {
+        r_args[r_index][ii] = args[r_index][i];
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ void store_args(
+    T* dst,
+    T* src,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const int64_t i = i_start + threadIdx.x + ii * blockDim.x;
+    if (i < n && i < chunk_size)
+      dst[i] = src[ii];
+  }
+}
+
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void binary_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args
+      // has depth 1
+      load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void pointwise_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+      load_store(r_args[1], args[1], 0, i_start);
+      load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args
+      // has depth 3
+      load_args<3>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+
+//
+// Binary Functors
+//
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpListAlphaFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarTensorFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      T* scalar,
+      opmath_t alpha) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        // Regardless if depth is 1 (for inplace) or 2 (for out of place),
+        // r_args has depth 1
+        load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+//
+// Unary Functors
+//
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct ZeroFunctor {
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<1>& tl) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const auto all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        // store
+        load_store(args[0], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct UnaryOpFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+//
+// Pointwise Functors
+//
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth>
+struct PointwiseOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[depth - 1][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[2], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth - 1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[2], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    static_assert(depth == 3 || depth == 4, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+        load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    static_assert(depth == 2 || depth == 3, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(a, b);
+  }
+};
+
+template <typename T>
+struct reverse_power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(b, a);
+  }
+};
+
+} // namespace
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachMinMaxFunctors.cuh b/aten/src/ATen/native/zoom/ForeachMinMaxFunctors.cuh
new file mode 100644
index 00000000000000..9b08911b1d9500
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachMinMaxFunctors.cuh
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/NumericUtils.h>
+
+namespace at::native {
+
+// std:: does not have clamp functors
+template <typename T>
+struct minimum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a < b) ? a : b;
+  }
+};
+
+template <typename T>
+struct maximum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a > b) ? a : b;
+  }
+};
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachPointwiseOp.cu b/aten/src/ATen/native/zoom/ForeachPointwiseOp.cu
new file mode 100644
index 00000000000000..b42aadaf7cf04d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachPointwiseOp.cu
@@ -0,0 +1,272 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_maximum_native.h>
+#include <ATen/ops/_foreach_minimum_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <template <class> class Op>
+std::vector<Tensor> foreach_pointwise_op(
+    TensorList input,
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& scalar) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(input.size());
+  for (const auto& t : input) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(input.vec());
+  tensor_lists.emplace_back(tensors1.vec());
+  tensor_lists.emplace_back(tensors2.vec());
+  tensor_lists.emplace_back(std::move(vec_res));
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      input[0].scalar_type(),
+      "foreach_pointwise_op_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        multi_tensor_apply<4>(
+            tensor_lists,
+            PointwiseOpScalarFunctor<
+                scalar_t,
+                /* depth */ 4,
+                /* r_args_depth */ 3,
+                /* res_arg_index */ 3>(),
+            Op<opmath_t>(),
+            scalar.to<opmath_t>());
+      });
+
+  return tensor_lists[3];
+}
+
+template <template <class> class Op>
+void foreach_pointwise_op_(
+    TensorList input,
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& scalar) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(input.vec());
+  tensor_lists.emplace_back(tensors1.vec());
+  tensor_lists.emplace_back(tensors2.vec());
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      input[0].scalar_type(),
+      "foreach_pointwise_op__zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        multi_tensor_apply<3>(
+            tensor_lists,
+            PointwiseOpScalarFunctor<
+                scalar_t,
+                /* depth */ 3,
+                /* r_args_depth */ 3,
+                /* res_arg_index */ 0>(),
+            Op<opmath_t>(),
+            scalar.to<opmath_t>());
+      });
+  increment_version(input);
+}
+
+template <template <class> class Op>
+void foreach_pointwise_op_(
+    TensorList input,
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.reserve(3);
+  tensor_lists.emplace_back(input.vec());
+  tensor_lists.emplace_back(tensors1.vec());
+  tensor_lists.emplace_back(tensors2.vec());
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      input[0].scalar_type(),
+      "foreach_pointwise_op__zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        multi_tensor_apply<3, opmath_t>(
+            tensor_lists,
+            scalars,
+            PointwiseOpScalarListFunctor<
+                scalar_t,
+                /* depth */ 3,
+                /* r_args_depth */ 3,
+                /* res_arg_index */ 0>(),
+            Op<opmath_t>());
+      });
+  increment_version(input);
+}
+
+template <template <class> class Op>
+std::vector<Tensor> foreach_pointwise_op(
+    TensorList input,
+    TensorList tensors1,
+    TensorList tensors2,
+    at::ArrayRef<Scalar> scalars) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.reserve(4);
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(input.size());
+  for (const auto& t : input) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(input.vec());
+  tensor_lists.emplace_back(tensors1.vec());
+  tensor_lists.emplace_back(tensors2.vec());
+  tensor_lists.emplace_back(std::move(vec_res));
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf,
+      kBFloat16,
+      input[0].scalar_type(),
+      "foreach_pointwise_op_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        multi_tensor_apply<4, opmath_t>(
+            tensor_lists,
+            scalars,
+            PointwiseOpScalarListFunctor<
+                scalar_t,
+                /* depth */ 4,
+                /* r_args_depth */ 3,
+                /* res_arg_index */ 3>(),
+            Op<opmath_t>());
+      });
+
+  return tensor_lists[3];
+}
+
+#define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalar_zoom(              \
+      TensorList input,                                                 \
+      TensorList tensors1,                                              \
+      TensorList tensors2,                                              \
+      const Scalar& scalar) {                                           \
+    check_foreach_api_restrictions(input, tensors1, tensors2);          \
+                                                                        \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalar) ||     \
+        has_integral_tensor(input, /* includeBool */ true)) {           \
+      return at::native::foreach_tensor_##NAME##_scalar_slow(           \
+          input, tensors1, tensors2, scalar);                           \
+    }                                                                   \
+                                                                        \
+    return foreach_pointwise_op<OP>(input, tensors1, tensors2, scalar); \
+  }                                                                     \
+                                                                        \
+  void foreach_tensor_##NAME##_scalar_zoom_(                            \
+      TensorList input,                                                 \
+      TensorList tensors1,                                              \
+      TensorList tensors2,                                              \
+      const Scalar& scalar) {                                           \
+    check_foreach_api_restrictions(input, tensors1, tensors2);          \
+                                                                        \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalar) ||     \
+        has_integral_tensor(input, /* includeBool */ true)) {           \
+      return at::native::foreach_tensor_##NAME##_scalar_slow_(          \
+          input, tensors1, tensors2, scalar);                           \
+    }                                                                   \
+                                                                        \
+    foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalar);       \
+  }
+
+#define FOREACH_POINTWISE_OP_SCALARLIST(NAME, OP)                        \
+  std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_zoom(           \
+      TensorList input,                                                  \
+      TensorList tensors1,                                               \
+      TensorList tensors2,                                               \
+      at::ArrayRef<Scalar> scalars) {                                    \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);  \
+                                                                         \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||     \
+        has_integral_tensor(input, /* includeBool */ true)) {            \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow(        \
+          input, tensors1, tensors2, scalars);                           \
+    }                                                                    \
+                                                                         \
+    return foreach_pointwise_op<OP>(input, tensors1, tensors2, scalars); \
+  }                                                                      \
+                                                                         \
+  void foreach_tensor_##NAME##_scalarlist_zoom_(                         \
+      TensorList input,                                                  \
+      TensorList tensors1,                                               \
+      TensorList tensors2,                                               \
+      at::ArrayRef<Scalar> scalars) {                                    \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);  \
+                                                                         \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||     \
+        has_integral_tensor(input, /* includeBool */ true)) {            \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow_(       \
+          input, tensors1, tensors2, scalars);                           \
+    }                                                                    \
+                                                                         \
+    foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalars);       \
+  }
+
+#define FOREACH_POINTWISE_OP_TENSOR(NAME, OP)                             \
+  std::vector<Tensor> foreach_tensor_##NAME##_tensor_zoom(                \
+      TensorList input,                                                   \
+      TensorList tensors1,                                                \
+      TensorList tensors2,                                                \
+      const Tensor& scalars_) {                                           \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size()); \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);   \
+    if (!can_use_fast_route({input, tensors1, tensors2}) ||               \
+        has_integral_tensor(input, /* includeBool */ true)) {             \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow(         \
+          input, tensors1, tensors2, scalars);                            \
+    }                                                                     \
+                                                                          \
+    return foreach_pointwise_op<OP>(input, tensors1, tensors2, scalars);  \
+  }                                                                       \
+                                                                          \
+  void foreach_tensor_##NAME##_tensor_zoom_(                              \
+      TensorList input,                                                   \
+      TensorList tensors1,                                                \
+      TensorList tensors2,                                                \
+      const Tensor& scalars_) {                                           \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size()); \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);   \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||      \
+        has_integral_tensor(input, /* includeBool */ true)) {             \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow_(        \
+          input, tensors1, tensors2, scalars);                            \
+    }                                                                     \
+                                                                          \
+    foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalars);        \
+  }
+
+FOREACH_POINTWISE_OP_SCALAR(addcmul, std::multiplies);
+FOREACH_POINTWISE_OP_SCALAR(addcdiv, std::divides);
+FOREACH_POINTWISE_OP_SCALARLIST(addcmul, std::multiplies);
+FOREACH_POINTWISE_OP_SCALARLIST(addcdiv, std::divides);
+FOREACH_POINTWISE_OP_TENSOR(addcdiv, std::divides);
+FOREACH_POINTWISE_OP_TENSOR(addcmul, std::multiplies);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachReduceOp.cu b/aten/src/ATen/native/zoom/ForeachReduceOp.cu
new file mode 100644
index 00000000000000..20b7712bf8ce21
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachReduceOp.cu
@@ -0,0 +1,352 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_norm_native.h>
+
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at::native {
+
+// _foreach_norm supports only L1, L2, and inf norm
+enum class NormType { L1, L2, LInf };
+
+// NOTE: This is a simple variant of TensorListMetadata in MultiTensorApply.cuh
+// as we only need to track addresses for the lpnorm_cleanup function below.
+// Why is this struct necessary? For the same reason the TensorListMetadata
+// struct is necessary--which is to ferry static metadata to the CUDA kernel
+// while complying with the 4kb size constraint. Since we only need to track
+// addresses, we introduce this struct to be able to fit more Tensor pointers at
+// a time, currently 400 empirically, compared to the much smaller values in
+// depth_to_max_tensors. This way, we can launch fewer kernels for better
+// performance.
+//
+// IF YOU USE THIS STRUCT, PLEASE ADD A ONE-OFF TEST IN test_foreach.py AS THIS
+// IS CURRENTLY ONLY TESTED FOR _foreach_norm.
+const size_t MAX_TENSORS_PER_KERNEL = 400;
+struct TensorListAddresses {
+  const void* addresses[MAX_TENSORS_PER_KERNEL];
+};
+
+template <
+    typename T,
+    NormType norm_type,
+    int depth = 1,
+    int r_args_depth = 1,
+    int res_arg_index = 0>
+struct LpNormFunctor {
+  using opmath_t = typename at::opmath_type<T>;
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      opmath_t* output_per_tensor,
+      const int max_chunks_per_tensor) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* x = (T*)tl.addresses[0][tensor_loc];
+    x += chunk_idx * chunk_size;
+    n -= chunk_idx * chunk_size;
+
+    __shared__ opmath_t s_vals[512];
+    opmath_t vals[kILP];
+    T r_x[kILP];
+    for (int64_t i = 0; i < kILP; i++) {
+      vals[i] = opmath_t(0);
+      r_x[i] = T(0);
+    }
+
+    if (n % kILP == 0 && (chunk_size & kILP) == 0 && is_aligned(x)) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_x, x, 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          opmath_t next = static_cast<opmath_t>(r_x[ii]);
+          if constexpr (norm_type == NormType::LInf) {
+            vals[ii] = max_propagate_nan(vals[ii], ::abs(next));
+          } else {
+            vals[ii] += norm_type == NormType::L1 ? ::abs(next) : next * next;
+          }
+        }
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          int i = i_start + threadIdx.x + ii * blockDim.x;
+          if (i < n && i < chunk_size) {
+            opmath_t next = static_cast<opmath_t>(x[i]);
+            if constexpr (norm_type == NormType::LInf) {
+              vals[ii] = max_propagate_nan(vals[ii], ::abs(next));
+            } else {
+              vals[ii] += norm_type == NormType::L1 ? ::abs(next) : next * next;
+            }
+          }
+        }
+      }
+    }
+
+    auto val = opmath_t(0);
+    for (int i = 0; i < kILP; i++) {
+      if constexpr (norm_type == NormType::LInf) {
+        val = max_propagate_nan(val, vals[i]);
+      } else {
+        val += vals[i];
+      }
+    }
+    auto final_val = norm_type == NormType::L1 || norm_type == NormType::L2
+        ? at::native::zoom_utils::BlockReduceSum(val, s_vals)
+        : at::native::zoom_utils::BlockReduceMax(val, s_vals);
+
+    if (threadIdx.x == 0) {
+      output_per_tensor
+          [(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor +
+           chunk_idx] = final_val;
+    }
+  }
+};
+
+template <
+    typename T,
+    NormType norm_type,
+    typename opmath_t = at::opmath_type<T>>
+__global__ void lpnorm_cleanup(
+    const opmath_t* output_per_tensor,
+    TensorListAddresses addr_struct,
+    int max_chunks_per_tensor) {
+  __shared__ opmath_t vals[512];
+
+  const opmath_t* output_this_tensor =
+      output_per_tensor + blockIdx.x * max_chunks_per_tensor;
+  opmath_t val = 0;
+  for (size_t i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) {
+    if constexpr (norm_type == NormType::LInf) {
+      val = max_propagate_nan(val, output_this_tensor[i]);
+    } else {
+      val += output_this_tensor[i];
+    }
+  }
+  opmath_t final_val = norm_type == NormType::L1 || norm_type == NormType::L2
+      ? at::native::zoom_utils::BlockReduceSum<opmath_t>(val, vals)
+      : at::native::zoom_utils::BlockReduceMax(val, vals);
+  if (threadIdx.x == 0) {
+    *(T*)addr_struct.addresses[blockIdx.x] =
+        norm_type == NormType::L1 || norm_type == NormType::LInf
+        ? final_val
+        : ::sqrt(final_val);
+  }
+}
+
+// note(mkozuki): Why excluding Int and Complex from fast path
+// - Int: at::norm does not support.
+// - Complex: __shfl_down_sync does not support complex and foreach does not
+// support functions whose inputs dtypes and output dtype are different.
+std::vector<Tensor> foreach_tensor_norm_zoom(
+    TensorList tensors,
+    const Scalar& ord) {
+  double p;
+  if (ord.isIntegral(false)) {
+    p = ord.to<int64_t>();
+  } else if (ord.isFloatingPoint()) {
+    p = ord.to<double>();
+  } else {
+    TORCH_CHECK(
+        false, "foreach_tensor_norm_zoom expects ord to be integer or float");
+  }
+  check_foreach_api_restrictions(tensors);
+  const bool has_int_or_complex =
+      std::any_of(tensors.begin(), tensors.end(), [](const auto& t) {
+        const auto scalar_type = t.scalar_type();
+        return at::isIntegralType(scalar_type, /*includeBool*/ true) ||
+            at::isComplexType(scalar_type);
+      });
+  if (!can_use_fast_route(tensors) || has_int_or_complex ||
+      !(p == static_cast<double>(1) || p == static_cast<double>(2) ||
+        p == std::numeric_limits<double>::infinity())) {
+    return foreach_tensor_norm_slow(tensors, ord);
+  }
+
+  const size_t ntensors = tensors.size();
+  int max_chunks_per_tensor = -1;
+
+  for (const auto t : c10::irange(ntensors)) {
+    int max_chunks_this_tensor =
+        (tensors[t].numel() + kChunkSize - 1) / kChunkSize;
+    if (max_chunks_this_tensor > max_chunks_per_tensor) {
+      max_chunks_per_tensor = max_chunks_this_tensor;
+    }
+  }
+  const auto options = tensors[0].options();
+  auto output_per_tensor = at::zeros(
+      {static_cast<int64_t>(ntensors) * max_chunks_per_tensor},
+      options.dtype(toOpMathType(tensors[0].scalar_type())));
+
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(ntensors);
+  for (const auto i : c10::irange(ntensors)) {
+    vec_res.push_back(at::empty({}, options));
+  }
+
+  auto tensor_lists = std::vector<std::vector<Tensor>>{tensors.vec()};
+  if (p == static_cast<double>(1)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        tensor_lists[0][0].scalar_type(),
+        "foreach_tensor_norm_zoom",
+        [&]() {
+          using opmath_t = typename at::opmath_type<scalar_t>;
+          multi_tensor_apply<1>(
+              tensor_lists,
+              LpNormFunctor<scalar_t, NormType::L1>(),
+              output_per_tensor.mutable_data_ptr<opmath_t>(),
+              max_chunks_per_tensor);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          const c10::zoom::OptionalZoomGuard device_guard(
+              device_of(output_per_tensor));
+          auto stream = c10::zoom::getCurrentZoomStream();
+
+          const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL);
+          for (const auto i : c10::irange(num_kernels)) {
+            const size_t num_tensors_this_kernel =
+                (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0)
+                ? MAX_TENSORS_PER_KERNEL
+                : (ntensors % MAX_TENSORS_PER_KERNEL);
+
+            TensorListAddresses addr_struct;
+            for (const auto j : c10::irange(num_tensors_this_kernel)) {
+              addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j]
+                                             .mutable_data_ptr<scalar_t>();
+            }
+
+            lpnorm_cleanup<scalar_t, NormType::L1>
+                <<<num_tensors_this_kernel, 512, 0, stream>>>(
+                    output_per_tensor.const_data_ptr<opmath_t>() +
+                        i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor,
+                    addr_struct,
+                    max_chunks_per_tensor);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        });
+  } else if (p == static_cast<double>(2)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        tensor_lists[0][0].scalar_type(),
+        "foreach_tensor_norm_zoom",
+        [&]() {
+          using opmath_t = typename at::opmath_type<scalar_t>;
+          multi_tensor_apply<1>(
+              tensor_lists,
+              LpNormFunctor<scalar_t, NormType::L2>(),
+              output_per_tensor.mutable_data_ptr<opmath_t>(),
+              max_chunks_per_tensor);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          const c10::zoom::OptionalZoomGuard device_guard(
+              device_of(output_per_tensor));
+          auto stream = c10::zoom::getCurrentZoomStream();
+
+          const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL);
+          for (const auto i : c10::irange(num_kernels)) {
+            const size_t num_tensors_this_kernel =
+                (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0)
+                ? MAX_TENSORS_PER_KERNEL
+                : (ntensors % MAX_TENSORS_PER_KERNEL);
+
+            TensorListAddresses addr_struct;
+            for (const auto j : c10::irange(num_tensors_this_kernel)) {
+              addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j]
+                                             .mutable_data_ptr<scalar_t>();
+            }
+
+            lpnorm_cleanup<scalar_t, NormType::L2>
+                <<<num_tensors_this_kernel, 512, 0, stream>>>(
+                    output_per_tensor.const_data_ptr<opmath_t>() +
+                        i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor,
+                    addr_struct,
+                    max_chunks_per_tensor);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        });
+  } else if (p == std::numeric_limits<double>::infinity()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        tensor_lists[0][0].scalar_type(),
+        "foreach_tensor_norm_zoom",
+        [&]() {
+          using opmath_t = typename at::opmath_type<scalar_t>;
+          multi_tensor_apply<1>(
+              tensor_lists,
+              LpNormFunctor<scalar_t, NormType::LInf>(),
+              output_per_tensor.mutable_data_ptr<opmath_t>(),
+              max_chunks_per_tensor);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          const c10::zoom::OptionalZoomGuard device_guard(
+              device_of(output_per_tensor));
+          auto stream = c10::zoom::getCurrentZoomStream();
+
+          const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL);
+          for (const auto i : c10::irange(num_kernels)) {
+            const size_t num_tensors_this_kernel =
+                (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0)
+                ? MAX_TENSORS_PER_KERNEL
+                : (ntensors % MAX_TENSORS_PER_KERNEL);
+
+            TensorListAddresses addr_struct;
+            for (const auto j : c10::irange(num_tensors_this_kernel)) {
+              addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j]
+                                             .mutable_data_ptr<scalar_t>();
+            }
+
+            lpnorm_cleanup<scalar_t, NormType::LInf>
+                <<<num_tensors_this_kernel, 512, 0, stream>>>(
+                    output_per_tensor.const_data_ptr<opmath_t>() +
+                        i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor,
+                    addr_struct,
+                    max_chunks_per_tensor);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        });
+  } else {
+    TORCH_CHECK(
+        false,
+        "foreach_tensor_norm_zoom fast path got unexpected ord value: ",
+        p);
+  }
+
+  // correctly assign values to only non-empty slots, as the empty slots should
+  // get skipped
+  std::vector<Tensor> result;
+  result.reserve(ntensors);
+  int i = 0;
+  for (const auto& t : tensors) {
+    if (t.numel() != 0) {
+      result.emplace_back(vec_res[i]);
+      i++;
+    } else {
+      result.emplace_back(at::zeros({}, options));
+    }
+  }
+  return result;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachTernaryOp.cu b/aten/src/ATen/native/zoom/ForeachTernaryOp.cu
new file mode 100644
index 00000000000000..f20b6431605de0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachTernaryOp.cu
@@ -0,0 +1,159 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/Lerp.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_lerp_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <typename T>
+struct LerpFunctor {
+  inline C10_DEVICE T operator()(const T self, const T end, const T weight) {
+    return lerp(self, end, weight);
+  }
+};
+
+std::vector<at::Tensor> foreach_tensor_lerp_ternary_zoom(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3) {
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
+    return foreach_tensor_ternary_lerp_slow(tensors1, tensors2, tensors3);
+  }
+
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors1.size());
+  for (const auto& t : tensors1) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors1[0].scalar_type(),
+      "foreach_tensor_lerp_ternary_zoom",
+      [&]() {
+        using opmath_t = typename at::opmath_type<scalar_t>;
+        multi_tensor_apply<4>(
+            tensor_lists,
+            TernaryOpListFunctor<
+                scalar_t,
+                /* depth */ 4,
+                /* r_args_depth */ 3,
+                /* res_arg_index */ 3>(),
+            LerpFunctor<opmath_t>());
+      });
+
+  return tensor_lists[3];
+}
+
+void foreach_tensor_lerp_ternary_zoom_(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3) {
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  if (!can_use_fast_route({tensors1, tensors2, tensors3}, {}, true)) {
+    return foreach_tensor_ternary_lerp_slow_(tensors1, tensors2, tensors3);
+  }
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      tensors1.vec(), tensors2.vec(), tensors3.vec()};
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors1[0].scalar_type(),
+      "foreach_tensor_lerp_ternary_zoom_",
+      [&]() {
+        using opmath_t = typename at::opmath_type<scalar_t>;
+        multi_tensor_apply<3>(
+            tensor_lists,
+            TernaryOpListFunctor<
+                scalar_t,
+                /* depth */ 3,
+                /* r_args_depth */ 3,
+                /* res_arg_index */ 0>(),
+            LerpFunctor<opmath_t>());
+      });
+  increment_version(tensors1);
+}
+
+std::vector<at::Tensor> foreach_tensor_lerp_list_zoom(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& weight) {
+  check_foreach_api_restrictions(tensors1, tensors2);
+  if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
+    return foreach_tensor_lerp_list_kernel_slow(tensors1, tensors2, weight);
+  }
+
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors1.size());
+  for (const auto& t : tensors1) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      tensors1.vec(), tensors2.vec(), vec_res};
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors1[0].scalar_type(),
+      "foreach_tensor_lerp_scalar_zoom",
+      [&]() {
+        using opmath_t = typename at::opmath_type<scalar_t>;
+        multi_tensor_apply<3>(
+            tensor_lists,
+            TernaryOpScalarFunctor<
+                scalar_t,
+                /* depth */ 3,
+                /* r_args_depth */ 2,
+                /* res_arg_index */ 2>(),
+            LerpFunctor<opmath_t>(),
+            weight.to<opmath_t>());
+      });
+
+  return tensor_lists[2];
+}
+
+void foreach_tensor_lerp_list_zoom_(
+    TensorList tensors1,
+    TensorList tensors2,
+    const Scalar& weight) {
+  check_foreach_api_restrictions(tensors1, tensors2);
+  if (!can_use_fast_route({tensors1, tensors2}, {}, true)) {
+    return foreach_tensor_lerp_list_kernel_slow_(tensors1, tensors2, weight);
+  }
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      tensors1.vec(), tensors2.vec()};
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors1[0].scalar_type(),
+      "foreach_tensor_lerp_scalar_zoom_",
+      [&]() {
+        using opmath_t = typename at::opmath_type<scalar_t>;
+        multi_tensor_apply<2>(
+            tensor_lists,
+            TernaryOpScalarFunctor<
+                scalar_t,
+                /* depth */ 2,
+                /* r_args_depth */ 2,
+                /* res_arg_index */ 0>(),
+            LerpFunctor<opmath_t>(),
+            weight.to<opmath_t>());
+      });
+}
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ForeachUnaryOp.cu b/aten/src/ATen/native/zoom/ForeachUnaryOp.cu
new file mode 100644
index 00000000000000..04b89f90288472
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ForeachUnaryOp.cu
@@ -0,0 +1,408 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_foreach_abs_native.h>
+#include <ATen/ops/_foreach_acos_native.h>
+#include <ATen/ops/_foreach_asin_native.h>
+#include <ATen/ops/_foreach_atan_native.h>
+#include <ATen/ops/_foreach_ceil_native.h>
+#include <ATen/ops/_foreach_cos_native.h>
+#include <ATen/ops/_foreach_cosh_native.h>
+#include <ATen/ops/_foreach_erf_native.h>
+#include <ATen/ops/_foreach_erfc_native.h>
+#include <ATen/ops/_foreach_exp_native.h>
+#include <ATen/ops/_foreach_expm1_native.h>
+#include <ATen/ops/_foreach_floor_native.h>
+#include <ATen/ops/_foreach_frac_native.h>
+#include <ATen/ops/_foreach_lgamma_native.h>
+#include <ATen/ops/_foreach_log10_native.h>
+#include <ATen/ops/_foreach_log1p_native.h>
+#include <ATen/ops/_foreach_log2_native.h>
+#include <ATen/ops/_foreach_log_native.h>
+#include <ATen/ops/_foreach_neg_native.h>
+#include <ATen/ops/_foreach_reciprocal_native.h>
+#include <ATen/ops/_foreach_round_native.h>
+#include <ATen/ops/_foreach_sigmoid_native.h>
+#include <ATen/ops/_foreach_sign_native.h>
+#include <ATen/ops/_foreach_sin_native.h>
+#include <ATen/ops/_foreach_sinh_native.h>
+#include <ATen/ops/_foreach_sqrt_native.h>
+#include <ATen/ops/_foreach_tan_native.h>
+#include <ATen/ops/_foreach_tanh_native.h>
+#include <ATen/ops/_foreach_trunc_native.h>
+#include <ATen/ops/_foreach_zero_native.h>
+
+#include <ATen/ops/empty_like_native.h>
+#endif
+
+namespace at::native {
+
+template <typename scalar_t, template <class> class Op>
+std::vector<Tensor> foreach_unary_op(TensorList tensors) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  std::vector<at::Tensor> vec_res;
+  vec_res.reserve(tensors.size());
+  for (const auto& t : tensors) {
+    vec_res.emplace_back(at::native::empty_like(t));
+  }
+
+  tensor_lists.emplace_back(tensors.vec());
+  tensor_lists.emplace_back(std::move(vec_res));
+
+  using opmath_t = typename at::opmath_type<scalar_t>;
+  multi_tensor_apply<2>(
+      tensor_lists,
+      UnaryOpFunctor<
+          scalar_t,
+          /* depth */ 2,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 1>(),
+      Op<opmath_t>());
+
+  return tensor_lists[1];
+}
+
+template <typename scalar_t, template <class> class Op>
+void foreach_unary_op_(TensorList tensors) {
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(tensors.vec());
+  using opmath_t = typename at::opmath_type<scalar_t>;
+  multi_tensor_apply<1>(
+      tensor_lists,
+      UnaryOpFunctor<
+          scalar_t,
+          /* depth */ 1,
+          /* r_args_depth */ 1,
+          /* res_arg_index */ 0>(),
+      Op<opmath_t>());
+  increment_version(tensors);
+}
+
+template <template <class> class Op>
+std::vector<Tensor> floating_complex_half(TensorList tensors) {
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
+      ScalarType::Half,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { return foreach_unary_op<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+void floating_complex_half_(TensorList tensors) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
+      ScalarType::Half,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom_",
+      [&]() { foreach_unary_op_<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_complex_bfloat16_half_bool(TensorList tensors) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      ScalarType::Bool,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { return foreach_unary_op<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+void all_types_complex_bfloat16_half_bool_(TensorList tensors) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      ScalarType::Bool,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { foreach_unary_op_<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> floating_complex_half_bfloat16(TensorList tensors) {
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { return foreach_unary_op<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+void floating_complex_half_bfloat16_(TensorList tensors) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom_",
+      [&]() { foreach_unary_op_<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> all_types_half_complex_bfloat16(TensorList tensors) {
+  return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { return foreach_unary_op<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+void all_types_half_complex_bfloat16_(TensorList tensors) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      ScalarType::Half,
+      at::ScalarType::BFloat16,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom_",
+      [&]() { foreach_unary_op_<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> floating_half(TensorList tensors) {
+  return AT_DISPATCH_FLOATING_TYPES_AND(
+      ScalarType::Half,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { return foreach_unary_op<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+void floating_half_(TensorList tensors) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      tensors[0].scalar_type(), "foreach_unary_op_zoom_", [&]() {
+        foreach_unary_op_<scalar_t, Op>(tensors);
+      });
+}
+
+template <template <class> class Op>
+std::vector<Tensor> floating_half_bfloat16(TensorList tensors) {
+  return AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom",
+      [&]() { return foreach_unary_op<scalar_t, Op>(tensors); });
+}
+
+template <template <class> class Op>
+void floating_half_bfloat16_(TensorList tensors) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      tensors[0].scalar_type(),
+      "foreach_unary_op_zoom_",
+      [&]() { foreach_unary_op_<scalar_t, Op>(tensors); });
+}
+
+// makes the functor
+#define STD_FUNCTOR(op_name, functor_name) \
+  template <typename T>                    \
+  struct functor_name {                    \
+    __device__ T operator()(T t) const {   \
+      return std::op_name(t);              \
+    }                                      \
+  };
+
+// given a functor and a "dispatch function", creates the outplace and inplace
+// operations
+#define OP_CUSTOM_FUNCTOR(function, op_name, functor_name)                  \
+  std::vector<Tensor> foreach_tensor_##op_name##_zoom(TensorList tensors) { \
+    check_foreach_api_restrictions(tensors);                                \
+    if (!can_use_fast_route(tensors) ||                                     \
+        has_integral_tensor(tensors, /* includeBool */ true)) {             \
+      return at::native::foreach_tensor_##op_name##_slow(tensors);          \
+    }                                                                       \
+    return function<functor_name>(tensors);                                 \
+  }                                                                         \
+  void foreach_tensor_##op_name##_zoom_(TensorList tensors) {               \
+    check_foreach_api_restrictions(tensors);                                \
+    if (!can_use_fast_route(tensors) ||                                     \
+        has_integral_tensor(tensors, /* includeBool */ true)) {             \
+      return at::native::foreach_tensor_##op_name##_slow_(tensors);         \
+    }                                                                       \
+                                                                            \
+    function##_<functor_name>(tensors);                                     \
+  }
+
+// creates a functor, outplace version, and inplace version.
+#define OP(function, op_name, functor_name) \
+  STD_FUNCTOR(op_name, functor_name);       \
+  OP_CUSTOM_FUNCTOR(function, op_name, functor_name);
+
+OP(floating_half_bfloat16, erfc, Erfc);
+OP(floating_half, lgamma, Lgamma);
+OP(floating_half_bfloat16, trunc, Truncf);
+OP(floating_half_bfloat16, floor, Floor);
+OP(floating_half_bfloat16, ceil, Ceil);
+
+OP(floating_complex_half_bfloat16, acos, Acos);
+OP(floating_complex_half_bfloat16, asin, Asin);
+OP(floating_complex_half_bfloat16, atan, Atan);
+OP(floating_complex_half_bfloat16, cosh, Cosh);
+OP(floating_complex_half_bfloat16, tan, Tan);
+OP(floating_complex_half_bfloat16, sin, Sin);
+OP(floating_complex_half_bfloat16, sinh, Sinh);
+
+OP(floating_complex_half_bfloat16, exp, Exp);
+OP(floating_complex_half_bfloat16, expm1, Expm1);
+OP(floating_complex_half_bfloat16, tanh, Tanh);
+OP(floating_complex_half_bfloat16, log, Log);
+OP(floating_complex_half_bfloat16, log10, Log10);
+OP(floating_complex_half_bfloat16, log2, Log2);
+OP(floating_complex_half_bfloat16, log1p, Log1p);
+OP(floating_complex_half_bfloat16, cos, Cos);
+OP(floating_complex_half_bfloat16, sqrt, Sqrt);
+
+OP(floating_half_bfloat16, erf, Erf);
+
+//
+// Special cases
+// These functions must be special cased as they can't be written as
+// std::functor_name in OP macro
+//
+template <typename T>
+struct Sigmoid {
+  T one = T(1);
+  __device__ T operator()(T t) const {
+    return (one / (one + std::exp(-t)));
+  }
+};
+
+template <typename T>
+struct Round {
+  __device__ T operator()(T t) const {
+    return std::nearbyint(t);
+  }
+};
+
+template <typename T>
+struct Trunc {
+  __device__ T operator()(T t) const {
+    return t - std::trunc(t);
+  }
+};
+
+template <typename T>
+struct Reciprocal {
+  T one = T(1);
+  __device__ T operator()(T t) const {
+    return (one / t);
+  }
+};
+
+template <typename T>
+struct Sign {
+  C10_DEVICE T operator()(T t) const {
+    return c10::signum<T>(t);
+  }
+};
+
+OP_CUSTOM_FUNCTOR(floating_half_bfloat16, sigmoid, Sigmoid)
+OP_CUSTOM_FUNCTOR(floating_half_bfloat16, round, Round)
+OP_CUSTOM_FUNCTOR(floating_half_bfloat16, frac, Trunc)
+OP_CUSTOM_FUNCTOR(floating_complex_half_bfloat16, reciprocal, Reciprocal)
+OP_CUSTOM_FUNCTOR(floating_half_bfloat16, sign, Sign)
+
+// note(mkozuki): tensor dtype checks of `neg` kernels.
+// Since `check_foreach_api_restrictions` don't require all the tensors to have
+// the same dtype, I think it safer to check every single tensor's dtype inside
+// negation kernels.
+std::vector<Tensor> foreach_tensor_neg_zoom(TensorList tensors) {
+  check_foreach_api_restrictions(tensors);
+
+  if (!can_use_fast_route(tensors)) {
+    return at::native::foreach_tensor_neg_slow(tensors);
+  }
+
+  TORCH_CHECK(
+      tensors[0].scalar_type() != kBool,
+      "Negation, the `-` operator, on a bool tensor is not supported. "
+      "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+  return all_types_half_complex_bfloat16<std::negate>(tensors);
+}
+
+void foreach_tensor_neg_zoom_(TensorList tensors) {
+  check_foreach_api_restrictions(tensors);
+
+  if (!can_use_fast_route(tensors)) {
+    return at::native::foreach_tensor_neg_slow_(tensors);
+  }
+
+  TORCH_CHECK(
+      tensors[0].scalar_type() != kBool,
+      "Negation, the `-` operator, on a bool tensor is not supported. "
+      "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+  all_types_half_complex_bfloat16_<std::negate>(tensors);
+}
+
+// Abs have to go via slow path in case of a complex type.
+// This is because foreach kernels can't return a different dtype than passed,
+// while abs with complex inputs will produce float output.
+template <typename T>
+struct Abs {
+  __device__ T operator()(T t) const {
+    return std::abs(t);
+  }
+};
+
+std::vector<Tensor> foreach_tensor_abs_zoom(TensorList tensors) {
+  check_foreach_api_restrictions(tensors);
+  const bool has_complex =
+      std::any_of(tensors.begin(), tensors.end(), [](const auto& t) {
+        return at::isComplexType(t.scalar_type());
+      });
+  if (!can_use_fast_route(tensors) || has_complex) {
+    return at::native::foreach_tensor_abs_slow(tensors);
+  }
+
+  return all_types_complex_bfloat16_half_bool<Abs>(tensors);
+}
+
+void foreach_tensor_abs_zoom_(TensorList tensors) {
+  check_foreach_api_restrictions(tensors);
+  const bool has_complex =
+      std::any_of(tensors.begin(), tensors.end(), [](const auto& t) {
+        return at::isComplexType(t.scalar_type());
+      });
+  if (!can_use_fast_route(tensors) || has_complex) {
+    return at::native::foreach_tensor_abs_slow_(tensors);
+  }
+
+  all_types_complex_bfloat16_half_bool_<Abs>(tensors);
+}
+
+void foreach_tensor_zero_zoom_(TensorList tensors) {
+  check_foreach_api_restrictions(tensors);
+
+  if (!can_use_fast_route(tensors)) {
+    return at::native::foreach_tensor_zero_slow_(tensors);
+  }
+
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+  tensor_lists.emplace_back(tensors.vec());
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+      ScalarType::Half,
+      ScalarType::BFloat16,
+      ScalarType::Bool,
+      tensors[0].scalar_type(),
+      "foreach_zero_zoom_",
+      [&]() {
+        multi_tensor_apply<1>(
+            tensor_lists,
+            ZeroFunctor<
+                scalar_t,
+                /* depth */ 1,
+                /* r_args_depth */ 1,
+                /* res_arg_index */ 0>());
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Loss.cu b/aten/src/ATen/native/zoom/Loss.cu
new file mode 100644
index 00000000000000..b21fd1ba5cf68f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Loss.cu
@@ -0,0 +1,627 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/Resize.h>
+
+#include <type_traits>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/binary_cross_entropy_backward_native.h>
+#include <ATen/ops/binary_cross_entropy_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/squeeze.h>
+#endif
+
+constexpr float EPSILON = 1e-12;
+
+namespace {
+
+using namespace at;
+
+void binary_cross_entropy_backward_out_kernel(Tensor& grad_input, const Tensor& grad, const Tensor& input, const Tensor& target) {
+  at::TensorIterator iter = TensorIteratorConfig()
+      .add_output(grad_input)
+      .add_input(grad)
+      .add_input(input)
+      .add_input(target)
+      .build();
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "binary_cross_entropy_backward_out_zoom", [&]() {
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA (
+        scalar_t grad_val,
+        scalar_t input_val,
+        scalar_t target_val
+      ) -> scalar_t {
+        const scalar_t one = 1;
+        const scalar_t epsilon = EPSILON;
+
+        scalar_t grad_input_denominator = max(
+          (one - input_val) * input_val,
+          epsilon
+        );
+
+        return grad_val * (input_val - target_val) / grad_input_denominator;
+      }
+    );
+  });
+}
+
+} // namespace
+
+namespace at::native {
+
+Tensor binary_cross_entropy_zoom(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+    Tensor loss = at::empty_like(input);
+    return at::native::binary_cross_entropy_out_zoom(
+        input, target, weight, reduction, loss);
+}
+
+Tensor& binary_cross_entropy_out_zoom(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  Tensor loss_squeezed = at::squeeze(loss);
+
+  TensorIterator iter = TensorIteratorConfig()
+      .add_output(loss_squeezed)
+      .add_owned_input(at::squeeze(input))
+      .add_owned_input(at::squeeze(target))
+      .build();
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "binary_cross_entropy_out_zoom", [&]() {
+    gpu_kernel(iter,
+      [] GPU_LAMBDA (scalar_t input_val, scalar_t target_val) -> scalar_t {
+        const scalar_t zero = 0;
+        const scalar_t one = 1;
+        const scalar_t neg_100 = -100;
+
+        ZOOM_KERNEL_ASSERT(input_val >= zero && input_val <= one);
+        ZOOM_KERNEL_ASSERT(target_val >= zero && target_val <= one);
+
+        scalar_t log_input_val = ::log(input_val);
+        scalar_t log_1_minus_input_val = std::log1p(-input_val);
+
+        log_input_val = ::max(log_input_val, neg_100);
+        log_1_minus_input_val = ::max(log_1_minus_input_val, neg_100);
+
+        return ((target_val - one) * log_1_minus_input_val) - (target_val * log_input_val);
+      }
+    );
+  });
+  if (weight.defined()) {
+    loss.mul_(weight);
+  }
+
+  if (reduction != at::Reduction::None) {
+    Tensor loss_reduced;
+    if (reduction == at::Reduction::Mean) {
+      loss_reduced = loss.mean();
+    } else if (reduction == at::Reduction::Sum) {
+      loss_reduced = loss.sum();
+    }
+    loss.resize_as_(loss_reduced).copy_(loss_reduced);
+  }
+
+  return loss;
+}
+
+Tensor binary_cross_entropy_backward_zoom(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  Tensor grad_input = at::empty_like(input);
+  return at::native::binary_cross_entropy_backward_out_zoom(
+      grad, input, target, weight, reduction, grad_input);
+}
+
+Tensor& binary_cross_entropy_backward_out_zoom(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  Tensor grad_expand = grad.expand_as(input);
+  binary_cross_entropy_backward_out_kernel(grad_input, grad_expand, input, target);
+
+  if (weight.defined()) {
+    grad_input.mul_(weight);
+  }
+  if (reduction == at::Reduction::Mean) {
+    grad_input.div_(input.numel());
+  }
+  return grad_input;
+}
+
+// -----------------------------------
+// nll_loss
+// -----------------------------------
+namespace {
+
+constexpr int NLL_LOSS_THREADS = 32;
+
+// NOTE(crcrpar): `Byte` support was added for https://github.com/pytorch/pytorch/issues/59765.
+#define AT_DISPATCH_NLL_LOSS_INDEX_TYPES(TYPE, NAME, ...)                     \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                              \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Byte, index_t, __VA_ARGS__) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Long, index_t, __VA_ARGS__))
+
+#define CHECK_INDEX_IN_CLASS(INDEX, N_CLASSES)                                \
+  if constexpr(std::is_unsigned<decltype(INDEX)>::value) {                    \
+    ZOOM_KERNEL_ASSERT(INDEX < N_CLASSES);                                    \
+  } else {                                                                    \
+    ZOOM_KERNEL_ASSERT(INDEX >= 0 && INDEX < N_CLASSES);                      \
+  }
+
+template <typename scalar_t, typename index_t>
+__global__ void nll_loss_forward_no_reduce_zoom_kernel(
+    int64_t batch_size,
+    PackedTensorAccessor64<scalar_t, 2> input,
+    const index_t* target,
+    scalar_t* output,
+    const scalar_t* weights,
+    int64_t n_classes,
+    int64_t ignore_index) {
+  HIP_KERNEL_LOOP(index, batch_size) {
+    index_t cur_target = target[index];
+    if (cur_target == ignore_index) {
+      output[index] = static_cast<scalar_t>(0);
+      continue;
+    }
+    CHECK_INDEX_IN_CLASS(cur_target, n_classes);
+    auto cur_weight =
+        weights != nullptr ? weights[cur_target] : static_cast<scalar_t>(1);
+    output[index] = -cur_weight * input[index][cur_target];
+  }
+}
+
+template <typename scalar_t, typename index_t>
+__global__ void nll_loss_forward_reduce_zoom_kernel_1d(
+    scalar_t* output,
+    scalar_t* total_weight,
+    const scalar_t* input,
+    const index_t* target,
+    const scalar_t* weights,
+    bool size_average,
+    int64_t n_classes,
+    int64_t ignore_index) {
+  ZOOM_KERNEL_ASSERT(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
+
+  const index_t t = *target;
+  if (t != ignore_index) {
+    CHECK_INDEX_IN_CLASS(t, n_classes);
+    const auto cur_weight = weights != nullptr ? weights[t] : scalar_t{1};
+    *total_weight = cur_weight;
+
+    if (size_average) {
+      // If we try to normalize a zero then we return a NaN
+      if (cur_weight == 0) {
+        *output = std::numeric_limits<scalar_t>::quiet_NaN();
+      } else {
+        *output = -input[t];
+      }
+    } else {
+      *output = -cur_weight * input[t];
+    }
+  } else {
+    // If the only element was omitted, we get 0. See the discussion in
+    // https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
+    *output = scalar_t{0};
+    *total_weight = scalar_t{0};
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_t>
+__global__ void nll_loss_forward_reduce_zoom_kernel_2d(
+    scalar_t* output,
+    scalar_t* total_weight,
+    const scalar_t* input,
+    const index_t* target,
+    const scalar_t* weights,
+    bool size_average,
+    int64_t nframe,
+    int64_t ndim,
+    int64_t n_classes,
+    int64_t ignore_index) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  __shared__ accscalar_t sh_inputs[NLL_LOSS_THREADS],
+      acc_weight[NLL_LOSS_THREADS];
+
+  sh_inputs[threadIdx.x] = static_cast<accscalar_t>(0);
+  acc_weight[threadIdx.x] = static_cast<accscalar_t>(0);
+  for (int i = threadIdx.x; i < nframe; i += NLL_LOSS_THREADS) {
+    index_t t = target[i];
+    if (t != ignore_index) {
+      CHECK_INDEX_IN_CLASS(t, n_classes);
+      scalar_t cur_weight =
+          weights != nullptr ? weights[t] : static_cast<scalar_t>(1);
+      sh_inputs[threadIdx.x] -= input[i * ndim + t] * cur_weight;
+      acc_weight[threadIdx.x] += cur_weight;
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    accscalar_t output_acc = 0;
+    accscalar_t total_weight_acc = 0;
+    for (int i = 0; i < NLL_LOSS_THREADS; ++i) {
+      output_acc += sh_inputs[i];
+      total_weight_acc += acc_weight[i];
+    }
+    *total_weight = static_cast<scalar_t>(total_weight_acc);
+    if (size_average) {
+      *output = static_cast<scalar_t>(output_acc / total_weight_acc);
+    } else {
+      *output = static_cast<scalar_t>(output_acc);
+    }
+  }
+}
+
+void nll_loss_forward_out_zoom_template(
+    const Tensor& output,
+    const Tensor& total_weight,
+    const Tensor& input_,
+    const Tensor& target_,
+    const Tensor& weight,
+    int64_t reduction,
+    int64_t ignore_index) {
+  auto input = *input_.expect_contiguous();
+  auto target = *target_.expect_contiguous();
+
+  int64_t n_classes = input.size(-1);
+  int64_t n_dims = input.dim();
+  int64_t batch_size = n_dims == 1 ? 1 : input.size(0);
+
+  auto weight_ = weight.defined() ? weight.contiguous() : weight;
+
+  if (reduction == Reduction::None && n_dims == 2) {
+    at::native::resize_output(output, {batch_size});
+    total_weight.zero_();
+    if (batch_size == 0) {
+      // This guards from unnecessary operations and launching CUDA kernel with
+      // 0 blocks.
+      return;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss_forward_no_reduce_zoom_kernel",
+        [&] {
+          AT_DISPATCH_NLL_LOSS_INDEX_TYPES(
+              target.scalar_type(),
+              "nll_loss_forward_no_reduce_zoom_kernel_index",
+              [&] {
+               hipLaunchKernelGGL(( nll_loss_forward_no_reduce_zoom_kernel<scalar_t, index_t>)
+                    , dim3(at::zoom::detail::GET_BLOCKS(batch_size)),
+                       dim3(at::zoom::detail::HIP_NUM_THREADS),
+                       0,
+                       c10::zoom::getCurrentZoomStream(), 
+                        batch_size,
+                        input.packed_accessor64<scalar_t, 2>(),
+                        target.const_data_ptr<index_t>(),
+                        output.mutable_data_ptr<scalar_t>(),
+                        weight_.defined() ? weight_.const_data_ptr<scalar_t>()
+                                          : nullptr,
+                        n_classes,
+                        ignore_index);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              });
+        });
+    return;
+  }
+
+  // produce scalar outputs for the reduction case
+  at::native::resize_output(output, {});
+  total_weight.resize_({});
+
+  if (target.numel() == 0) {
+    // Here target (and input) have zero elements
+    // Mean reduction on empty tensors produces NaN. See the discussion in
+    // https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
+    if (reduction == Reduction::Mean) {
+      output.fill_(std::numeric_limits<double>::quiet_NaN());
+    } else {
+      output.zero_();
+    }
+    total_weight.zero_();
+    return;
+  }
+
+  if (n_dims == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss_forward_reduce_zoom_kernel_1d",
+        [&] {
+          AT_DISPATCH_NLL_LOSS_INDEX_TYPES(
+              target.scalar_type(),
+              "nll_loss_forward_reduce_zoom_kernel_1d_index",
+              [&] {
+               hipLaunchKernelGGL(( nll_loss_forward_reduce_zoom_kernel_1d<scalar_t, index_t>)
+                    , dim3(1), dim3(1), 0, c10::zoom::getCurrentZoomStream(), 
+                        output.mutable_data_ptr<scalar_t>(),
+                        total_weight.mutable_data_ptr<scalar_t>(),
+                        input.const_data_ptr<scalar_t>(),
+                        target.const_data_ptr<index_t>(),
+                        weight_.defined() ? weight_.const_data_ptr<scalar_t>()
+                                          : nullptr,
+                        reduction == at::Reduction::Mean,
+                        n_classes,
+                        ignore_index);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              });
+        });
+  } else if (n_dims == 2) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss_forward_reduce_zoom_kernel_2d",
+        [&] {
+          AT_DISPATCH_NLL_LOSS_INDEX_TYPES(
+              target.scalar_type(),
+              "nll_loss_forward_reduce_zoom_kernel_2d_index",
+              [&] {
+                using accscalar_t = at::acc_type<scalar_t, /*is_cuda*/true>;
+               hipLaunchKernelGGL(( nll_loss_forward_reduce_zoom_kernel_2d<scalar_t, accscalar_t, index_t>)
+                    , dim3(1),
+                       dim3(NLL_LOSS_THREADS),
+                       0,
+                       c10::zoom::getCurrentZoomStream(), 
+                        output.mutable_data_ptr<scalar_t>(),
+                        total_weight.mutable_data_ptr<scalar_t>(),
+                        input.const_data_ptr<scalar_t>(),
+                        target.const_data_ptr<index_t>(),
+                        weight_.defined() ? weight_.const_data_ptr<scalar_t>()
+                                          : nullptr,
+                        reduction == at::Reduction::Mean,
+                        input.size(0),
+                        input.size(1),
+                        n_classes,
+                        ignore_index);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              });
+        });
+  }
+}
+
+template <typename scalar_t, typename index_t>
+__global__ void nll_loss_backward_no_reduce_zoom_kernel(
+  int batch_size,
+  const index_t *target,
+  PackedTensorAccessor64<const scalar_t, 1> grad_output,
+  PackedTensorAccessor64<scalar_t, 2> grad_input,
+  const scalar_t *weights,
+  int64_t n_classes,
+  int64_t ignore_index) {
+
+  HIP_KERNEL_LOOP(index, batch_size) {
+    index_t cur_target = target[index];
+    if (cur_target == ignore_index) {
+      continue;
+    }
+    CHECK_INDEX_IN_CLASS(cur_target, n_classes);
+    scalar_t weight = weights != nullptr ? weights[cur_target] : static_cast<scalar_t>(1);
+    grad_input[index][cur_target] = -weight * grad_output[index];
+  }
+};
+
+template <typename scalar_t, typename index_t>
+__global__ void nll_loss_backward_reduce_zoom_kernel_1d(
+  scalar_t *grad_input,
+  const scalar_t *grad_output,
+  const scalar_t *weights,
+  const index_t *target,
+  const scalar_t *total_weight,
+  bool size_average,
+  int64_t n_classes,
+  int64_t ignore_index
+) {
+  const index_t t = *target;
+  if (t != ignore_index) {
+    CHECK_INDEX_IN_CLASS(t, n_classes);
+    const auto grad = -(size_average ? *grad_output / *total_weight : *grad_output);
+    grad_input[t] = weights != nullptr ? weights[t] * grad : grad;
+  }
+}
+
+template <typename T> struct bwd_index_type { using type = T; };
+template<> struct bwd_index_type<uint8_t> { using type = int; };
+template<> struct bwd_index_type<int64_t> { using type = uint64_t; };
+
+template <typename scalar_t, typename index_t>
+__global__ void nll_loss_backward_reduce_zoom_kernel_2d(
+    scalar_t* grad_input,
+    const scalar_t* grad_output,
+    const index_t* target,
+    const scalar_t* weights,
+    const scalar_t* total_weight,
+    bool size_average,
+    int nframe,
+    int ndim,
+    int64_t n_classes,
+    int64_t ignore_index) {
+  using bwd_index_t = typename bwd_index_type<index_t>::type;
+  const auto grad = -(size_average ? *grad_output / *total_weight
+                                   : *grad_output);
+
+  for (int i = threadIdx.x; i < nframe; i += NLL_LOSS_THREADS) {
+    const index_t t = target[i];
+    if (t != ignore_index) {
+      CHECK_INDEX_IN_CLASS(t, n_classes);
+      // NOTE(crcrpar): this index could overflow in int64_t as `t` itself can be close to the max.
+      const bwd_index_t index = static_cast<bwd_index_t>(i) * ndim + t;
+      if constexpr(!std::is_unsigned<decltype(index)>::value) {
+        ZOOM_KERNEL_ASSERT(index >= 0);
+      }
+      grad_input[index] = weights != nullptr ? weights[t] * grad : grad;
+    }
+  }
+}
+
+void nll_loss_backward_out_zoom_template(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    const Tensor& input_,
+    const Tensor& target_,
+    const Tensor& total_weight,
+    const Tensor& weight,
+    int64_t reduction,
+    int64_t ignore_index) {
+  auto target = *target_.expect_contiguous();
+  auto input = *input_.expect_contiguous();
+  auto grad_input = *grad_input_.expect_contiguous();
+  auto grad_output = *grad_output_.expect_contiguous();
+
+  int64_t n_dims = input.dim();
+  int64_t n_classes = input.size(-1);
+  int64_t batch_size = n_dims == 1 ? 1 : input.size(0);
+
+  auto weight_ = weight.defined() ? weight.contiguous() : weight;
+
+  if (reduction == at::Reduction::None && n_dims == 2) {
+    if (batch_size == 0) {
+      // This guards from unnecessary operations and launching CUDA kernel with 0 blocks.
+      return;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss_backward_no_reduce_zoom_kernel",
+        [&] {
+          AT_DISPATCH_NLL_LOSS_INDEX_TYPES(
+              target.scalar_type(),
+              "nll_loss_backward_no_reduce_zoom_kernel_index",
+              [&] {
+               hipLaunchKernelGGL(( nll_loss_backward_no_reduce_zoom_kernel<scalar_t, index_t>)
+                    , dim3(at::zoom::detail::GET_BLOCKS(batch_size)),
+                       dim3(at::zoom::detail::HIP_NUM_THREADS),
+                       0,
+                       c10::zoom::getCurrentZoomStream(), 
+                        batch_size,
+                        target.const_data_ptr<index_t>(),
+                        grad_output.packed_accessor64<const scalar_t, 1>(),
+                        grad_input.packed_accessor64<scalar_t, 2>(),
+                        weight.defined() ? weight_.const_data_ptr<scalar_t>() : nullptr,
+                        n_classes,
+                        ignore_index);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              });
+        });
+    return;
+  }
+
+  if (n_dims == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss_backward_reduce_zoom_kernel_1d",
+        [&] {
+          AT_DISPATCH_NLL_LOSS_INDEX_TYPES(
+              target.scalar_type(),
+              "nll_loss_backward_reduce_zoom_kernel_1d_index",
+              [&] {
+               hipLaunchKernelGGL(( nll_loss_backward_reduce_zoom_kernel_1d<scalar_t, index_t>)
+                    , dim3(1), dim3(1), 0, c10::zoom::getCurrentZoomStream(), 
+                        grad_input.mutable_data_ptr<scalar_t>(),
+                        grad_output.const_data_ptr<scalar_t>(),
+                        weight.defined() ? weight_.const_data_ptr<scalar_t>()
+                                         : nullptr,
+                        target.const_data_ptr<index_t>(),
+                        total_weight.const_data_ptr<scalar_t>(),
+                        reduction == at::Reduction::Mean,
+                        n_classes,
+                        ignore_index);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              });
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss_backward_reduce_zoom_kernel_2d",
+        [&] {
+          AT_DISPATCH_NLL_LOSS_INDEX_TYPES(
+              target.scalar_type(),
+              "nll_loss_backward_reduce_zoom_kernel_2d_index",
+              [&] {
+           hipLaunchKernelGGL(( nll_loss_backward_reduce_zoom_kernel_2d<scalar_t, index_t>)
+                , dim3(1), dim3(NLL_LOSS_THREADS), 0, c10::zoom::getCurrentZoomStream(), 
+                    grad_input.mutable_data_ptr<scalar_t>(),
+                    grad_output.const_data_ptr<scalar_t>(),
+                    target.const_data_ptr<index_t>(),
+                    weight.defined() ? weight_.const_data_ptr<scalar_t>() : nullptr,
+                    total_weight.const_data_ptr<scalar_t>(),
+                    reduction == at::Reduction::Mean,
+                    input.size(0),
+                    input.size(1),
+                    n_classes,
+                    ignore_index);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          });
+        });
+  }
+}
+
+#undef AT_DISPATCH_NLL_LOSS_INDEX_TYPES
+
+} // namespace
+
+TORCH_IMPL_FUNC(nll_loss_forward_out_zoom)
+(const Tensor& self,
+ const Tensor& target,
+ const OptionalTensorRef weight_opt,
+ int64_t reduction,
+ int64_t ignore_index,
+ const Tensor& output,
+ const Tensor& total_weight) {
+  const Tensor& weight = weight_opt.getTensorRef();
+  nll_loss_forward_out_zoom_template(
+      output, total_weight, self, target, weight, reduction, ignore_index);
+}
+
+TORCH_IMPL_FUNC(nll_loss_backward_out_zoom)
+(const Tensor& grad_output,
+ const Tensor& self,
+ const Tensor& target,
+ OptionalTensorRef weight_opt,
+ int64_t reduction,
+ int64_t ignore_index,
+ const Tensor& total_weight,
+ const Tensor& grad_input) {
+  const Tensor& weight = weight_opt.getTensorRef();
+  grad_input.zero_();
+  nll_loss_backward_out_zoom_template(
+      grad_input,
+      grad_output,
+      self,
+      target,
+      total_weight,
+      weight,
+      reduction,
+      ignore_index);
+}
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/MultiTensorApply.cuh b/aten/src/ATen/native/zoom/MultiTensorApply.cuh
new file mode 100644
index 00000000000000..9efa863f49ceaf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MultiTensorApply.cuh
@@ -0,0 +1,379 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+#include <vector>
+
+namespace at::native {
+
+namespace {
+
+static constexpr int64_t kILP = 4;
+static constexpr int64_t kChunkSize = 65536;
+static constexpr int64_t kBlockSize = 512;
+
+// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy`
+// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
+static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
+static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = {
+    72,
+    60};
+
+template <typename T>
+__device__ __forceinline__ bool is_aligned(T* p) {
+  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
+}
+
+template <typename T>
+__device__ __forceinline__ void load_store(
+    T* dst,
+    T* src,
+    int64_t dst_offset,
+    int64_t src_offset) {
+  using LT = at::native::memory::aligned_vector<T, kILP>;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template <int n>
+struct TensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename scalar_vals_t, int n>
+struct TensorListScalarListMetadata {
+  const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+};
+
+// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of
+// 4kb with `c10::complex<double>`
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 1> {
+  const void* addresses[1]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]];
+  int block_to_chunk[depth_to_max_blocks[1 - 1]];
+};
+
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 2> {
+  const void* addresses[2]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]];
+  int block_to_chunk[depth_to_max_blocks[2 - 1]];
+};
+
+// NOTE(crcrpar): This is a conservative resolution to handle `state_steps`
+// whose each element is `at::Tensor` of 1 element representing the number of
+// `step`s called so far.
+template <int n>
+struct FusedOptimizerTensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+C10_LAUNCH_BOUNDS_1(kBlockSize)
+__global__ void multi_tensor_apply_kernel(
+    T tensorListMeta,
+    U callable,
+    ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(kChunkSize, tensorListMeta, args...);
+}
+
+} // namespace
+
+// multi_tensor_apply enables horizontal fusion across lists of tensors.
+// For example, whereas you once had a for-loop of a + b = c, where a, b,
+// and c are individual tensors in lists as, bs, and cs, you can now with
+// fewer kernel launches compute as + bs = cs.
+//
+// You can also imagine bs to be a scalar list vs a tensor list.
+//
+// The function below takes in tensor lists, scalars, and a callable and
+// chunks up the computation to launch as few kernels as possible by iterating
+// through every "chunk" in every tensor (thus the nested for loops). In the
+// simplest case, everything gets bundled into just one kernel launch, but
+// due to blocksize constraints, we may need to launch multiple kernels.
+// Each kernel launch is defined by one tensorListMeta construct, which we
+// use to track and reset the necessary metadata for each launch.
+template <int depth, typename scalar_T, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<Scalar> scalars,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  using scalar_vals_t = typename T::opmath_t;
+  TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // now we enter [chunking territory].
+    // we will launch a kernel when EITHER the blocks get filled up OR
+    // the tensors get filled up. There will always be at least one block
+    // per tensor since the zero-sized ones will not enter the loop, so
+    // the nested forloop within represents iterating through the chunks
+    // of a single tensor.
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      // a tensor is not considered full unless all its chunks have been
+      // processed
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        // all chunks have already been handled in the kernel
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else { // blocks were full and tensor chunks remain
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.scalar_vals[0] =
+              tensorListMeta.scalar_vals[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // note: [finishing what we started]
+  // if there's remaining work to be done but the tensors/blocks aren't full
+  // yet we are at the end, submit the kernel to do the work!
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  TensorListMetadata<depth> tensorListMeta;
+  tensorListMeta.start_tensor_this_launch = 0;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see note: [chunking territory].
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+          tensorListMeta.start_tensor_this_launch = t + 1;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+          tensorListMeta.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+
+  // see note: [finishing what we started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply_for_fused_optimizer(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::TensorList state_steps,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth");
+  const auto num_tensors = tensor_lists[0].size();
+  FusedOptimizerTensorListMetadata<depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][tensor_index].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.state_steps_addresses[loc_tensor_info] =
+        state_steps[tensor_index].const_data_ptr();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][tensor_index].numel();
+    for (const auto& d : c10::irange(depth)) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][tensor_index].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see above note: [chunking territory]
+    const auto numel = tensor_lists[0][tensor_index].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    TORCH_CHECK(chunks > -1);
+    for (const auto& chunk : c10::irange(chunks)) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const auto tensor_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1];
+
+      if (tensor_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.state_steps_addresses[0] =
+              tensorListMeta.state_steps_addresses[loc_tensor_info - 1];
+          for (const auto& d : c10::irange(depth)) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // see above note: [finishing what we've started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        c10::zoom::getCurrentZoomStream()>>>(tensorListMeta, callable, args...);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/NLLLoss2d.cu b/aten/src/ATen/native/zoom/NLLLoss2d.cu
new file mode 100644
index 00000000000000..bb8c021f5dafa2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/NLLLoss2d.cu
@@ -0,0 +1,537 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/macros/Macros.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/nll_loss2d_forward_native.h>
+#include <ATen/ops/nll_loss2d_backward_native.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+// Returns a contiguous tensor if the source tensor
+// is defined. Otherwise returns the undefined
+// source tensor unmodified.
+inline Tensor optional_contiguous(const Tensor& source) {
+  return source.defined() ? source.contiguous() : source;
+}
+
+// Returns the address of the first element of a tensor
+// or nullptr if the tensor is undefined.
+template <typename scalar_t>
+inline const scalar_t* optional_data(const Tensor& source) {
+  return source.defined() ? source.const_data_ptr<scalar_t>() : nullptr;
+}
+
+using at::zoom::detail::HIP_NUM_THREADS;
+using at::zoom::detail::GET_BLOCKS;
+
+// TODO(crcrpar): Think about introducing `canUse32BitIndexMath` and choose int or int64_t for `target`.
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(HIP_NUM_THREADS)
+__global__ void nll_loss2d_forward_no_reduce_kernel(
+  int64_t n_threads,
+  PackedTensorAccessor64<scalar_t, 4> input,
+  PackedTensorAccessor64<int64_t, 3> target,
+  PackedTensorAccessor64<scalar_t, 3> output,
+  const scalar_t* weight,
+  int64_t ignore_index
+) {
+  int64_t batch_size = input.size(0);
+  int64_t n_classes = input.size(1);
+  int64_t H = input.size(2);
+  int64_t W = input.size(3);
+
+  HIP_KERNEL_LOOP(index, n_threads) {
+    const int64_t b = index % batch_size;
+    const int64_t h = (index / batch_size) % H;
+    const int64_t w = (index / (batch_size * H)) % W;
+
+    int64_t cur_target = target[b][h][w];
+    if (cur_target == ignore_index) {
+      output[b][h][w] = static_cast<scalar_t>(0);
+      continue;
+    }
+    ZOOM_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
+    scalar_t value = input[b][cur_target][h][w];
+    scalar_t cur_weight = weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1);
+    output[b][h][w] = -value * cur_weight;
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_t>
+C10_LAUNCH_BOUNDS_1(HIP_NUM_THREADS)
+__global__ void nll_loss2d_forward_kernel(
+  scalar_t* output,
+  scalar_t* total_weight,
+  const scalar_t* input,
+  const int64_t* target,
+  const scalar_t* weight,
+  int n_classes,
+  int map_nelem,
+  int blocks_per_sample,
+  int64_t ignore_index) {
+
+  scalar_t cur_weight;
+  accscalar_t input_sum = 0;
+  accscalar_t acc_weight = 0;
+
+  index_t sample = blockIdx.x / blocks_per_sample;
+  index_t toffset = sample * map_nelem;
+  index_t ioffset = sample * map_nelem * n_classes;
+  int step = blockDim.x * blocks_per_sample;
+  for (int i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
+       i < map_nelem;
+       i += step) {
+    index_t t = target[toffset + i];
+    if (t != ignore_index) {
+      ZOOM_KERNEL_ASSERT(t >= 0 && t < n_classes);
+      cur_weight = weight != nullptr ? weight[t] : static_cast<scalar_t>(1);
+      const auto input_index = ioffset + i + map_nelem * t;
+      ZOOM_KERNEL_ASSERT(input_index >= 0);
+      input_sum -= input[input_index] * cur_weight;
+      acc_weight += cur_weight;
+    }
+  }
+
+  __shared__ accscalar_t acc_weight_smem[HIP_NUM_THREADS];
+  __shared__ accscalar_t input_sum_smem[HIP_NUM_THREADS];
+
+  auto acc_weight_ = zoom_utils::BlockReduceSum(acc_weight, acc_weight_smem);
+  auto input_sum_ = zoom_utils::BlockReduceSum(input_sum, input_sum_smem);
+
+  if (threadIdx.x == 0) {
+    gpuAtomicAdd(total_weight, static_cast<scalar_t>(acc_weight_));
+    gpuAtomicAdd(output, static_cast<scalar_t>(input_sum_));
+  }
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(HIP_NUM_THREADS)
+__global__ void nll_loss2d_forward_size_average_kernel(
+  scalar_t* output,
+  const scalar_t* total_weight
+) {
+  *output /= *total_weight;
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(HIP_NUM_THREADS)
+__global__ void nll_loss2d_backward_no_reduce_kernel(
+  int64_t n_threads,
+  PackedTensorAccessor64<int64_t, 3> target,
+  PackedTensorAccessor64<scalar_t, 3> grad_output,
+  PackedTensorAccessor64<scalar_t, 4> grad_input,
+  const scalar_t* weight,
+  int64_t ignore_index
+) {
+  int64_t batch_size = target.size(0);
+  int64_t H = target.size(1);
+  int64_t W = target.size(2);
+
+  HIP_KERNEL_LOOP(index, n_threads) {
+    const int64_t b = index % batch_size;
+    const int64_t h = (index / batch_size) % H;
+    const int64_t w = (index / (batch_size * H)) % W;
+
+    int64_t cur_target = target[b][h][w];
+    if (cur_target == ignore_index) {
+      continue;
+    }
+    scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
+    grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
+  }
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(HIP_NUM_THREADS)
+__global__ void nll_loss2d_backward_kernel(
+  scalar_t* grad_input,
+  const scalar_t* grad_output,
+  const int64_t* target,
+  const scalar_t* weights,
+  const scalar_t* total_weight,
+  bool size_average,
+  int n_classes,
+  int map_nelem,
+  int blocks_per_sample,
+  int64_t ignore_index
+) {
+  const auto grad = -(size_average ? *grad_output / *total_weight
+                                   : *grad_output);
+
+  const int sample = blockIdx.x / blocks_per_sample;
+  const int step = blockDim.x * blocks_per_sample;
+
+  const int toffset = sample * map_nelem;
+  const auto* const target_thread = target + toffset;
+
+  const int ioffset = sample * map_nelem * n_classes;
+  auto* const grad_input_thread = grad_input + ioffset;
+
+  for (int i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
+       i < map_nelem;
+       i += step) {
+    const int64_t t = target_thread[i];
+    if (t != ignore_index) {
+      ZOOM_KERNEL_ASSERT(t >= 0 && t < n_classes);
+      const auto grad_input_index = i + map_nelem * t;
+      ZOOM_KERNEL_ASSERT(grad_input_index >= 0);
+      grad_input_thread[i + map_nelem * t] = weights != nullptr ? weights[t] * grad
+                                                                : grad;
+    }
+  }
+}
+
+void check_inputs_nll_loss2d(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight) {
+  TORCH_CHECK(
+      target.dim() == 3,
+      "only batches of spatial targets supported (3D tensors)"
+      " but got targets of size: : ",
+      target.sizes());
+  TORCH_CHECK(
+      input.dim() == 4,
+      "only batches of spatial inputs supported (4D tensors), "
+      "but got input of size: ",
+      input.sizes());
+  TORCH_CHECK(
+      !weight.defined() || weight.numel() == input.size(1),
+      "weight tensor should be defined either for all or no classes");
+
+  TORCH_CHECK(
+      input.size(0) == target.size(0) && input.size(2) == target.size(1) &&
+          input.size(3) == target.size(2),
+      "input and target batch or spatial sizes don't match: target ",
+      target.sizes(),
+      ", input ",
+      input.sizes());
+}
+
+void nll_loss2d_forward_out_zoom_template(
+    Tensor& output,
+    Tensor& total_weight,
+    const Tensor& input,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage in 'sum' or 'mean' reductions.
+  if (reduction != at::Reduction::None) {
+    at::globalContext().alertNotDeterministic("nll_loss2d_forward_out_zoom_template");
+  }
+
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  check_inputs_nll_loss2d(input, target, weight);
+  total_weight.resize_({});
+
+  if (reduction == at::Reduction::None) {
+    int64_t batch_size = input.size(0);
+    int64_t H = input.size(2);
+    int64_t W = input.size(3);
+    int64_t count = batch_size * H * W;
+
+    at::native::resize_output(output, {batch_size, H, W});
+    if (count == 0) {
+      // This guards from unnecessary operations and launching CUDA kernel with
+      // 0 blocks.
+      return;
+    }
+    auto weight_ = optional_contiguous(weight);
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss2d_forward_no_reduce_kernel",
+        [&] {
+         hipLaunchKernelGGL(( nll_loss2d_forward_no_reduce_kernel<scalar_t>)
+              , dim3(GET_BLOCKS(count)),
+                 dim3(HIP_NUM_THREADS),
+                 0,
+                 c10::zoom::getCurrentZoomStream(), 
+                  count,
+                  input.packed_accessor64<scalar_t, 4>(),
+                  target.packed_accessor64<int64_t, 3>(),
+                  output.packed_accessor64<scalar_t, 3>(),
+                  optional_data<scalar_t>(weight_),
+                  ignore_index);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+    return;
+  }
+
+  // produce scalar outputs for the reduction case
+  at::native::resize_output(output, {});
+
+  if (target.numel() == 0) {
+    // Here target (and input) have zero elements
+    // Mean reduction on empty tensors produces NaN. See the discussion in
+    // https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
+    if (reduction == Reduction::Mean) {
+      output.fill_(std::numeric_limits<double>::quiet_NaN());
+    } else {
+      output.zero_();
+    }
+    total_weight.zero_();
+    return;
+  }
+
+  auto input_ = input.contiguous();
+  auto weight_ = optional_contiguous(weight);
+  auto target_ = target.contiguous();
+
+  output.zero_();
+  total_weight.zero_();
+
+  auto batch_size = target.size(0);
+  int64_t map_nelem = target.numel() / batch_size;
+  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+  int total_blocks = blocks_per_sample * batch_size;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      input.scalar_type(),
+      "nll_loss2d_forward_kernel",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+    AT_DISPATCH_INDEX_TYPES(
+        at::native::canUse32BitIndexMath(input_, INT_MAX) ? ScalarType::Int : ScalarType::Long,
+        "nll_loss2d_forward_launcher", [&] {
+           hipLaunchKernelGGL(( nll_loss2d_forward_kernel<scalar_t, accscalar_t, index_t>)
+                , dim3(total_blocks),
+                  dim3(HIP_NUM_THREADS),
+                  0,
+                  c10::zoom::getCurrentZoomStream(), 
+                    output.mutable_data_ptr<scalar_t>(),
+                    total_weight.mutable_data_ptr<scalar_t>(),
+                    input_.const_data_ptr<scalar_t>(),
+                    target_.const_data_ptr<int64_t>(),
+                    optional_data<scalar_t>(weight_),
+                    input_.size(1),
+                    input_.size(2) * input_.size(3),
+                    blocks_per_sample,
+                    ignore_index);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            // Divide by total_weight
+            if (reduction == at::Reduction::Mean) {
+             hipLaunchKernelGGL(( nll_loss2d_forward_size_average_kernel<scalar_t>)
+                  , dim3(1), dim3(1), 0, c10::zoom::getCurrentZoomStream(), 
+                      output.mutable_data_ptr<scalar_t>(),
+                      total_weight.const_data_ptr<scalar_t>());
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            }
+    });
+      });
+}
+
+void nll_loss2d_backward_out_zoom_template(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  check_inputs_nll_loss2d(input, target, weight);
+  grad_input.resize_as_(input);
+  grad_input.zero_();
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+  TORCH_CHECK(
+      total_weight.numel() == 1,
+      "expected total_weight to be a single element tensor, got: ",
+      total_weight.sizes(),
+      " (",
+      total_weight.numel(),
+      " elements)");
+
+
+  if (reduction == at::Reduction::None) {
+    TORCH_CHECK(
+        grad_output.dim() == 3,
+        "grad_output must have same dimension as target (3) but got dimension: ",
+        grad_output.sizes());
+    TORCH_CHECK(
+        grad_output.size(0) == target.size(0) &&
+            grad_output.size(1) == target.size(1) &&
+            grad_output.size(2) == target.size(2),
+        "grad_output sizes don't match target sizes: target ",
+        target.sizes(),
+        ", grad_output ",
+        grad_output.sizes())
+    int64_t batch_size = input.size(0);
+    int64_t H = input.size(2);
+    int64_t W = input.size(3);
+    int64_t count = batch_size * H * W;
+
+    if (count == 0) {
+      // This guards from unnecessary operations and launching CUDA kernel with
+      // 0 blocks.
+      return;
+    }
+    auto weight_ = optional_contiguous(weight);
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss2d_backward_no_reduce_kernel",
+        [&] {
+         hipLaunchKernelGGL(( nll_loss2d_backward_no_reduce_kernel<scalar_t>)
+              , dim3(GET_BLOCKS(count)),
+                 dim3(HIP_NUM_THREADS),
+                 0,
+                 c10::zoom::getCurrentZoomStream(), 
+                  count,
+                  target.packed_accessor64<int64_t, 3>(),
+                  grad_output.packed_accessor64<scalar_t, 3>(),
+                  grad_input.packed_accessor64<scalar_t, 4>(),
+                  optional_data<scalar_t>(weight_),
+                  ignore_index);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+    return;
+  }
+
+  int64_t batch_size = target.size(0);
+  auto target_numel = target.numel();
+  if (batch_size != 0 && target_numel != 0) {
+    // This guards from unnecessary operations and launching CUDA kernel with 1
+    // blocks.
+    auto target_ = target.contiguous();
+    auto weight_ = optional_contiguous(weight);
+
+    int64_t map_nelem = target_numel / batch_size;
+    int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+    blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+    int total_blocks = blocks_per_sample * batch_size;
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "nll_loss2d_backward_kernel",
+        [&] {
+         hipLaunchKernelGGL(( nll_loss2d_backward_kernel<scalar_t>)
+              , dim3(total_blocks),
+                dim3(HIP_NUM_THREADS),
+                0,
+                c10::zoom::getCurrentZoomStream(), 
+                  grad_input.mutable_data_ptr<scalar_t>(),
+                  grad_output.const_data_ptr<scalar_t>(),
+                  target_.const_data_ptr<int64_t>(),
+                  optional_data<scalar_t>(weight_),
+                  total_weight.const_data_ptr<scalar_t>(),
+                  reduction == at::Reduction::Mean,
+                  input.size(1),
+                  map_nelem,
+                  blocks_per_sample,
+                  ignore_index);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+  }
+}
+} // namespace
+
+std::tuple<Tensor&, Tensor&> nll_loss2d_forward_out_zoom(
+    const Tensor& self,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    Tensor& output,
+    Tensor& total_weight) {
+  nll_loss2d_forward_out_zoom_template(
+      output, total_weight, self, target, weight_opt, reduction, ignore_index);
+  return std::tuple<Tensor&, Tensor&>(output, total_weight);
+}
+
+std::tuple<Tensor, Tensor> nll_loss2d_forward_zoom(
+    const Tensor& self,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index) {
+  auto output = at::empty({0}, self.options());
+  auto total_weight = at::empty({0}, self.options());
+  nll_loss2d_forward_out_zoom_template(
+      output, total_weight, self, target, weight_opt, reduction, ignore_index);
+  return std::make_tuple(output, total_weight);
+}
+
+Tensor& nll_loss2d_backward_out_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight,
+    Tensor& grad_input) {
+  nll_loss2d_backward_out_zoom_template(
+      grad_input,
+      grad_output,
+      self,
+      target,
+      weight_opt,
+      reduction,
+      ignore_index,
+      total_weight);
+  return grad_input;
+}
+
+Tensor nll_loss2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    int64_t reduction,
+    int64_t ignore_index,
+    const Tensor& total_weight) {
+  auto grad_input = at::empty_like(self);
+  nll_loss2d_backward_out_zoom_template(
+      grad_input,
+      grad_output,
+      self,
+      target,
+      weight_opt,
+      reduction,
+      ignore_index,
+      total_weight);
+  return grad_input;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Pow.cuh b/aten/src/ATen/native/zoom/Pow.cuh
new file mode 100644
index 00000000000000..eee86031f8d932
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Pow.cuh
@@ -0,0 +1,58 @@
+#pragma once
+#include <ATen/native/Pow.h>
+#include <c10/core/Scalar.h>
+
+namespace at { namespace native {
+
+namespace {
+
+
+// SFINAE doesn't work well with NVCC under Windows for math functions like pow and sqrt.
+// So we need to define the functions with the explicit function signatures.
+// As for pow, the following signatures are defined as the device function:
+//   pow(float, int)
+//   pow(double, int)
+//   pow(float, float)
+//   pow(double, double)
+#ifdef _MSC_VER
+// Functions for pow
+// pow for at::Half
+static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
+  return static_cast<at::Half>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow for at::BFloat16
+static inline __host__ __device__ at::BFloat16 pow_(at::BFloat16 base, at::BFloat16 exp) {
+  return static_cast<at::BFloat16>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow (floating, floating/int)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<std::is_floating_point<Base_type>::value && (std::is_same<Base_type, Exp_type>::value || std::is_same<Exp_type, int>::value), Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return std::pow(base, exp);
+}
+// pow (Otherwise)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<!std::is_same<Base_type, Exp_type>::value && !std::is_same<Exp_type, int>::value, Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return static_cast<Base_type>(std::pow(static_cast<double>(base), static_cast<double>(exp)));
+}
+#else
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ Base_type pow_(Base_type base, Exp_type exp) {
+  return ::pow(base, exp);
+}
+#endif
+
+template <typename T>
+static inline __host__ __device__ std::enable_if_t<std::is_integral<T>::value, T> pow_(
+    T base, T exp) {
+  return at::native::powi(base, exp);
+}
+
+template <typename T>
+static inline __host__ __device__ c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
+  return c10_complex_math::pow(base, exp);
+}
+
+} // namespace
+}} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/RecordStream.cu b/aten/src/ATen/native/zoom/RecordStream.cu
new file mode 100644
index 00000000000000..07970a18be5ccd
--- /dev/null
+++ b/aten/src/ATen/native/zoom/RecordStream.cu
@@ -0,0 +1,17 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/record_stream_native.h>
+#endif
+
+namespace at::native {
+void record_stream_zoom(Tensor& self, c10::Stream stream) {
+  struct c10::StreamData3 data = stream.pack3();
+  c10::zoom::ZoomCachingAllocator::recordStream(self.storage().data_ptr(), c10::zoom::ZoomStream::unpack3(data.stream_id, data.device_index, data.device_type));
+}
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/RreluWithNoise.cu b/aten/src/ATen/native/zoom/RreluWithNoise.cu
new file mode 100644
index 00000000000000..6c8dac9becd7a2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/RreluWithNoise.cu
@@ -0,0 +1,195 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/leaky_relu.h>
+#include <ATen/ops/rrelu_with_noise_native.h>
+#endif
+
+
+namespace at::native {
+
+template <typename scalar_t, int unroll_factor, typename F>
+C10_LAUNCH_BOUNDS_2(256, 4)
+__global__ void rrelu_with_noise_zoom_kernel(
+    int numel,
+    PhiloxHIPState philox_args,
+    scalar_t* output,
+    const scalar_t* input,
+    scalar_t* noise,
+    double lower,
+    double upper,
+    const F& random_func) {
+  auto seeds = at::zoom::philox::unpack(philox_args);
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  int grid_stride = blockDim.x * gridDim.x * unroll_factor;
+  int rounded_size = ((numel - 1) / grid_stride + 1) * grid_stride;
+  double range = upper - lower;
+
+  for (int linear_index = idx; linear_index < rounded_size; linear_index += grid_stride) {
+    auto rand = random_func(&state);
+
+    // ensure that (&rand.x)[ii] is safe
+    static_assert(sizeof(rand)/sizeof(rand.x) == unroll_factor, "");
+
+    #pragma unroll
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      int li = linear_index + blockDim.x * gridDim.x * ii;
+      if (li >= numel) {
+        continue;
+      }
+      scalar_t r = static_cast<scalar_t>((&rand.x)[ii]);
+      r = r * range + lower;
+      if (input[li] <= 0) {
+        output[li] = input[li] * r;
+        noise[li] = r;
+      } else {
+        output[li] = input[li];
+        noise[li] = static_cast<scalar_t>(1);
+      }
+    }
+    __syncthreads();
+  }
+}
+
+template <typename scalar_t>
+inline void _rrelu_with_noise_zoom_train(
+    Tensor& output,
+    const Tensor& input_,
+    const Tensor& noise_,
+    const Scalar& lower_,
+    const Scalar& upper_,
+    std::optional<Generator> generator) {
+  auto input = input_.contiguous();
+  auto noise = noise_.contiguous();
+  Tensor tmp_output = output.contiguous();
+
+  int64_t numel = input.numel();
+  auto execution_policy = calc_execution_policy(numel);
+
+  auto counter_offset = std::get<0>(execution_policy);
+  auto grid = std::get<1>(execution_policy);
+  auto block = std::get<2>(execution_policy);
+
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(
+      generator, zoom::detail::getDefaultZoomGenerator());
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(counter_offset);
+  }
+
+  const scalar_t* input_data = input.const_data_ptr<scalar_t>();
+  scalar_t* noise_data = noise.mutable_data_ptr<scalar_t>();
+  scalar_t* output_data = tmp_output.mutable_data_ptr<scalar_t>();
+
+  double lower = lower_.to<double>();
+  double upper = upper_.to<double>();
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  if (std::is_same<scalar_t, double>::value) {
+   hipLaunchKernelGGL(( rrelu_with_noise_zoom_kernel<scalar_t, 2>), dim3(grid), dim3(block), 0, stream, 
+        numel,
+        rng_engine_inputs,
+        output_data,
+        input_data,
+        noise_data,
+        lower,
+        upper,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) {
+          return hiprand_uniform2_double(state);
+        });
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+    // half and float
+   hipLaunchKernelGGL(( rrelu_with_noise_zoom_kernel<scalar_t, 4>), dim3(grid), dim3(block), 0, stream, 
+        numel,
+        rng_engine_inputs,
+        output_data,
+        input_data,
+        noise_data,
+        lower, upper,
+        [] __device__ (hiprandStatePhilox4_32_10_t* state) {
+          return hiprand_uniform4(state);
+        });
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+
+  if (!output.is_contiguous()) {
+    output.copy_(tmp_output);
+  }
+}
+
+Tensor& rrelu_with_noise_out_zoom(const Tensor& self,
+    const Tensor& noise,
+    const Scalar& lower,
+    const Scalar& upper,
+    bool training,
+    std::optional<Generator> generator,
+    Tensor& output) {
+  at::native::resize_output(output, self.sizes());
+
+  if (self.numel() == 0) {
+    return output;
+  }
+
+  TensorArg self_arg{self, "self", 1}, noise_arg{noise, "noise", 2},
+      output_arg{output, "output", 3};
+  checkAllSameGPU("rrelu_with_noise_out_zoom", {self_arg, noise_arg, output_arg});
+
+  if (training) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+        self.scalar_type(), "rrelu_with_noise_out_zoom", [&] {
+          _rrelu_with_noise_zoom_train<scalar_t>(
+              output, self, noise, lower, upper, generator);
+        });
+  }
+  else {
+    auto lower_tensor = lower.to<double>();
+    auto upper_tensor = upper.to<double>();
+    Scalar negative_slope = (lower_tensor + upper_tensor) / 2;
+    at::leaky_relu_out(output, self, negative_slope);
+  }
+  return output;
+}
+
+Tensor rrelu_with_noise_zoom(
+    const Tensor& self,
+    const Tensor& noise,
+    const Scalar& lower,
+    const Scalar& upper,
+    bool training,
+    std::optional<Generator> generator) {
+  Tensor output = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  return at::native::rrelu_with_noise_out_zoom(self, noise, lower, upper, training, generator, output);
+}
+
+Tensor& rrelu_with_noise_zoom_(
+    Tensor& self,
+    const Tensor& noise,
+    const Scalar& lower,
+    const Scalar& upper,
+    bool training,
+    std::optional<Generator> generator) {
+  return at::native::rrelu_with_noise_out_zoom(
+      self, noise, lower, upper, training, generator, self);
+}
+
+}  // namespace at::native

From 358886c83cae477ce82934936e0f4e52812655df Mon Sep 17 00:00:00 2001
From: 123epsilon <arhammkhan@gmail.com>
Date: Tue, 18 Mar 2025 21:08:07 +0000
Subject: [PATCH 18/23] add distributed support

---
 CMakeLists.txt                                |    2 +-
 aten/src/ATen/core/VariableFallbackKernel.cpp |    5 +
 build.sh                                      |    2 +-
 build_variables.bzl                           |   24 +-
 caffe2/CMakeLists.txt                         |   48 +-
 cmake/Dependencies.cmake                      |    9 +-
 torch/CMakeLists.txt                          |    9 +-
 torch/csrc/StorageMethods.cpp                 |   20 +
 torch/csrc/distributed/c10d/NCCLUtils.cpp     |   10 +
 torch/csrc/distributed/c10d/NCCLUtils.hpp     |    5 +
 .../distributed/c10d/ProcessGroupNCCL.hpp     |   27 +-
 .../distributed/c10d/ProcessGroupZoomNCCL.cpp | 4367 +++++++++++++++++
 torch/csrc/distributed/c10d/TraceUtils.h      |   12 +-
 torch/csrc/distributed/c10d/Utils.cu          |   13 +
 .../csrc/distributed/c10d/intra_node_comm.hpp |   28 +-
 .../distributed/c10d/intra_node_comm_zoom.cpp |  216 +
 .../distributed/c10d/intra_node_comm_zoom.cu  |  748 +++
 .../c10d/quantization/quantization_gpu.cu     |   40 +-
 .../c10d/quantization/quantization_utils.h    |    6 +
 torch/csrc/distributed/c10d/reducer_zoom.cpp  |   86 +
 torch/csrc/jit/python/pybind_utils.h          |    5 +
 torch/csrc/utils.h                            |   10 +
 torch/csrc/zoom/Module.cpp                    |   21 +-
 torch/csrc/zoom/nccl.cpp                      | 1122 +++++
 torch/csrc/zoom/nccl.h                        |  218 +
 torch/csrc/zoom/python_nccl.cpp               |  323 ++
 torch/csrc/zoom/python_nccl.h                 |   13 +
 torch/distributed/distributed_c10d.py         |   15 +-
 torch/zoom/__init__.py                        |   99 +
 torch/zoom/nccl.py                            |  137 +
 torch/zoom/zoom_triton_mm.py                  |   14 +
 31 files changed, 7599 insertions(+), 55 deletions(-)
 create mode 100644 torch/csrc/distributed/c10d/ProcessGroupZoomNCCL.cpp
 create mode 100644 torch/csrc/distributed/c10d/intra_node_comm_zoom.cpp
 create mode 100644 torch/csrc/distributed/c10d/intra_node_comm_zoom.cu
 create mode 100644 torch/csrc/distributed/c10d/reducer_zoom.cpp
 create mode 100644 torch/csrc/zoom/nccl.cpp
 create mode 100644 torch/csrc/zoom/nccl.h
 create mode 100644 torch/csrc/zoom/python_nccl.cpp
 create mode 100644 torch/csrc/zoom/python_nccl.h
 create mode 100644 torch/zoom/nccl.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 528ebfb8f55a47..a132d9e4ee453a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -237,7 +237,7 @@ cmake_dependent_option(
     "MPS_FOUND" OFF)
 cmake_dependent_option(
     USE_NCCL "Use NCCL" ON
-    "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+    "USE_CUDA OR USE_ROCM OR USE_ZOOM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON
     USE_NCCL OFF)
 cmake_dependent_option(
diff --git a/aten/src/ATen/core/VariableFallbackKernel.cpp b/aten/src/ATen/core/VariableFallbackKernel.cpp
index b801eb2fa52111..2bac27929b9e1e 100644
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@@ -66,6 +66,11 @@ TORCH_LIBRARY_IMPL(_, AutogradCUDA, m) {
   m.fallback(AUTOGRAD_FALLBACK);
 }
 
+// TODO(Arham): replace with zoom key
+TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
+  m.fallback(AUTOGRAD_FALLBACK);
+}
+
 TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
   m.fallback(AUTOGRAD_FALLBACK);
 }
diff --git a/build.sh b/build.sh
index 74897f8830e56a..28e75f35a1fe9f 100644
--- a/build.sh
+++ b/build.sh
@@ -66,7 +66,7 @@ export USE_MKLDNN=0
 export USE_MKLDNN_CBLAS=0
 export USE_MPI=0
 export USE_NATIVE_ARCH=0
-export USE_NCCL=0
+export USE_NCCL=1
 export USE_NNAPI=0
 export USE_NNPACK=0
 export USE_NUMA=0
diff --git a/build_variables.bzl b/build_variables.bzl
index 6bd1898db6310b..8badcac93371ea 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -691,6 +691,23 @@ libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_s
     "torch/csrc/cuda/nccl.cpp",
 ]
 
+# Zoom Variants of distributed sources
+libtorch_zoom_distributed_base_sources = [
+    "torch/csrc/distributed/c10d/reducer_zoom.cpp",
+]
+
+libtorch_zoom_distributed_extra_sources = [
+    "torch/csrc/distributed/c10d/NCCLUtils.cpp",
+    "torch/csrc/distributed/c10d/ProcessGroupZoomNCCL.cpp",
+    "torch/csrc/distributed/c10d/ProcessGroupUCC.cpp",
+    "torch/csrc/distributed/c10d/UCCTracing.cpp",
+    "torch/csrc/distributed/c10d/UCCUtils.cpp",
+    "torch/csrc/distributed/c10d/intra_node_comm_zoom.cpp",
+    "torch/csrc/distributed/c10d/intra_node_comm_zoom.cu",
+    "torch/csrc/distributed/c10d/Utils.cu",
+    "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
+]
+
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
     "torch/csrc/api/src/data/datasets/mnist.cpp",
@@ -773,6 +790,11 @@ libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
     "torch/csrc/cuda/Tensor.cpp",
 ]
 
+libtorch_zoom_core_sources = [
+    "torch/csrc/zoom/comm.cpp",
+    "torch/csrc/zoom/memory_snapshot.cpp",
+]
+
 libtorch_python_zoom_sources = [
     "torch/csrc/zoom/Module.cpp",
     "torch/csrc/zoom/Event.cpp",
@@ -781,8 +803,6 @@ libtorch_python_zoom_sources = [
     "torch/csrc/zoom/Graph.cpp",
     "torch/csrc/zoom/utils.cpp",
     "torch/csrc/zoom/ZoomPluggableAllocator.cpp",
-    "torch/csrc/zoom/comm.cpp",
-    "torch/csrc/zoom/memory_snapshot.cpp",
     "torch/csrc/zoom/shared/hiprt.cpp",
 ]
 
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 82d8b6b3372135..c1bf64696b8a85 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -604,9 +604,9 @@ if(USE_CUDA OR USE_ROCM)
   append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()
 
-# if (USE_ZOOM)
-#   append_filelist("libtorch_zoom_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
-# endif()
+if (USE_ZOOM)
+  append_filelist("libtorch_zoom_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
+endif()
 
 if(USE_CUDA)
   list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
@@ -689,25 +689,25 @@ if(USE_ROCM)
   install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
 
-# if(USE_ZOOM)
-#   list(APPEND Caffe2_ZOOM_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
-#   if(USE_NCCL)
-#     list(APPEND Caffe2_ZOOM_SRCS
-#       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
-#   endif()
-#   if(USE_DISTRIBUTED)
-#     append_filelist("libtorch_zoom_distributed_base_sources" Caffe2_ZOOM_SRCS)
-#     if(NOT WIN32)
-#       append_filelist("libtorch_zoom_distributed_extra_sources" Caffe2_ZOOM_SRCS)
-#     endif()
-#   endif()
+if(USE_ZOOM)
+  list(APPEND Caffe2_ZOOM_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
+  if(USE_NCCL)
+    list(APPEND Caffe2_ZOOM_SRCS
+      ${TORCH_SRC_DIR}/csrc/zoom/nccl.cpp)
+  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_zoom_distributed_base_sources" Caffe2_ZOOM_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_zoom_distributed_extra_sources" Caffe2_ZOOM_SRCS)
+    endif()
+  endif()
   # See NOTE [ ATen NVRTC Stub and HIP ]
-#   hip_add_library(caffe2_hiprtc SHARED ${ATen_HIPRTC_STUB_SRCS})
-#   target_link_libraries(caffe2_hiprtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB})
-#   target_include_directories(caffe2_hiprtc PRIVATE ${CMAKE_BINARY_DIR} ${ROCM_SOURCE_DIR}/include)
-#   target_compile_definitions(caffe2_hiprtc PRIVATE USE_ROCM __HIP_PLATFORM_AMD__)
-#   install(TARGETS caffe2_hiprtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-# endif()
+  # hip_add_library(caffe2_hiprtc SHARED ${ATen_HIPRTC_STUB_SRCS})
+  # target_link_libraries(caffe2_hiprtc ${PYTORCH_HIP_LIBRARIES} ${ROCM_HIPRTC_LIB})
+  # target_include_directories(caffe2_hiprtc PRIVATE ${CMAKE_BINARY_DIR} ${ROCM_SOURCE_DIR}/include)
+  # target_compile_definitions(caffe2_hiprtc PRIVATE USE_ROCM __HIP_PLATFORM_AMD__)
+  # install(TARGETS caffe2_hiprtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+endif()
 
 if(NOT NO_API AND NOT BUILD_LITE_INTERPRETER)
   list(APPEND TORCH_SRCS
@@ -989,6 +989,10 @@ elseif(USE_ZOOM)
   hip_add_library(torch_zoom ${Caffe2_ZOOM_SRCS} ${ATen_HIPRTC_STUB_SRCS})
   set(CUDA_LINK_LIBRARIES_KEYWORD)
   torch_compile_options(torch_zoom)  # see cmake/public/utils.cmake
+  if(USE_NCCL)
+    target_link_libraries(torch_zoom PRIVATE __caffe2_nccl)
+    target_compile_definitions(torch_zoom PRIVATE USE_NCCL)
+  endif()
 elseif(USE_CUDA)
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
   list(APPEND Caffe2_GPU_SRCS ${GENERATED_CXX_TORCH_CUDA})
@@ -1454,6 +1458,8 @@ if(USE_DISTRIBUTED)
   if(USE_NCCL AND USE_C10D_NCCL)
     if(USE_ROCM)
       target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+    elseif(USE_ZOOM)
+      target_compile_definitions(torch_zoom PUBLIC USE_C10D_NCCL)
     else()
       target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
     endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index bc0d184cb8fd98..98f3fc3a327487 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1275,9 +1275,9 @@ endif()
 
 # ---[ NCCL
 if(USE_NCCL)
-  if(NOT (USE_CUDA OR USE_ROCM))
+  if(NOT (USE_CUDA OR USE_ROCM OR USE_ZOOM))
     message(WARNING
-        "Not using CUDA/ROCM, so disabling USE_NCCL. Suppress this warning with "
+        "Not using CUDA/ROCM/ZOOM, so disabling USE_NCCL. Suppress this warning with "
         "-DUSE_NCCL=OFF.")
     caffe2_update_option(USE_NCCL OFF)
   elseif(NOT CMAKE_SYSTEM_NAME STREQUAL "Linux")
@@ -1289,6 +1289,9 @@ if(USE_NCCL)
   elseif(USE_ROCM)
     include(${CMAKE_CURRENT_LIST_DIR}/External/rccl.cmake)
     list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS __caffe2_nccl)
+  elseif(USE_ZOOM)
+    include(${CMAKE_CURRENT_LIST_DIR}/External/rccl.cmake)
+    list(APPEND Caffe2_ZOOM_DEPENDENCY_LIBS __caffe2_nccl)
   endif()
 endif()
 
@@ -1331,7 +1334,7 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
     list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe)
     if(USE_CUDA)
       list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
-    elseif(USE_ROCM)
+    elseif(USE_ROCM OR USE_ZOOM)
       message(WARNING "TensorPipe doesn't yet support ROCm")
       # Not yet...
       # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 60b6038f7bb9be..3cfde3c8925966 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -284,8 +284,13 @@ if(USE_DISTRIBUTED)
 endif()
 
 if(USE_NCCL AND NOT WIN32)
-    list(APPEND TORCH_PYTHON_SRCS
-      ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
+    if(NOT USE_ZOOM)
+      list(APPEND TORCH_PYTHON_SRCS
+        ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
+    else()
+      list(APPEND TORCH_PYTHON_SRCS
+        ${TORCH_SRC_DIR}/csrc/zoom/python_nccl.cpp)
+    endif()
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
 endif()
 
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 540268d1522445..5f08a82b8ad4b4 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -31,6 +31,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef USE_ZOOM
+#include <ATen/native/zoom/Resize.h>
+#include <hip/hip_runtime.h>
+#endif
+
 #include <ATen/detail/PrivateUse1HooksInterface.h>
 #include <ATen/native/Resize.h>
 
@@ -160,6 +165,21 @@ static PyObject* THPStorage_resize_(PyObject* self, PyObject* number_arg) {
     at::native::resize_bytes_cuda(storage.unsafeGetStorageImpl(), size_bytes);
 #else
     TORCH_CHECK(false, "built without USE_CUDA");
+#endif
+  } 
+  // TODO (Arham): replace with zoom key
+  else if(device_type == at::kPrivateUse1){
+#ifdef USE_ZOOM
+    ptrdiff_t size_bytes_i = newsize;
+    TORCH_CHECK(
+        !c10::overflows<size_t>(size_bytes_i),
+        "Requested storage size (",
+        size_bytes_i,
+        ") cannot be represented as a size_t");
+    const auto size_bytes = static_cast<size_t>(size_bytes_i);
+    at::native::resize_bytes_zoom(storage.unsafeGetStorageImpl(), size_bytes);
+#else
+    TORCH_CHECK(false, "built without USE_ZOOM");
 #endif
   } else {
     at::native::resize_bytes_nocuda(storage, newsize);
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index e26ab22f1a9f3c..af0a590e7c2e8b 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -6,7 +6,12 @@
 #ifdef USE_C10D_NCCL
 #include <vector>
 
+#ifdef USE_ZOOM
+#include <hip/hip_runtime.h>
+#else
 #include <cuda_runtime.h>
+#endif
+
 #include <mutex>
 
 namespace {
@@ -107,7 +112,12 @@ size_t hashTensors(const std::vector<at::Tensor>& tensors) {
         char* dst = (char*)std::calloc(data_size, sizeof(char));
         // This is needed so that we trigger a device synchronization so we can
         // get the collective finished if launched on GPU and hash its output.
+        #ifdef USE_ZOOM
+        hipMemcpy(dst, src, data_size, hipMemcpyDeviceToHost);
+        #else
         cudaMemcpy(dst, src, data_size, cudaMemcpyDeviceToHost);
+        #endif
+
         for (size_t i = 0; i < data_size; ++i) {
           // Update the hash for each byte in the tensor
           hash = c10::hash_combine(
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index 5690c0591a7af3..3564b8189fa99f 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -12,7 +12,12 @@
 #include <ATen/ATen.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
+
+#ifdef USE_ZOOM
+#include <rccl/rccl.h>
+#else
 #include <nccl.h>
+#endif
 
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
     (NCCL_MINOR >= 14)
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 07f3730b1338b5..d1e3ebff5f6e8d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -25,13 +25,22 @@
 #include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
 
 #include <ATen/DynamicLibrary.h>
+#ifdef USE_ZOOM
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ZoomEvent.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/zoom/ZoomStream.h>
+#else
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
-#include <c10/core/Stream.h>
-#include <c10/core/StreamGuard.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#endif
+
+#include <c10/core/Stream.h>
+#include <c10/core/StreamGuard.h>
 
 #include <torch/custom_class.h>
 
@@ -315,12 +324,17 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     // The cached list of CUDA devices to operate on
     at::Device device_;
 
+    #ifdef USE_ZOOM
+    std::shared_ptr<at::zoom::ZoomEvent> ncclStartEvent_;
+    std::shared_ptr<at::zoom::ZoomEvent> ncclEndEvent_;
+    #else
     // The start CUDA event of NCCL operator tracking this work item. These
     // start CUDA events are needed by desync debugging if enabled.
     std::shared_ptr<at::cuda::CUDAEvent> ncclStartEvent_;
 
     // The end CUDA event of NCCL operator tracking this work item.
     std::shared_ptr<at::cuda::CUDAEvent> ncclEndEvent_;
+    #endif
 
     // The NCCL communicator used for this work item.
     std::shared_ptr<NCCLComm> ncclComm_;
@@ -969,11 +983,16 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Add Work Pointer to workVector
   void workEnqueue(c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>);
 
+  #ifdef USE_ZOOM
+  std::unordered_map<std::string, c10::zoom::ZoomStream> ncclStreams_;
+  std::unordered_map<std::string, at::zoom::ZoomEvent> ncclEvents_;
+  #else
   // The CUDA streams used by NCCL kernels
   std::unordered_map<std::string, at::cuda::CUDAStream> ncclStreams_;
 
   // The CUDA events used to sync NCCL streams
   std::unordered_map<std::string, at::cuda::CUDAEvent> ncclEvents_;
+  #endif
 
   // Device Indexes used for all collectives in this group
   std::set<int> usedDeviceIdxs_;
@@ -982,7 +1001,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   int coalescing_state_ = 0;
 
   // Stores device indexes for all collectives run inside a coalescing block
+  #ifdef USE_ZOOM
+  at::Device coalescedDevice_ = at::Device("zoom");
+  #else
   at::Device coalescedDevice_ = at::Device("cuda");
+  #endif
 
   // Stores communicators for all collectives run inside a coalescing block
   std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupZoomNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupZoomNCCL.cpp
new file mode 100644
index 00000000000000..61496e0e0d6d27
--- /dev/null
+++ b/torch/csrc/distributed/c10d/ProcessGroupZoomNCCL.cpp
@@ -0,0 +1,4367 @@
+#ifdef USE_C10D_NCCL
+
+#include <exception>
+#include <fstream>
+#include <map>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_set>
+#include <utility>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/HIPGraph.h>
+#include <c10/core/DeviceType.h>
+#include <c10/zoom/ZoomAllocatorConfig.h>
+#include <c10/zoom/HIPGraphsC10Utils.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/zoom/nccl.h>
+#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/logger.hpp>
+#include <torch/torch.h>
+
+namespace c10d {
+
+constexpr const char* const kNCCLAbortedCommStoreKey = "NCCLABORTEDCOMM";
+
+namespace {
+
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
+#define NCCL_HAS_AVG 1
+#endif
+
+// NCCL op mapping
+const std::map<ReduceOp::RedOpType, ncclRedOp_t> ncclOp = {
+    {ReduceOp::MIN, ncclMin},
+    {ReduceOp::MAX, ncclMax},
+    {ReduceOp::SUM, ncclSum},
+    {ReduceOp::PRODUCT, ncclProd},
+#ifdef NCCL_HAS_AVG
+    {ReduceOp::AVG, ncclAvg},
+#endif
+};
+
+// NCCL type typing
+std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
+    {at::kChar, ncclInt8},
+    {at::kByte, ncclUint8},
+    {at::kFloat, ncclFloat},
+    {at::kDouble, ncclDouble},
+    {at::kInt, ncclInt32},
+    {at::kLong, ncclInt64},
+    {at::kHalf, ncclHalf},
+    {at::kBool, ncclUint8},
+    {at::kFloat8_e5m2, ncclUint8},
+    {at::kFloat8_e4m3fn, ncclUint8},
+    {at::kFloat8_e4m3fnuz, ncclUint8},
+    {at::kFloat8_e5m2fnuz, ncclUint8},
+#if HAS_NCCL_BF16_DATATYPE
+    {at::kBFloat16, ncclBfloat16},
+#endif
+};
+
+// Helper function that gets the data type and issues error if not supported
+ncclDataType_t getNcclDataType(at::ScalarType type) {
+  auto it = ncclDataType.find(type);
+  TORCH_CHECK_WITH(
+      TypeError,
+      it != ncclDataType.end(),
+      "Input tensor data type is not supported for NCCL process group: ",
+      type);
+  return it->second;
+}
+
+bool complexViewAsRealAllowed(const ReduceOp reduceOp) {
+  switch (reduceOp) {
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::AVG:
+      return true;
+    case ReduceOp::PREMUL_SUM:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
+#ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
+template <typename T, ncclDataType_t dataType>
+ncclRedOpRAII unpackPreMulSum(
+    const ReduceOp& reduceOp,
+    const ncclComm_t& comm) {
+  const auto* preMulSupplement =
+      reinterpret_cast<NCCLPreMulSumSupplement*>(reduceOp.supplement_.get());
+  ncclRedOp_t preMulSum;
+  bool has_tensor = preMulSupplement->tensor_factor.defined();
+  auto residence = has_tensor ? ncclScalarDevice : ncclScalarHostImmediate;
+  const T* ptr_factor = has_tensor
+      ? preMulSupplement->tensor_factor.const_data_ptr<T>()
+      : nullptr;
+  T scalar_factor = T(preMulSupplement->double_factor);
+  ncclRedOpCreatePreMulSum(
+      &preMulSum,
+      // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html#ncclredopcreatepremulsum
+      // tells us that the scalar input is strictly a multiplier.
+      /*scalar=*/has_tensor ? const_cast<T*>(ptr_factor) : &scalar_factor,
+      dataType,
+      residence,
+      comm);
+  return ncclRedOpRAII(preMulSum, comm);
+}
+#endif
+
+ncclRedOpRAII getNcclReduceOp(
+    const ReduceOp& reduceOp,
+    at::Tensor& input,
+    const ncclDataType_t& dataType,
+    const ncclComm_t& comm) {
+  try {
+    if (input.scalar_type() == at::kBool) {
+      if (reduceOp == ReduceOp::SUM) {
+        // For bool tensors, map sum to max, which both represent a bitwise or.
+        // This is to prevent overflow issues with sum, since we use uint8 to
+        // represent a bool (see ncclDataType mapping).
+        return ncclMax;
+      }
+#ifdef NCCL_HAS_AVG
+      if (reduceOp == ReduceOp::AVG) {
+        C10_THROW_ERROR(
+            TypeError, "Cannot use ReduceOp.AVG with boolean inputs");
+      }
+#endif
+    }
+    if (reduceOp == ReduceOp::PREMUL_SUM) {
+#ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
+      switch (dataType) {
+        case ncclHalf:
+          return unpackPreMulSum<at::Half, ncclHalf>(reduceOp, comm);
+        case ncclFloat:
+          return unpackPreMulSum<float, ncclFloat>(reduceOp, comm);
+        case ncclDouble:
+          return unpackPreMulSum<double, ncclDouble>(reduceOp, comm);
+        default:
+          C10_THROW_ERROR(
+              TypeError, "PreMulSum Data type must be half, float, or double");
+          ncclRedOp_t unused;
+          return unused;
+      }
+#else
+      C10_THROW_ERROR(ValueError, "PreMulSum requires NCCL>=2.11.1");
+#endif
+    }
+    return ncclOp.at(reduceOp);
+  } catch (const std::out_of_range&) {
+    switch (reduceOp) {
+      case ReduceOp::AVG:
+        C10_THROW_ERROR(
+            ValueError,
+            c10::str(
+                "AVG requires NCCL 2.10+. The current version is ",
+                NCCL_MAJOR,
+                ".",
+                NCCL_MINOR));
+        break;
+      case ReduceOp::BAND:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BAND with NCCL");
+        break;
+      case ReduceOp::BOR:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BOR with NCCL");
+        break;
+      case ReduceOp::BXOR:
+        C10_THROW_ERROR(ValueError, "Cannot use ReduceOp.BXOR with NCCL");
+        break;
+      default:
+        C10_THROW_ERROR(ValueError, "Unhandled ReduceOp");
+        break;
+    }
+  }
+}
+
+// Get a key string from device
+inline std::string getKeyFromDevice(at::Device& device) {
+  return std::to_string(device.index());
+}
+
+std::string getKeySendRecv(int myRank, int peer) {
+  int lowRank = myRank < peer ? myRank : peer;
+  int highRank = myRank < peer ? peer : myRank;
+  std::string sendRecvPair =
+      std::to_string(lowRank) + ":" + std::to_string(highRank);
+  return sendRecvPair;
+}
+
+// Get device from tensor
+inline at::Device getDevice(at::Tensor& tensor) {
+  return tensor.device();
+}
+
+// [Sync Streams] Helper that lets the input ncclStreams to wait for the current
+// stream. NCCL communications run on ncclStreams, but input tensors are
+// allocated on different streams (i.e., current streams). Communications on
+// ncclStreams cannot start before pending input tensor ops on current streams
+// finish. Otherwise, ops on two streams might read/write same tensors
+// concurrently.
+//
+// The synchronization above alone is not enough. We also need to make sure
+// input tensors are not freed before their usages on ncclStreams finish. This
+// can be achieved by calling c10::zoom::ZoomCachingAllocator::recordStream,
+// which remembers the usage stream (ncclStream), creates an event on the usage
+// stream when GC attempts to free the input tensor, and delays GC until that
+// event is done.
+void syncStream(
+    at::Device& device,
+    at::zoom::ZoomEvent& ncclEvent,
+    c10::zoom::ZoomStream& ncclStream) {
+  ncclEvent.record(c10::zoom::getCurrentZoomStream(device.index()));
+  ncclEvent.block(ncclStream);
+}
+
+// Given a ncclUniqueId, convert it to a string representation that can be put
+// in the store.
+std::string buildNcclUniqueIdStr(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (const auto i : c10::irange(NCCL_UNIQUE_ID_BYTES)) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+std::string getNcclAbortedCommStoreKey(const std::string ncclIdStr) {
+  return std::string(kNCCLAbortedCommStoreKey) + ":" + ncclIdStr;
+}
+
+// Returns exception's what() given an exception_ptr instance.
+std::string getExceptionMsgFromExceptionPtr(
+    const std::exception_ptr& exceptionPtr) {
+  TORCH_CHECK(exceptionPtr != nullptr);
+  try {
+    std::rethrow_exception(exceptionPtr);
+  } catch (const std::exception& e) {
+    return e.what();
+  } catch (...) {
+    return "Unknown exception type";
+  }
+}
+
+inline void errorIfCapturingNonCapturableNCCL(c10::zoom::CaptureStatus status) {
+  // parentheses avoid some compiler warnings
+  static const uint64_t min_version =
+      (((uint64_t)2) << 32) + (((uint64_t)9) << 16) + ((uint64_t)6);
+  static const uint64_t cur_version = torch::zoom::nccl::version();
+  if (cur_version < min_version) {
+    TORCH_CHECK_WITH(
+        NotImplementedError,
+        status == c10::zoom::CaptureStatus::None,
+        "Capturing NCCL collectives is only allowed with NCCL >= 2.9.6");
+  }
+}
+
+} // namespace
+
+// Map from each communicator to its device index.
+// This map is used when register/deregister cache segments from cache
+// allocator. See design notes below:
+// - Each segment should be registered only to the communicator on the
+//   same device.
+// - We cannot reuse devNCCLCommMap_ in each ProcessGroup because the key may be
+//   ranks rather than device in point-to-point case.
+// - This map has also to be maintained as global variable since the register
+//   hooks are called outside the scope of any PG, thus we need traverse
+//   communicators in all PGs.
+static std::unordered_map<std::shared_ptr<NCCLComm>, int> ncclCommDevIdxMap;
+static std::mutex ncclCommDevIdxMapMutex;
+static bool allocatorHooksAttached = false;
+
+std::atomic<bool> ProcessGroupNCCL::shouldDump_(false);
+
+void cacheAllocatorRegisterHook(
+    const c10::zoom::ZoomCachingAllocator::TraceEntry& te) {
+  // Register after SEGMENT_ALLOC
+  if (te.action_ !=
+      c10::zoom::ZoomCachingAllocator::TraceEntry::Action::SEGMENT_ALLOC) {
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(ncclCommDevIdxMapMutex);
+  for (auto& it : ncclCommDevIdxMap) {
+    auto& ncclComm = it.first;
+    auto& devIdx = it.second;
+    if (te.device_ == devIdx) {
+      ncclComm->registerSegment(reinterpret_cast<void*>(te.addr_), te.size_);
+    }
+  }
+}
+
+void cacheAllocatorDeregisterHook(
+    const c10::zoom::ZoomCachingAllocator::TraceEntry& te) {
+  // deregister before SEGMENT_FREE
+  if (te.action_ !=
+      c10::zoom::ZoomCachingAllocator::TraceEntry::Action::SEGMENT_FREE) {
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(ncclCommDevIdxMapMutex);
+  for (auto& it : ncclCommDevIdxMap) {
+    auto& ncclComm = it.first;
+    auto& devIdx = it.second;
+    if (te.device_ == devIdx) {
+      ncclComm->deregisterSegment(reinterpret_cast<void*>(te.addr_));
+    }
+  }
+}
+
+#if defined(IS_NCCLX) && defined(NCCL_COMM_DUMP)
+std::string dump_nccl_trace() {
+  std::unordered_map<
+      std::string /* ncclUniqueID */,
+      std::unordered_map<std::string, std::string> /* dump from this comm */>
+      ncclDumpMap;
+  // dump_nccl_trace is only called from the default PG (uid_=0), but we want to
+  // dump from all comms so we need to iterate over ncclCommDevIdxMap, which
+  // is static
+  std::vector<std::shared_ptr<NCCLComm>> allNCCLComms;
+  // within the critical section, we don't want to dump while holding the lock
+  // as dump might hang
+  ncclCommDevIdxMapMutex.lock();
+  for (auto& [ncclComm, _] : ncclCommDevIdxMap) {
+    allNCCLComms.push_back(ncclComm);
+  }
+  ncclCommDevIdxMapMutex.unlock();
+  for (auto& ncclComm : allNCCLComms) {
+    std::string ncclUniqueIDStr = buildNcclUniqueIdStr(ncclComm->getNcclId());
+    ncclDumpMap[ncclUniqueIDStr] = ncclComm->ncclCommDump();
+  }
+  return NCCLTraceBuffer::get()->dump(ncclDumpMap);
+}
+#else
+std::string dump_nccl_trace() {
+  return NCCLTraceBuffer::get()->dump(c10::nullopt);
+}
+#endif
+
+std::optional<std::function<void(std::function<void(const std::string&)>)>>&
+get_cpp_trace_dumper() {
+  static std::optional<
+      std::function<void(std::function<void(const std::string&)>)>>
+      dumper(c10::nullopt);
+  return dumper;
+}
+
+gil_checker_t& get_gil_checker() {
+  static gil_checker_t gil_checker = nullptr;
+  return gil_checker;
+}
+
+std::future<bool> launchAsyncGilCheck() {
+  std::promise<bool> resultPromise;
+  std::future<bool> resultFuture = resultPromise.get_future();
+  TORCH_CHECK(get_gil_checker(), "Can't check GIL with null GIL checker");
+  std::thread workerThread([promise = std::move(resultPromise)]() mutable {
+    try {
+      auto& gil_checker = get_gil_checker();
+      promise.set_value((*gil_checker)());
+    } catch (...) {
+      promise.set_exception(std::current_exception());
+    }
+  });
+
+  // Detach the thread to allow it to run independently
+  workerThread.detach();
+
+  return resultFuture;
+}
+
+// Return CUDA device with ordinal given by input rank.  If we aren't
+// bound to a specific device, there is no strict guarantee that this
+// heuristic is the correct assignment of ranks to GPUs that Python
+// layers use, but in practice it tends to be.  Fortunately we don't
+// rely on this for correctness of any tensor operations, just for
+// ancillary uses like barriers.
+at::Device ProcessGroupNCCL::guessDeviceForRank() const {
+  TORCH_CHECK_WITH(ValueError, rank_ >= 0, "Invalid rank ", rank_);
+  if (getBoundDeviceId()) {
+    return *getBoundDeviceId();
+  } else {
+    auto numGPUs = at::zoom::getNumGPUs();
+    int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs);
+    return at::Device(at::DeviceType::PrivateUse1, deviceIdx);
+  }
+}
+
+const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 100;
+constexpr int64_t kSynchronizeBusyWaitMillis = 10;
+thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0;
+
+std::ostream& operator<<(
+    std::ostream& output,
+    const ProcessGroupNCCL::WorkNCCL& workNCCL) {
+  std::string workInfo;
+  workInfo = c10::str(
+      "WorkNCCL(",
+      "SeqNum=",
+      workNCCL.seq_,
+      ", OpType=",
+      opTypeToString(workNCCL.opType_),
+      ", NumelIn=",
+      workNCCL.numelIn_,
+      ", NumelOut=",
+      workNCCL.numelOut_,
+      ", Timeout(ms)=",
+      workNCCL.opTimeout_.count(),
+      ")");
+  return output << workInfo;
+}
+
+ProcessGroupNCCL::WorkNCCL::WorkNCCL(
+    at::Device& device,
+    int rank,
+    OpType opType,
+    uint64_t seq,
+    const char* profilingTitle,
+    const std::optional<std::vector<at::Tensor>>& inputs,
+    bool desyncDebug,
+    bool enableTiming,
+    DebugLevel distDebugLevel)
+    : Work(rank, opType, profilingTitle, inputs),
+      device_(device),
+      workStartTime_(std::chrono::steady_clock::now()),
+      seq_(seq),
+      timingEnabled_(enableTiming),
+      distDebugLevel_(distDebugLevel) {
+  // Creates the CUDA event wrappers
+  // Note: The actual events are lazily created when first recorded to with
+  // DEFAULT_FLAGS = hipEventDisableTiming.
+  if (enableTiming) {
+    ncclStartEvent_ = std::make_shared<at::zoom::ZoomEvent>(hipEventDefault);
+  }
+  ncclEndEvent_ = std::make_shared<at::zoom::ZoomEvent>(
+      enableTiming ? hipEventDefault : hipEventDisableTiming);
+}
+
+ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
+    : Work(w.rank_, w.opType_),
+      std::enable_shared_from_this<WorkNCCL>(w),
+      device_(w.device_),
+      ncclStartEvent_(w.ncclStartEvent_),
+      ncclEndEvent_(w.ncclEndEvent_),
+      ncclComm_(w.ncclComm_),
+      blockingWait_(w.blockingWait_),
+      opTimeout_(w.opTimeout_),
+      workStartTime_(w.workStartTime_),
+      seq_(w.seq_),
+      startTraceUpdated_(w.startTraceUpdated_),
+      numelIn_(w.numelIn_),
+      numelOut_(w.numelOut_),
+      store_(w.store_),
+      timingEnabled_(w.timingEnabled_),
+      trace_id_(w.trace_id_),
+      distDebugLevel_(w.distDebugLevel_) {
+  exception_ = w.exception_;
+}
+
+ProcessGroupNCCL::WorkNCCL::~WorkNCCL() = default;
+
+bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
+  if (!ncclComm_->isAborted()) {
+    checkAndSetException();
+  }
+  return exception() || finishedGPUExecutionInternal();
+}
+
+bool ProcessGroupNCCL::WorkNCCL::isStarted() {
+  if (!ncclComm_->isAborted()) {
+    checkAndSetException();
+  }
+  return exception() || startedGPUExecutionInternal();
+}
+
+bool ProcessGroupNCCL::WorkNCCL::isSuccess() const {
+  C10_THROW_ERROR(NotImplementedError, "WorkNCCL::isSuccess() is deprecated");
+}
+
+void ProcessGroupNCCL::WorkNCCL::checkAndSetException() {
+  if (exception()) {
+    // We already have an exception.
+    return;
+  }
+
+  auto exception_ptr = checkForNCCLErrors();
+  std::unique_lock<std::mutex> lock(mutex_);
+  exception_ = exception_ptr;
+  if (exception_) {
+    LOG(INFO) << logPrefix()
+              << "found async exception when checking for NCCL errors: "
+              << getExceptionMsgFromExceptionPtr(exception_);
+  }
+}
+
+const std::string& ProcessGroupNCCL::WorkNCCL::logPrefix() const {
+  static std::string prefix = c10::str("[Rank ", rank_, "] ");
+  return prefix;
+}
+
+void ProcessGroupNCCL::WorkNCCL::setException(
+    std::exception_ptr exception_ptr) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  exception_ = exception_ptr;
+}
+
+// Helper that checks if the NCCL kernels are completed on the GPUs
+bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecution() {
+  checkAndSetException();
+  return finishedGPUExecutionInternal();
+}
+
+bool ProcessGroupNCCL::WorkNCCL::startedGPUExecutionInternal() const {
+  // if timing is disabled we won't have allocated start events
+  if (!timingEnabled_) {
+    return false;
+  }
+  // Checking the work's corresponding CUDA event's status
+  if (!ncclStartEvent_->query()) {
+    return false;
+  }
+  return true;
+}
+
+bool ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const {
+  // Checking the work's corresponding CUDA event's status
+  if (!ncclEndEvent_->query()) {
+    return false;
+  }
+  return true;
+}
+
+bool ProcessGroupNCCL::WorkNCCL::checkTimeout(
+    std::optional<std::chrono::milliseconds> timeout) {
+  auto currentTimepoint = std::chrono::steady_clock::now();
+  auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
+      currentTimepoint - workStartTime_);
+  auto workTimeout = timeout ? *timeout : opTimeout_;
+
+  if (timeElapsed < workTimeout)
+    return false;
+
+  // Timed out
+
+  // There is already an error, we don't override it
+  if (exception())
+    return true;
+
+  std::string exceptionMsg = c10::str(
+      logPrefix(),
+      "Watchdog caught collective operation timeout: ",
+      *this,
+      " ran for ",
+      timeElapsed.count(),
+      " milliseconds before timing out.");
+
+  LOG(ERROR) << exceptionMsg;
+  std::exception_ptr exception_ptr =
+      std::make_exception_ptr(C10_BUILD_ERROR(DistBackendError, exceptionMsg));
+  setException(exception_ptr);
+  return true;
+}
+
+void ProcessGroupNCCL::WorkNCCL::handleException(
+    ErrorHandlingMode errorHandling) {
+  if (exception_) {
+    auto exceptionMsg = c10::str(
+        "Some NCCL operations have failed or timed out. Due to the ",
+        "asynchronous nature of HIP kernels, subsequent GPU operations ",
+        "might run on corrupted/incomplete data.");
+    LOG(ERROR) << logPrefix() << exceptionMsg;
+    C10_LOG_API_USAGE_ONCE("ProcessGroupNCCL.WorkNCCL.handleException");
+
+    if (SHOULD_TEAR_DOWN(errorHandling)) {
+      auto tearDownMsg = c10::str(
+          "To avoid data inconsistency, we are taking the entire process down.");
+      LOG(ERROR) << logPrefix() << tearDownMsg;
+      std::rethrow_exception(exception_);
+    }
+  }
+}
+
+void ProcessGroupNCCL::WorkNCCL::synchronize() {
+  // Call Synchronize without a timeout. We use this method to avoid adding a
+  // timeout argument to the public synchronize API.
+  synchronizeInternal(kNoTimeout);
+}
+
+void ProcessGroupNCCL::WorkNCCL::synchronizeStream() {
+  auto currentStream = c10::zoom::getCurrentZoomStream(device_.index());
+  // Block the current stream on the NCCL stream
+  ncclEndEvent_->block(currentStream);
+
+  if (avoidRecordStreams_) {
+    stashed_for_allocator_safety_->clear();
+  }
+}
+
+// Waiting on the work's corresponding CUDA events
+void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
+    std::chrono::milliseconds timeout) {
+  synchronizeStream();
+
+  // In case of blocking, wait for the operation to complete.
+  if (blockingWait_) {
+    while (!isCompleted()) {
+      bool timedOut = checkTimeout(
+          timeout == kNoTimeout ? c10::nullopt : c10::make_optional(timeout));
+      // Explicitly abort ncclComms here before throwing this timed out
+      // exception to users.
+      // If throwing timed out excepiton without aborting nccl communicators
+      // here, it was observed that CUDA GPU will have 100% utilization and
+      // can not run new events successfully.
+      if (timedOut) {
+        std::string exceptionMsg = c10::str(
+            logPrefix(),
+            "Work ",
+            (*this),
+            " timed out in blocking wait (TORCH_NCCL_BLOCKING_WAIT=1).");
+        LOG(ERROR) << exceptionMsg;
+        break;
+      }
+      // Yield
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+    }
+    // exception() includes timeout and error during blocking wait
+    if (exception()) {
+      // Abort NCCL communicators
+      abort();
+      // Throw exception (from main thread here)
+      handleException(TearDown);
+    }
+  }
+
+  // Device synchronize only after we've completed timeout checks.
+  if (barrierTensor_.defined()) {
+    // If we use the work to do barrier, we should block here
+    // `dist.barrier()` only requires all CPU processes to enter this
+    // function, hence we only need to make sure the dummy all-reduce has
+    // completed. So we would only need to sync the **current stream** back to
+    // host, and do not need to synchronize the entire device (which may have
+    // kernels running on other streams).
+    // Using `hipStreamSynchronize` instead of `hipDeviceSynchronize` can:
+    // - lower chance of hang;
+    // - CurrentZoomStream is usually the context of the next operation in
+    // Python, thus blocking current stream would already block the next
+    // compute kernel;
+    // - achieve better barrier performance.
+    auto currentStream = c10::zoom::getCurrentZoomStream(device_.index());
+    C10_ZOOM_CHECK(hipStreamSynchronize(currentStream));
+  }
+}
+
+// Same as calling synchronize().
+bool ProcessGroupNCCL::WorkNCCL::wait(std::chrono::milliseconds timeout) {
+  RECORD_PARAM_COMMS(
+      static_cast<int>(this->seq_), // seq
+      std::make_tuple("", ""), // PG name tuple
+      rank_, // rank
+      "wait", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      -1,
+      -1,
+      static_cast<int>(1)); // number of device?
+  synchronizeInternal(timeout);
+  // TODO(kwen2501): this should be moved to c10d tests, to qualify a NCCL
+  // upgrade. Once a NCCL version is qualified, this code should not be needed
+  // at runtime.
+#ifdef PGNCCL_ENABLE_HASH
+  if (distDebugLevel_ >= DebugLevel::Detail) {
+    auto numel = getTensorsNumel(*outputs_);
+    auto hashValue = hashTensors(*outputs_);
+    PRINT_COLLECTIVE_HASH_SIGNATURE(
+        "output", opTypeToString(opType_), numel, hashValue);
+  }
+#endif
+  // Always return true, because abort API is not implemented.
+  return true;
+}
+
+void ProcessGroupNCCL::WorkNCCL::abort() {
+  // Abort all communicators of this work
+  ncclComm_->ncclCommAbort();
+
+  ncclCommDevIdxMapMutex.lock();
+  ncclCommDevIdxMap.erase(ncclComm_);
+  ncclCommDevIdxMapMutex.unlock();
+}
+
+static std::atomic<size_t> process_group_id = 0;
+
+constexpr const char* MULTI_DEVICE_ERROR_MSG =
+    "Expecting one tensor only but got multiple. You are probably using multiple "
+    "devices under one thread. The support for such usage has been deprecated. "
+    "For details, please refer to "
+    "https://pytorch.org/docs/stable/distributed.html#multi-gpu-collective-functions. "
+    "ProcessGroupNCCL continues supporting multi-process and multi-thread modes.";
+
+ProcessGroupNCCL::ProcessGroupNCCL(
+    const c10::intrusive_ptr<Store>& store,
+    int rank,
+    int size,
+    c10::intrusive_ptr<Options> options)
+    : Backend(rank, size),
+      store_(store),
+      options_(options),
+      ncclCommCounter_(0),
+      traceKeyStart_(getTraceStartKey("NCCL", rank)),
+      traceKeyEnd_(getTraceEndKey("NCCL", rank)),
+      terminateProcessGroup_(false),
+      terminateHeartbeatMonitorThread_(false),
+      collectiveDebugInfoMode_(false),
+      uid_(process_group_id++),
+      intraNodeComm_(initIntraNodeComm()) {
+  TORCH_CHECK_WITH(
+      ValueError,
+      at::zoom::getNumGPUs() != 0,
+      "ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
+  this->setGroupName(options_->group_name);
+  logPrefix_ = createLogPrefix();
+  blockingWait_ = getCvarBool(TORCH_NCCL_BLOCKING_WAIT, false);
+  asyncErrorHandling_ = static_cast<ErrorHandlingMode>(
+      getCvarInt(TORCH_NCCL_ASYNC_ERROR_HANDLING, 3 /*SkipCleanUp*/));
+  desyncDebug_ = getCvarBool(TORCH_NCCL_DESYNC_DEBUG, false) ||
+      (dist_debug_level_ >= DebugLevel::Detail);
+  // TODO, we should either deprecate TORCH_NCCL_DUMP_ON_TIMEOUT
+  // or change its name to reflect that dump happens on exception including
+  // both timeout and other errors.
+  dumpOnException_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
+      (dist_debug_level_ >= DebugLevel::Detail);
+  enableNanCheck_ = getCvarBool(TORCH_NCCL_NAN_CHECK, false);
+  heartbeat_ = 1ULL;
+  monitorThreadEnabled_.store(getCvarBool(TORCH_NCCL_ENABLE_MONITORING, true));
+  heartbeatTimeoutInSec_ =
+      getCvarInt(TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC, 60 * 10 /*10 Mins*/);
+  waitTimeoutDumpInMilSec_ =
+      getCvarInt(TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC, 60 * 1000 /*60 Sec*/);
+  coordCheckIntervalMilSec_ = getCvarInt(TORCH_NCCL_COORD_CHECK_MILSEC, 1000);
+  ncclTraceBufferSize_ = getCvarInt(TORCH_NCCL_TRACE_BUFFER_SIZE, 0);
+  enableCollecticeHashDebug_ = (dist_debug_level_ >= DebugLevel::Detail);
+  // store_ usually is wrapped with PrefixStore and the prefix is different
+  // across different ProcessGroupNCCL(PG) instances. We need to get the
+  // underlying non-PrefixStore for sharing global information shared across
+  // different PGs.
+  PrefixStore* prefixStore = dynamic_cast<PrefixStore*>(store_.get());
+  globalStore_ =
+      prefixStore ? prefixStore->getUnderlyingNonPrefixStore() : store_;
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+  enableTiming_.store(
+      getCvarBool(TORCH_NCCL_ENABLE_TIMING, false) || desyncDebug_);
+#endif
+  avoidRecordStreams_ = getCvarBool(TORCH_NCCL_AVOID_RECORD_STREAMS, false);
+#ifdef NCCL_HAS_COMM_REGISTER
+  useTensorRegisterAllocatorHook_ =
+      getCvarBool(TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK, false);
+  if (c10::zoom::ZoomCachingAllocator::ZoomAllocatorConfig::
+          expandable_segments()) {
+    useTensorRegisterAllocatorHook_ = false;
+    LOG(INFO)
+        << logPrefix()
+        << "disables TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK because it is not compatible with Zoom allocator expandable segments mode.";
+  }
+#endif
+
+  if (blockingWait_) {
+    if (asyncErrorHandling_ != NoHandling || desyncDebug_) {
+      LOG(INFO)
+          << logPrefix() << "TORCH_NCCL_BLOCKING_WAIT and "
+          << "TORCH_NCCL_ASYNC_ERROR_HANDLING|TORCH_NCCL_DESYNC_DEBUG"
+          << "should not both be enabled. "
+          << "Only TORCH_NCCL_BLOCKING_WAIT is being used in this process.";
+      asyncErrorHandling_ = NoHandling;
+      desyncDebug_ = false;
+    }
+  } else {
+    if (desyncDebug_ && asyncErrorHandling_ == NoHandling) {
+      LOG(INFO)
+          << logPrefix()
+          << "TORCH_NCCL_DESYNC_DEBUG and TORCH_NCCL_ASYNC_ERROR_HANDLING "
+          << "must both be enabled. "
+          << "Enabling TORCH_NCCL_ASYNC_ERROR_HANDLING.";
+      asyncErrorHandling_ = SkipCleanUp;
+    }
+  }
+
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+  ncclCommWatchdogThread_ =
+      std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
+#endif
+
+  init();
+  const std::string OFF = "OFF";
+  std::string torch_distributed_debug =
+      getCvarString({"TORCH_DISTRIBUTED_DEBUG"}, OFF.c_str());
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL initialization options: "
+            << "NCCL version: " << getNcclVersion() << ", size: " << size
+            << ", global rank: " << globalRank()
+            << ", TORCH_NCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_
+            << ", TORCH_NCCL_DUMP_ON_TIMEOUT: " << dumpOnException_
+            << ", TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: "
+            << waitTimeoutDumpInMilSec_
+            << ", TORCH_NCCL_DESYNC_DEBUG: " << desyncDebug_
+            << ", TORCH_NCCL_ENABLE_TIMING: " << enableTiming_.load()
+            << ", TORCH_NCCL_BLOCKING_WAIT: " << blockingWait_
+            << ", TIMEOUT(ms): " << options_->timeout.count()
+            << ", USE_HIGH_PRIORITY_STREAM: "
+            << options_->is_high_priority_stream
+            << ", SPLIT_FROM: " << options_->split_from
+            << ", SPLIT_COLOR: " << options_->split_color
+            << ", TORCH_DISTRIBUTED_DEBUG: " << torch_distributed_debug
+#ifdef NCCL_HAS_COMM_REGISTER
+            << ", TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK: "
+            << useTensorRegisterAllocatorHook_
+#endif
+            << ", TORCH_NCCL_ENABLE_MONITORING: "
+            << monitorThreadEnabled_.load()
+            << ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
+            << ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << ncclTraceBufferSize_
+            << ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
+            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_
+            << ", PG Name: " << options_->group_name;
+
+  if (options_->global_ranks_in_group.empty()) {
+    this->globalRankStart = 0;
+  } else {
+    this->globalRankStart = options_->global_ranks_in_group[0];
+  }
+
+  if (options_->global_ranks_in_group.empty()) {
+    this->globalRankStride = 1;
+  } else if (options_->global_ranks_in_group.size() == 1) {
+    this->globalRankStride = 0;
+  } else {
+    bool ranksAreStrided = true;
+    int startRank = options_->global_ranks_in_group[0];
+    int stride =
+        options_->global_ranks_in_group[1] - options_->global_ranks_in_group[0];
+    for (std::vector<uint64_t>::size_type i = 0;
+         i < options_->global_ranks_in_group.size();
+         i++) {
+      if (options_->global_ranks_in_group[i] != startRank + i * stride) {
+        ranksAreStrided = false;
+        break;
+      }
+    }
+
+    if (ranksAreStrided) {
+      this->globalRankStride = options_->global_ranks_in_group[1] -
+          options_->global_ranks_in_group[0];
+    } else {
+      this->globalRankStride = -1;
+    }
+  }
+
+  // Attach hooks to cache allocator to trigger the hooks whenever a traced
+  // action is called. In the following hooks, we register a newly allocated
+  // segment when SEGMENT_ALLOC action occurs, and deregister a segment when
+  // SEGMENT_FREE action occurs.
+  // We attach hooks only once at the first PG creation.
+  // Attaching hooks fails if ZoomCachingAllocator is not initialized, so
+  // lazyInitZoom is called (and is a no-op if Zoom is already initialized).
+  if (useTensorRegisterAllocatorHook_ && !allocatorHooksAttached) {
+    at::globalContext().lazyInitPrivateUse1();
+    c10::zoom::ZoomCachingAllocator::attachAllocatorTraceTracker(
+        &cacheAllocatorRegisterHook);
+    c10::zoom::ZoomCachingAllocator::attachAllocatorTraceTracker(
+        &cacheAllocatorDeregisterHook);
+    allocatorHooksAttached = true;
+  }
+}
+
+void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
+  const auto key = getKeyFromDevice(device);
+  LOG(INFO) << logPrefix() << "Eagerly connecting nccl backend with device "
+            << device;
+  getNCCLComm(key, device, OpType::ALLREDUCE);
+}
+
+void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
+  // If our backend doesn't support splitting, this is a no-op for
+  // ranks not in the new subgroup (and ranks that would be in it will
+  // just use a new communicator rather than split).
+#ifdef NCCL_HAS_COMM_SPLIT
+  const auto key = getKeyFromDevice(device);
+  LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
+            << device << ", key " << key << ", i am " << this;
+  auto comm = getNCCLComm(key, device, OpType::ALLREDUCE);
+  NCCLComm::split(comm.get(), NCCL_SPLIT_NOCOLOR, rank_, options_->config);
+#endif
+}
+
+c10::intrusive_ptr<intra_node_comm::IntraNodeComm> ProcessGroupNCCL::
+    initIntraNodeComm() {
+  using IntraNodeComm = intra_node_comm::IntraNodeComm;
+  if (!IntraNodeComm::isEnabled()) {
+    return nullptr;
+  }
+  auto prefixStore = c10::make_intrusive<PrefixStore>("IntraNodeComm", store_);
+  auto comm = c10::make_intrusive<IntraNodeComm>(prefixStore, rank_, size_);
+  if (comm->rendezvous()) {
+    return comm;
+  } else {
+    return nullptr;
+  }
+}
+
+void ProcessGroupNCCL::setSequenceNumberForGroup() {
+} // NCCL just starts sequence numbers at 0.
+
+uint64_t ProcessGroupNCCL::getSequenceNumberForGroup() {
+  return seq_;
+}
+
+void ProcessGroupNCCL::registerOnCompletionHook(
+    std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
+  TORCH_CHECK_WITH(
+      DistBackendError,
+      onCompletionHook_ == nullptr,
+      "ProcessGroupNCCL OnCompletion hook already registered");
+
+  TORCH_CHECK_WITH(
+      ValueError,
+      enableTiming_.load(),
+      "ProcessGroupNCCL OnCompletion hook requires recording start and end "
+      "events which require setting TORCH_NCCL_ENABLE_TIMING environment variable. "
+      "This is only available for NCCL version >= 2.4.");
+  onCompletionHook_ = std::move(hook);
+  onCompletionHookThread_ = std::thread(&ProcessGroupNCCL::runHookLoop, this);
+}
+
+// must release GIL when calling this method
+void ProcessGroupNCCL::waitForPendingWorks() {
+  // Reasoning about hook completion:
+  // 1. waitForPendingWorks should be called after user code has finished
+  // calling
+  //    all collectives. This means, when we got here, all of the collectives
+  //    are either in workMetaList_ or has been erased from workMetaList_.
+  // 2. The watchdog thread grabs both locks to move Work object from the
+  //    workMetaList_ to the completedWorkList_, and the hook thread only erases
+  //    a Work object after the hook is returned. Therefore, after user code
+  //    calls a collective, its Work object is either in workMetaList_ or in
+  //    completedWorkList_ before it finishes.
+  // 3. We have three threads and two locks.
+  //      a. main thread (this function) grabs two locks atomically
+  //      b. watchdog thread (watchdogHandler function) always grabs
+  //      workMetaListMutex_
+  //         first and then grabs completedWorkListMutex_.
+  //      c. hook thread (runHookLoop function) only grabs
+  //      completedWorkListMutex_. Therefore, locks are always acquired in the
+  //      same order and hence no deadlocks.
+  while (true) {
+    {
+      std::lock(workMetaListMutex_, completedWorkListMutex_);
+      std::lock_guard<std::mutex> lockWork(workMetaListMutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> lockHook(
+          completedWorkListMutex_, std::adopt_lock);
+
+      if (workMetaList_.empty() && completedWorkList_.empty()) {
+        return;
+      }
+    }
+
+    std::this_thread::sleep_for(
+        std::chrono::milliseconds(kWatchdogThreadSleepMillis));
+  }
+}
+
+void ProcessGroupNCCL::enableCollectivesTiming() {
+  enableTiming_.store(true);
+}
+
+void ProcessGroupNCCL::waitForFutureOrTimeout(
+    std::future<bool>& fut,
+    const std::chrono::milliseconds& timeOutMilSec,
+    const std::string& futDescription,
+    bool throwException) {
+  std::string errorMsg;
+  TORCH_CHECK(fut.valid(), "Expected a valid future");
+  std::future_status status = fut.wait_for(timeOutMilSec);
+  if (status == std::future_status::ready) {
+    // Calling .get() will re-raise any exception from the future, and we don't
+    // care about the retval
+    try {
+      bool result = fut.get();
+      if (result) {
+        LOG(INFO) << logPrefix()
+                  << "future is successfully executed for: " << futDescription;
+      }
+    } catch (const std::exception& e) {
+      errorMsg = c10::str(
+          logPrefix(),
+          "Exception thrown when waitng for future ",
+          futDescription,
+          ": ",
+          e.what());
+      LOG(ERROR) << errorMsg;
+    } catch (...) {
+      errorMsg = c10::str(
+          logPrefix(),
+          "Unknown exception thrown when waitng for future ",
+          futDescription);
+      LOG(ERROR) << errorMsg;
+    }
+  } else {
+    errorMsg = c10::str(
+        logPrefix(),
+        "Future for ",
+        futDescription,
+        " timed out after ",
+        timeOutMilSec.count(),
+        " ms");
+    LOG(ERROR) << errorMsg;
+  }
+  if (throwException && !errorMsg.empty()) {
+    C10_THROW_ERROR(DistBackendError, errorMsg);
+  }
+}
+
+void ProcessGroupNCCL::abortCommsFromMap(
+    std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
+    std::optional<std::string> abortReason) {
+  // The process may control multiple devices, loop through the communicators on
+  // each device
+  for (auto& it : ncclCommsMap) {
+    auto& devName = it.first;
+    auto& ncclComm = it.second;
+
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
+              << ncclComm->ncclComm_ << " on Zoom device: " << devName;
+    ncclComm->ncclCommAbort(abortReason);
+    // Note that we don't remove the aborted communicators from the
+    // cache. The reason is that if we do remove the communicator
+    // from the cache, it is possible that a new collective operation
+    // calls `ncclCommInitRank` to create a new communicator whereas
+    // other ranks might have failed/timed out and didn't enter
+    // `ncclCommInitRank`. As a result, when there is a failure on
+    // a communicator the application receives an exception and its
+    // their responsibility to destroy the process group and recreate
+    // it to recover from errors.
+
+    c10::StreamId streamId = -1;
+    if (ncclStreams_.find(devName) != ncclStreams_.end()) {
+      auto stream = ncclStreams_.at(devName);
+      streamId = stream.id();
+    }
+
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroyed "
+              << " communicator on Zoom device: " << devName
+              << " with stream: " << streamId;
+  }
+}
+
+// Abort all communicators on this rank
+bool ProcessGroupNCCL::abort(std::optional<std::string> abortReason) {
+  // Remove record from global ncclCommDevIdxMapMutex before aboarting,
+  // so that a new cache segment would not register to already aborded
+  // communicators. Note that ncclCommDevIdxMap is a global container which may
+  // contain other PG's communicators, thus we need to only erase communicators
+  // for the current PG.
+  ncclCommDevIdxMapMutex.lock();
+  for (auto& it : devNCCLCommMap_) {
+    auto& ncclComm = it.second;
+    ncclCommDevIdxMap.erase(ncclComm);
+  }
+  ncclCommDevIdxMapMutex.unlock();
+
+  std::lock_guard<std::mutex> lock(mutex_);
+  abortCommsFromMap(devNCCLCommMap_, abortReason);
+  abortCommsFromMap(inInitializationCommMap_, abortReason);
+  return true;
+}
+
+void ProcessGroupNCCL::shutdown(std::optional<std::string> reason) {
+  // Don't join threads here since the purpose of this method is to abort all
+  // communicators and signal the threads to exit. Joining on the threads could
+  // potentially block and hence avoid it in this method.
+  terminateProcessGroup_.store(true);
+  workMetaListCV_.notify_one();
+
+  // lauch abort asynchrounously and wait for it to complete or timeout
+  LOG(INFO) << logPrefix()
+            << "Launching ProcessGroupNCCL abort asynchrounously.";
+  std::future<bool> fut = std::async(
+      std::launch::async, [this, &reason]() { return this->abort(reason); });
+
+  waitForFutureOrTimeout(fut, options_->timeout, "ProcessGroup abort", true);
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL aborts successfully.";
+
+  // We need to wait for abort to finish before we can safely shut down
+  // heartbeat monitoring thread.
+  terminateHeartbeatMonitorThread_.store(true);
+  monitorWakeUpCV_.notify_one();
+}
+
+ProcessGroupNCCL::~ProcessGroupNCCL() {
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
+
+  if (!terminateProcessGroup_.load()) {
+    LOG(WARNING) << c10::str(
+        "WARNING: process group has NOT been destroyed before it is being destructed. ",
+        "On normal program exit, the application should call destroy_process_group to ",
+        "ensure that any pending NCCL data transfers have finished in this process. "
+        "In rare cases this process can exit before this point and block the progress of "
+        "another member of the process group. This constraint has always been present, "
+        " but this warning has only been added since PyTorch 2.4");
+    // If user haven't explicitly destroy/shutdown process group, destructor
+    // needs to do so
+    shutdown();
+  }
+
+  // Wait for all threads to finish before returning
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+  if (ncclCommWatchdogThread_.joinable()) {
+    ncclCommWatchdogThread_.join();
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
+  }
+  if (ncclHeartbeatMonitorThread_.joinable()) {
+    ncclHeartbeatMonitorThread_.join();
+    LOG(INFO) << logPrefix()
+              << "ProcessGroupNCCL heart beat monitor thread joined.";
+  }
+#endif
+  if (onCompletionHookThread_.joinable()) {
+    onCompletionHookThread_.join();
+    LOG(INFO) << logPrefix()
+              << "ProcessGroupNCCL onCompletionHookThread thread joined.";
+  }
+}
+
+bool ProcessGroupNCCL::dumpDebuggingInfo() {
+  // Serialize all calls to this function to avoid corrupting data, but allow
+  // multiple calls in one runtime. User is responsible for preserving the
+  // output file from an earlier call before a later call overwrites it.
+  static std::mutex writeDebugInfoMutex;
+  std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
+  LOG(ERROR) << logPrefix() << "ProcessGroupNCCL preparing to dump debug info.";
+  if (ncclTraceBufferSize_ > 0) {
+    // We dump nccl trace into local disk by default and users can register
+    // their customized writer by inheriting `DebugInfoWriter` via
+    // `registerDebugInfoWriter`.
+    auto ncclTrace = dump_nccl_trace();
+    DebugInfoWriter& writer = DebugInfoWriter::getWriter(globalRank());
+    LOG(INFO) << logPrefix() << "ProcessGroupNCCL dumping nccl trace to "
+              << writer.getWriterTarget();
+    writer.write(ncclTrace);
+    return true;
+  }
+  return false;
+}
+
+void ProcessGroupNCCL::terminateProcess(std::string errMsg) {
+  // Logging with `FATAL`, after errMsg printed, it calls `std::abort()`
+  // to terminate the program execution.
+  LOG(FATAL) << logPrefix() << errMsg;
+}
+
+int computeDeltaMS(
+    std::chrono::time_point<std::chrono::steady_clock> start,
+    std::chrono::time_point<std::chrono::steady_clock> end) {
+  return std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+      .count();
+}
+
+void ProcessGroupNCCL::heartbeatMonitor() {
+  uint64_t heartBeatCounter = 0ULL;
+  std::string errorMsg;
+  std::string exitMsg;
+  bool checkDumpSignal = (dumpOnException_ && uid_ == 0);
+  int monitorPollInterval = checkDumpSignal ? coordCheckIntervalMilSec_
+                                            : heartbeatTimeoutInSec_ * 1000;
+  auto lastTimePollStore = std::chrono::steady_clock::now();
+  auto lastTimeHeartBeatCheck = std::chrono::steady_clock::now();
+  std::optional<DumpPipe> dumpPipe = c10::nullopt;
+  if (uid_ == 0) {
+    // DumpPipe is one per-trainer process, and its convenient to name them
+    // after 'global' ranks in the system, So we assume processgroup (uid)==0 is
+    // the global PG and has globally unique rank ids across trainers.
+    dumpPipe.emplace(rank_);
+  }
+  while (true) {
+    // This won't have any lock since this lock is only used here.
+    // Please be aware that mutex `monitorMutex_` should not be used
+    // somewhere else to avoid the deadlock.
+    std::unique_lock<std::mutex> lock(monitorMutex_);
+    if (monitorWakeUpCV_.wait_for(
+            lock, std::chrono::milliseconds(monitorPollInterval), [&] {
+              return terminateHeartbeatMonitorThread_.load();
+            })) {
+      // For the normal complete or user interception, monitorWakeUpCV_
+      // will get notified, we early return and exit heartbeatMonitor.
+      return;
+    }
+    auto currentTime = std::chrono::steady_clock::now();
+
+    // We put extra functionality in the thread for the default PG (aka, uid_=0)
+    // because the signal is same across different PGs. We only need to run
+    // once per process to avoid duplicate things performed in too many separate
+    // threads. For example, we check a global flag on the TCPStore periodically
+    // to see if any PG on any rank observed a timeout and signaled peers to
+    // dump debugging info, and we avoid hammering the TCPStore from all PGs on
+    // the same rank.
+    if (checkDumpSignal) {
+      // There are two scenarios where monitor thread will dump on timeout:
+      // 1. The local rank is the first to observe a timeout.shouldDump_ will be
+      // set to true.
+      // 2. other ranks detected the timeout and signal the local rank to dump
+      // In addtion, monitor threads will dump if watchdog threads has no
+      // heartbeat or dumpPipe is not empty.
+      if (shouldDump_.load()) {
+        errorMsg = c10::str(
+            logPrefix(),
+            "Received a dump signal from this local rank and will ",
+            "start to dump the debug info. ",
+            "Last enqueued NCCL work: ",
+            lastEnqueuedSeq_,
+            ", last completed NCCL work: ",
+            lastCompletedSeq_,
+            ".");
+        exitMsg = c10::str(
+            "ProcessGroupNCCL's watchdog detected an exception from the local rank. ",
+            "This is most likely caused by incorrect usages of collectives, e.g., wrong ",
+            "sizes used across ranks, the order of collectives is not same for all ranks ",
+            "or the scheduled collective, for some reason, didn't run. Additionally, ",
+            "this can be caused by GIL deadlock or other reasons such as network errors or ",
+            "bugs in the communications library (e.g. NCCL), etc. We tried our best to ",
+            "dump the debug info into the storage to help you debug the issue.");
+        break;
+      }
+      // We poll store to see if some ranks have flagged a timeout when
+      // we haven't polled for `heartbeat_timeout` seconds and there haven't
+      // any work added or removed for `watchdog_timeout` seconds.
+      if (computeDeltaMS(lastWorkListUpdateTime_, currentTime) >=
+              kWatchdogThreadSleepMillis &&
+          computeDeltaMS(lastTimePollStore, currentTime) >=
+              coordCheckIntervalMilSec_) {
+        lastTimePollStore = currentTime;
+        if (globalStore_->check({std::string(EXCEPTION_DUMP)})) {
+          int timeOutRank = -1;
+          if (!shouldDump_.load()) {
+            LOG(ERROR)
+                << logPrefix()
+                << "First PG on this rank detecting the dump signal through tcpstore.";
+          }
+          shouldDump_.store(true);
+          try {
+            auto vec = globalStore_->get(std::string(EXCEPTION_DUMP));
+            TORCH_CHECK_WITH(
+                DistBackendError,
+                vec.size() == sizeof(int),
+                "Invalid size for the timeout rank ID");
+            std::memcpy(&timeOutRank, vec.data(), vec.size());
+          } catch (const std::exception& e) {
+            LOG(ERROR)
+                << "Failed to get timeout rank ID from the global store.";
+          }
+          errorMsg = c10::str(
+              logPrefix(),
+              "Received a global dump signal from rank ",
+              timeOutRank,
+              ", and will start to dump the debug info. ",
+              "Last enqueued NCCL work: ",
+              lastEnqueuedSeq_,
+              ", last completed NCCL work: ",
+              lastCompletedSeq_,
+              ".");
+          exitMsg = c10::str(
+              "ProcessGroupNCCL's watchdog detected a dump signal from rank ",
+              timeOutRank,
+              " and notified the current rank. ",
+              "This is most likely caused by incorrect usages of collectives, e.g., wrong ",
+              "sizes used across ranks, the order of collectives is not same for all ranks ",
+              "or the scheduled collective, for some reason, didn't run. Additionally, ",
+              "this can be caused by GIL deadlock or other reasons such as network errors or ",
+              "bugs in the communications library (e.g. NCCL), etc. We tried our best to ",
+              "dump the debug info into the storage to help you debug the issue.");
+          break;
+        }
+      }
+    }
+
+    if (computeDeltaMS(lastTimeHeartBeatCheck, currentTime) >=
+        heartbeatTimeoutInSec_ * 1000) {
+      // Check the heart beat of watchdog thread.
+      lastTimeHeartBeatCheck = currentTime;
+      auto heartbeat = heartbeat_.load();
+      if (heartbeat != heartBeatCounter) {
+        heartBeatCounter = heartbeat;
+      } else {
+        if (!shouldDump_.load()) {
+          LOG(ERROR)
+              << logPrefix()
+              << "First PG on this rank that detected no heartbeat of its watchdog.";
+        }
+        shouldDump_.store(true);
+        // No heartbeat increase detected and timeout.
+        errorMsg = c10::str(
+            logPrefix(),
+            "Heartbeat monitor timed out! Process will be terminated after dumping debug info.",
+            " workMetaList_.size()=",
+            workMetaList_.size());
+        exitMsg = c10::str(
+            "ProcessGroupNCCL's watchdog got stuck for ",
+            heartbeatTimeoutInSec_,
+            " seconds without making progress in monitoring enqueued collectives. ",
+            "This typically indicates a NCCL/HIP API hang blocking the watchdog, ",
+            "and could be triggered by another thread holding the GIL inside a ",
+            "HIP api, or other deadlock-prone behaviors.",
+            "If you suspect the watchdog is not actually stuck and a longer timeout would help, ",
+            "you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value "
+            "or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0)."
+            "If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout "
+            "or false positive abort; otherwise, please attempt to debug the hang. "
+            "workMetaList_.size() = ",
+            workMetaList_.size(),
+            "");
+        break;
+      }
+    }
+    // process a request to dump the trace. only PG uid 0 will respond to dump
+    // requests, but this is fine since all PG's feed into the same flight
+    // recorder and dump. After dump, the training should continue.
+    if (dumpPipe.has_value() && dumpPipe->shouldDump()) {
+      // best effort dump, not waiting for the dump here
+      std::future<bool> fut = std::async(
+          std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
+    }
+  }
+  LOG(ERROR) << errorMsg;
+
+  auto& cpp_dumper = get_cpp_trace_dumper();
+  if (cpp_dumper.has_value()) {
+    LOG(INFO) << "Dumping c++ stacktraces:";
+    cpp_dumper.value()([](const std::string& line) { LOG(INFO) << line; });
+  }
+
+  if (checkDumpSignal && shouldDump_.load()) {
+    // Store debug info to storage if no other thread does it. (By default to
+    // local disk)
+    std::future<bool> asyncDebugDump = std::async(
+        std::launch::async, [this]() { return this->dumpDebuggingInfo(); });
+
+    // wait for the dump until timeout
+    waitForFutureOrTimeout(
+        asyncDebugDump,
+        std::chrono::milliseconds(waitTimeoutDumpInMilSec_),
+        "Flight recorder dump in heartbeatMonitor");
+  }
+
+  if (get_gil_checker() != nullptr) {
+    auto fut = launchAsyncGilCheck();
+    auto kGilCheckTimeout = std::chrono::milliseconds(300);
+    auto futStatus = fut.wait_for(kGilCheckTimeout);
+    if (futStatus != std::future_status::ready) {
+      TORCH_CHECK(
+          futStatus != std::future_status::deferred,
+          "Expected the future to have been launched eagerly.");
+      LOG(ERROR)
+          << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
+    }
+    LOG(INFO) << "Could acquire GIL on exit";
+  } else {
+    LOG(INFO)
+        << "GIL checker was not registered, perhaps this is a no-python build?";
+  }
+
+  // There are two possible cases for the watchdog thread exit:
+  // Case one: desync report runs quickly, and it follows the step:
+  // collective timeout -> desync -> exception handling -> destructors
+  // -> set terminateHeartbeatMonitorThread_ -> notify monitorWakeUpCV_.
+  // So the code either early returns above or will skip the sleep below.
+  // Case two: desync might be slow or get stuck. Or we get stuck in
+  // destructors, we will sleep for some time before calling std::abort() to
+  // kill the whole process.
+  if ((terminateProcessGroup_.load() || collectiveDebugInfoMode_.load() ||
+       shouldDump_.load()) &&
+      !terminateHeartbeatMonitorThread_.load()) {
+    // Leave another two mins for desync report generation or process group
+    // destroy.
+    std::this_thread::sleep_for(std::chrono::seconds(heartbeatTimeoutInSec_));
+  }
+
+  // At this point, we either already sleep for another `heartbeatTimeoutInSec_`
+  // or the thread has finished. Because we don't want to block the monitor
+  // thread, so We mark the thread detach and the dump of debug info becomes
+  // "best effort". If the process exit normally, marking it detach also makes
+  // sense because we don't really care about dumping the debug info.
+
+  // We already log completion inside the thread, so it may not be necessary to
+  // check the return value here.  We mainly use a future so we can exit early
+  // if done.
+
+  if (!terminateHeartbeatMonitorThread_.load()) {
+    // Create a error message reported from MonitorThread, so
+    // we throw exception and make the whole process to be killed.
+    // TODO(fduwjj): After having a hang debug wiki, we need to update the wiki
+    // url here.
+    const auto finalExitMsg = c10::str(logPrefix(), exitMsg);
+    if (monitorThreadEnabled_.load()) {
+      terminateProcess(finalExitMsg);
+    } else {
+      LOG(ERROR)
+          << "PGNCCL Monitor Thread is disabled, but would have killed this job:\n"
+          << finalExitMsg;
+    }
+  }
+}
+
+void ProcessGroupNCCL::ncclCommWatchdog() {
+  try {
+    VLOG(2) << logPrefix() << "Process group watchdog thread started!";
+    ncclHeartbeatMonitorThread_ =
+        std::thread(&ProcessGroupNCCL::heartbeatMonitor, this);
+    watchdogHandler();
+    VLOG(2) << logPrefix()
+            << "Process group watchdog thread terminated normally";
+  } catch (std::exception& e) {
+    if (std::string(e.what()).find("driver shutting down") !=
+        std::string::npos) {
+      LOG(INFO)
+          << logPrefix()
+          << "main process destroyed HIP before watchdog loop exited, terminating watchdog."
+          << " (Watchdog caught exception: " << e.what();
+
+    } else {
+      // Append error message reported from watchdogHandler
+      const auto exitMsg = c10::str(
+          logPrefix(),
+          "Process group watchdog thread terminated with exception: ",
+          e.what());
+      LOG(ERROR) << exitMsg;
+      // TODO(whc) clean up the rethrow - why is it stored in a class var and
+      // rethrown?
+      watchDogException_ =
+          std::make_exception_ptr(C10_BUILD_ERROR(DistBackendError, exitMsg));
+      std::rethrow_exception(watchDogException_);
+    }
+  } catch (...) {
+    const auto exitMsg = c10::str(
+        logPrefix(),
+        "Process group watchdog thread terminated with exception: unknown");
+    LOG(ERROR) << exitMsg;
+    watchDogException_ =
+        std::make_exception_ptr(C10_BUILD_ERROR(DistBackendError, exitMsg));
+    std::rethrow_exception(watchDogException_);
+  }
+}
+
+void ProcessGroupNCCL::logWorkStart(WorkNCCL& work) {
+  if (work.startTraceUpdated_)
+    return;
+
+  if (terminateProcessGroup_.load() || storeError_)
+    return;
+
+  work.startTraceUpdated_ = true;
+  storeError_ = !c10d::traceUpdate(
+      store_, traceKeyStart_, work.seq_, opTypeToString(work.opType_));
+}
+
+void ProcessGroupNCCL::logWorkEnd(WorkNCCL& work) {
+  if (terminateProcessGroup_.load() || storeError_)
+    return;
+
+  // In case the start of the work hasn't been logged
+  if (!work.startTraceUpdated_) {
+    logWorkStart(work);
+  }
+
+  storeError_ = !c10d::traceUpdate(
+      store_, traceKeyEnd_, work.seq_, opTypeToString(work.opType_));
+}
+
+std::string ProcessGroupNCCL::getNCCLWatchdogDebugInfo() {
+  return retrieveDesyncReport(store_, "NCCL", rank_, size_);
+}
+
+std::string ProcessGroupNCCL::createLogPrefix() const {
+  if (!pg_desc_.empty() && pg_desc_ != "undefined") {
+    return c10::str("[PG ", pg_name_, " (", pg_desc_, ") Rank ", rank_, "] ");
+  } else {
+    return c10::str("[PG ", pg_name_, " Rank ", rank_, "] ");
+  }
+}
+
+const std::string& ProcessGroupNCCL::logPrefix() const {
+  return logPrefix_;
+}
+
+const int& ProcessGroupNCCL::globalRank() const {
+  static int globalRank = rank_;
+  return globalRank;
+}
+
+const std::vector<uint64_t>& ProcessGroupNCCL::groupRanks() const {
+  if (options_->global_ranks_in_group.empty() && uid_ == 0) {
+    static std::vector<uint64_t> globalRanks(size_);
+    std::iota(globalRanks.begin(), globalRanks.end(), 0);
+    return globalRanks;
+  }
+  return options_->global_ranks_in_group;
+}
+
+void ProcessGroupNCCL::watchdogHandler() {
+  bool done = false;
+  lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
+  auto lastStatusUpdateTime = std::chrono::steady_clock::now();
+  std::list<ProcessGroupNCCL::WorkNCCL> completedWorkList;
+
+  while (!done || !terminateProcessGroup_.load()) {
+    std::unique_lock<std::mutex> lock(workMetaListMutex_);
+    // We busy-poll the work vector every kWatchdogThreadSleepMillis
+    // milliseconds as long as the atomic is True.
+    workMetaListCV_.wait_for(
+        lock,
+        std::chrono::milliseconds(kWatchdogThreadSleepMillis),
+        [&]() -> bool { return terminateProcessGroup_.load(); });
+    // Bump up heart beat by one.
+    heartbeat_++;
+
+// Some versions of GLOG support less-spammy version of LOG_EVERY_MS
+// in which case we don't want to spam the logs.
+#ifdef LOG_EVERY_MS
+    // Log the progress of this PG periodically
+    C10_LOG_EVERY_MS(INFO, kWorkStatusUpdatePeriodMs) << c10::str(
+        logPrefix(),
+        "NCCL Work update periodically: ",
+        "last enqueued NCCL work: ",
+        lastEnqueuedSeq_,
+        ", last completed NCCL work: ",
+        lastCompletedSeq_,
+        ".");
+#endif
+    auto logger = ::c10d::C10dLogger::getLogger();
+    if (logger &&
+        computeDeltaMS(
+            lastStatusUpdateTime, std::chrono::steady_clock::now()) >=
+            kWorkStatusUpdatePeriodMs) {
+      ::c10d::C10dLoggingData data;
+      // logging integers
+      data.integers["pg_id"] = uid_;
+      data.integers["rank"] = rank_;
+      data.integers["global_rank"] = globalRank();
+      data.integers["last_enqueued_work"] = lastEnqueuedSeq_;
+      data.integers["last_started_work"] = lastStartedSeq_;
+      data.integers["last_completed_work"] = lastCompletedSeq_;
+      // logging strings
+      data.strings["last_enqueued_work_name"] = lastEnqueuedWorkName_;
+      data.strings["last_started_work_name"] = lastStartedWorkName_;
+      data.strings["last_completed_work_name"] = lastCompletedWorkName_;
+      data.strings["pg_name"] = pg_name_;
+      data.strings["pg_desc"] = pg_desc_;
+      logger->log(data);
+      lastStatusUpdateTime = std::chrono::steady_clock::now();
+    }
+
+    for (auto it = workMetaList_.begin(); it != workMetaList_.end();
+         /* no increment */) {
+      auto& work = *it;
+      // When terminateProcessGroup_ is true, communicators have already been
+      // aborted, So cannot check exception based on them. But watchdog needs to
+      // finish the check for the works that have already been enqueued to
+      // workMetaList_
+      if (!terminateProcessGroup_.load()) {
+        work.checkAndSetException();
+      }
+      bool timedOut = work.checkTimeout();
+
+      // If work hits an exception (either an error or timeout)
+      if (work.exception()) {
+        // log as soon as exception is detected
+        LOG(ERROR) << c10::str(
+            logPrefix(),
+            "Exception (either an error or timeout) detected by watchdog at work: ",
+            work.seq_,
+            ", last enqueued NCCL work: ",
+            lastEnqueuedSeq_,
+            ", last completed NCCL work: ",
+            lastCompletedSeq_,
+            ".");
+        // try to dump flight records if exception happens.
+        // Flight recorder behavior should be independent of desync Debug
+        if (dumpOnException_) {
+          try {
+            auto rank = globalRank();
+            auto vec = std::vector<uint8_t>(
+                reinterpret_cast<uint8_t*>(&rank),
+                reinterpret_cast<uint8_t*>(&rank) + sizeof(rank));
+            globalStore_->set(std::string(EXCEPTION_DUMP), vec);
+            if (!shouldDump_.load()) {
+              LOG(ERROR) << logPrefix()
+                         << "First watchdog to set the dump signal.";
+            }
+            // signal the monitor thread to start dumping
+            shouldDump_.store(true);
+            // This sleep is used to give time for dumping before throwing
+            // exception
+            std::this_thread::sleep_for(
+                std::chrono::seconds(heartbeatTimeoutInSec_));
+          } catch (const std::exception& e) {
+            LOG(ERROR) << logPrefix()
+                       << "Failed to set dump signal in tcpstore. "
+                       << "Error: " << e.what();
+          }
+        }
+
+        if (SHOULD_CLEAN_UP(asyncErrorHandling_)) {
+          // Abort work and corresponding communicators
+          work.abort();
+          // PG level abort, which would abort all other communicators on this
+          // rank
+          abort();
+        }
+
+        // Report desync state in case of timeout
+        if (timedOut) {
+          LOG(ERROR) << c10::str(
+              logPrefix(),
+              "Timeout at NCCL work: ",
+              work.seq_,
+              ", last enqueued NCCL work: ",
+              lastEnqueuedSeq_,
+              ", last completed NCCL work: ",
+              lastCompletedSeq_,
+              ".");
+          if (desyncDebug_) {
+            try {
+              collectiveDebugInfoMode_.store(true);
+              auto desyncMsg = getNCCLWatchdogDebugInfo();
+              LOG(ERROR) << logPrefix() << desyncMsg;
+            } catch (const std::exception& e) {
+              LOG(ERROR)
+                  << logPrefix()
+                  << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
+                  << " Please file an issue. Error: " << e.what();
+            } catch (...) {
+              LOG(ERROR)
+                  << logPrefix()
+                  << "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
+                  << " Please file an issue.";
+            }
+          }
+        }
+        // Throw exception
+        work.handleException(asyncErrorHandling_);
+      }
+
+      // Work status logging for desync debug
+      if (desyncDebug_) {
+        if (work.isStarted()) {
+          logWorkStart(work);
+        }
+        if (work.isCompleted()) {
+          logWorkEnd(work);
+        }
+      }
+
+      // a work could be started but not completed, so we should not update
+      // lastStartedSeq_ and lastStartedOpName_ if the work state is checked
+      // multiple times after the start
+      if (lastStartedSeq_ < static_cast<int64_t>(work.seq_) &&
+          work.isStarted()) {
+        lastStartedSeq_ = work.seq_;
+        lastStartedWorkName_ = opTypeToString(work.opType_);
+      }
+
+      // Clean up completed work
+      if (work.isCompleted()) {
+        lastCompletedSeq_ = work.seq_;
+        lastCompletedWorkName_ = opTypeToString(work.opType_);
+        NCCLTraceBuffer::get()->retire_id(work.trace_id_, true);
+        if (onCompletionHook_) {
+          // Move Work object to completedWorkList_ to be consumed by the hook
+          // thread
+          {
+            const std::lock_guard<std::mutex> lock(completedWorkListMutex_);
+            completedWorkList_.splice(
+                completedWorkList_.end(), workMetaList_, it++);
+          }
+          completedWorkListCV_.notify_one();
+        } else {
+          it = workMetaList_.erase(it);
+          lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
+        }
+        at::zoom::HIPGraph::dec_pending_event_queries();
+      } else {
+        // Increment the iterator if the current WorkNCCL object is not
+        // completed.
+        ++it;
+      }
+      // Increment heartbeat after each work processed,
+      // in case processing is slowed down (but not hung) by hip api contention
+      heartbeat_++;
+    }
+    done = workMetaList_.empty();
+  }
+}
+
+void ProcessGroupNCCL::runHookLoop() {
+  bool done = false;
+  while (!done || !terminateProcessGroup_.load()) {
+    std::unique_lock<std::mutex> lock(completedWorkListMutex_);
+    // We busy-poll the work vector every kWatchdogThreadSleepMillis
+    // milliseconds as long as the atomic is True.
+    completedWorkListCV_.wait_for(
+        lock,
+        std::chrono::milliseconds(kWatchdogThreadSleepMillis),
+        [&]() -> bool {
+          return !completedWorkList_.empty() || terminateProcessGroup_.load();
+        });
+
+    try {
+      for (auto it = completedWorkList_.begin(); it != completedWorkList_.end();
+           /* no increment */) {
+        const WorkNCCL& work = *it;
+        // Hook might grab GIL, unlock first to prevent deadlock
+        lock.unlock();
+
+        auto timeStarted =
+            std::chrono::system_clock::now() +
+            std::chrono::duration_cast<std::chrono::system_clock::duration>(
+                work.workStartTime_ - std::chrono::steady_clock::now());
+        onCompletionHook_(std::make_shared<WorkInfo>(
+            work.retrieveOpType(), // OpType
+            work.getSequencenumber(), // seq
+            timeStarted, // timeStarted
+            std::chrono::system_clock::now(), // timeFinished
+            std::chrono::duration<float, std::milli>(
+                work.getDuration()) // activeDuration
+            ));
+
+        lock.lock();
+        it = completedWorkList_.erase(it);
+      }
+    } catch (std::exception& e) {
+      if (std::string(e.what()).find("driver shutting down") !=
+          std::string::npos) {
+        LOG(INFO)
+            << logPrefix()
+            << "main process destroyed hip before runHookLoop exited, terminating runHookLoop."
+            << " (runHookLoop caught exception: " << e.what();
+
+      } else {
+        // PythonOnCompletionHook has already extracted Python exception message
+        // and wrapped it with a cpp one. So we no longer need to acquire GIL
+        // here.
+        const auto errorStr = c10::str(
+            "Caught exception on rank ",
+            rank_,
+            " while running onCompletion hook for ProcessGroupNCCL: ",
+            e.what(),
+            ". Aborting all communicators.");
+
+        // No need to call abort() on WorkNCCL here as that collective has
+        // already finished successfully at this point. We just need to abort
+        // the process Abort all NCCL Communicators on this ProcessGroupNCCL
+        // instance.
+        abort(errorStr);
+      }
+    }
+
+    // Lock is still acquired at this point
+    done = completedWorkList_.empty();
+  }
+}
+
+std::exception_ptr ProcessGroupNCCL::WorkNCCL::checkForNCCLErrors() {
+  return checkForNCCLErrorsInternal(ncclComm_);
+}
+
+std::exception_ptr ProcessGroupNCCL::checkForNCCLErrors(
+    std::shared_ptr<NCCLComm>& ncclComm) {
+  return checkForNCCLErrorsInternal(ncclComm);
+}
+
+std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal(
+    std::shared_ptr<NCCLComm>& ncclComm) {
+  // Prioritize commFailureReason over checkForNcclError() result if
+  // commFailureReason is set.
+  auto commFailureReason = ncclComm->getNcclCommFailureReason();
+  if (commFailureReason != c10::nullopt) {
+    return std::make_exception_ptr(C10_BUILD_ERROR(
+        DistBackendError,
+        c10::str(
+            "NCCL communicator encountered error set by ProcessGroupNCCL: ",
+            *commFailureReason)));
+  }
+  ncclResult_t ncclAsyncErr = ncclComm->checkForNcclError();
+  // When nonblocking mode is enabled by TORCH_NCCL_USE_COMM_NONBLOCKING,
+  // ncclInProgress could be returned when there are pending NCCL calls.
+  // In this case, no exception should be thrown
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  // ncclInProgress is defined only if NCCL_HAS_COMM_NONBLOCKING is defined
+  if (ncclAsyncErr != ncclSuccess && ncclAsyncErr != ncclInProgress) {
+#else
+  if (ncclAsyncErr != ncclSuccess) {
+#endif
+    return std::make_exception_ptr(C10_BUILD_ERROR(
+        DistBackendError,
+        "NCCL error: " + ncclGetErrorWithVersion(ncclAsyncErr) + "\n" +
+            getNcclErrorDetailStr(ncclAsyncErr)));
+  }
+
+  return nullptr;
+}
+
+void ProcessGroupNCCL::broadcastUniqueNCCLID(
+    ncclUniqueId* ncclID,
+    bool isSingleP2POp,
+    const std::string& p2pKey,
+    int p2pRank) {
+  // For collective operations:
+  // For every NCCL communicator that we create we need to broadcast
+  // a unique ID from rank 0 to all other ranks. This broadcast is
+  // done by rank 0 setting a key in the store and all other ranks
+  // retrieving the contents of that key. A single process group
+  // may create multiple NCCL communicators, so we use a sequence
+  // number to differentiate between them.
+  // For single point-to-point operations:
+  // The sequence number will only be increased on 2 out of all the
+  // processes in a Process Group. So all following collective
+  // operations will see different sequence numbers which will cause
+  // runtime errors. To avoid that, use the src:target pair instead
+  // of sequence number for p2p communications.
+
+  std::string storeKey;
+  if (!isSingleP2POp) {
+    storeKey = std::to_string(ncclCommCounter_++);
+  } else {
+    storeKey = p2pKey;
+  }
+  if (rank_ == 0 || (isSingleP2POp && p2pRank == 0)) {
+    auto vec = std::vector<uint8_t>(
+        reinterpret_cast<uint8_t*>(ncclID),
+        reinterpret_cast<uint8_t*>(ncclID) + NCCL_UNIQUE_ID_BYTES);
+    store_->set(storeKey, vec);
+  } else {
+    try {
+      auto vec = store_->get(storeKey);
+      TORCH_CHECK_WITH(
+          DistBackendError,
+          vec.size() == NCCL_UNIQUE_ID_BYTES,
+          "Invalid size for ncclUniqueId");
+      std::memcpy(ncclID, vec.data(), vec.size());
+    } catch (const std::exception& e) {
+      std::string exceptionMsg = c10::str(
+          "[",
+          rank_,
+          "] is setting up NCCL communicator and "
+          "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
+          storeKey,
+          "', but store->get('",
+          storeKey,
+          "') got error: ");
+      C10_THROW_ERROR(
+          DistBackendError,
+          exceptionMsg + e.what() +
+              ". This may indicate a possible application crash on rank 0 or a network set up issue.");
+    } catch (...) {
+      C10_THROW_ERROR(
+          DistBackendError,
+          c10::str(
+              "Unknown exception while [",
+              rank_,
+              "] is setting up NCCL communicator and "
+              "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
+              storeKey,
+              "'",
+              ". This may indicate a possible application crash on rank 0 or a network set up issue."));
+    }
+  }
+}
+
+void ProcessGroupNCCL::destroyNCCLComms(const std::string& devNCCLCommMapKey) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (devNCCLCommMap_.find(devNCCLCommMapKey) == devNCCLCommMap_.end()) {
+    TORCH_INTERNAL_ASSERT(
+        false,
+        "Expected to find key ",
+        devNCCLCommMapKey,
+        " in NCCL communicator map.");
+  }
+  std::shared_ptr<NCCLComm>& ncclComm = devNCCLCommMap_[devNCCLCommMapKey];
+  // ncclCommDestroy(comm->getNcclComm()) results in segfault when PG is being
+  // destroyed, so using ncclCommAbort here.
+  ncclComm->ncclCommAbort();
+  // Remove communicators from the cache.
+  devNCCLCommMap_.erase(devNCCLCommMapKey);
+  // Clear used device indices.
+  usedDeviceIdxs_.clear();
+
+  ncclCommDevIdxMapMutex.lock();
+  ncclCommDevIdxMap.erase(ncclComm);
+  ncclCommDevIdxMapMutex.unlock();
+}
+
+std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
+    const std::string& deviceKey,
+    at::Device& device,
+    OpType opType,
+    int p2pRank,
+    bool isSendRecvSelf) {
+  // Sanity check
+  if (deviceKey.empty()) {
+    C10_THROW_ERROR(
+        DistBackendError,
+        "Not able to create/get the NCCL Communicator since "
+        "the GPU devices are not known");
+  }
+  if (bound_device_id_) {
+    if (*bound_device_id_ != device) {
+      LOG(ERROR) << logPrefix() << "Tensor found on device " << device
+                 << " but backend constrained to " << *bound_device_id_;
+      C10_THROW_ERROR(
+          DistBackendError,
+          "Attempt to perform collective on tensor not on device passed to init_process_group");
+    }
+  }
+
+  usedDeviceIdxs_.insert(device.index());
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (devNCCLCommMap_.find(deviceKey) != devNCCLCommMap_.end()) {
+      // Reuse the cached communicator if there is one.
+      return devNCCLCommMap_[deviceKey];
+    }
+  }
+
+  // NCCL communicator not cached, create a new entry
+  std::shared_ptr<NCCLComm> ncclComm;
+
+  // Create the unique NCCL ID and broadcast it
+  ncclUniqueId ncclID;
+
+  // reset log prefix to include group_desc
+  logPrefix_ = createLogPrefix();
+
+#ifdef NCCL_COMM_DESCRIPTION
+  // Pass process group name and description to NCCL communicator
+  std::string commDesc = pg_desc_ + ':' + pg_name_;
+  options_->config.commDesc = strdup(commDesc.c_str());
+#endif
+
+  // For batch_isend_irecv, ncclGroupStart() would be called upfront
+  bool batchP2P = ncclActiveGroupCounter_ > 0;
+  bool singleP2POp = isP2POp(opType, batchP2P);
+  // For point-to-point communication, lower rank of the two will get unique id.
+  if (rank_ == 0 || (singleP2POp && p2pRank == 0)) {
+    C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID), c10::nullopt);
+  }
+
+  // For point-to-point communication on the same process, don't need broadcast.
+  if (!isSendRecvSelf) {
+    // Broadcast so that each process can have a unique NCCL ID
+    auto timeStarted = std::chrono::steady_clock::now();
+    broadcastUniqueNCCLID(&ncclID, singleP2POp, deviceKey, p2pRank);
+    auto timerDeltaMs =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            std::chrono::steady_clock::now() - timeStarted)
+            .count() *
+        1000;
+    LOG(INFO) << logPrefix()
+              << "ProcessGroupNCCL broadcast unique ID through store took "
+              << timerDeltaMs << " ms";
+  }
+
+  c10::zoom::OptionalZoomGuard gpuGuard;
+
+  // [Group Start/End Note] This is used to ensure that nccl communicator will
+  // be created before communication primitives are called. Let's look at this
+  // example: Using the batch_isend_irecv to send a tensor to a target process.
+  // On the sender side, the corresponding underlying NCCL calls will look like
+  //   ncclGroupStart() // This is in batch_isend_irecv
+  //   ncclCommInitRank() // Inside NCCLComm::create
+  //   ncclSend()
+  //   ncclGroupEnd() // This is in batch_isend_irecv
+  // With this pattern, the nccl communicator will be created in the last
+  // ncclGroupEnd which means when ncclSend is processed, the passed
+  // communicator argument is NULL which will lead to runtime error. So we need
+  // to "close" all active nccl groups to ensure nccl communicator is actually
+  // created before encountering any communication calls. This is why we need
+  // the following for loop.
+  for (const auto i : c10::irange(ncclActiveGroupCounter_)) {
+    (void)i;
+    // comms have not been initiated yet, so can only check in blocking-way
+    C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
+  }
+
+  // GPU world size and GPU rank
+  int numRanks, rank;
+
+  if (!singleP2POp) {
+    // Collective, all-to-all, or batch P2P
+    numRanks = getSize();
+    rank = getRank();
+  } else if (isSendRecvSelf) {
+    // Same process send and recv.
+    numRanks = 1;
+    rank = 0;
+  } else {
+    // For single point-to-point operation, there are only 2 processes
+    // involved so the GPU rank is either 0 or 1.
+    numRanks = 2;
+    rank = p2pRank;
+  }
+  // Get the device index
+  auto deviceIndex = device.index();
+  gpuGuard.set_index(deviceIndex);
+#ifdef NCCL_HAS_COMM_SPLIT
+  if (options_->split_from) {
+    TORCH_CHECK(
+        options_->split_color != 0,
+        "Must specify a non-zero color when splitting");
+    // Find a valid, healthy communicator to split from if possible.
+    std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
+    auto& other_comms = options_->split_from->devNCCLCommMap_;
+    auto dit = other_comms.find(deviceKey);
+    if (dit != other_comms.end()) {
+      auto& parentComm = dit->second;
+      if (parentComm != nullptr && !parentComm->isAborted()) {
+        ncclComm = NCCLComm::split(
+            parentComm.get(), options_->split_color, rank, options_->config);
+      }
+    }
+  }
+#endif
+
+  // To simplify conditioonal nesting, just create the ncclComms[i]
+  // entry if it hasn't been yet rather than untangling the
+  // conditions that might have resulted in a split above.
+  if (!ncclComm) {
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+    ncclComm = NCCLComm::create(numRanks, rank, ncclID, options_->config);
+#else
+    ncclComm = NCCLComm::create(numRanks, rank, ncclID);
+#endif
+  }
+
+  // Creates the NCCL streams
+  bool force_high = getCvarBool(TORCH_NCCL_HIGH_PRIORITY, false);
+  auto streamVal = c10::zoom::getStreamFromPool(
+      options_->is_high_priority_stream || force_high);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    inInitializationCommMap_.emplace(deviceKey, ncclComm);
+  }
+
+  NCCLTraceBuffer::get()->record_pg_ranks(
+      std::make_tuple(pg_name_, pg_desc_), groupRanks());
+
+  RECORD_PARAM_COMMS(
+      0, // seq
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      rank, // rank
+      "init", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      size_); // worldSize
+
+  LOG(INFO) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
+            << ncclComm->ncclComm_ << " on zoom device: " << deviceIndex;
+
+  // At this point NCCL should have been initialized, hence we can accurately
+  // get the env value even if NCCL sets it by reading from nccl.conf file
+  LOG(INFO) << logPrefix()
+            << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
+
+  // See [Group Start/End Note]
+  for (const auto i : c10::irange(ncclActiveGroupCounter_)) {
+    (void)i;
+    C10D_NCCL_CHECK(ncclGroupStart(), c10::nullopt);
+  }
+
+  ncclStreams_.emplace(deviceKey, std::move(streamVal));
+
+  // Note: these events are created with the (default) hipEventDisableTiming
+  // flag This flag provides the best performance when used with
+  // hipStreamWaitEvent() and hipEventQuery(). Since we here don't measure the
+  // performance using hipEvent, this should be set.
+  // TODO(kwen2501): is ncclEvents_ used anywhere else?
+  ncclEvents_.emplace(deviceKey, at::zoom::ZoomEvent(hipEventDisableTiming));
+
+  // Record the communicators based on ncclUniqueId.
+  ncclIdToCommMap_.emplace(buildNcclUniqueIdStr(ncclID), ncclComm);
+
+  // Move the NCCL resource to cache
+  auto it = inInitializationCommMap_.find(deviceKey);
+  // A previous thread could've already removed devicesKey from
+  // inInitializationCommMap_ and added it to devNCCLCommMap_
+  if (it != inInitializationCommMap_.end()) {
+    devNCCLCommMap_.emplace(deviceKey, std::move(it->second));
+    inInitializationCommMap_.erase(deviceKey);
+
+    // Now ncclComms are fully initialized.
+    // Register all active hip memory segments in cache allocator to
+    // the new NCCL communicators
+    if (useTensorRegisterAllocatorHook_) {
+      auto snapshot = c10::zoom::ZoomCachingAllocator::snapshot();
+      // Register the segment to a new NCCL communicator if on the same device
+      for (const auto& segmentInfo : snapshot.segments) {
+        TORCH_INTERNAL_ASSERT(
+            segmentInfo.device == device.index(),
+            "Mismatch between HIP memory segment device and current device");
+        ncclComm->registerSegment(
+            reinterpret_cast<void*>(segmentInfo.address),
+            segmentInfo.total_size);
+      }
+    }
+    // Record the mapping between ncclComm and device index so that later
+    // register hook can register a newly allocated segment to communicators
+    // on the same device.
+    // NOTE: we need remove the communicator from this map when it is
+    // destroyed, otherwise may register onto an invalid communicator.
+    ncclCommDevIdxMapMutex.lock();
+    ncclCommDevIdxMap.emplace(ncclComm, device.index());
+    ncclCommDevIdxMapMutex.unlock();
+  }
+
+  it = devNCCLCommMap_.find(deviceKey);
+  TORCH_INTERNAL_ASSERT(
+      it != devNCCLCommMap_.end(), "Communicators not populated in cache!");
+
+  return it->second;
+}
+
+uint64_t ProcessGroupNCCL::getCommSplitCounter() const {
+  uint64_t ret = 0;
+  for (const auto& i : ncclIdToCommMap_) {
+    auto& ncclComm = i.second;
+    ret += ncclComm->getCommSplitCounter();
+  }
+  return ret;
+}
+
+namespace {
+
+// Check validity of tensor
+void check_gpu_single_tensor(
+    const at::Tensor& tensor,
+    const bool p2p = false // whether operation is a P2P operation
+) {
+  if (!tensor.is_privateuseone() || tensor.is_sparse()) {
+    C10_THROW_ERROR(ValueError, "Tensors must be Zoom and dense");
+  }
+  // Skip the following requirements for P2P operations
+  if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    if (p2p) {
+      TORCH_WARN_ONCE(
+          "Detected non-contiguous tensor in P2P operations. It is user "
+          "responsibility to guarantee that source and destination tensors have "
+          "the same contiguity format.");
+    } else {
+      C10_THROW_ERROR(ValueError, "Tensors must be contiguous");
+    }
+  }
+}
+
+// Checks that all `tensors' have the same type and shape and reside on the same
+// GPU.
+// TODO: test_c10d_nccl.py should consider adding tests for the error conditions
+// here, ie, that deliberately pass invalid tensors and check the right
+// exception is thrown. The "Expected list of tensors on the same device"
+// condition may be a challenge because the test would need to pass tensors on
+// different devices in the same process.
+int64_t check_gpu_tensors_same_device(const std::vector<at::Tensor>& tensors) {
+  if (tensors.size() == 0) {
+    C10_THROW_ERROR(ValueError, "Tensor list must be nonempty");
+  }
+
+  const auto& first = tensors.front();
+
+  int64_t total_numel = 0;
+  for (const auto& t : tensors) {
+    if (!t.is_privateuseone() || t.is_sparse()) {
+      C10_THROW_ERROR(ValueError, "Tensors must be Zpom and dense");
+    }
+    if (t.scalar_type() != first.scalar_type()) {
+      C10_THROW_ERROR(TypeError, "Tensors must have identical type");
+    }
+    if (!t.is_non_overlapping_and_dense()) {
+      C10_THROW_ERROR(ValueError, "Tensors must be non-overlapping and dense");
+    }
+    // If we're in this function, the user called a _coalesced collective
+    // on a set of tensors with potentially different sizes and strides.
+    // Therefore, we don't check for matching sizes and strides,
+    // but we do double-check tensors are on the same device.
+    TORCH_CHECK_WITH(
+        ValueError,
+        t.get_device() == tensors[0].get_device(),
+        "Expected list of tensors on the same device");
+    total_numel += t.numel();
+  }
+
+  return total_numel;
+}
+
+bool check_same_size(const std::vector<at::Tensor>& input_tensors) {
+  for (const auto& input_tensor : input_tensors) {
+    if (!input_tensors[0].is_same_size(input_tensor)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
+    at::Device& device,
+    int rank,
+    OpType opType,
+    const char* profilingTitle,
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<at::Tensor>& outputs, // TODO(kwen2501): necessary?
+    bool record) {
+  auto r = c10::make_intrusive<ProcessGroupNCCL::WorkNCCL>(
+      device,
+      rank,
+      opType,
+      seq_,
+      profilingTitle,
+      profilingTitle != nullptr ? std::optional<std::vector<at::Tensor>>(inputs)
+                                : c10::nullopt,
+      desyncDebug_,
+      enableTiming_.load(),
+      dist_debug_level_);
+  if (record) {
+    // Ideally record every work that we enqueue, rather than every work we
+    // create.
+    // - at the time of this PR we do not currently enqueue every created work
+    // - but it is unsafe to steal refs to start/end HIP events from Works that
+    //   may go out of scope before flight recorder has retired them,
+    //   so we must ensure that any work that is initialized via initWork will
+    //   be enqueued
+    // - initially, moved record() into workEnqueue(), but found that makes it
+    //   hard to get access to profilingTitle,
+    //   inputs, and outputs for metadata recording, and we don't want to attach
+    //   these objects to the Work becuase it has implications for keeping those
+    //   tensors alive longer and adds overhead when copying Work objects
+    //   between threads
+    r->trace_id_ = NCCLTraceBuffer::get()->record(
+        uid_,
+        std::make_tuple(pg_name_, pg_desc_),
+        seq_,
+        op_id_,
+        profilingTitle ? profilingTitle : "",
+        inputs,
+        outputs,
+        r->ncclStartEvent_.get(),
+        r->ncclEndEvent_.get());
+  }
+  return r;
+}
+
+// TODO(kwen2501): deprecate
+std::vector<at::Tensor> ProcessGroupNCCL::WorkNCCL::result() {
+  return *outputs_;
+}
+
+c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupNCCL::WorkNCCL::
+    getFuture() {
+  return future_;
+}
+
+float ProcessGroupNCCL::WorkNCCL::getDuration() const {
+  TORCH_CHECK(timingEnabled_, "getDuration only works if timing was enabled");
+  TORCH_CHECK(
+      ncclStartEvent_,
+      "getDuration only works if ncclStartEvents_ is populated, true if timing enabled");
+  TORCH_CHECK(
+      ncclEndEvent_,
+      "getDuration only works if ncclEndEvents_ is populated, which should always be true");
+  return ncclStartEvent_->elapsed_time(*ncclEndEvent_);
+}
+
+uint64_t ProcessGroupNCCL::WorkNCCL::getSequencenumber() const {
+  return seq_;
+}
+
+void ProcessGroupNCCL::workEnqueue(
+    c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work) {
+  if (!terminateProcessGroup_.load()) {
+    std::lock_guard<std::mutex> lock(workMetaListMutex_);
+    // Avoid view tensors to be processed in cleanup thread.
+    // View tensors' destruction invokes autograd_meta, which
+    // needs to be destructed in user thread. Otherwise will
+    // get deadlock. Here we enqueue work without outputs_.
+    workMetaList_.emplace_back(*work);
+    lastEnqueuedSeq_ = work->seq_;
+    lastEnqueuedWorkName_ = opTypeToString(work->opType_);
+    lastWorkListUpdateTime_ = std::chrono::steady_clock::now();
+  }
+}
+
+ProcessGroupNCCL::Options::Options(bool is_high_priority_stream)
+    : Backend::Options(NCCL_BACKEND_NAME, kProcessGroupNCCLDefaultTimeout),
+      is_high_priority_stream(is_high_priority_stream) {}
+
+static constexpr int CoalActive = 0x01, CoalColl = 0x02, CoalP2P = 0x04;
+
+void ProcessGroupNCCL::startCoalescing() {
+  coalescedDevice_.set_index(-1);
+  coalescedComm_ = nullptr;
+  coalescing_state_ |= CoalActive;
+  groupStart();
+  // Other collective ops bump seq_ before creating a work. Thus, if coalesced
+  // ops bump seq_ only after initing a work they will collide with (reuse) the
+  // seq_ of the last non-coalesced collective.  Previously, seq_ was bumped
+  // inside endCoalescing, but before initWork. Since we now record individual
+  // ops from a coalesce group into the flight recorder, we want to have the
+  // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
+  // start, which has one minor downside- we burn a seq_ if someone ever does a
+  // 'start' and 'end' coalescing region without doing an operation inbetween.
+  seq_++;
+
+  // Don't bump op_id_ here, becuase startCoalescing isn't a logical operation.
+  // Bump it for each logical op inside the coalescing group.
+}
+
+// `optype` is for specifying a composite optype, such as ALLGATHER and
+// REDUCE_SCATTER
+c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
+  if (coalescedComm_ == nullptr) {
+    // There is no actual work being coalesced, return here
+    groupEnd();
+    coalescing_state_ = 0;
+    return nullptr;
+  }
+  TORCH_CHECK(
+      coalescedDevice_.index() >= 0,
+      "Somthing went wrong. Did you call end_coalescing before start_coalescing?");
+
+  // `coalescedComm_` should have same set of comms across collectives
+  auto comm = coalescedComm_;
+  // `coalescedDevice_` should have same set of devices across collectives
+  auto device = coalescedDevice_;
+
+  // `getKeyFromDevice` is how we get keys for both collectives and batch P2P
+  const auto key = getKeyFromDevice(device);
+  auto ncclStream = ncclStreams_.at(key);
+
+  // Create Work object
+  c10::zoom::CaptureStatus capture_status =
+      c10::zoom::currentStreamCaptureStatusMayInitCtx();
+  bool enqueue =
+      (coalescing_state_) && capture_status == c10::zoom::CaptureStatus::None;
+  auto work =
+      initWork(device, rank_, optype, "nccl:coalesced", {}, {}, enqueue);
+  work->ncclComm_ = comm;
+  work->blockingWait_ = blockingWait_;
+  work->avoidRecordStreams_ = avoidRecordStreams_;
+  work->opTimeout_ = options_->timeout;
+  work->store_ = store_;
+
+  // Record start before ncclGroupEnd
+  if (work->timingEnabled_) {
+    work->ncclStartEvent_->record(ncclStream);
+  }
+
+  if (nccl_use_nonblocking()) {
+    groupEndNonblocking(comm);
+  } else {
+    groupEnd();
+  }
+
+  // Record end after ncclGroupEnd
+  // TODO(eqy): is this still necessary if avoidRecordStreams_ is set?
+  work->ncclEndEvent_->record(ncclStream);
+
+  if (avoidRecordStreams_) {
+    // other functions expect an initialized ptr if avoidRecordStreams_ is set
+    work->stashed_for_allocator_safety_ =
+        std::make_shared<std::vector<at::Tensor>>();
+  }
+
+  // Notify graphs before we check the capture status preemptively
+  at::zoom::HIPGraph::inc_pending_event_queries();
+
+  if (enqueue) {
+    workEnqueue(work);
+  } else {
+    at::zoom::HIPGraph::dec_pending_event_queries();
+  }
+
+  coalescing_state_ = 0;
+  coalescedComm_ = nullptr;
+  return work;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing() {
+  // Default OpType to COALESCED if not specified
+  return endCoalescing(OpType::COALESCED);
+}
+
+template <typename Fn, typename PreProcess, typename PostProcess>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
+    at::Tensor& input,
+    at::Tensor& output,
+    Fn fn,
+    PreProcess pre,
+    PostProcess post,
+    OpType opType,
+    const char* profilingTitle,
+    bool avoidRecordStreams) {
+  if (enableNanCheck_) {
+    checkForNan(input);
+  }
+  // Environment setting by the user may add onto collective call's option
+  avoidRecordStreams |= avoidRecordStreams_;
+  c10::zoom::CaptureStatus capture_status =
+      c10::zoom::currentStreamCaptureStatusMayInitCtx();
+  errorIfCapturingNonCapturableNCCL(capture_status);
+
+  // Bump collective counter
+  seq_++;
+  op_id_++;
+
+  auto device = getDevice(input);
+  const auto key = getKeyFromDevice(device);
+  auto ncclComm = getNCCLComm(key, device, opType);
+
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalColl;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = ncclComm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
+  // Used many times below, so we stash the unordered_map lookup
+  auto ncclStream = ncclStreams_.at(key);
+
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStream(device, ncclEvents_[key], ncclStream);
+
+  std::vector<at::Tensor> inputs{input};
+  std::vector<at::Tensor> outputs{output};
+
+  bool enqueue =
+      !coalescing_state_ && capture_status == c10::zoom::CaptureStatus::None;
+  auto work =
+      initWork(device, rank_, opType, profilingTitle, inputs, outputs, enqueue);
+
+  // Store references to outputs to be used by WorkNCCL::result and operator<<.
+  work->outputs_ =
+      std::make_shared<std::vector<at::Tensor>>(std::move(outputs));
+
+  if (avoidRecordStreams) {
+    work->stashed_for_allocator_safety_ =
+        std::make_shared<std::vector<at::Tensor>>();
+    work->stashed_for_allocator_safety_->push_back(input);
+  }
+
+  c10::zoom::OptionalZoomGuard gpuGuard;
+
+  // Start event should only be recorded before the ncclGroupStart()
+  if (work->timingEnabled_) {
+    work->ncclStartEvent_->record(ncclStream);
+  }
+
+  pre(ncclStream, work);
+
+  ncclComm_t comm = ncclComm->getNcclComm();
+
+  // Both `inputs' and `outputs' are created on a worker stream and used in
+  // different ncclStreams.  Hence, both must record the ncclStream to
+  // prevent being freed before the collective finishes.
+  //
+  // We only record `inputs' here, and leave recording `outputs' to `fn' for
+  // operations where `inputs' and `outputs' are not the same.
+  //
+  // See [Sync Streams].
+  if (!avoidRecordStreams) {
+    if (!input.is_sparse()) {
+      c10::zoom::ZoomCachingAllocator::recordStream(
+          input.storage().data_ptr(), ncclStream);
+    } else {
+      // for sparse input case record streams on both index and value
+      // tensors
+      c10::zoom::ZoomCachingAllocator::recordStream(
+          input.values().storage().data_ptr(), ncclStream);
+      c10::zoom::ZoomCachingAllocator::recordStream(
+          input.indices().storage().data_ptr(), ncclStream);
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  C10D_NCCL_CHECK(
+      fn(input, output, comm, ncclStream),
+      ncclComm->getNcclCommFailureReason());
+#else
+  C10D_NCCL_CHECK_TIMEOUT(
+      fn(input, output, comm, ncclStream),
+      comm,
+      ncclComm->getNcclCommFailureReason());
+#endif
+
+  post(ncclStream, work);
+
+  // End event should only be recorded after the ncclGroupEnd()
+  if (!coalescing_state_) {
+    work->ncclEndEvent_->record(ncclStream);
+  }
+  work->ncclComm_ = ncclComm;
+
+  {
+    c10::zoom::ZoomMultiStreamGuard streamGuard(ncclStream);
+    std::vector<at::Device> devices{device};
+    work->future_ = c10::make_intrusive<at::ivalue::Future>(
+        c10::ListType::create(c10::TensorType::get()), devices);
+
+    // Add a callback that runs profiling end callbacks. wrapCallback() in HIP
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
+    work->future_->markCompleted(at::IValue(*work->outputs_));
+  }
+
+  // Set appropriate work parameters.
+  work->blockingWait_ = blockingWait_;
+  work->avoidRecordStreams_ = avoidRecordStreams;
+  work->opTimeout_ = options_->timeout;
+  work->store_ = store_;
+  // Record size info for debug. We only record the size on the first device as
+  // multi-device per process is deprecated
+  work->numelIn_ = input.numel();
+  work->numelOut_ = output.numel();
+
+  // Notify graphs before we check the capture status preemptively
+  at::zoom::HIPGraph::inc_pending_event_queries();
+  if (enqueue) {
+    workEnqueue(work);
+  } else {
+    at::zoom::HIPGraph::dec_pending_event_queries();
+  }
+
+  return work;
+}
+
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    Fn fn,
+    OpType opType,
+    const char* profilingTitle,
+    bool avoidRecordStreams) {
+  // Environment setting by the user may add onto collective call's option
+  avoidRecordStreams |= avoidRecordStreams_;
+  c10::zoom::CaptureStatus capture_status =
+      c10::zoom::currentStreamCaptureStatusMayInitCtx();
+  errorIfCapturingNonCapturableNCCL(capture_status);
+
+  // Bump collective counter
+  seq_++;
+  // For coalescingManager collectives, there is no individual c++ call per
+  // collective so there is no flight record and we increment seq_ and op_id_
+  // together. Compare this to startCoalesing/endCoalescing flow where we
+  // increment seq_ once per group and increment op_id_ once per indvidual
+  // operation within the group
+  op_id_++;
+
+  // Currently, the API permits one scenario where inputs.size() and
+  // outputs.size() are > 0.
+  // 1. If the call was a _coalesced call, all inputs must be on the same
+  // device.
+  //    The group of nccl calls applies the collective separately to each input,
+  //    but the group as a whole should be efficient, and might even execute as
+  //    a single fused kernel.
+  auto device = getDevice(inputs[0]);
+  const auto key = getKeyFromDevice(device);
+  auto ncclComm = getNCCLComm(key, device, opType);
+
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalColl;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = ncclComm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
+  // Used many times below, so we stash the unordered_map lookup
+  auto ncclStream = ncclStreams_.at(key);
+
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStream(device, ncclEvents_[key], ncclStream);
+
+  auto work = initWork(
+      device, rank_, opType, profilingTitle, inputs, outputs, /*record=*/true);
+
+  // Store references to outputs to be used by WorkNCCL::result and operator<<.
+  work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
+
+  if (avoidRecordStreams) {
+    work->stashed_for_allocator_safety_ =
+        std::make_shared<std::vector<at::Tensor>>(inputs);
+  }
+
+  c10::zoom::OptionalZoomGuard gpuGuard;
+
+  // Start event should only be recorded before the ncclGroupStart() (which
+  // happens inside AutoNcclGroup guard below)
+  if (work->timingEnabled_) {
+    work->ncclStartEvent_->record(ncclStream);
+  }
+
+  ncclComm_t comm = ncclComm->getNcclComm();
+
+// TODO(kwen2501): this should be moved to c10d tests, to qualify a NCCL
+// upgrade. Once a NCCL version is qualified, this code should not be needed at
+// runtime.
+#ifdef PGNCCL_ENABLE_HASH
+  if (enableCollecticeHashDebug_.load()) {
+    auto numel = getTensorsNumel(inputs);
+    auto hashValue = hashTensors(inputs);
+    PRINT_COLLECTIVE_HASH_SIGNATURE(
+        "input", opTypeToString(opType), numel, hashValue);
+  }
+#endif
+
+  {
+    torch::zoom::nccl::AutoNcclGroup nccl_group_guard(
+        comm, nccl_use_nonblocking());
+    for (const auto i : c10::irange(inputs.size())) {
+      // Both `inputs' and `outputs' are created on a worker stream and used in
+      // different ncclStreams.  Hence, both must record the ncclStream to
+      // prevent being freed before the collective finishes.
+      //
+      // We only record `inputs' here, and leave recording `outputs' to `fn' for
+      // operations where `inputs' and `outputs' are not the same.
+      //
+      // See [Sync Streams].
+      if (!avoidRecordStreams) {
+        if (!inputs[i].is_sparse()) {
+          c10::zoom::ZoomCachingAllocator::recordStream(
+              inputs[i].storage().data_ptr(), ncclStream);
+        } else {
+          // for sparse input case record streams on both index and value
+          // tensors
+          c10::zoom::ZoomCachingAllocator::recordStream(
+              inputs[i].values().storage().data_ptr(), ncclStream);
+          c10::zoom::ZoomCachingAllocator::recordStream(
+              inputs[i].indices().storage().data_ptr(), ncclStream);
+        }
+      }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+      C10D_NCCL_CHECK(
+          fn(inputs[i], outputs[i], comm, ncclStream),
+          ncclComm->getNcclCommFailureReason());
+#else
+      C10D_NCCL_CHECK_TIMEOUT(
+          fn(inputs[i], outputs[i], comm, ncclStream),
+          comm,
+          ncclComm->getNcclCommFailureReason());
+#endif
+    }
+  }
+
+  work->ncclEndEvent_->record(ncclStream);
+  work->ncclComm_ = ncclComm;
+
+  {
+    c10::zoom::ZoomMultiStreamGuard streamGuard(ncclStream);
+    std::vector<at::Device> devices{device};
+    work->future_ = c10::make_intrusive<at::ivalue::Future>(
+        c10::ListType::create(c10::TensorType::get()), devices);
+
+    // Add a callback that runs profiling end callbacks. wrapCallback() in HIP
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
+    work->future_->markCompleted(at::IValue(*work->outputs_));
+  }
+
+  // Set appropriate work parameters.
+  work->blockingWait_ = blockingWait_;
+  work->avoidRecordStreams_ = avoidRecordStreams;
+  work->opTimeout_ = options_->timeout;
+  work->store_ = store_;
+  // Record size info for debug. We only record the size on the first device as
+  // multi-device per process is deprecated
+  work->numelIn_ = inputs[0].numel();
+  work->numelOut_ = outputs[0].numel();
+
+  /* Note [cuda graph capture and workEnqueue]
+
+  Normal behavior of the C10D watchdog is to query cuda events on work objects
+  periodically, but when cuda graph recording is active these event queries
+  would crash or mess up the recording.
+
+  To ensure we do not enqueue a work object to the watchdog when cuda graph
+  capture is active, we use a one-way sync. We increment a flag pre-emptively,
+  indicating our intent to enqueue a work object. Then we check capture_status
+  to see if (a) capturing is already in progress (we cannot enqueue in this
+  case), (b) capturing hasn't started yet, so we can trust that no capture will
+  start (since a pre-condition of starting a capture is to check the event query
+  count is 0).
+
+  If we are not able to enqueue the work due to capture-in-progress, we finally
+  decrement the counter.
+
+  For this reason we cannot easily move the increment inside workEnqueue unless
+  we also change the semantic of workEnqueue to 'maybeWorkEnqueue'.
+
+  TODO:
+   - Is our design for flight recorder safe in this context?  are we recording
+  any FR events during HIPGraph capture? if so, they won't be safe to poll for
+  completion status.
+  */
+  at::zoom::HIPGraph::inc_pending_event_queries();
+  if (capture_status == c10::zoom::CaptureStatus::None) {
+    workEnqueue(work);
+  } else {
+    at::zoom::HIPGraph::dec_pending_event_queries();
+  }
+  // TODO(whc) if the work isn't enqueued, I don't feel great about returning
+  // it, since interactions with it by usercode won't behave normally - they
+  // won't observe work completion, for instance.  Will this lead to silent
+  // problems during capture?
+  return work;
+}
+
+template <typename Fn, typename PreProcess, typename PostProcess>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
+    at::Tensor& tensor,
+    Fn fn,
+    int peer,
+    OpType opType,
+    PreProcess pre,
+    PostProcess post,
+    const char* profilingTitle) {
+  if (enableNanCheck_) {
+    checkForNan(tensor);
+  }
+  // avoidRecordStreams_ note:
+  // send, recv, and irecv should be ok with avoidRecordStreams,
+  // However, for isend, I don't think the API requires the user
+  // to wait() on the returned handle, so ProcessGroupNCCL can't know
+  // when it's safe to release the input back to the allocator,
+  // and the present call has no way to know it's not an isend.
+  // Therefore, we warn and fall back to the typical recordStream logic:
+  if (avoidRecordStreams_) {
+    TORCH_WARN_ONCE(
+        "TORCH_NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point "
+        "collectives.");
+  }
+
+  auto device = getDevice(tensor);
+  std::string key;
+  int p2pRank = 0, p2pTargetRank = 0;
+  bool isSendRecvSelf = false;
+  // For batch_isend_irecv, ncclGroupStart() would be called upfront
+  bool batchP2P = ncclActiveGroupCounter_ > 0;
+  if (batchP2P) {
+    // For batch P2P, we need to treat it like a collective when selecting
+    // communicator, because other ranks can call into this batch other than my
+    // rank and my peer
+    key = getKeyFromDevice(device);
+    p2pRank = rank_;
+    p2pTargetRank = peer;
+  } else {
+    // For single P2P, preserve the old two-rank behavior (to avoid perf diff)
+    key = getKeySendRecv(rank_, peer);
+    p2pRank = rank_ <= peer ? 0 : 1;
+    isSendRecvSelf = rank_ == peer;
+    p2pTargetRank = isSendRecvSelf ? 0 : 1 - p2pRank;
+
+    if (!coalescing_state_) {
+      // Bump sequence number. Don't do so if it's a batch P2P, it will be
+      // bumped in `endCoalescing`.
+      seq_++;
+    }
+  }
+
+  // Bump the logical operation counter regardless of whether this op is
+  // coalesced or individual
+  op_id_++;
+
+  auto ncclComm = getNCCLComm(key, device, opType, p2pRank, isSendRecvSelf);
+
+  if (coalescing_state_ & CoalActive) {
+    coalescing_state_ |= CoalP2P;
+    if (coalescedDevice_.index() < 0) {
+      coalescedDevice_ = device;
+    } else {
+      TORCH_CHECK(
+          coalescedDevice_.index() == device.index(), MULTI_DEVICE_ERROR_MSG);
+    }
+    if (coalescedComm_ == nullptr) {
+      coalescedComm_ = ncclComm;
+    } else {
+      TORCH_CHECK(coalescedComm_ == ncclComm, MULTI_DEVICE_ERROR_MSG);
+    }
+  }
+
+  // Used many times below, so we stash the unordered_map lookup
+  auto ncclStream = ncclStreams_.at(key);
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStream(device, ncclEvents_[key], ncclStream);
+
+  // Work itself will create the CUDA events on all GPUs of tensors
+  c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
+  if (coalescing_state_) {
+    // When coalescing, we record events per op that lack timing/state
+    // information becuase there is no 'work' associated with them, and then
+    // later in endCoalescing we record a 'coalesced' Work which has
+    // timing/state updates via watchdog thread, but lacks op metadata such as
+    // input/output sizes and profilingTitle per-op in the group.
+    auto trace_id = NCCLTraceBuffer::get()->record(
+        uid_,
+        std::make_tuple(pg_name_, pg_desc_),
+        seq_,
+        op_id_,
+        profilingTitle,
+        {tensor},
+        {tensor},
+        nullptr,
+        nullptr);
+    // TODO(whc) if we want to make the per-p2p-op flightrecorder entries get
+    // their timings/states updated by proxy when the Work obj representing the
+    // coalesce group gets its update, we could accumulate these trace_ids
+    // together and ask FlightRecorder to take the update from one Work and
+    // apply it to multiple entries
+    (void)trace_id;
+  } else {
+    // Store references to outputs to be used by WorkNCCL::result and
+    // operator<<. Note that these outputs are only valid for recv(), as send()
+    // does not modify the inputs but we still create these outputs for use
+    // cases such as profiling.
+
+    work = initWork(
+        device, rank_, opType, profilingTitle, {tensor}, {}, /*record=*/false);
+    // This bypasses something in Work() that crashes if {tensor} is given as
+    // output, not sure what
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>();
+    work->outputs_->push_back(tensor);
+    // TODO(whc) becuase we don't pass output {tensor} to initWork, we tell
+    // initWork to not record, and then we manually call record passing all the
+    // information it wants.
+    work->trace_id_ = NCCLTraceBuffer::get()->record(
+        uid_,
+        std::make_tuple(pg_name_, pg_desc_),
+        seq_,
+        op_id_,
+        profilingTitle,
+        {tensor},
+        {tensor},
+        work->ncclStartEvent_.get(),
+        work->ncclEndEvent_.get());
+  }
+
+  // is gpuGuard needed for the if block below, or can i swap them
+  c10::zoom::OptionalZoomGuard gpuGuard;
+
+  if (!coalescing_state_) {
+    // Start event should only be recorded before the ncclGroupStart()
+    if (work->timingEnabled_) {
+      work->ncclStartEvent_->record(ncclStream);
+    }
+
+    pre(ncclStream, work);
+  }
+
+  // Both send tensor and recv tensor are created on a worker stream and used
+  // in different ncclStreams.  Hence, both must record the ncclStream to
+  // prevent being freed before the collective finishes.
+  //
+  // See [Sync Streams].
+  c10::zoom::ZoomCachingAllocator::recordStream(
+      tensor.storage().data_ptr(), ncclStream);
+
+  // This part seems common to both p2p and coalesced-p2p usage?
+  ncclComm_t comm_ = ncclComm->getNcclComm();
+
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  C10D_NCCL_CHECK(
+      fn(tensor, comm_, ncclStream, p2pTargetRank),
+      ncclComm->getNcclCommFailureReason());
+#else
+  C10D_NCCL_CHECK_TIMEOUT(
+      fn(tensor, comm_, ncclStream, p2pTargetRank),
+      ncclComm->getNcclComm(),
+      ncclComm->getNcclCommFailureReason());
+#endif
+
+  if (!coalescing_state_) {
+    post(ncclStream);
+
+    // End event should only be recorded after the ncclGroupEnd()
+    work->ncclEndEvent_->record(ncclStream);
+    work->ncclComm_ = ncclComm;
+    work->blockingWait_ = blockingWait_;
+    work->opTimeout_ = options_->timeout;
+    work->store_ = store_;
+    // Record size info for debug. We only record the size on the first device
+    // as multi-device per process is deprecated
+    work->numelIn_ = work->numelOut_ = tensor.numel();
+
+    // Future only needs to be created and marked completed with outputs for
+    // recv(), but still create future for use cases such as profiling even for
+    // send().
+    {
+      c10::zoom::ZoomMultiStreamGuard streamGuard(ncclStream);
+      std::vector<at::Device> devices{device};
+      work->future_ = c10::make_intrusive<at::ivalue::Future>(
+          c10::ListType::create(c10::TensorType::get()), devices);
+      work->future_->markCompleted(at::IValue(*work->outputs_));
+    }
+
+    // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
+  }
+
+  // Enqueue P2P op so that it can be cancelled by NCCL watchdog
+  c10::zoom::CaptureStatus capture_status =
+      c10::zoom::currentStreamCaptureStatusMayInitCtx();
+
+  // Notify graphs before we check the capture status preemptively
+  at::zoom::HIPGraph::inc_pending_event_queries();
+
+  if (!coalescing_state_ && capture_status == c10::zoom::CaptureStatus::None) {
+    workEnqueue(work);
+    return work;
+  } else {
+    at::zoom::HIPGraph::dec_pending_event_queries();
+    return nullptr;
+  }
+}
+
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
+    at::Tensor& input,
+    at::Tensor& output,
+    Fn fn,
+    OpType opType,
+    const char* profilingTitle,
+    bool avoidRecordStreams) {
+  return collective(
+      input,
+      output,
+      fn,
+      [](c10::zoom::ZoomStream&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      [](c10::zoom::ZoomStream&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      opType,
+      profilingTitle,
+      avoidRecordStreams);
+}
+
+template <typename Fn>
+c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
+    at::Tensor& tensor,
+    Fn fn,
+    int peer,
+    OpType opType,
+    const char* profilingTitle) {
+  return pointToPoint(
+      tensor,
+      fn,
+      peer,
+      opType,
+      [](c10::zoom::ZoomStream&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      [](c10::zoom::ZoomStream&) {},
+      profilingTitle);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+#ifdef IS_NCCLX
+  tensor = tensor.coalesce();
+  at::Tensor outputTensor =
+      torch::zeros(tensor.sizes(), tensor.options().layout(torch::kStrided));
+  auto work = collective(
+      tensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+
+        size_t num_elements = output.numel();
+        auto indices = input.indices();
+        auto sizes = input.sizes();
+        int colSize = sizes[1];
+        auto rows = indices[0];
+        size_t blockCount = rows.sizes()[0];
+        auto recvIndices = indices[0] * colSize;
+
+        // prevent output and recvIndices from being freed
+        c10::zoom::ZoomCachingAllocator::recordStream(
+            output.storage().data_ptr(), stream);
+        c10::zoom::ZoomCachingAllocator::recordStream(
+            recvIndices.storage().data_ptr(), stream);
+        auto result = ncclAllReduceSparseBlock(
+            input._values().data_ptr(), // sendbuff
+            recvIndices.data_ptr<int64_t>(), // recv_indices
+            blockCount, // block_count
+            colSize, // block_length
+            output.data_ptr(), // recvbuff
+            output.numel(), // recv_count
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+        return result;
+      },
+      [](c10::zoom::ZoomStream& ncclStream,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      [&](c10::zoom::ZoomStream& ncclStream,
+          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+        // Convert output tensors to sparse and back into tensors.
+        c10::zoom::ZoomStreamGuard guard(ncclStream);
+        if (opts.sparseIndices.has_value()) {
+          tensor = at::sparse_coo_tensor(
+              opts.sparseIndices.value(), outputTensor, tensor.sizes());
+        } else {
+          tensor = outputTensor.to_sparse();
+        }
+      },
+      OpType::_ALLREDUCE_SPARSE,
+      "nccl:all_reduce_sparse");
+  return work;
+#else
+  // If the nccl branch is not "exp" then we just error
+  C10_THROW_ERROR(
+      Error,
+      "NCCL does not support all_reduce with sparse tensors. Please use dense tensors instead.");
+#endif
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_impl(
+    at::Tensor& tensor,
+    const AllreduceOptions& opts) {
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclAllReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+      },
+      OpType::ALLREDUCE,
+      "nccl:all_reduce");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "all_reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
+  check_gpu_single_tensor(tensor);
+
+  if (intraNodeComm_ != nullptr && opts.reduceOp == ReduceOp::SUM) {
+    using namespace intra_node_comm;
+    auto algo = intraNodeComm_->selectAllReduceAlgo(tensor);
+    if (algo != intra_node_comm::AllReduceAlgo::NONE) {
+      intraNodeComm_->allReduce(tensor, algo);
+      return c10::make_intrusive<IntraNodeCommWork>();
+    }
+  }
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+  // @lint-ignore CLANGTIDY
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      rank_, // rank
+      "allreduce", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash tensors.
+  return allreduce_impl(tensor, opts);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceCoalescedOptions& opts) {
+  auto total_numel = check_gpu_tensors_same_device(tensors);
+  TORCH_CHECK(
+      !isFloat8Type(tensors.back().scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+
+  // @lint-ignore CLANGTIDY
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      rank_, // rank
+      "allreduce_coalesced", // collective name
+      total_numel, // inNelems
+      total_numel, // outNelems
+      tensors[0].scalar_type(), // dType
+      // I'm not sure what in,outSplitSizes mean here.
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash tensors.
+  return collectiveCoalesced(
+      tensors,
+      tensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclAllReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+      },
+      OpType::COALESCED,
+      "nccl:allreduce_coalesced");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::broadcast(
+    std::vector<at::Tensor>& tensors,
+    const BroadcastOptions& opts) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    tensor = at::view_as_real(tensor);
+  }
+  check_gpu_single_tensor(tensor);
+
+  // @lint-ignore CLANGTIDY
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      opts.rootRank, // root rank
+      "broadcast", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash tensors.
+  bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
+
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        const auto root = opts.rootRank + opts.rootTensor;
+        return ncclBcast(
+            input.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            root,
+            comm,
+            stream.stream());
+      },
+      OpType::BROADCAST,
+      "nccl:broadcast",
+      avoidRecordStreams);
+}
+
+// _broadcast_oop adds an out-of-place broadcast in PGNCCL
+// Custom collectives may be implemented by coalescing broadcast operations
+// One use-case is implementing a vector all_gather (all_gather_v)
+// where unevenly sized inputs are gathered among participating ranks
+// Since all_gather provides an out-of-place API, an all_gather_v
+// semantic implemented inside pg_nccl.all_gather also needs to support
+// out-of-place, for which an out-of-place broadcast is required to be added
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_broadcast_oop(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const BroadcastOptions& opts) {
+  if (outputTensor.numel() != inputTensor.numel()) {
+    C10_THROW_ERROR(
+        ValueError,
+        "Tensor input and output of _broadcast_oop must have the same number of elements ");
+  }
+
+  return collective(
+      inputTensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        const auto root = opts.rootRank + opts.rootTensor;
+        return ncclBroadcast(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            root,
+            comm,
+            stream.stream());
+      },
+      OpType::BROADCAST,
+      "nccl:_broadcast_oop");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
+    std::vector<at::Tensor>& tensors,
+    const ReduceOptions& opts) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  if (tensor.is_complex()) {
+    TORCH_CHECK(
+        complexViewAsRealAllowed(opts.reduceOp),
+        "reduce does not support",
+        opts.reduceOp,
+        "on complex tensors");
+    tensor = at::view_as_real(tensor);
+  }
+  check_gpu_single_tensor(tensor);
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      opts.rootRank, // root rank
+      "reduce", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash tensors.
+  return collective(
+      tensor,
+      tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        const auto root = opts.rootRank + opts.rootTensor;
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            root,
+            comm,
+            stream.stream());
+      },
+      OpType::REDUCE,
+      "nccl:reduce");
+}
+
+// _reduce_oop exposes an out-of-place reduce from PGNCCL
+// Custom collectives may be implemented by coalescing reduce operations
+// One use-case is implementing a vector reduce_scatter (reduce_scatter_v)
+// where inputs are reduced and scattered unevenly among participating ranks
+// Since reduce_scatter provides an out-of-place API, a reduce_scatter_v
+// semantic implemented inside pg_nccl.reduce_scatter also needs to support
+// out-of-place, for which an out-of-place reduce is required to be added
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_oop(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const ReduceOptions& opts) {
+  if (outputTensor.numel() != inputTensor.numel()) {
+    C10_THROW_ERROR(
+        ValueError,
+        "Tensor input and output of _reduce_oop must have the same number of elements ");
+  }
+
+  return collective(
+      inputTensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        const auto root = opts.rootRank + opts.rootTensor;
+        const auto ncclDataType = getNcclDataType(input.scalar_type());
+        const auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            (int)root,
+            comm,
+            stream.stream());
+      },
+      OpType::REDUCE,
+      "nccl:_reduce_oop");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
+    std::vector<std::vector<at::Tensor>>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const AllgatherOptions& opts) {
+  TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto inputTensor = inputTensors.back();
+  check_gpu_single_tensor(inputTensor);
+  // @lint-ignore CLANGTIDY
+  auto outputTensors_ = outputTensors.back();
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "all_gather", // collective name
+      inputTensor.numel(), // inNelems
+      inputTensor.numel() * // outNelems
+          this->getSize(),
+      inputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  bool same_size = check_same_size(outputTensors_);
+  if (same_size) {
+    // Flatten a vector of tensors into a single, stacked tensor.
+    at::Tensor outputFlattened = newLikeFlat(outputTensors_);
+
+    return collective(
+        inputTensor,
+        outputFlattened,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ncclComm_t comm,
+            c10::zoom::ZoomStream& stream) {
+          if (!avoidRecordStreams_) {
+            c10::zoom::ZoomCachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
+          return ncclAllGather(
+              input.data_ptr(),
+              output.data_ptr(),
+              input.numel(),
+              getNcclDataType(input.scalar_type()),
+              comm,
+              stream.stream());
+        },
+        [](c10::zoom::ZoomStream& ncclStream,
+           c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+          // avoidRecordStreams_ note: We actually don't need to stash anything
+          // here.
+          //  - inputTensors is stashed onto work->stashed_for_allocator_safety_
+          //    in collective().
+          //  - outputFlattened is stashed onto work->outputs_ in collective().
+          //  - User-facing outputTensors should be held by the user until after
+          //    waiting on work_, or the call makes no sense.
+          // So all participating tensors are accounted for, and won't be
+          // released back to their allocation streams until after work_ is
+          // waited on.
+        },
+        [&](c10::zoom::ZoomStream& ncclStream,
+            c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+          // Copy the flattened output tensors to the outputs.
+          c10::zoom::ZoomStreamGuard guard(ncclStream);
+          for (const auto j : c10::irange(outputTensors_.size())) {
+            // See [Sync Streams].
+            if (!avoidRecordStreams_) {
+              c10::zoom::ZoomCachingAllocator::recordStream(
+                  outputTensors_[j].storage().data_ptr(), ncclStream);
+            }
+            outputTensors_[j].copy_(outputFlattened[j], true);
+          }
+        },
+        OpType::ALLGATHER,
+        "nccl:all_gather");
+  } else {
+    const auto num_reduces = outputTensors_.size();
+    startCoalescing();
+    for (const int i : c10::irange(num_reduces)) {
+      auto& output = outputTensors_[i];
+      auto& input = (i == rank_) ? inputTensor : output;
+      auto broadcastOpts = BroadcastOptions{
+          static_cast<int64_t>(i), static_cast<int64_t>(0), opts.timeout};
+      _broadcast_oop(output, input, broadcastOpts);
+    }
+    auto work = endCoalescing(OpType::ALLGATHER);
+    return work;
+  }
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_coalesced(
+    std::vector<std::vector<at::Tensor>>& /* unused */,
+    std::vector<at::Tensor>& /* unused */,
+    const AllgatherOptions& /* unused */) {
+  C10_THROW_ERROR(
+      NotImplementedError,
+      "ProcessGroupNCCL does not support allgather_coalesced");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
+    std::vector<at::Tensor>& outputs,
+    std::vector<at::Tensor>& inputs,
+    const AllgatherOptions& opts) {
+  return collectiveCoalesced(
+      inputs,
+      outputs,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        return ncclAllGather(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            comm,
+            stream.stream());
+      },
+      OpType::COALESCED,
+      "nccl:all_gather_into_tensor_coalesced");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<std::vector<at::Tensor>>& inputTensors,
+    const ReduceScatterOptions& opts) {
+  TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto outputTensor = outputTensors.back();
+  check_gpu_single_tensor(outputTensor);
+  // @lint-ignore CLANGTIDY
+  auto inputTensors_ = inputTensors.back();
+  TORCH_CHECK(
+      !isFloat8Type(outputTensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "reduce_scatter", // collective name
+      outputTensor.numel() * this->getSize(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  bool same_size = check_same_size(inputTensors_);
+  if (same_size) {
+    // Flatten a vector of tensors into a single, stacked tensor.
+    at::Tensor inputFlattened = newLikeFlat(inputTensors_);
+
+    return collective(
+        inputFlattened,
+        outputTensor,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ncclComm_t comm,
+            c10::zoom::ZoomStream& stream) {
+          if (!avoidRecordStreams_) {
+            c10::zoom::ZoomCachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
+          const auto ncclDataType = getNcclDataType(input.scalar_type());
+          const auto ncclReduceOp =
+              getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+          return ncclReduceScatter(
+              input.data_ptr(),
+              output.data_ptr(),
+              output.numel(),
+              ncclDataType,
+              ncclReduceOp,
+              comm,
+              stream.stream());
+        },
+        [&](c10::zoom::ZoomStream& ncclStream,
+            c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+          if (avoidRecordStreams_) {
+            // We only need to stash inputTensors.
+            //  - inputFlattened is stashed onto
+            //  work->stashed_for_allocator_safety_
+            //    in collective().
+            //  - User-facing outputTensors is stashed onto work->outputs_ in
+            //  collective(),
+            //    and should also be held by the user until after waiting on
+            //    work_.
+            auto& v = work->stashed_for_allocator_safety_;
+            v->insert(v->end(), inputTensors_.begin(), inputTensors_.end());
+          }
+
+          // Copy the input tensors to the flattened inputs.
+          c10::zoom::ZoomStreamGuard guard(ncclStream);
+          for (const auto j : c10::irange(inputTensors_.size())) {
+            // See [Sync Streams].
+            if (!avoidRecordStreams_) {
+              c10::zoom::ZoomCachingAllocator::recordStream(
+                  inputTensors_[j].storage().data_ptr(), ncclStream);
+            }
+            inputFlattened[j].copy_(inputTensors_[j], true);
+          }
+        },
+        [&](c10::zoom::ZoomStream&,
+            c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+        OpType::REDUCE_SCATTER,
+        "nccl:reduce_scatter");
+  } else {
+    const auto num_reduces = inputTensors_.size();
+    startCoalescing();
+    for (const int i : c10::irange(num_reduces)) {
+      auto& input = inputTensors_[i];
+      auto& output = (i == rank_) ? outputTensor : input;
+      auto reduceOpts = ReduceOptions{
+          opts.reduceOp,
+          static_cast<int64_t>(i),
+          static_cast<int64_t>(0),
+          opts.timeout};
+      _reduce_oop(output, input, reduceOpts);
+    }
+    auto work = endCoalescing(OpType::REDUCE_SCATTER);
+    return work;
+  }
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const ReduceScatterOptions& opts) {
+  if (inputTensor.dtype() != outputTensor.dtype()) {
+    C10_THROW_ERROR(
+        TypeError, "input tensor must be the same type as the output tensor.");
+  }
+
+  if (inputTensor.numel() != outputTensor.numel() * size_) {
+    C10_THROW_ERROR(
+        ValueError,
+        "input tensor must be the same size as output size times world size");
+  }
+
+  // @lint-ignore CLANGTIDY
+  const auto& tensor = outputTensor;
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensor, // inputTensor
+      outputTensor, // outputTensor
+      rank_, // rank
+      "_reduce_scatter_base", // collective name
+      inputTensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dtype
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash inputs and outputs.
+  // Note 2: for asyncOp = false, we don't want to record streams because we
+  // know that the NCCL stream will join back to the "current" stream right
+  // after this op. So we might just as well keep the stream ownership of the
+  // input/output tensors unchanged. The benefit would be that the
+  // allocation/free of the tensors would look deterministic to the "current"
+  // stream so that the caching allocator can reuse memory pool for this stream
+  // in a clever way. This setting is added for libraries like FSDP which uses
+  // `reduce_scatter_tensor`.
+  bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
+
+  return collective(
+      inputTensor,
+      outputTensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        if (!avoidRecordStreams) {
+          c10::zoom::ZoomCachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+        }
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclReduceScatter(
+            input.data_ptr(),
+            output.data_ptr(),
+            output.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+      },
+      OpType::_REDUCE_SCATTER_BASE,
+      "nccl:_reduce_scatter_base",
+      avoidRecordStreams);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
+    std::vector<at::Tensor>& outputs,
+    std::vector<at::Tensor>& inputs,
+    const ReduceScatterOptions& opts) {
+  TORCH_CHECK(
+      !isFloat8Type(inputs.back().scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
+  return collectiveCoalesced(
+      inputs,
+      outputs,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        if (!avoidRecordStreams_) {
+          c10::zoom::ZoomCachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+        }
+        auto ncclDataType = getNcclDataType(input.scalar_type());
+        auto ncclReduceOp =
+            getNcclReduceOp(opts.reduceOp, input, ncclDataType, comm);
+        return ncclReduceScatter(
+            input.data_ptr(),
+            output.data_ptr(),
+            output.numel(),
+            ncclDataType,
+            ncclReduceOp,
+            comm,
+            stream.stream());
+      },
+      OpType::COALESCED,
+      "nccl:reduce_scatter_tensor_coalesced");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
+  RECORD_PARAM_COMMS(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      rank_, // rank
+      "barrier", // collective name
+      0, // inNelems
+      0, // outNelems
+      at::kByte, // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  std::vector<at::Device> devices;
+
+  // Use user defined GPU device ids if provided
+  if (!opts.device_ids.empty()) {
+    for (auto device : opts.device_ids) {
+      devices.emplace_back(at::DeviceType::PrivateUse1, device);
+    }
+  } else if (usedDeviceIdxs_.empty()) {
+    // This means there is not yet a NCCL collective being called
+    // Here we have to use the best guesses and will use a single GPU to call
+    // allreduce to achieve barrier.
+    // In case the multiple processes fall into the same node, we use rank to
+    // ensure that each process is on a different GPU
+    auto numGPUs = at::zoom::getNumGPUs();
+    int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs);
+    LOG(INFO)
+        << logPrefix()
+        << c10::str(
+               " using GPU ",
+               deviceIdx,
+               " to perform barrier as devices used by this process are currently unknown. ",
+               "This can potentially cause a hang if this rank to GPU mapping is incorrect.",
+               "Specify device_ids in barrier() to force use of a particular device.");
+    devices.emplace_back(guessDeviceForRank());
+  } else {
+    for (auto usedDeviceIdx : usedDeviceIdxs_) {
+      devices.emplace_back(at::DeviceType::PrivateUse1, usedDeviceIdx);
+    }
+  }
+
+  // Use one device only
+  auto device = devices.back();
+  at::Tensor barrierTensor =
+      at::empty({1}, at::TensorOptions().device(device).dtype(at::kByte));
+  // All reduce to achieve the barrier
+  auto work = allreduce_impl(barrierTensor);
+
+  // Work will take over barrierTensors
+  auto ncclWork = dynamic_cast<ProcessGroupNCCL::WorkNCCL*>(work.get());
+  TORCH_CHECK(ncclWork);
+  ncclWork->barrierTensor_ = std::move(barrierTensor);
+  return work;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall_base(
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    std::vector<int64_t>& outputSplitSizes,
+    std::vector<int64_t>& inputSplitSizes,
+    const AllToAllOptions& /* unused */) {
+  check_gpu_single_tensor(outputTensor, true);
+  check_gpu_single_tensor(inputTensor, true);
+  if (outputSplitSizes.size() == 0 && inputSplitSizes.size() == 0) {
+    RECORD_PARAM_COMMS_DATA(
+        static_cast<int>(
+            this->getSequenceNumberForGroup() +
+            1), // seq + 1 to match collective
+        std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+        inputTensor, // inputTensor
+        outputTensor, // outputTensor
+        rank_, // rank
+        "all_to_all", // collective name
+        inputTensor.numel(), // inNelems
+        outputTensor.numel(), // outNelems
+        inputTensor.scalar_type(), // dType
+        std::vector<int64_t>(), // inSplitSizes
+        std::vector<int64_t>(), // outSplitSizes
+        globalRankStart, // globalRankStart
+        globalRankStride, // globalRankStride
+        this->getSize()); // worldSize
+
+    // avoidRecordStreams_ note: collective() will stash inputTensors and
+    // outputTensors.
+    return collective(
+        inputTensor,
+        outputTensor,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ncclComm_t comm,
+            c10::zoom::ZoomStream& stream) {
+          // See [Sync Streams].
+          if (!avoidRecordStreams_) {
+            c10::zoom::ZoomCachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
+          torch::zoom::nccl::all2all_single_equal_split(
+              input, output, this->getSize(), comm, stream);
+          return ncclSuccess;
+        },
+        OpType::ALLTOALL_BASE,
+        "nccl:all_to_all");
+  } else {
+    c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_);
+    c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_);
+
+    RECORD_PARAM_COMMS_DATA(
+        static_cast<int>(
+            this->getSequenceNumberForGroup() +
+            1), // seq + 1 to match collective
+        std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+        inputTensor, // inputTensor
+        outputTensor, // outputTensor
+        rank_, // rank
+        "all_to_allv", // collective name
+        inputTensor.numel(), // inNelems
+        outputTensor.numel(), // outNelems
+        inputTensor.scalar_type(), // dType
+        inputSplitSizes, // inSplitSizes
+        outputSplitSizes, // outSplitSizes
+        globalRankStart, // globalRankStart
+        globalRankStride, // globalRankStride
+        this->getSize()); // worldSize
+
+    // avoidRecordStreams_ note: collective() will stash inputTensors and
+    // outputTensors.
+    return collective(
+        inputTensor,
+        outputTensor,
+        [&](at::Tensor& input,
+            at::Tensor& output,
+            ncclComm_t comm,
+            c10::zoom::ZoomStream& stream) {
+          std::vector<size_t> send_lengths(size_);
+          std::vector<size_t> recv_lengths(size_);
+          std::vector<size_t> send_offsets(size_);
+          std::vector<size_t> recv_offsets(size_);
+          c10d::computeLengthsAndOffsets(
+              inputSplitSizes, input, &send_lengths, &send_offsets);
+          c10d::computeLengthsAndOffsets(
+              outputSplitSizes, output, &recv_lengths, &recv_offsets);
+          // See [Sync Streams].
+          if (!avoidRecordStreams_) {
+            c10::zoom::ZoomCachingAllocator::recordStream(
+                output.storage().data_ptr(), stream);
+          }
+          torch::zoom::nccl::all2all_single_unequal_split(
+              input.data_ptr(),
+              send_lengths.data(),
+              send_offsets.data(),
+              output.data_ptr(),
+              recv_lengths.data(),
+              recv_offsets.data(),
+              input.element_size(),
+              input.scalar_type(),
+              comm,
+              stream);
+          return ncclSuccess;
+        },
+        OpType::ALLTOALL_BASE,
+        "nccl:all_to_all");
+  }
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::alltoall(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const AllToAllOptions& /* unused */) {
+  std::vector<int64_t> inSplitSizes;
+  std::vector<int64_t> outSplitSizes;
+  int64_t total_numel = 0;
+
+  auto device = outputTensors[0].device();
+  for (const auto r : c10::irange(outputTensors.size())) {
+    check_gpu_single_tensor(outputTensors[r], true);
+    check_gpu_single_tensor(inputTensors[r], true);
+    TORCH_CHECK(
+        device == outputTensors[r].device() &&
+            device == inputTensors[r].device(),
+        "Tensors must be on the same device")
+    inSplitSizes.push_back(inputTensors[r].numel());
+    outSplitSizes.push_back(outputTensors[r].numel());
+    total_numel += inputTensors[r].numel();
+  }
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      rank_, // rank
+      "all_to_all", // collective name
+      total_numel, // inNelems
+      total_numel, // outNelems
+      inputTensors.front().scalar_type(), // dType
+      inSplitSizes, // inSplitSizes
+      outSplitSizes, // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  return collective(
+      inputTensors[0],
+      outputTensors[0],
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        torch::zoom::nccl::all2all(outputTensors, inputTensors, comm, stream);
+        return ncclSuccess;
+      },
+      [&](c10::zoom::ZoomStream&,
+          c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {
+        if (avoidRecordStreams_) {
+          // inputTensor0 and outputTensor0 are stashed redundantly by
+          // collective(), but that's ok.
+          auto& v = work->stashed_for_allocator_safety_;
+          v->insert(v->end(), inputTensors.begin(), inputTensors.end());
+          v->insert(v->end(), outputTensors.begin(), outputTensors.end());
+        }
+      },
+      [](c10::zoom::ZoomStream&,
+         c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work) {},
+      OpType::ALLTOALL,
+      "nccl:all_to_all");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& tensors,
+    int dstRank,
+    int /* unused */) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  check_gpu_single_tensor(tensor, true);
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      dstRank, // dst rank
+      "send", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  auto ret = pointToPoint(
+      tensor,
+      [&](at::Tensor& input,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream,
+          int dst) {
+        torch::zoom::nccl::send(input, comm, stream, dst);
+        return ncclSuccess;
+      },
+      dstRank,
+      OpType::SEND,
+      c10::str("nccl:send ", rank_, "->", dstRank).c_str());
+  return ret;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& tensors,
+    int srcRank,
+    int /* unused */) {
+  TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto tensor = tensors.back();
+  check_gpu_single_tensor(tensor, true);
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      tensors, // inputTensors
+      tensors, // outputTensors
+      srcRank, // src rank
+      "recv", // collective name
+      tensor.numel(), // inNelems
+      tensor.numel(), // outNelems
+      tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSizes
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  auto ret = pointToPoint(
+      tensor,
+      [&](at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream,
+          int src) {
+        torch::zoom::nccl::recv(output, comm, stream, src);
+        return ncclSuccess;
+      },
+      srcRank,
+      OpType::RECV,
+      c10::str("nccl:recv ", rank_, "<-", srcRank).c_str());
+  return ret;
+}
+
+void ProcessGroupNCCL::groupStart() {
+  C10D_NCCL_CHECK(ncclGroupStart(), c10::nullopt);
+  ++ncclActiveGroupCounter_;
+}
+
+void ProcessGroupNCCL::groupEnd() {
+  C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
+  --ncclActiveGroupCounter_;
+}
+
+void ProcessGroupNCCL::groupEndNonblocking(std::shared_ptr<NCCLComm> comm) {
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
+#else
+  if (!nccl_use_nonblocking()) {
+    C10D_NCCL_CHECK(ncclGroupEnd(), c10::nullopt);
+  } else {
+    C10D_NCCL_CHECK_TIMEOUT_GROUPEND(ncclGroupEnd(), comm, c10::nullopt);
+  }
+#endif
+  --ncclActiveGroupCounter_;
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::gather(
+    std::vector<std::vector<at::Tensor>>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const GatherOptions& opts) {
+  static auto invalidArgument = [](const std::string& msg) {
+    C10_THROW_ERROR(ValueError, "ProcessGroupNCCL::gather: " + msg);
+  };
+
+  assertRootRank(invalidArgument, opts.rootRank, size_);
+
+  TORCH_CHECK(inputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  // @lint-ignore CLANGTIDY
+  auto inputTensor = inputTensors.back();
+
+  std::vector<at::Tensor> outputs;
+
+  if (getRank() == opts.rootRank) {
+    if (outputTensors.size() != 1) {
+      std::stringstream ss;
+      ss << "requires a single-element output list containing a list with "
+         << getSize() << " tensors.";
+      invalidArgument(ss.str());
+    } else if (outputTensors[0].size() != static_cast<size_t>(getSize())) {
+      std::stringstream ss;
+      ss << "Incorrect output list size " << outputTensors[0].size()
+         << ". Output list size should be " << getSize()
+         << ", same as size of the process group.";
+      invalidArgument(ss.str());
+    }
+
+    const auto& options = inputTensor.options();
+    const auto& sizes = inputTensor.sizes();
+    assertTypeAndSizesMatch(invalidArgument, outputTensors[0], options, sizes);
+    outputs = outputTensors[0];
+  } else {
+    // if not in the root rank, initialize outputs as empty list
+    if (outputTensors.size() != 0) {
+      invalidArgument("requires empty output on non-root");
+    }
+    outputs = {};
+    // append a empty tensor to the list, we don't use it but the
+    // `collective` template function requires it to invoke its function
+    outputs.emplace_back();
+  }
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      opts.rootRank, // root rank
+      "gather", // collective name
+      inputTensor.numel(), // inNelems
+      inputTensor.numel() * this->getSize(), // outNelems
+      inputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash inputTensors and
+  // outputs, which == outputTensors[0] on the root rank where it matters.
+  return collective(
+      inputTensor,
+      outputs[0], // just to fit the collective interface
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        const auto root = opts.rootRank;
+        if (getRank() == root) {
+          if (!avoidRecordStreams_) {
+            for (auto output : outputs) {
+              c10::zoom::ZoomCachingAllocator::recordStream(
+                  output.storage().data_ptr(), stream);
+            }
+          }
+        }
+        torch::zoom::nccl::gather(inputTensor, outputs, comm, stream, root);
+        return ncclSuccess;
+      },
+      OpType::GATHER,
+      "nccl:gather");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::scatter(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<std::vector<at::Tensor>>& inputTensors,
+    const ScatterOptions& opts) {
+  static auto invalidArgument = [](const std::string& msg) {
+    C10_THROW_ERROR(ValueError, "ProcessGroupNCCL::scatter: " + msg);
+  };
+
+  assertRootRank(invalidArgument, opts.rootRank, size_);
+
+  TORCH_CHECK(outputTensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
+  auto outputTensor = outputTensors.back();
+
+  std::vector<at::Tensor> inputs;
+
+  if (getRank() == opts.rootRank) {
+    if (inputTensors.size() != 1) {
+      std::stringstream ss;
+      ss << "requires a single-element input list containing a list with "
+         << getSize() << " tensors.";
+      invalidArgument(ss.str());
+    } else if (inputTensors[0].size() != static_cast<size_t>(getSize())) {
+      std::stringstream ss;
+      ss << "Incorrect input list size " << inputTensors[0].size()
+         << ". Input list size should be " << getSize()
+         << ", same as size of the process group.";
+      invalidArgument(ss.str());
+    }
+
+    const auto& options = outputTensor.options();
+    const auto& sizes = outputTensor.sizes();
+    assertTypeAndSizesMatch(invalidArgument, inputTensors[0], options, sizes);
+    inputs = inputTensors[0];
+  } else {
+    // if not in the root rank, initialize inputTensors as empty place holder
+    // with an empty list
+    if (inputTensors.size() != 0) {
+      invalidArgument("requires empty input on non-root");
+    }
+    inputs = {};
+    // append a empty tensor to the list, we don't use it but the
+    // `collective` template function requires it to invoke its function
+    inputs.emplace_back();
+  }
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      inputTensors, // inputTensors
+      outputTensors, // outputTensors
+      opts.rootRank, // root rank
+      "scatter", // collective name
+      outputTensor.numel() * this->getSize(), // inNelems
+      outputTensor.numel(), // outNelems
+      outputTensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash outputTensors and
+  // inputs, which == inputTensors[0] on the root rank where it matters.
+  bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
+
+  return collective(
+      outputTensor,
+      inputs[0], // just to fit the collective interface
+      [&](at::Tensor& /* unused */,
+          at::Tensor& /* unused */,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        const auto root = opts.rootRank;
+        if (getRank() == root) {
+          if (!avoidRecordStreams) {
+            for (auto input : inputs) {
+              c10::zoom::ZoomCachingAllocator::recordStream(
+                  input.storage().data_ptr(), stream);
+            }
+          }
+        }
+        torch::zoom::nccl::scatter(inputs, outputTensor, comm, stream, root);
+        return ncclSuccess;
+      },
+      OpType::SCATTER,
+      "nccl:scatter",
+      avoidRecordStreams);
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::recvAnysource(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */) {
+  C10_THROW_ERROR(
+      NotImplementedError, "ProcessGroupNCCL does not support recvAnysource");
+}
+
+c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const AllgatherOptions& opts) {
+  check_gpu_single_tensor(input_tensor);
+  check_gpu_single_tensor(output_tensor);
+
+  if (input_tensor.dtype() != output_tensor.dtype()) {
+    C10_THROW_ERROR(
+        TypeError, "output tensor must have the same type as input tensor");
+  }
+
+  if (input_tensor.numel() * size_ != output_tensor.numel()) {
+    C10_THROW_ERROR(
+        ValueError,
+        "output tensor size must be equal to world_size times input tensor size");
+  }
+
+  RECORD_PARAM_COMMS_DATA(
+      static_cast<int>(
+          this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
+      std::make_tuple(pg_name_, pg_desc_), // PG name tuple
+      input_tensor, // inputTensors
+      output_tensor, // outputTensors
+      rank_, // rank
+      "_allgather_base", // collective name
+      input_tensor.numel(), // inNelems
+      output_tensor.numel(), // outNelems
+      output_tensor.scalar_type(), // dType
+      std::vector<int64_t>(), // inSplitSizes
+      std::vector<int64_t>(), // outSplitSize
+      globalRankStart, // globalRankStart
+      globalRankStride, // globalRankStride
+      this->getSize()); // worldSize
+
+  // avoidRecordStreams_ note: collective() will stash inputs and outputs.
+  // Note 2: for asyncOp = false, we don't want to record streams because we
+  // know that the NCCL stream will join back to the "current" stream right
+  // after this op. So we might just as well keep the stream ownership of the
+  // input/output tensors unchanged. The benefit would be that the
+  // allocation/free of the tensors would look deterministic to the "current"
+  // stream so that the caching allocator can reuse memory pool for this stream
+  // in a clever way. This setting is added for libraries like FSDP which uses
+  // `all_gather_into_tensor`.
+  bool avoidRecordStreams = avoidRecordStreams_ || (!opts.asyncOp);
+
+  return collective(
+      input_tensor,
+      output_tensor,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          ncclComm_t comm,
+          c10::zoom::ZoomStream& stream) {
+        if (!avoidRecordStreams) {
+          c10::zoom::ZoomCachingAllocator::recordStream(
+              output.storage().data_ptr(), stream);
+        }
+        return ncclAllGather(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            comm,
+            stream.stream());
+      },
+      OpType::_ALLGATHER_BASE,
+      "nccl:_all_gather_base",
+      avoidRecordStreams);
+}
+
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index 181f2208160b7e..6d0d45db328de8 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h
@@ -296,10 +296,16 @@ inline std::string retrieveDesyncReport(
 
 #ifdef USE_C10D_NCCL
 
+#ifdef USE_ZOOM
+using GPUEvent = at::zoom::ZoomEvent;
+#else
+using GPUEvent = at::cuda::CUDAEvent;
+#endif
+
 /* Helper used by work::getDuration() and nccl flight recorder */
 float getDurationFromEvent(
-    at::cuda::CUDAEvent& ncclStartEvent,
-    at::cuda::CUDAEvent& ncclEndEvent) {
+    GPUEvent& ncclStartEvent,
+    GPUEvent& ncclEndEvent) {
   TORCH_CHECK(
       ncclEndEvent.query(),
       "getDuration can only be called after work is succeeded.")
@@ -419,7 +425,7 @@ struct NCCLTraceBuffer {
     capture_cpp_stack_ = getCvarBool({"TORCH_NCCL_TRACE_CPP_STACK"}, false);
     enabled_ = max_entries_ > 0;
   }
-  using Event = at::cuda::CUDAEvent;
+  using Event = GPUEvent;
   struct Entry {
     size_t id_; // incremented id in the trace buffer
                 // used to figure out where in the circular entries
diff --git a/torch/csrc/distributed/c10d/Utils.cu b/torch/csrc/distributed/c10d/Utils.cu
index 1a4b3ebb651bc6..94d5150f75456a 100644
--- a/torch/csrc/distributed/c10d/Utils.cu
+++ b/torch/csrc/distributed/c10d/Utils.cu
@@ -1,6 +1,11 @@
 #include <ATen/Dispatch.h>
+#ifdef USE_ZOOM
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomGuard.h>
+#else
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#endif
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/torch.h>
 #include <algorithm>
@@ -15,7 +20,11 @@ __global__ void checkForNaN(T* data, size_t size) {
   size_t stride = blockDim.x * gridDim.x;
 
   for (size_t i = tid; i < size; i += stride) {
+    #ifdef USE_ZOOM
+    ZOOM_KERNEL_ASSERT(!isnan(data[i]));
+    #else
     CUDA_KERNEL_ASSERT(!isnan(data[i]));
+    #endif
   }
 }
 
@@ -37,7 +46,11 @@ void checkForNan(const at::Tensor& tensor) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.scalar_type(), "checkForNaN", [&] {
     checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock>>>(
         tensor.data_ptr<scalar_t>(), tensor.numel());
+    #ifdef USE_ZOOM
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    #else
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+    #endif
   });
 
 }
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/intra_node_comm.hpp
index fe591978c5332c..12f2c0d1673448 100644
--- a/torch/csrc/distributed/c10d/intra_node_comm.hpp
+++ b/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -1,8 +1,13 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#ifdef USE_ZOOM
+#include <ATen/zoom/ZoomEvent.h>
+#include <c10/zoom/ZoomStream.h>
+#else
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/cuda/CUDAStream.h>
+#endif
 #include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 
@@ -76,17 +81,23 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
   void get(size_t rank, at::Tensor tensor, int64_t offset = 0);
 
  private:
+  #ifdef USE_ZOOM
+  using GPUStream = c10::zoom::ZoomStream;
+  #else
+  using GPUStream = at::cuda::CUDAStream;
+  #endif
+
   at::Tensor oneShotAllReduce(
       const at::Tensor& input,
-      at::cuda::CUDAStream& stream);
+      GPUStream& stream);
 
   at::Tensor twoShotAllReduce(
       const at::Tensor& input,
-      at::cuda::CUDAStream& stream);
+      GPUStream& stream);
 
   at::Tensor hybridCubeMeshAllReduce(
       const at::Tensor& input,
-      at::cuda::CUDAStream& stream);
+      GPUStream& stream);
 
   c10::intrusive_ptr<Store> store_;
   size_t rank_;
@@ -128,18 +139,25 @@ class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
  * synchronization can also be performed via IntraNodeWork::wait().
  */
 class IntraNodeCommWork : public c10d::Work {
+  #ifdef USE_ZOOM
+  using GPUEvent = at::zoom::ZoomEvent;
+  #define getCurrentStream c10::zoom::getCurrentZoomStream
+  #else
+  using GPUEvent = at::cuda::CUDAEvent;
+  #define getCurrentStream at::cuda::getCurrentCUDAStream
+  #endif
  public:
   IntraNodeCommWork() : c10d::Work() {
     event_.record();
   }
 
   bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
-    event_.block(at::cuda::getCurrentCUDAStream());
+    event_.block(getCurrentStream());
     return true;
   }
 
  private:
-  at::cuda::CUDAEvent event_;
+  GPUEvent event_;
 };
 
 TORCH_API int64_t getIntraNodeCommUsageCounter();
diff --git a/torch/csrc/distributed/c10d/intra_node_comm_zoom.cpp b/torch/csrc/distributed/c10d/intra_node_comm_zoom.cpp
new file mode 100644
index 00000000000000..bc9842b7233f02
--- /dev/null
+++ b/torch/csrc/distributed/c10d/intra_node_comm_zoom.cpp
@@ -0,0 +1,216 @@
+#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/Logging.h>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+#include <iostream>
+#include <utility>
+
+#include <fcntl.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <hip/hip_runtime.h>
+
+namespace c10d::intra_node_comm {
+
+static std::vector<std::string> ENABLE_INTRA_NODE_COMM = {
+    "ENABLE_INTRA_NODE_COMM"};
+// Forces detectedTopology() to return Topology::FULLY_CONNECTED, so
+// IntraNodeComm can be used even without NVLink connection. This is only used
+// for testing purposes.
+static std::vector<std::string> TEST_INTRA_NODE_COMM = {"TEST_INTRA_NODE_COMM"};
+
+////////////////////////////////////////////////////////////////////////////////
+// HIP Functions
+////////////////////////////////////////////////////////////////////////////////
+
+bool isIntraNodeCommSupported();
+
+std::optional<HybridCubeMesh> getHybridCubeMesh(NvlMesh nvlMesh);
+
+void* initP2pState();
+
+void* initTopoInfo(Topology topology, NvlMesh nvlMesh, size_t rank);
+
+////////////////////////////////////////////////////////////////////////////////
+// Topology Detection
+////////////////////////////////////////////////////////////////////////////////
+
+static std::ostream& operator<<(std::ostream& os, const NvlMesh& nvlMesh) {
+  std::ostringstream oss;
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    for (size_t j = 0; j < kMaxDevices; ++j) {
+      oss << nvlMesh[i][j] << " ";
+    }
+    oss << '\n';
+  }
+  os << oss.str();
+  return os;
+}
+
+static bool isSame(NvlMesh lhs, NvlMesh rhs) {
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    for (size_t j = 0; j < kMaxDevices; ++j) {
+      if (lhs[i][j] != rhs[i][j]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/**
+ * Query the nvlink connection among devices.
+ */
+static NvlMesh getNvlMesh(const std::vector<std::string>& rankToBusId) {
+  return {};
+}
+
+/**
+ * Determine if the devices form a hybrid cube mesh
+ * topology given a NvlMesh.
+ */
+static bool isHybridCubeMesh(const NvlMesh nvlMesh) {
+  std::array<size_t, kMaxDevices> numNeighbors = {};
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    for (size_t j = 0; j < kMaxDevices; ++j) {
+      if (nvlMesh[i][j] > 0) {
+        numNeighbors[i] += 1;
+      }
+    }
+  }
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    // TODO: this is insufficent and needs revisit
+    if (numNeighbors[i] != 4) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/**
+ * Detech topology given a NvlMesh.
+ */
+static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) {
+  if (getCvarBool(TEST_INTRA_NODE_COMM, false)) {
+    return Topology::FULLY_CONNECTED;
+  }
+  bool fullyConnected = true;
+  for (size_t i = 0; i < worldSize - 1; ++i) {
+    for (size_t j = i + 1; j < worldSize; ++j) {
+      if (nvlMesh[i][j] == 0 || nvlMesh[j][i] == 0) {
+        fullyConnected = false;
+      }
+    }
+  }
+  if (fullyConnected) {
+    LOG(INFO) << "IntraNodeComm: Topology::FULLY_CONNECTED";
+    return Topology::FULLY_CONNECTED;
+  }
+  if (worldSize == kMaxDevices && getHybridCubeMesh(nvlMesh) != std::nullopt) {
+    LOG(INFO) << "IntraNodeComm: Topology::HYBRID_CUBE_MESH";
+    return Topology::HYBRID_CUBE_MESH;
+  }
+  LOG(INFO) << "IntraNodeComm: Topology::UNKNOWN";
+  return Topology::UNKNOWN;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Rendezvous and Initialization
+////////////////////////////////////////////////////////////////////////////////
+
+IntraNodeComm::IntraNodeComm(
+    c10::intrusive_ptr<c10d::Store> store,
+    size_t rank,
+    size_t worldSize,
+    std::optional<size_t> bufferSize)
+    : store_(std::move(store)),
+      rank_(rank),
+      worldSize_(worldSize),
+      bufferSize_(bufferSize.has_value() ? *bufferSize : kDefaultBufferSize) {
+  rendezvous();
+}
+
+IntraNodeComm::~IntraNodeComm() {
+  if (!isInitialized_) {
+    return;
+  }
+  // Intentionally releasing resources without synchronizing devices. The
+  // teardown logic is safe for propoerly sync'd user program. We don't want
+  // improperly sync'd user program to hang here.
+  for (size_t r = 0; r < worldSize_; ++r) {
+    if (r == rank_) {
+      continue;
+    }
+    C10_ZOOM_CHECK(hipIpcCloseMemHandle(p2pStates_[r]));
+    C10_ZOOM_CHECK(hipIpcCloseMemHandle(buffers_[r]));
+  }
+  C10_ZOOM_CHECK(hipFree(p2pStates_[rank_]));
+  C10_ZOOM_CHECK(hipFree(buffers_[rank_]));
+  if (topoInfo_ != nullptr) {
+    C10_ZOOM_CHECK(hipFree(topoInfo_));
+  }
+  C10_ZOOM_CHECK(hipFree(p2pStatesDev_));
+  C10_ZOOM_CHECK(hipFree(buffersDev_));
+}
+
+bool IntraNodeComm::isEnabled() {
+  return getCvarBool(ENABLE_INTRA_NODE_COMM, false);
+}
+
+/**
+ * Use c10d::Store to perform allgather on a trivially copyable type.
+ */
+template <typename T>
+std::vector<T> storeAllGather(
+    const c10::intrusive_ptr<c10d::Store>& store,
+    const std::string& prefix,
+    size_t rank,
+    size_t worldSize,
+    T val) {
+  static_assert(std::is_trivially_copyable_v<T>);
+
+  std::vector<std::string> peerKeys;
+  for (size_t r = 0; r < worldSize; ++r) {
+    std::ostringstream oss;
+    oss << prefix << "-" << r;
+    peerKeys.push_back(oss.str());
+  }
+
+  {
+    std::vector<uint8_t> payload(
+        reinterpret_cast<uint8_t*>(&val),
+        reinterpret_cast<uint8_t*>(&val) + sizeof(T));
+    store->set(peerKeys[rank], payload);
+  }
+
+  std::vector<T> peerVals;
+  for (size_t r = 0; r < worldSize; ++r) {
+    if (r == rank) {
+      peerVals.push_back(val);
+      continue;
+    }
+    store->wait({peerKeys[r]});
+    auto payload = store->get(peerKeys[r]);
+    TORCH_CHECK(payload.size() == sizeof(T));
+    T peerVal{};
+    std::memcpy(&peerVal, payload.data(), sizeof(T));
+    peerVals.push_back(peerVal);
+  }
+  return peerVals;
+}
+
+bool IntraNodeComm::rendezvous() {
+  if (isInitialized_) {
+    return true;
+  }
+  return false;
+}
+
+} // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/intra_node_comm_zoom.cu b/torch/csrc/distributed/c10d/intra_node_comm_zoom.cu
new file mode 100644
index 00000000000000..1788fcff6f8fe5
--- /dev/null
+++ b/torch/csrc/distributed/c10d/intra_node_comm_zoom.cu
@@ -0,0 +1,748 @@
+#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomGuard.h>
+
+namespace c10d {
+namespace intra_node_comm {
+
+static constexpr size_t kBytesPerThread = 16;
+static constexpr size_t kMaxAllReduceBlocks = 24;
+static constexpr size_t kThreadsPerBlock = 1024;
+static constexpr size_t kWarpSize = 32;
+
+static constexpr size_t kHcmThreshBytes = 256 * 1024;
+static constexpr size_t kOneShotThreshBytes = 256 * 1024;
+static constexpr size_t kTwoShotThreshBytes = 10 * 1024 * 1024;
+
+using __nv_bfloat162 = uint32_t;
+
+struct __align__(16) bf16x8 {
+  __nv_bfloat162 vals[4];
+};
+
+#define DEVICE_INLINE __device__ inline __attribute__((always_inline))
+
+DEVICE_INLINE __nv_bfloat162
+bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+  ZOOM_KERNEL_ASSERT(false);
+  return 0;
+}
+
+DEVICE_INLINE bf16x8 add_bf16x8(bf16x8 a, bf16x8 b) {
+  bf16x8 c;
+  c.vals[0] = bf16hadd2(a.vals[0], b.vals[0]);
+  c.vals[1] = bf16hadd2(a.vals[1], b.vals[1]);
+  c.vals[2] = bf16hadd2(a.vals[2], b.vals[2]);
+  c.vals[3] = bf16hadd2(a.vals[3], b.vals[3]);
+  return c;
+}
+
+/**
+ * NOTE [cross device memory synchronization]
+ *
+ * The multi-stage algorithms (e.g. two-shot, hcm allreduce) require the writes
+ * of a thread to be visible by threads with the same block/thread ID on other
+ * devices. To satisfy CUDA's memory consistency model, every thread has to
+ * release its writes at the system scope, and the consuming thread has to
+ * acquire the writes at the system scope. This incurs high overhead and
+ * attempts in optmizing this process can be prone to race condition.
+ *
+ * Instead, we go around caching by having each thread:
+ *
+ * - Directly write to global memory via st.cs (cache-streaming).
+ * - Synchronize with threads within the block.
+ * - Perform cross device synchronization at block level (via system scope
+ *   atomic ops).
+ * - Synchronize with threads within the block.
+ * - Directly read from global memory via ld.nc (non-coherent/non-cached).
+ */
+template <typename T>
+DEVICE_INLINE void streamLoad128(bf16x8& val, const T* addr) {
+  ZOOM_KERNEL_ASSERT(false);
+}
+
+__device__ inline void streamStore128(at::BFloat16* addr, const bf16x8& val) {
+  ZOOM_KERNEL_ASSERT(false);
+}
+
+template <typename T>
+DEVICE_INLINE void load128(bf16x8& val, const T* addr) {
+  *reinterpret_cast<uint4*>(&val) = reinterpret_cast<const uint4*>(addr)[0];
+}
+
+template <typename T>
+DEVICE_INLINE void store128(T* addr, const bf16x8& val) {
+  *reinterpret_cast<uint4*>(addr) = reinterpret_cast<const uint4*>(&val)[0];
+}
+
+DEVICE_INLINE void releaseSignal(uint32_t* addr) {
+  ZOOM_KERNEL_ASSERT(false);
+}
+
+DEVICE_INLINE void acquireSignal(uint32_t* addr) {
+  ZOOM_KERNEL_ASSERT(false);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Fully Connected Algos
+////////////////////////////////////////////////////////////////////////////////
+
+struct P2pState {
+  uint32_t signals0[kMaxAllReduceBlocks][kMaxDevices];
+  uint32_t signals1[kMaxAllReduceBlocks][kMaxDevices];
+};
+
+template <uint32_t kWorldSize, bool kAligned>
+static __global__ void oneShotAllReduceKernel(
+    at::BFloat16* input,
+    size_t N,
+    size_t N_aligned,
+    P2pState** p2pStates,
+    at::BFloat16** buffers,
+    size_t rank,
+    bool fuseInputCopy) {
+  const size_t numelPerThread = kBytesPerThread / sizeof(at::BFloat16);
+  const size_t offset =
+      (blockDim.x * blockIdx.x + threadIdx.x) * numelPerThread;
+  const size_t stride = blockDim.x * gridDim.x * numelPerThread;
+
+  if (fuseInputCopy) {
+    for (size_t i = offset; i < N_aligned; i += stride) {
+      bf16x8 val;
+      streamLoad128(val, &input[i]);
+      streamStore128(&buffers[rank][i], val);
+    }
+  }
+
+  // Wait for all other ranks to enter the kernel
+  if (threadIdx.x < kWorldSize) {
+    auto targetRank = threadIdx.x;
+    releaseSignal(&p2pStates[targetRank]->signals0[blockIdx.x][rank]);
+    acquireSignal(&p2pStates[rank]->signals0[blockIdx.x][targetRank]);
+  }
+  __syncthreads();
+
+  // The source pointers. Distributed round-robin for the different warps
+  const at::BFloat16* srcs[kWorldSize];
+#pragma unroll kWorldSize
+  for (int ii = 0; ii < kWorldSize; ++ii) {
+    int srcRank = (rank + ii) % kWorldSize;
+    srcs[ii] = buffers[srcRank];
+  }
+
+  for (size_t i = offset; i < N_aligned; i += stride) {
+    bf16x8 vals[kWorldSize];
+#pragma unroll kWorldSize
+    for (size_t ii = 0; ii < kWorldSize; ++ii) {
+      streamLoad128(vals[ii], &srcs[ii][i]);
+    }
+
+    bf16x8 sums;
+    memset(reinterpret_cast<void*>(&sums), 0, sizeof(sums));
+
+#pragma unroll kWorldSize
+    for (size_t ii = 0; ii < kWorldSize; ++ii) {
+      sums = add_bf16x8(sums, vals[ii]);
+    }
+    if constexpr (kAligned) {
+      streamStore128(&input[i], sums);
+    } else {
+      for (size_t ii = 0; ii < numelPerThread; ++ii) {
+        if (i + ii < N) {
+          input[i + ii] = reinterpret_cast<at::BFloat16*>(&sums)[ii];
+        }
+      }
+    }
+  }
+}
+
+template <uint32_t kWorldSize>
+static __launch_bounds__(1024) __global__ void twoShotAllReduceKernel(
+    at::BFloat16* input,
+    size_t N_aligned,
+    P2pState** p2pStates,
+    at::BFloat16** buffers,
+    size_t rank) {
+  const size_t numelPerThread = kBytesPerThread / sizeof(at::BFloat16);
+  const size_t offset =
+      (blockDim.x * blockIdx.x + threadIdx.x) * numelPerThread;
+  const size_t stride = blockDim.x * gridDim.x * numelPerThread;
+  const size_t N_per_rank = N_aligned / kWorldSize;
+  const size_t N_start = N_per_rank * rank;
+
+  // Wait for all other ranks to enter the kernel
+  if (threadIdx.x < kWorldSize) {
+    auto targetRank = threadIdx.x;
+    releaseSignal(&p2pStates[targetRank]->signals0[blockIdx.x][rank]);
+    acquireSignal(&p2pStates[rank]->signals0[blockIdx.x][targetRank]);
+  }
+  __syncthreads();
+
+  // The source pointers. Distributed round-robin for the different warps
+  at::BFloat16* srcs[kWorldSize];
+  size_t srcRanks[kWorldSize];
+#pragma unroll kWorldSize
+  for (int ii = 0; ii < kWorldSize; ++ii) {
+    int srcRank = (rank + ii) % kWorldSize;
+    srcs[ii] = buffers[srcRank];
+    srcRanks[ii] = srcRank;
+  }
+
+  for (size_t i = offset; i < N_per_rank; i += stride) {
+    bf16x8 vals[kWorldSize];
+#pragma unroll kWorldSize
+    for (size_t ii = 0; ii < kWorldSize; ++ii) {
+      streamLoad128(vals[ii], &srcs[ii][N_start + i]);
+    }
+
+    bf16x8 sums;
+    memset(reinterpret_cast<void*>(&sums), 0, sizeof(sums));
+
+#pragma unroll kWorldSize
+    for (size_t ii = 0; ii < kWorldSize; ++ii) {
+      sums = add_bf16x8(sums, vals[ii]);
+    }
+    streamStore128(&srcs[0][N_start + i], sums);
+    // Store local sums into input now so we can avoid
+    // a global memory access later for it.
+    streamStore128(&input[N_start + i], sums);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < kWorldSize) {
+    auto targetRank = threadIdx.x;
+    releaseSignal(&p2pStates[targetRank]->signals1[blockIdx.x][rank]);
+    acquireSignal(&p2pStates[rank]->signals1[blockIdx.x][targetRank]);
+  }
+  __syncthreads();
+
+  for (size_t i = offset; i < N_per_rank; i += stride) {
+#pragma unroll kWorldSize - 1
+    for (size_t ii = 1; ii < kWorldSize; ++ii) {
+      size_t k = N_start + i + (srcRanks[ii] - rank) * N_per_rank;
+      bf16x8 val;
+      streamLoad128(val, &srcs[ii][k]);
+      streamStore128(&input[k], val);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Hybrid Cube Mesh Algos
+////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * NOTE [hybrid cube mesh]
+ *
+ * In a hybrid cube mesh topology, every device has exactly 4 neighbors
+ * (directly connected via NVLink). For every device X, it has exactly 1
+ * neighbor Y that is a neighbor of the 3 non-neighbor of X. We call Y the
+ * relay neighbor of X. This property is symmetrical: X is also guaranteed to
+ * be the relay neighbor of Y.
+ *
+ * With this property, we can perform a variant of one-shot allreduce algo that
+ * only moves data across NVLinks:
+ *
+ * - Each device one-shot allreduce among itself and 3 non-relay neighbors.
+ * - Each device exchange data with its relay neighbor.
+ *
+ * HybridCubeMesh is a data structure for describing the topology:
+ *
+ * - hcm[X][0:3] are the 3 neighbors of X.
+ * - hcm[X][3] is the relay neighbor of X.
+ * - For load balancing purpose, we also ensure that if hcm[X][k] = Y,
+ *   hcm[Y][k] = X.
+ */
+std::optional<HybridCubeMesh> getHybridCubeMesh(NvlMesh nvlMesh) {
+  std::array<std::unordered_set<size_t>, kMaxDevices> neighbors = {};
+  std::array<size_t, kMaxDevices> neighborMasks = {};
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    for (size_t j = 0; j < kMaxDevices; ++j) {
+      if (nvlMesh[i][j] > 0) {
+        neighbors[i].insert(j);
+        neighborMasks[i] |= (1ul << j);
+      }
+    }
+  }
+  HybridCubeMesh hcm = {};
+  for (auto& row : hcm) {
+    row.fill(-1);
+  }
+  // A topology is an HCM if:
+  // - Every device has exactly 4 neighbors.
+  // - For every device, it has exactly 1 relay neighbor that is
+  //   a neighbor of the 3 non-neighbor of the device.
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    if (neighbors[i].size() != 4) {
+      return std::nullopt;
+    }
+    // Condition 1: check the number of neighbors
+    std::vector<size_t> relayNeighbors;
+    for (size_t j = 0; j < kMaxDevices; ++j) {
+      if ((neighborMasks[i] & neighborMasks[j]) == 0) {
+        relayNeighbors.push_back(j);
+      }
+    }
+    // Condition 2: check the number of relay neighbors
+    if (relayNeighbors.size() != 1) {
+      return std::nullopt;
+    }
+    neighbors[i].erase(relayNeighbors[0]);
+    hcm[i][3] = relayNeighbors[0];
+  }
+
+  for (size_t i = 0; i < kMaxDevices; ++i) {
+    for (size_t k = 0; k < 3; ++k) {
+      // We can only fill hcm[i][k] with j if hcm[j][k] is not filled
+      for (size_t j : neighbors[i]) {
+        if (hcm[j][k] == -1) {
+          hcm[i][k] = j;
+          hcm[j][k] = i;
+          break;
+        }
+      }
+      TORCH_CHECK(hcm[i][k] != -1);
+      neighbors[i].erase(hcm[i][k]);
+    }
+  }
+  return hcm;
+}
+
+template <bool kAligned>
+static __global__ void hybridCubeMeshAllReduceKernel(
+    at::BFloat16* input,
+    size_t N,
+    size_t N_aligned,
+    P2pState** p2pStates,
+    at::BFloat16** buffers,
+    int hcmInfo[4],
+    size_t bufferSize,
+    size_t rank) {
+  const size_t numelPerThread = kBytesPerThread / sizeof(at::BFloat16);
+  const size_t offset =
+      (blockDim.x * blockIdx.x + threadIdx.x) * numelPerThread;
+  const size_t stride = blockDim.x * gridDim.x * numelPerThread;
+  const int relayRank = hcmInfo[3];
+
+  // Wait for HCM neigbors to enter the kernel
+  if (threadIdx.x < 3) {
+    auto targetRank = hcmInfo[threadIdx.x];
+    releaseSignal(&p2pStates[targetRank]->signals0[blockIdx.x][rank]);
+    acquireSignal(&p2pStates[rank]->signals0[blockIdx.x][targetRank]);
+  }
+  __syncthreads();
+
+  const at::BFloat16* srcs[4] = {
+      buffers[rank],
+      buffers[hcmInfo[0]],
+      buffers[hcmInfo[1]],
+      buffers[hcmInfo[2]],
+  };
+  // Use the half second half of the buffer as relay
+  at::BFloat16* localRelay =
+      buffers[rank] + (bufferSize / sizeof(at::BFloat16) / 2);
+  at::BFloat16* remoteRelay =
+      buffers[relayRank] + (bufferSize / sizeof(at::BFloat16) / 2);
+
+  for (size_t i = offset; i < N_aligned; i += stride) {
+    bf16x8 vals[4];
+
+#pragma unroll 4
+    for (size_t ii = 0; ii < 4; ++ii) {
+      streamLoad128(vals[ii], &srcs[ii][i]);
+    }
+
+    bf16x8 sums;
+    memset(reinterpret_cast<void*>(&sums), 0, sizeof(sums));
+
+#pragma unroll 4
+    for (size_t ii = 0; ii < 4; ++ii) {
+      sums = add_bf16x8(sums, vals[ii]);
+    }
+    // Cached store for local sums
+    store128(&localRelay[i], sums);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    releaseSignal(&p2pStates[relayRank]->signals0[blockIdx.x][rank]);
+    acquireSignal(&p2pStates[rank]->signals0[blockIdx.x][relayRank]);
+  }
+  __syncthreads();
+
+  for (size_t i = offset; i < N_aligned; i += stride) {
+    bf16x8 localSum, remoteSum;
+    // Cached load for local sums
+    load128(localSum, &localRelay[i]);
+    streamLoad128(remoteSum, &remoteRelay[i]);
+    localSum = add_bf16x8(localSum, remoteSum);
+    if constexpr (kAligned) {
+      streamStore128(&input[i], localSum);
+    } else {
+      for (size_t ii = 0; ii < numelPerThread; ++ii) {
+        if (i + ii < N) {
+          input[i + ii] = reinterpret_cast<at::BFloat16*>(&localSum)[ii];
+        }
+      }
+    }
+  }
+}
+
+static inline size_t divUp(uint32_t a, uint32_t b) {
+  return (a + b - 1) / b;
+}
+
+static inline size_t alignUp(uint32_t a, uint32_t b) {
+  return divUp(a, b) * b;
+}
+
+static void checkInput(const at::Tensor& input, size_t rank) {
+  TORCH_CHECK(
+      input.dtype() == at::kBFloat16,
+      "oneShotAllReduce only supports bf16 for now");
+  TORCH_CHECK(input.is_non_overlapping_and_dense());
+  TORCH_CHECK(input.device().is_privateuseone());
+  TORCH_CHECK(static_cast<size_t>(input.get_device()) == rank);
+}
+
+static void getLaunchConfig(
+    size_t N_aligned,
+    size_t elemSize,
+    dim3& blocks,
+    dim3& threads) {
+  blocks = dim3(0, 1, 1);
+  threads = dim3(0, 1, 1);
+
+  const auto numelPerThread = kBytesPerThread / elemSize;
+  const auto numelPerWarp = numelPerThread * kWarpSize;
+  TORCH_CHECK(N_aligned % numelPerThread == 0);
+  TORCH_CHECK(N_aligned % numelPerWarp == 0);
+  if (N_aligned < numelPerThread * kThreadsPerBlock) {
+    threads.x = N_aligned / numelPerWarp * kWarpSize;
+    blocks.x = 1;
+  } else {
+    auto warpsRequired = N_aligned / numelPerWarp;
+    auto threadsRequired = N_aligned / numelPerThread;
+    blocks.x =
+        std::min(divUp(threadsRequired, kThreadsPerBlock), kMaxAllReduceBlocks);
+    auto warpsPerBlock = divUp(warpsRequired, blocks.x);
+    threads.x = std::min(kThreadsPerBlock, warpsPerBlock * kWarpSize);
+  }
+}
+
+bool isIntraNodeCommSupported() {
+  // false for ROCm
+  return false;
+}
+
+void* initP2pState() {
+  void* state = nullptr;
+  C10_ZOOM_CHECK(hipMalloc(&state, sizeof(P2pState)));
+  C10_ZOOM_CHECK(hipMemset(state, 0, sizeof(P2pState)));
+  return state;
+}
+
+void* initTopoInfo(Topology topology, NvlMesh nvlMesh, size_t rank) {
+  void* topoInfo = nullptr;
+  if (topology != Topology::HYBRID_CUBE_MESH) {
+    return topoInfo;
+  }
+  auto hcm = getHybridCubeMesh(nvlMesh);
+  int hcmInfo[4];
+  std::copy((*hcm)[rank].begin(), (*hcm)[rank].begin() + 4, hcmInfo);
+  C10_ZOOM_CHECK(hipMalloc(&topoInfo, sizeof(hcmInfo)));
+  C10_ZOOM_CHECK(
+      hipMemcpy(topoInfo, hcmInfo, sizeof(hcmInfo), hipMemcpyHostToDevice));
+  return topoInfo;
+}
+
+at::Tensor IntraNodeComm::oneShotAllReduce(
+    const at::Tensor& input,
+    c10::zoom::ZoomStream& stream) {
+  checkInput(input, rank_);
+
+  const size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
+  const size_t N_aligned = alignUp(input.numel(), numelPerWarp);
+  const bool isAligned = (N_aligned == static_cast<size_t>(input.numel()));
+  TORCH_CHECK(N_aligned <= bufferSize_ / input.element_size());
+
+  dim3 blocks, threads;
+  getLaunchConfig(N_aligned, input.element_size(), blocks, threads);
+
+  c10::zoom::OptionalZoomGuard guard(input.get_device());
+
+  // When the input data is small, copying inside the kernel is faster. Because
+  // in such cases, the launch overhead of hipMemcpyAsync outweighs its
+  // efficiency. Here we consider the input data to be small if the copy loop
+  // can finish in a single iteration.
+  const bool fuseInputCopy = isAligned && blocks.x < kMaxAllReduceBlocks;
+  if (!fuseInputCopy) {
+    C10_ZOOM_CHECK(hipMemcpyAsync(
+        buffers_[rank_],
+        input.data_ptr(),
+        input.numel() * input.element_size(),
+        hipMemcpyDeviceToDevice,
+        stream));
+  }
+
+#define X(kWorldSize, kAligned)                            \
+  if (worldSize_ == kWorldSize) {                          \
+    oneShotAllReduceKernel<kWorldSize, kAligned>           \
+        <<<blocks, threads, 0, stream>>>(                  \
+            input.data_ptr<at::BFloat16>(),                \
+            input.numel(),                                 \
+            N_aligned,                                     \
+            reinterpret_cast<P2pState**>(p2pStatesDev_),   \
+            reinterpret_cast<at::BFloat16**>(buffersDev_), \
+            rank_,                                         \
+            fuseInputCopy);                                \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                        \
+  }
+
+#define DISPATCH_ALL_WORLD_SIZES(kAligned) \
+  X(2, kAligned);                          \
+  X(3, kAligned);                          \
+  X(4, kAligned);                          \
+  X(5, kAligned);                          \
+  X(6, kAligned);                          \
+  X(7, kAligned);                          \
+  X(8, kAligned);
+
+  if (isAligned) {
+    DISPATCH_ALL_WORLD_SIZES(true);
+  } else {
+    DISPATCH_ALL_WORLD_SIZES(false);
+  }
+
+#undef DISPATCH_ALL_WORLD_SIZES
+#undef X
+  return input;
+}
+
+at::Tensor IntraNodeComm::twoShotAllReduce(
+    const at::Tensor& input,
+    c10::zoom::ZoomStream& stream) {
+  checkInput(input, rank_);
+
+  size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
+  size_t N_aligned = alignUp(input.numel(), worldSize_ * numelPerWarp);
+  size_t N_per_rank = N_aligned / worldSize_;
+  TORCH_CHECK(N_aligned <= bufferSize_ / input.element_size());
+
+  dim3 blocks, threads;
+  getLaunchConfig(N_per_rank, input.element_size(), blocks, threads);
+
+  auto output = N_aligned == static_cast<size_t>(input.numel())
+      ? input
+      : input.new_empty(N_aligned);
+
+  c10::zoom::OptionalZoomGuard guard(input.get_device());
+  C10_ZOOM_CHECK(hipMemcpyAsync(
+      buffers_[rank_],
+      input.data_ptr(),
+      input.numel() * input.element_size(),
+      hipMemcpyDeviceToDevice,
+      stream));
+
+#define X(kWorldSize)                                                   \
+  if (worldSize_ == kWorldSize) {                                       \
+    twoShotAllReduceKernel<kWorldSize><<<blocks, threads, 0, stream>>>( \
+        output.data_ptr<at::BFloat16>(),                                \
+        N_aligned,                                                      \
+        reinterpret_cast<P2pState**>(p2pStatesDev_),                    \
+        reinterpret_cast<at::BFloat16**>(buffersDev_),                  \
+        rank_);                                                         \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                     \
+  }
+  X(2);
+  X(3);
+  X(4);
+  X(5);
+  X(6);
+  X(7);
+  X(8);
+#undef X
+
+  if (output.data_ptr() != input.data_ptr()) {
+    C10_ZOOM_CHECK(hipMemcpyAsync(
+        input.data_ptr(),
+        output.data_ptr(),
+        input.numel() * input.element_size(),
+        hipMemcpyDeviceToDevice,
+        stream));
+  }
+  return input;
+}
+
+at::Tensor IntraNodeComm::hybridCubeMeshAllReduce(
+    const at::Tensor& input,
+    c10::zoom::ZoomStream& stream) {
+  checkInput(input, rank_);
+
+  size_t numelPerWarp = kBytesPerThread / input.element_size() * kWarpSize;
+  size_t N_aligned = alignUp(input.numel(), numelPerWarp);
+  TORCH_CHECK(N_aligned * 2 <= bufferSize_ / input.element_size());
+
+  dim3 blocks, threads;
+  getLaunchConfig(N_aligned, input.element_size(), blocks, threads);
+
+  c10::zoom::OptionalZoomGuard guard(input.get_device());
+  C10_ZOOM_CHECK(hipMemcpyAsync(
+      buffers_[rank_],
+      input.data_ptr(),
+      input.numel() * input.element_size(),
+      hipMemcpyDeviceToDevice,
+      stream));
+
+#define X(kAligned)                                                        \
+  hybridCubeMeshAllReduceKernel<kAligned><<<blocks, threads, 0, stream>>>( \
+      input.data_ptr<at::BFloat16>(),                                      \
+      input.numel(),                                                       \
+      N_aligned,                                                           \
+      reinterpret_cast<P2pState**>(p2pStatesDev_),                         \
+      reinterpret_cast<at::BFloat16**>(buffersDev_),                       \
+      static_cast<int*>(topoInfo_),                                        \
+      bufferSize_,                                                         \
+      rank_);                                                              \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  if (N_aligned == static_cast<size_t>(input.numel())) {
+    X(true);
+  } else {
+    X(false);
+  }
+#undef X
+  return input;
+}
+
+AllReduceAlgo IntraNodeComm::selectAllReduceAlgo(const at::Tensor& input) {
+  // Only support bf16 for now
+  if (input.dtype() != at::kBFloat16) {
+    return AllReduceAlgo::NONE;
+  }
+  const auto inputSize = input.numel() * input.element_size();
+  const auto bytesPerWarp = kBytesPerThread * kWarpSize;
+
+  if (topology_ == Topology::HYBRID_CUBE_MESH) {
+    TORCH_CHECK(
+        worldSize_ == 8, "hyperCubeAllReduce only supports exactly 8 GPUs");
+    const auto hcmInputSize = alignUp(inputSize, bytesPerWarp);
+    const auto hcmBufferSizeReq = hcmInputSize * 2;
+    if (hcmInputSize <= kHcmThreshBytes && hcmBufferSizeReq <= bufferSize_) {
+      return AllReduceAlgo::HCM;
+    }
+  }
+  if (topology_ == Topology::FULLY_CONNECTED) {
+    const auto oneShotInputSize = alignUp(inputSize, bytesPerWarp);
+    const auto oneShotBufferSizeReq = oneShotInputSize;
+    if (oneShotInputSize <= kOneShotThreshBytes &&
+        oneShotBufferSizeReq <= bufferSize_) {
+      return AllReduceAlgo::ONE_SHOT;
+    }
+
+    const auto twoShotInputSize = alignUp(inputSize, bytesPerWarp * worldSize_);
+    const auto twoShotBufferSizeReq = twoShotInputSize;
+    if (twoShotInputSize <= kTwoShotThreshBytes &&
+        twoShotBufferSizeReq <= bufferSize_) {
+      return AllReduceAlgo::TWO_SHOT;
+    }
+  }
+  return AllReduceAlgo::NONE;
+}
+
+static int64_t usageCounter = 0;
+
+at::Tensor IntraNodeComm::allReduce(
+    const at::Tensor& input,
+    AllReduceAlgo algo) {
+  // Report usage for testing purposes.
+  // We don't care about overflowing.
+  ++usageCounter;
+  auto stream = c10::zoom::getCurrentZoomStream();
+  c10::zoom::ZoomCachingAllocator::recordStream(
+      input.storage().data_ptr(), stream);
+  switch (algo) {
+    case AllReduceAlgo::ONE_SHOT:
+      return oneShotAllReduce(input, stream);
+    case AllReduceAlgo::TWO_SHOT:
+      return twoShotAllReduce(input, stream);
+    case AllReduceAlgo::HCM:
+      return hybridCubeMeshAllReduce(input, stream);
+    default:
+      C10_THROW_ERROR(ValueError, "IntraNodeComm: invalid algo");
+  }
+}
+
+int64_t getIntraNodeCommUsageCounter() {
+  return usageCounter;
+}
+
+static __global__ void barrierKernel(
+    P2pState** p2pStates,
+    uint64_t mask,
+    size_t rank,
+    size_t worldSize) {
+  if (threadIdx.x < worldSize && (mask & (1ULL << threadIdx.x))) {
+    auto targetRank = threadIdx.x;
+    releaseSignal(&p2pStates[targetRank]->signals0[0][rank]);
+    acquireSignal(&p2pStates[rank]->signals0[0][targetRank]);
+  }
+}
+
+void IntraNodeComm::barrier(std::optional<std::vector<int64_t>> ranks) {
+  if (!ranks.has_value()) {
+    ranks = std::vector<int64_t>(worldSize_);
+    std::iota(ranks->begin(), ranks->end(), 0);
+  }
+  uint64_t mask = 0;
+  for (const auto& r : ranks.value()) {
+    TORCH_CHECK(r >= 0 && r < static_cast<int64_t>(worldSize_));
+    mask |= (1ULL << r);
+  }
+  barrierKernel<<<1, kWarpSize, 0, c10::zoom::getCurrentZoomStream()>>>(
+      reinterpret_cast<P2pState**>(p2pStatesDev_), mask, rank_, worldSize_);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+void IntraNodeComm::put(const at::Tensor& tensor, int64_t offset) {
+  TORCH_CHECK(
+      tensor.is_non_overlapping_and_dense(),
+      "IntraNodeComm::put(): tensor must be non-overlapping and dense");
+  size_t sz = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(
+      offset + sz <= bufferSize_,
+      "IntraNodeComm::put(): offset + tensor size exceeded "
+      "p2p buffer size");
+  // This results in "Memcpy PtoP" which does not use SMs for copying
+  C10_ZOOM_CHECK(hipMemcpyAsync(
+      static_cast<char*>(buffers_[rank_]) + offset,
+      static_cast<char*>(tensor.data_ptr()),
+      sz,
+      hipMemcpyDeviceToDevice,
+      c10::zoom::getCurrentZoomStream()));
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+void IntraNodeComm::get(size_t rank, at::Tensor tensor, int64_t offset) {
+  TORCH_CHECK(
+      tensor.is_non_overlapping_and_dense(),
+      "IntraNodeComm::get(): tensor must be non-overlapping and dense");
+  size_t sz = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(
+      offset + sz <= bufferSize_,
+      "IntraNodeComm::get(): offset + tensor size exceeded "
+      "p2p buffer size");
+  // This results in "Memcpy PtoP" which does not use SMs for copying
+  C10_ZOOM_CHECK(hipMemcpyAsync(
+      static_cast<char*>(tensor.data_ptr()),
+      static_cast<char*>(buffers_[rank]) + offset,
+      sz,
+      hipMemcpyDeviceToDevice,
+      c10::zoom::getCurrentZoomStream()));
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+} // namespace intra_node_comm
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
index 48cc7cfc4f3ee1..69004c32472446 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
+++ b/torch/csrc/distributed/c10d/quantization/quantization_gpu.cu
@@ -1,4 +1,8 @@
+#ifdef USE_ZOOM
+#include <c10/zoom/ZoomGuard.h>
+#else
 #include <c10/cuda/CUDAGuard.h>
+#endif
 #include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/csrc/distributed/c10d/quantization/quantization_gpu.h>
 #include <torch/csrc/distributed/c10d/quantization/quantization_utils.h>
@@ -55,12 +59,28 @@ namespace distributed {
 namespace c10d {
 namespace quantization {
 
+#ifdef USE_ZOOM
+#define TENSOR_ON_GPU(x) TENSOR_ON_ZOOM_GPU(x)
+using GPUGuard = c10::zoom::OptionalZoomGuard;
+#define get_stream c10::zoom::getCurrentZoomStream
+#define KERNEL_LAUNCH_CHECK() C10_ZOOM_KERNEL_LAUNCH_CHECK() 
+#define DeviceKey c10::DispatchKey::PrivateUse1
+#define Device PrivateUse1
+#else
+#define TENSOR_ON_GPU(x) TENSOR_ON_CUDA_GPU(x)
+using GPUGuard = at::cuda::OptionalCUDAGuard;
+#define get_stream at::cuda::getCurrentCUDAStream
+#define KERNEL_LAUNCH_CHECK() C10_CUDA_KERNEL_LAUNCH_CHECK()
+#define DeviceKey c10::DispatchKey::CUDA
+#define Device CUDA
+#endif
+
 at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
-  TENSOR_ON_CUDA_GPU(input);
+  TENSOR_ON_GPU(input);
   // Currently it supports 2D inputs
   TENSOR_NDIM_EQUALS(input, 2);
 
-  at::cuda::OptionalCUDAGuard device_guard;
+  GPUGuard device_guard;
   device_guard.set_index(input.get_device());
 
   const int nrows = input.size(0);
@@ -90,7 +110,7 @@ at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
       gridDim,
       blockDim,
       0,
-      at::cuda::getCurrentCUDAStream()>>>(
+      get_stream()>>>(
       input.const_data_ptr<float>(),
       nrows,
       ncols,
@@ -100,17 +120,17 @@ at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input) {
       reinterpret_cast<uint16_t*>(output.mutable_data_ptr<at::Half>())
 #endif
       );
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
+KERNEL_LAUNCH_CHECK();
 
   return output;
 }
 
 at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) {
-  TENSOR_ON_CUDA_GPU(input);
+  TENSOR_ON_GPU(input);
   // Currently it supports 2D inputs
   TENSOR_NDIM_EQUALS(input, 2);
 
-  at::cuda::OptionalCUDAGuard device_guard;
+  GPUGuard device_guard;
   device_guard.set_index(input.get_device());
 
   const int nrows = input.size(0);
@@ -137,7 +157,7 @@ at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) {
       gridDim,
       blockDim,
       0,
-      at::cuda::getCurrentCUDAStream()>>>(
+      get_stream()>>>(
 #if HAS_NCCL_BF16_DATATYPE
       reinterpret_cast<const uint16_t*>(input.const_data_ptr<at::BFloat16>()),
 #else
@@ -146,15 +166,15 @@ at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input) {
       nrows,
       ncols,
       output.mutable_data_ptr<float>());
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  KERNEL_LAUNCH_CHECK();
 
   return output;
 }
 
 #define DISPATCH_TO_CUDA(name, function) \
-    m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
+    m.impl(name, torch::dispatch(DeviceKey, TORCH_FN(function)))
 
-TORCH_LIBRARY_IMPL(quantization, CUDA, m) {
+TORCH_LIBRARY_IMPL(quantization, Device, m) {
     DISPATCH_TO_CUDA("_Bfloat16QuantizedToFloat", _bfloat16_to_float_cuda);
     DISPATCH_TO_CUDA("_FloatToBfloat16Quantized", _float_to_bfloat16_cuda);
 }
diff --git a/torch/csrc/distributed/c10d/quantization/quantization_utils.h b/torch/csrc/distributed/c10d/quantization/quantization_utils.h
index 603e24194418ec..ed20737daaa522 100644
--- a/torch/csrc/distributed/c10d/quantization/quantization_utils.h
+++ b/torch/csrc/distributed/c10d/quantization/quantization_utils.h
@@ -32,3 +32,9 @@ inline std::string torch_tensor_device_name(const at::Tensor& ten) {
       x.is_cuda(),                                             \
       #x " must be a CUDA tensor; it is currently on device ", \
       torch_tensor_device_name(x))
+
+#define TENSOR_ON_ZOOM_GPU(x)                                  \
+TORCH_CHECK(                                                 \
+    x.is_privateuseone(),                                             \
+    #x " must be a Zoom tensor; it is currently on device ", \
+    torch_tensor_device_name(x))
diff --git a/torch/csrc/distributed/c10d/reducer_zoom.cpp b/torch/csrc/distributed/c10d/reducer_zoom.cpp
new file mode 100644
index 00000000000000..07763986e9a925
--- /dev/null
+++ b/torch/csrc/distributed/c10d/reducer_zoom.cpp
@@ -0,0 +1,86 @@
+#include <torch/csrc/distributed/c10d/reducer_timer.hpp>
+
+#include <ATen/zoom/ZoomEvent.h>
+#include <c10/core/DeviceGuard.h>
+
+namespace c10d {
+namespace {
+
+const int kMilliSecondToNanosSecond = 1000000;
+
+class ZoomTimer : public Timer {
+ private:
+  c10::Device device;
+
+  at::zoom::ZoomEvent forward_start = at::zoom::ZoomEvent(hipEventDefault);
+  at::zoom::ZoomEvent backward_compute_start =
+      at::zoom::ZoomEvent(hipEventDefault);
+  at::zoom::ZoomEvent backward_compute_end =
+      at::zoom::ZoomEvent(hipEventDefault);
+  at::zoom::ZoomEvent backward_comm_start =
+      at::zoom::ZoomEvent(hipEventDefault);
+  at::zoom::ZoomEvent backward_comm_end = at::zoom::ZoomEvent(hipEventDefault);
+
+  at::zoom::ZoomEvent& getEvent(Event event) {
+    switch (event) {
+      case Event::kForwardStart:
+        return forward_start;
+      case Event::kBackwardComputeStart:
+        return backward_compute_start;
+      case Event::kBackwardComputeEnd:
+        return backward_compute_end;
+      case Event::kBackwardCommStart:
+        return backward_comm_start;
+      case Event::kBackwardCommEnd:
+        return backward_comm_end;
+      default:
+        TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+
+ public:
+  explicit ZoomTimer(c10::Device dev) : device(dev) {}
+
+  void record(Event event) override {
+    // Parent class sets the host-side time
+    Timer::record(event);
+    c10::DeviceGuard g(device);
+    getEvent(event).record();
+  }
+
+  std::optional<int64_t> measureDifference(Event start, Event end) override {
+    c10::DeviceGuard g(device);
+    at::zoom::ZoomEvent& start_event = getEvent(start);
+    at::zoom::ZoomEvent& end_event = getEvent(end);
+    // It is possible users did not call backward or run codes in
+    // no-sync mode, in this case, some cudaEvents like "backward_compute_end"
+    // or "backward_comm_start" or "backward_comm_end" will not be recorded.
+    // cudaEvent is created when it is first time to be recorded.
+    // If it is never recorded/created, skip synchronize and calculation.
+    // Otherwise it will throw cuda errors.
+    if (!start_event.isCreated() || !end_event.isCreated()) {
+      return c10::nullopt;
+    }
+    // set_runtime_stats_and_log is called at the beginning of forward call,
+    // when it is cheap to synchronize the cuda events of previous iteration,
+    // as mostly all cuda operations are finished in previous iteration.
+    start_event.synchronize();
+    end_event.synchronize();
+    float milliseconds = start_event.elapsed_time(end_event);
+    // If gpu_end is not recorded in this iteration,
+    // milliseconds will have invalid value.
+    // For some cases like DDP runs on non-sync mode,
+    // gpu_end can not be recorded in this iteration and thus can not
+    // calculate the valid avg_time.
+    // In this case, skip calculating the avg_time and return.
+    if (milliseconds < 0) {
+      return c10::nullopt;
+    }
+    return int64_t(milliseconds * kMilliSecondToNanosSecond);
+  }
+};
+
+C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kPrivateUse1, ZoomTimer);
+
+} // namespace
+} // namespace c10d
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 242da11af7c048..c76cc239c0faeb 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -32,9 +32,14 @@
 #include <ATen/core/function_schema.h>
 #include <c10/core/Stream.h>
 #ifdef USE_C10D_NCCL
+#ifdef USE_ZOOM
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomStream.h>
+#else
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAStream.h>
 #endif
+#endif
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 7552f6d0c028aa..a16821c8ac8393 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -16,6 +16,10 @@
 #include <c10/cuda/CUDAStream.h>
 #endif
 
+#ifdef USE_ZOOM
+#include <c10/zoom/ZoomStream.h>
+#endif
+
 #define THPUtils_(NAME) TH_CONCAT_4(THP, Real, Utils_, NAME)
 
 #define THPUtils_typename(obj) (Py_TYPE(obj)->tp_name)
@@ -210,6 +214,12 @@ std::vector<std::optional<at::cuda::CUDAStream>>
 THPUtils_PySequence_to_CUDAStreamList(PyObject* obj);
 #endif
 
+// NB: idk just following convention with a forward decl here, same as above
+#ifdef USE_ZOOM
+std::vector<std::optional<c10::zoom::ZoomStream>>
+THPUtils_PySequence_to_ZoomStreamList(PyObject* obj);
+#endif
+
 void storage_fill(const at::Storage& self, uint8_t value);
 void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
 uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
diff --git a/torch/csrc/zoom/Module.cpp b/torch/csrc/zoom/Module.cpp
index 341f8484b30679..be58f275be1c75 100644
--- a/torch/csrc/zoom/Module.cpp
+++ b/torch/csrc/zoom/Module.cpp
@@ -22,7 +22,9 @@
 #include <c10/zoom/ZoomFunctions.h>
 #include <c10/zoom/ZoomMiscFunctions.h>
 #include <ATen/zoom/HIPGraphsUtils.hpp>
-
+#ifdef USE_NCCL
+#include <torch/csrc/zoom/python_nccl.h>
+#endif
 #include <c10/util/CallOnce.h>
 #include <c10/util/irange.h>
 
@@ -1343,6 +1345,23 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_zoomGetSyncDebugMode,
      METH_NOARGS,
      nullptr},
+  #ifdef USE_NCCL
+    {"_nccl_version", THCPModule_nccl_version, METH_NOARGS, nullptr},
+    {"_nccl_version_suffix",
+     THCPModule_nccl_version_suffix,
+     METH_NOARGS,
+     nullptr},
+    {"_nccl_unique_id", THCPModule_nccl_unique_id, METH_NOARGS, nullptr},
+    {"_nccl_init_rank", THCPModule_nccl_init_rank, METH_VARARGS, nullptr},
+    {"_nccl_reduce", THCPModule_nccl_reduce, METH_VARARGS, nullptr},
+    {"_nccl_all_reduce", THCPModule_nccl_all_reduce, METH_VARARGS, nullptr},
+    {"_nccl_broadcast", THCPModule_nccl_broadcast, METH_VARARGS, nullptr},
+    {"_nccl_all_gather", THCPModule_nccl_all_gather, METH_VARARGS, nullptr},
+    {"_nccl_reduce_scatter",
+     THCPModule_nccl_reduce_scatter,
+     METH_VARARGS,
+     nullptr},
+  #endif
     {nullptr}};
 
 PyMethodDef* THCPModule_methods() {
diff --git a/torch/csrc/zoom/nccl.cpp b/torch/csrc/zoom/nccl.cpp
new file mode 100644
index 00000000000000..4beeb9f8fdd10c
--- /dev/null
+++ b/torch/csrc/zoom/nccl.cpp
@@ -0,0 +1,1122 @@
+#include <ATen/core/functional.h>
+#include <torch/csrc/zoom/device_set.h>
+#include <torch/csrc/zoom/nccl.h>
+
+#include <ATen/ATen.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/Exception.h>
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+
+#include <rccl/rccl.h>
+
+#include <limits>
+#include <sstream>
+#include <type_traits>
+#include <unordered_map>
+
+ncclComm_t* to_nccl_comm(torch::zoom::nccl::ncclComm_t* var) {
+  return reinterpret_cast<ncclComm_t*>(var);
+}
+
+ncclComm_t to_nccl_comm(torch::zoom::nccl::ncclComm_t var) {
+  return reinterpret_cast<ncclComm_t>(var);
+}
+
+ncclUniqueId* to_nccl_unique_id(torch::zoom::nccl::ncclUniqueId* var) {
+  return reinterpret_cast<ncclUniqueId*>(var);
+}
+
+ncclResult_t to_nccl_result(torch::zoom::nccl::ncclResult var) {
+  switch (var) {
+    case torch::zoom::nccl::ncclResult::Success:
+      return ncclResult_t::ncclSuccess;
+    case torch::zoom::nccl::ncclResult::UnhandledCudaError:
+      return ncclResult_t::ncclUnhandledCudaError;
+    case torch::zoom::nccl::ncclResult::SystemError:
+      return ncclResult_t::ncclSystemError;
+    case torch::zoom::nccl::ncclResult::InternalError:
+      return ncclResult_t::ncclInternalError;
+    case torch::zoom::nccl::ncclResult::InvalidArgument:
+      return ncclResult_t::ncclInvalidArgument;
+    case torch::zoom::nccl::ncclResult::InvalidUsage:
+      return ncclResult_t::ncclInvalidUsage;
+    case torch::zoom::nccl::ncclResult::NumResults:
+      return ncclResult_t::ncclNumResults;
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+    case torch::zoom::nccl::ncclResult::InProgress:
+      return ncclResult_t::ncclInProgress;
+#endif
+    default:
+      throw std::runtime_error("Unconvertible NCCL type");
+  }
+}
+
+torch::zoom::nccl::ncclResult from_nccl_result(ncclResult_t var) {
+  switch (var) {
+    case ncclSuccess:
+      return torch::zoom::nccl::ncclResult::Success;
+    case ncclUnhandledCudaError:
+      return torch::zoom::nccl::ncclResult::UnhandledCudaError;
+    case ncclSystemError:
+      return torch::zoom::nccl::ncclResult::SystemError;
+    case ncclInternalError:
+      return torch::zoom::nccl::ncclResult::InternalError;
+    case ncclInvalidArgument:
+      return torch::zoom::nccl::ncclResult::InvalidArgument;
+    case ncclInvalidUsage:
+      return torch::zoom::nccl::ncclResult::InvalidUsage;
+    case ncclNumResults:
+      return torch::zoom::nccl::ncclResult::NumResults;
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+    case ncclInProgress:
+      return torch::zoom::nccl::ncclResult::InProgress;
+#endif
+    default:
+      throw std::runtime_error("Unconvertible NCCL type");
+  }
+}
+
+ncclDataType_t to_nccl_data_type(c10::ScalarType type) {
+  switch (type) {
+    case at::kFloat:
+      return ncclDataType_t::ncclFloat;
+    case at::kHalf:
+      return ncclDataType_t::ncclHalf;
+    case at::kDouble:
+      return ncclDataType_t::ncclDouble;
+    case at::kLong:
+      return ncclDataType_t::ncclInt64;
+    case at::kInt:
+      return ncclDataType_t::ncclInt;
+    case at::kChar:
+      return ncclDataType_t::ncclChar;
+    case at::kByte:
+      return ncclDataType_t::ncclUint8;
+    case at::kBool:
+      return ncclDataType_t::ncclUint8;
+#if HAS_NCCL_BF16_DATATYPE
+    case at::kBFloat16:
+      return ncclDataType_t::ncclBfloat16;
+#endif
+    default:
+      TORCH_CHECK(false, "Unconvertible NCCL type ", type);
+  }
+}
+
+ncclDataType_t to_nccl_data_type(const at::Tensor& t) {
+  if (!t.is_privateuseone()) {
+    TORCH_CHECK(
+        false,
+        "RCCL only supports Zoom tensors, but got a tensor on ",
+        t.device());
+  }
+  return to_nccl_data_type(t.scalar_type());
+}
+
+ncclRedOp_t to_nccl_red_op(int var) {
+  return (ncclRedOp_t)(var);
+}
+
+namespace torch::zoom::nccl {
+
+using namespace at;
+
+namespace detail {
+
+static inline void NCCL_CHECK(ncclResult_t result) {
+  NCCL_CHECK(from_nccl_result(result));
+}
+
+// TODO(eqy): can this duplication be avoided from NCCLUtils.cpp?
+bool nccl_use_nonblocking() {
+  static bool nccl_use_nonblocking_ =
+      c10::utils::check_env("TORCH_NCCL_USE_COMM_NONBLOCKING") == true;
+  if (nccl_use_nonblocking_) {
+    TORCH_WARN("Using experimental non-blocking NCCL communicator.");
+  }
+  return nccl_use_nonblocking_;
+}
+
+static int _parse_nccl_nonblocking_timeout() {
+  const char* val = getenv("TORCH_NCCL_NONBLOCKING_TIMEOUT");
+  int timeout = -1;
+  if (val) {
+    const std::string config(val);
+    timeout = std::stoi(config);
+    if (!nccl_use_nonblocking() && timeout > 0) {
+      TORCH_WARN(
+          "TORCH_NCCL_NONBLOCKING_TIMEOUT has no effect when TORCH_NCCL_USE_COMM_NONBLOCKING is false.");
+      timeout = -1;
+    }
+  }
+  return timeout;
+}
+
+static int nccl_nonblocking_timeout() {
+  static int timeout = _parse_nccl_nonblocking_timeout();
+  return timeout;
+}
+
+static inline void NCCL_CHECK_TIMEOUT(ncclResult status, ncclComm_t comm) {
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  ncclResult_t result = to_nccl_result(status);
+  auto startTimepoint = std::chrono::steady_clock::now();
+  while (result == ncclInProgress) {
+    if (nccl_nonblocking_timeout() > 0) {
+      auto currentTimepoint = std::chrono::steady_clock::now();
+      auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                             currentTimepoint - startTimepoint)
+                             .count();
+      if (timeElapsed > nccl_nonblocking_timeout()) {
+        throw std::runtime_error("NCCL timeout.");
+      }
+    }
+    ncclCommGetAsyncError(to_nccl_comm(comm), &result);
+  }
+  if (result != ncclSuccess) {
+    throw_nccl_error(from_nccl_result(result));
+  }
+#else
+  TORCH_INTERNAL_ASSERT(
+      false, "NCCL COMM NONBLOCKING USED WITH UNSUPPORTED NCCL VERSION.");
+#endif
+}
+
+static inline void NCCL_CHECK_TIMEOUT(ncclResult_t result, ncclComm_t comm) {
+  NCCL_CHECK_TIMEOUT(from_nccl_result(result), comm);
+}
+
+static inline void NCCL_CHECK_TIMEOUT(
+    ncclResult status,
+    std::vector<ncclComm_t>& comms) {
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  ncclResult_t result = to_nccl_result(status);
+  auto startTimepoint = std::chrono::steady_clock::now();
+  if (result == ncclInProgress) {
+    for (const auto i : c10::irange(comms.size())) {
+      do {
+        if (nccl_nonblocking_timeout() > 0) {
+          auto currentTimepoint = std::chrono::steady_clock::now();
+          auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                                 currentTimepoint - startTimepoint)
+                                 .count();
+          if (timeElapsed > nccl_nonblocking_timeout()) {
+            throw std::runtime_error("NCCL timeout.");
+          }
+        }
+        ncclCommGetAsyncError(to_nccl_comm(comms[i]), &result);
+      } while (result == ncclInProgress);
+      if (result != ncclSuccess) {
+        break; /* fall through to failed case */
+      }
+    }
+  }
+  if (result != ncclSuccess) {
+    throw_nccl_error(from_nccl_result(result));
+  }
+#else
+  TORCH_INTERNAL_ASSERT(
+      false, "NCCL COMM NONBLOCKING USED WITH UNSUPPORTED NCCL VERSION.");
+#endif
+}
+
+static inline void NCCL_CHECK_TIMEOUT(
+    ncclResult_t result,
+    std::vector<ncclComm_t>& comms) {
+  NCCL_CHECK_TIMEOUT(from_nccl_result(result), comms);
+}
+
+void throw_nccl_error(torch::zoom::nccl::ncclResult status) {
+  std::ostringstream err;
+  err << "NCCL Error " << static_cast<int>(status) << ": "
+      << ncclGetErrorString(to_nccl_result(status));
+  throw std::runtime_error(err.str());
+}
+
+struct NcclCommList {
+  std::unique_ptr<ncclComm_t[]> comms;
+  int ndevices;
+  NcclCommList(const std::vector<int>& devices)
+      : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) {
+    NCCL_CHECK(ncclCommInitAll(
+        to_nccl_comm(comms.get()), devices.size(), devices.data()));
+  }
+  NcclCommList(NcclCommList&& foo) = default;
+  ~NcclCommList() {
+    if (comms) {
+      for (const auto i : c10::irange(ndevices)) {
+        int dummy_var;
+        if (C10_ZOOM_ERROR_HANDLED(hipGetDevice(&dummy_var)) != hipSuccess) {
+          /* there are cases when this destructor is called after the
+           HIP driver is already unloaded from the process.
+           In these cases, skip ncclCommDestroy */
+          return;
+        }
+        comm_destroy(comms[i]);
+      }
+    }
+  }
+  ArrayRef<ncclComm_t> ref() const {
+    return ArrayRef<ncclComm_t>(comms.get(), ndevices);
+  }
+};
+
+using device_list = std::vector<int>;
+// accesses to this object have to be guarded by THC's ZoomFreeMutex
+static std::unordered_map<device_list, NcclCommList, c10::hash<device_list>>
+    _communicators;
+
+ArrayRef<ncclComm_t> get_communicators(TensorList inputs) {
+  static auto get_device = [](const at::Tensor& t) -> int {
+    return t.get_device();
+  };
+  device_list devices = fmap(inputs, get_device);
+  auto it = _communicators.find(devices);
+  if (it == _communicators.end()) {
+    it = _communicators.emplace(devices, devices).first;
+  }
+  return it->second.ref();
+}
+
+static inline void check_tensor(
+    const at::Tensor& input,
+    const at::optional<at::Tensor>& output,
+    int input_multiplier,
+    int output_multiplier,
+    int64_t ref_numel,
+    ScalarType ref_dtype) {
+  auto check_one = [&](const at::Tensor& tensor) {
+    if (!tensor.is_privateuseone() || tensor.is_sparse()) {
+      throw std::runtime_error(
+          "input and output elements have to be zoom dense Tensors");
+    }
+
+    if (ref_dtype != tensor.scalar_type()) {
+      throw std::runtime_error(
+          "all inputs and outputs must be of the same Tensor dtype");
+    }
+
+    if (!tensor.is_contiguous()) {
+      throw std::runtime_error("all inputs and outputs have to be contiguous");
+    }
+  };
+
+  check_one(input);
+
+  // all inputs must be same size
+  if (input.numel() != ref_numel) {
+    throw std::runtime_error(
+        "all inputs must have the same number of elements");
+  }
+
+  if (output) {
+    check_one(*output);
+
+    // inputs and outputs must be on same device respectively
+    if (input.get_device() != output->get_device()) {
+      throw std::runtime_error("input and output must be on the same device");
+    }
+
+    if (output->numel() * output_multiplier != ref_numel * input_multiplier) {
+      throw std::runtime_error(
+          "output must be of size input_size * size_multiplier");
+    }
+  }
+}
+
+void check_inputs(
+    TensorList inputs,
+    TensorList outputs,
+    int input_multiplier,
+    int output_multiplier) {
+  // len(inputs) == len(outputs)
+  size_t len = inputs.size();
+
+  if (len <= 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+
+  if (len != outputs.size()) {
+    std::stringstream err;
+    err << "inputs and outputs sequences have to be of the same length, but got input of length "
+        << len << " and output of length " << outputs.size();
+    throw std::runtime_error(err.str());
+  }
+
+  device_set devices;
+  int64_t numel = inputs[0].numel();
+  auto dtype = inputs[0].scalar_type();
+
+  for (const auto i : c10::irange(len)) {
+    auto input = inputs[i];
+    auto output = outputs[i];
+
+    check_tensor(
+        input, output, input_multiplier, output_multiplier, numel, dtype);
+
+    auto input_device = input.get_device();
+    // inputs must be on unique devices
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+    devices.set(input_device);
+  }
+}
+
+void check_inputs(
+    TensorList inputs,
+    const at::Tensor& output,
+    int root,
+    int input_multiplier,
+    int output_multiplier) {
+  auto len = inputs.size();
+
+  if (len <= 0) {
+    throw std::runtime_error("input sequence can't be empty");
+  }
+
+  device_set devices;
+  int64_t numel = inputs[0].numel();
+  auto dtype = inputs[0].scalar_type();
+
+  for (const auto i : c10::irange(len)) {
+    auto input = inputs[i];
+
+    check_tensor(
+        input,
+        i == static_cast<std::remove_cv_t<decltype(i)>>(root)
+            ? at::optional<at::Tensor>{output}
+            : at::nullopt,
+        input_multiplier,
+        output_multiplier,
+        numel,
+        dtype);
+
+    auto input_device = input.get_device();
+    // inputs must be on unique devices
+    if (devices.test(input_device)) {
+      throw std::runtime_error("inputs must be on unique devices");
+    }
+    devices.set(input_device);
+  }
+}
+
+} // namespace detail
+
+AutoNcclGroup::AutoNcclGroup() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR < 2)
+  // nccl < 2.0 cannot be called concurrently with hipFree
+  (c10::zoom::getFreeMutex())->lock();
+#endif
+  comm_nonblocking_ = false;
+  comm_ = nullptr;
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  detail::NCCL_CHECK(ncclGroupStart());
+#endif
+}
+
+AutoNcclGroup::AutoNcclGroup(ncclComm_t comm, bool comm_nonblocking) {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR < 2)
+  // nccl < 2.0 cannot be called concurrently with hipFree
+  (c10::zoom::getFreeMutex())->lock();
+#endif
+  comm_ = comm;
+  comm_nonblocking_ = comm_nonblocking;
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  detail::NCCL_CHECK(ncclGroupStart());
+#endif
+}
+
+AutoNcclGroup::~AutoNcclGroup() noexcept(false) {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  if (comm_nonblocking_ && comm_ != nullptr) {
+    detail::NCCL_CHECK_TIMEOUT(ncclGroupEnd(), comm_);
+  } else {
+    detail::NCCL_CHECK(ncclGroupEnd());
+  }
+#endif
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR < 2)
+  (c10::zoom::getFreeMutex())->unlock();
+#endif
+}
+
+bool is_available(TensorList tensors) {
+#ifdef USE_NCCL
+  device_set devices;
+  for (auto& tensor : tensors) {
+    if (!tensor.is_privateuseone() || tensor.is_sparse())
+      return false;
+    if (!tensor.is_contiguous())
+      return false;
+    auto device = tensor.get_device();
+    if (devices[device])
+      return false;
+    devices[device] = true;
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+std::uint64_t version() {
+#if defined(NCCL_MAJOR)
+  constexpr std::uint64_t ver = (((uint64_t)NCCL_MAJOR) << 32) |
+      (((uint64_t)NCCL_MINOR) << 16) | ((uint64_t)NCCL_PATCH);
+  return ver;
+#elif defined(USE_NCCL)
+  // return major version "1"
+  return ((uint64_t)1) << 32;
+#else
+  return 0;
+#endif
+}
+
+const char* version_suffix() {
+#if defined(NCCL_SUFFIX)
+  return NCCL_SUFFIX;
+#else
+  return "";
+#endif
+}
+
+void get_unique_id(ncclUniqueId& id) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  NCCL_CHECK(ncclGetUniqueId(to_nccl_unique_id(&id)));
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  ncclComm_t comm;
+  ncclUniqueId id = comm_id;
+  NCCL_CHECK(ncclCommInitRank(
+      to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), rank));
+  return comm;
+#else
+  return nullptr;
+#endif
+}
+
+void comm_destroy(ncclComm_t comm) {
+  /*
+   * TODO(T30279827) Temporarily disable calling ncclCommDestroy
+   * Calling ncclCommDestroy while program exiting is undefined
+   * according to Nvidia, and lead to segfault in NCCL 2
+   * (whether it is called before or after the CUDA runtime destructor).
+   * Temporarily disable it in destructor to avoid segfault.
+   * Following up with Nvidia for long term solution.
+   */
+  return;
+
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  NCCL_CHECK(ncclCommDestroy(to_nccl_comm(comm)));
+#endif
+}
+
+namespace {
+// NCCL changed the numerical type used for count between NCCL1 and NCCL2.
+// So we use the following struct, which gets the type of the second argument
+// of T, if T is a function type, with ncclBcast, to get that type statically
+// and programmatically.
+
+template <typename T>
+struct GetSecondArgType;
+
+template <typename R, typename Arg0, typename Arg1, typename... Args>
+struct GetSecondArgType<R(Arg0, Arg1, Args...)> {
+  typedef typename std::decay<Arg1>::type type;
+};
+
+constexpr auto count_max =
+    std::numeric_limits<GetSecondArgType<decltype(ncclBcast)>::type>::max();
+
+// Since NCCL 2.12.10, NCCL supports send/recv 0 byte:
+// https://github.com/NVIDIA/nccl/issues/696. The issue of skipping send/recv
+// is that it can cause deadlock when a rank send and recv 0 bytes so it's
+// completely skipping the collective, causing mismatch across ranks
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR > 13)))
+template <typename T>
+constexpr bool _nccl_should_send_recv(C10_UNUSED T _unused_) {
+  return true;
+}
+#else
+// old NCCL uses 0 byte message for synchronization
+// Avoid send/recv when message size is zero
+template <typename T>
+inline bool _nccl_should_send_recv(T value) {
+  return value != 0;
+}
+#endif
+} // namespace
+
+size_t get_max_count() {
+  return count_max;
+}
+
+void broadcast(
+    TensorList tensors,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  check_inputs(tensors, tensors, 1, 1);
+  auto data_type = to_nccl_data_type(tensors[0]);
+  int64_t numel = tensors[0].numel();
+
+  const auto comms = user_comms.empty() ? get_communicators(tensors)
+                                        : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  c10::zoom::OptionalZoomGuard device_guard;
+  for (size_t i = 0, num_tensors = tensors.size(); i < num_tensors; i++) {
+    auto device = tensors[i].get_device();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? c10::zoom::getCurrentZoomStream(device).stream()
+        : streams[i]->stream();
+    TORCH_CHECK(
+        static_cast<uint64_t>(numel) <= static_cast<uint64_t>(count_max),
+        "Broadcast tensor has ",
+        numel,
+        " elements, which exceeds the "
+        "maximum NCCL supports (",
+        count_max,
+        ")");
+    ncclComm_t comm = comms[i];
+    NCCL_CHECK(ncclBcast(
+        tensors[i].data_ptr(),
+        numel,
+        data_type,
+        0,
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void reduce(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& output,
+    int32_t root,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  TORCH_CHECK(
+      root >= 0 && static_cast<size_t>(root) < inputs.size(), "invalid root");
+
+  check_inputs(inputs, output, root, 1, 1);
+  const auto len = inputs.size();
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel();
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  c10::zoom::OptionalZoomGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? c10::zoom::getCurrentZoomStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+    NCCL_CHECK(ncclReduce(
+        inputs[i].data_ptr(),
+        static_cast<std::remove_cv_t<decltype(i)>>(root) == i
+            ? output.data_ptr()
+            : nullptr,
+        count,
+        data_type,
+        to_nccl_red_op(op),
+        root,
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void reduce(
+    std::vector<at::Tensor>& inputs,
+    int32_t root,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+  reduce(inputs, /*output=*/inputs[root], root, op, streams, user_comms);
+}
+
+void all_reduce(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  check_inputs(inputs, outputs, 1, 1);
+  const auto len = inputs.size();
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel();
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  c10::zoom::OptionalZoomGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? c10::zoom::getCurrentZoomStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+    NCCL_CHECK(ncclAllReduce(
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+        count,
+        data_type,
+        to_nccl_red_op(op),
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void reduce_scatter(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  const auto len = inputs.size();
+  check_inputs(inputs, outputs, 1, len);
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel() / len;
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  c10::zoom::OptionalZoomGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? c10::zoom::getCurrentZoomStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+    NCCL_CHECK(ncclReduceScatter(
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+        count,
+        data_type,
+        to_nccl_red_op(op),
+        to_nccl_comm(comm),
+        stream));
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all_gather(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    const stream_list& streams,
+    const comm_list& user_comms) {
+#ifdef USE_NCCL
+  using namespace torch::zoom::nccl::detail;
+  const auto len = inputs.size();
+  check_inputs(inputs, outputs, len, 1);
+
+  auto data_type = to_nccl_data_type(inputs[0]);
+
+  const auto count = inputs[0].numel();
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
+                                      : ArrayRef<ncclComm_t>(user_comms);
+
+  AutoNcclGroup nccl_group_guard;
+  c10::zoom::OptionalZoomGuard device_guard;
+  for (const auto i : c10::irange(len)) {
+    auto device = inputs[i].device().index();
+    device_guard.set_index(device);
+    // Default to the current stream
+    const auto stream = (streams.empty() || !streams[i])
+        ? c10::zoom::getCurrentZoomStream(device).stream()
+        : streams[i]->stream();
+
+    ncclComm_t comm = comms_ref[i];
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+    NCCL_CHECK(ncclAllGather(
+        inputs[i].data_ptr(),
+        outputs[i].data_ptr(),
+        count,
+        data_type,
+        to_nccl_comm(comm),
+        stream));
+#else
+    NCCL_CHECK(ncclAllGather(
+        inputs[i].data_ptr(),
+        count,
+        data_type,
+        outputs[i].data_ptr(),
+        to_nccl_comm(comm),
+        stream));
+#endif
+  }
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all2all_single_equal_split(
+    at::Tensor& input,
+    at::Tensor& output,
+    int size,
+    ncclComm_t _comm,
+    c10::zoom::ZoomStream& stream) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+
+  int numranks;
+  auto type = to_nccl_data_type(input);
+  size_t count = input.numel() / size;
+  size_t rankdiff = input.nbytes() / size;
+  const auto* sendbuff = reinterpret_cast<const char*>(input.const_data_ptr());
+  auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
+  auto comm = to_nccl_comm(_comm);
+#if defined(USE_ROCM)
+  NCCL_CHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
+#else
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclGroupStart());
+  for (const auto r : c10::irange(numranks)) {
+    if (_nccl_should_send_recv(count)) {
+      NCCL_CHECK(
+          ncclSend(sendbuff + r * rankdiff, count, type, r, comm, stream));
+      NCCL_CHECK(
+          ncclRecv(recvbuff + r * rankdiff, count, type, r, comm, stream));
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#endif
+#else
+  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all2all_single_unequal_split(
+    void* sendbuff,
+    const size_t* sendcounts,
+    const size_t* senddispls,
+    void* recvbuff,
+    const size_t* recvcounts,
+    const size_t* recvdispls,
+    size_t size,
+    c10::ScalarType _type,
+    ncclComm_t _comm,
+    c10::zoom::ZoomStream& stream) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+
+  auto type = to_nccl_data_type(_type);
+  auto comm = to_nccl_comm(_comm);
+  int numranks;
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclGroupStart());
+  for (const auto r : c10::irange(numranks)) {
+    if (_nccl_should_send_recv(sendcounts[r])) {
+      NCCL_CHECK(ncclSend(
+          ((char*)sendbuff) + senddispls[r] * size,
+          sendcounts[r],
+          type,
+          r,
+          comm,
+          stream));
+    }
+    if (_nccl_should_send_recv(recvcounts[r])) {
+      NCCL_CHECK(ncclRecv(
+          ((char*)recvbuff) + recvdispls[r] * size,
+          recvcounts[r],
+          type,
+          r,
+          comm,
+          stream));
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#else
+  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void all2all(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    ncclComm_t _comm,
+    c10::zoom::ZoomStream& stream) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+  auto comm = to_nccl_comm(_comm);
+
+  NCCL_CHECK(ncclGroupStart());
+  for (const auto r : c10::irange(outputTensors.size())) {
+    at::Tensor& input = inputTensors[r];
+    at::Tensor& output = outputTensors[r];
+
+    if (_nccl_should_send_recv(input.numel())) {
+      NCCL_CHECK(ncclSend(
+          input.data_ptr(),
+          input.numel(),
+          to_nccl_data_type(input),
+          r,
+          comm,
+          stream.stream()));
+    }
+    if (_nccl_should_send_recv(output.numel())) {
+      NCCL_CHECK(ncclRecv(
+          output.data_ptr(),
+          output.numel(),
+          to_nccl_data_type(output),
+          r,
+          comm,
+          stream.stream()));
+    }
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#else
+  AT_ERROR("all2all is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void send(
+    const at::Tensor& input,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream stream,
+    int dst) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclSend(
+      input.data_ptr(),
+      input.numel(),
+      to_nccl_data_type(input),
+      dst,
+      to_nccl_comm(comm),
+      stream.stream()));
+#else
+  NCCL_CHECK_TIMEOUT(
+      ncclSend(
+          input.data_ptr(),
+          input.numel(),
+          to_nccl_data_type(input),
+          dst,
+          to_nccl_comm(comm),
+          stream.stream()),
+      comm);
+#endif
+#else
+  AT_ERROR("Send is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void recv(
+    at::Tensor& output,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream stream,
+    int src) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclRecv(
+      output.data_ptr(),
+      output.numel(),
+      to_nccl_data_type(output),
+      src,
+      to_nccl_comm(comm),
+      stream.stream()));
+#else
+  NCCL_CHECK_TIMEOUT(
+      ncclRecv(
+          output.data_ptr(),
+          output.numel(),
+          to_nccl_data_type(output),
+          src,
+          to_nccl_comm(comm),
+          stream.stream()),
+      comm);
+#endif
+#else
+  AT_ERROR("Recv is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t _comm,
+    c10::zoom::ZoomStream& stream,
+    int32_t root) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+
+  auto comm = to_nccl_comm(_comm);
+  int numranks, cur_rank;
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
+
+  size_t count = inputs.numel();
+  auto type = to_nccl_data_type(inputs);
+  const auto* sendbuff = reinterpret_cast<const char*>(inputs.const_data_ptr());
+
+  NCCL_CHECK(ncclGroupStart());
+
+  if (cur_rank == root) {
+    for (const auto r : c10::irange(numranks)) {
+      if (r != root) {
+        auto* recvbuff = reinterpret_cast<char*>(outputs[r].data_ptr());
+        NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
+      } else {
+        // on its own rank, simply copy from the input
+        outputs[r].copy_(inputs);
+      }
+    }
+  } else {
+    NCCL_CHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+
+#else
+  AT_ERROR("gather is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+void scatter(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& outputs,
+    ncclComm_t _comm,
+    c10::zoom::ZoomStream& stream,
+    int32_t root) {
+#ifdef USE_NCCL
+#if defined(NCCL_MAJOR) && \
+    ((NCCL_MAJOR > 2) || ((NCCL_MAJOR == 2) && (NCCL_MINOR >= 7)))
+  using namespace torch::zoom::nccl::detail;
+
+  auto comm = to_nccl_comm(_comm);
+  int numranks, cur_rank;
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclCommCount(comm, &numranks));
+  NCCL_CHECK(ncclCommUserRank(comm, &cur_rank));
+#else
+  NCCL_CHECK_TIMEOUT(ncclCommCount(comm, &numranks), _comm);
+  NCCL_CHECK_TIMEOUT(ncclCommUserRank(comm, &cur_rank), _comm);
+#endif
+  NCCL_CHECK(ncclGroupStart());
+  if (cur_rank == root) {
+    for (const auto r : c10::irange(numranks)) {
+      if (r != root) {
+        size_t send_count = inputs[r].numel();
+        auto send_type = to_nccl_data_type(inputs[r]);
+        const auto* sendbuff =
+            reinterpret_cast<const char*>(inputs[r].const_data_ptr());
+        NCCL_CHECK(ncclSend(sendbuff, send_count, send_type, r, comm, stream));
+      } else {
+        // on its own rank, simply copy it to the output
+        outputs.copy_(inputs[r]);
+      }
+    }
+  } else {
+    size_t recv_count = outputs.numel();
+    auto recv_type = to_nccl_data_type(outputs);
+    auto* recvbuff = reinterpret_cast<char*>(outputs.data_ptr());
+    NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream));
+  }
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+  NCCL_CHECK(ncclGroupEnd());
+#else
+  NCCL_CHECK_TIMEOUT(ncclGroupEnd(), _comm);
+#endif
+#else
+  AT_ERROR("scatter is only supported for NCCL lib version >= 2.7.0");
+#endif
+#else
+  AT_ERROR("PyTorch built without NCCL support");
+#endif
+}
+
+} // namespace torch::zoom::nccl
diff --git a/torch/csrc/zoom/nccl.h b/torch/csrc/zoom/nccl.h
new file mode 100644
index 00000000000000..13c128d57074d4
--- /dev/null
+++ b/torch/csrc/zoom/nccl.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/util/Optional.h>
+
+#include <cstddef>
+#include <vector>
+
+// NCCL BFloat16 is enabled only for CUDA 11+ and NCCL versions 2.10+, or for
+// HIP 3.1+
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+#define HAS_NCCL_BF16_DATATYPE \
+  ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
+#elif defined(USE_ZOOM) && (TORCH_HIP_VERSION >= 301)
+#define HAS_NCCL_BF16_DATATYPE 1
+#else
+#define HAS_NCCL_BF16_DATATYPE 0
+#endif
+
+namespace torch::zoom::nccl {
+
+/* The following are copied from <nccl.h> and redefined in torch::zoom::nccl
+ * namespace */
+/* pytorch should only use the following definition within pytorch scope */
+
+/* Opaque handle to communicator to ncclComm*, this will reinterpret as ncclComm
+ * in nccl.cpp */
+typedef void* ncclComm_t;
+
+/** redefine nccl unique ID in torch scope. this should be identical to native
+ * nccl impp. */
+#define NCCL_UNIQUE_ID_BYTES 128
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+typedef struct {
+  char internal[NCCL_UNIQUE_ID_BYTES];
+} ncclUniqueId;
+
+/* Error type */
+enum class ncclResult {
+  Success = 0,
+  UnhandledCudaError = 1,
+  SystemError = 2,
+  InternalError = 3,
+  InvalidArgument = 4,
+  InvalidUsage = 5,
+  NumResults = 6,
+  InProgress = 7
+};
+
+/* Reduction operation selector */
+enum class ncclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3, NumOps = 4 };
+
+/* Data types */
+enum class ncclDataType {
+  Int8 = 0,
+  Char = 0,
+  Uint8 = 1,
+  Int32 = 2,
+  Int = 2,
+  Uint32 = 3,
+  Int64 = 4,
+  Uint64 = 5,
+  Float16 = 6,
+  Half = 6,
+  Float32 = 7,
+  Float = 7,
+  Float64 = 8,
+  Double = 8,
+  Bfloat16 = 9,
+  NumTypes = 10
+};
+
+// RAII helper class to manage NCCL group API and Zoom free mutex.
+// The destructor is allowed to throw since this helper class only
+// manages group and lock lifetimes.
+struct AutoNcclGroup {
+  AutoNcclGroup();
+  AutoNcclGroup(ncclComm_t comm, bool comm_nonblocking);
+  ~AutoNcclGroup() noexcept(false);
+  ncclComm_t comm_;
+  bool comm_nonblocking_;
+};
+
+// NOTE: this is exposed only so that python_nccl.cpp can some of these helpers.
+// Don't use them outside of these files.
+namespace detail {
+
+TORCH_ZOOM_API void throw_nccl_error(ncclResult status);
+
+static inline void NCCL_CHECK(ncclResult status) {
+  if (status != ncclResult::Success) {
+    throw_nccl_error(status);
+  }
+}
+
+TORCH_ZOOM_API at::ArrayRef<ncclComm_t> get_communicators(
+    at::TensorList inputs);
+TORCH_ZOOM_API void check_inputs(
+    at::TensorList inputs,
+    at::TensorList outputs,
+    int input_multiplier,
+    int output_multiplier);
+TORCH_ZOOM_API void check_inputs(
+    at::TensorList inputs,
+    const at::Tensor& output,
+    int root,
+    int input_multiplier,
+    int output_multiplier);
+
+} // namespace detail
+
+using comm_list = std::vector<ncclComm_t>;
+using stream_list = std::vector<std::optional<c10::zoom::ZoomStream>>;
+
+TORCH_ZOOM_API std::uint64_t version();
+TORCH_ZOOM_API const char* version_suffix();
+
+bool is_available(at::TensorList tensors);
+
+TORCH_ZOOM_API void get_unique_id(ncclUniqueId& id);
+TORCH_ZOOM_API ncclComm_t
+comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
+TORCH_ZOOM_API void comm_destroy(ncclComm_t comm);
+
+TORCH_ZOOM_API void broadcast(
+    at::TensorList tensors,
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+size_t get_max_count();
+
+TORCH_ZOOM_API void reduce(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& output,
+    int32_t root = 0,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_ZOOM_API void reduce(
+    std::vector<at::Tensor>& inputs,
+    int32_t root = 0,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_ZOOM_API void all_reduce(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_ZOOM_API void reduce_scatter(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_ZOOM_API void scatter(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& outputs,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream& stream,
+    int32_t root = 0);
+
+TORCH_ZOOM_API void all_gather(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_ZOOM_API void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream& stream,
+    int32_t root = 0);
+
+TORCH_ZOOM_API void all2all_single_equal_split(
+    at::Tensor& input,
+    at::Tensor& output,
+    int size,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream& stream);
+
+TORCH_ZOOM_API void all2all_single_unequal_split(
+    void* sendbuff,
+    const size_t* sendcounts,
+    const size_t* senddispls,
+    void* recvbuff,
+    const size_t* recvcounts,
+    const size_t* recvdispls,
+    size_t size,
+    c10::ScalarType type,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream& stream);
+
+TORCH_ZOOM_API void all2all(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    ncclComm_t _comm,
+    c10::zoom::ZoomStream& stream);
+
+TORCH_ZOOM_API void send(
+    const at::Tensor& input,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream stream,
+    int dst);
+
+TORCH_ZOOM_API void recv(
+    at::Tensor& output,
+    ncclComm_t comm,
+    c10::zoom::ZoomStream stream,
+    int src);
+} // namespace torch::zoom::nccl
diff --git a/torch/csrc/zoom/python_nccl.cpp b/torch/csrc/zoom/python_nccl.cpp
new file mode 100644
index 00000000000000..657c9baa9bd50a
--- /dev/null
+++ b/torch/csrc/zoom/python_nccl.cpp
@@ -0,0 +1,323 @@
+#include <torch/csrc/zoom/python_nccl.h>
+
+#include <ATen/core/functional.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/Types.h>
+#include <torch/csrc/zoom/THCP.h>
+#include <torch/csrc/zoom/nccl.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <c10/zoom/ZoomGuard.h>
+#include <c10/util/irange.h>
+
+using namespace at;
+using namespace torch;
+using namespace torch::zoom::nccl;
+using namespace torch::zoom::nccl::detail;
+
+static const char* COMM_CAPSULE_NAME = "torch.zoom.nccl.Communicator";
+
+PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args) {
+  return PyLong_FromUnsignedLongLong(version());
+}
+
+PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  return PyBytes_FromString(version_suffix());
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_nccl_unique_id(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  ncclUniqueId id;
+  get_unique_id(id);
+  return PyBytes_FromStringAndSize((char*)&id, NCCL_UNIQUE_ID_BYTES);
+  END_HANDLE_TH_ERRORS
+}
+
+static ncclComm_t unpack_nccl_comm(PyObject* capsule) {
+  ncclComm_t comm =
+      (ncclComm_t)PyCapsule_GetPointer(capsule, COMM_CAPSULE_NAME);
+  if (!comm)
+    throw python_error();
+  return comm;
+}
+
+static void destroy_nccl_comm(PyObject* capsule) {
+  HANDLE_TH_ERRORS
+  ncclComm_t comm = unpack_nccl_comm(capsule);
+  {
+    pybind11::gil_scoped_release no_gil;
+    comm_destroy(comm);
+  }
+  END_HANDLE_TH_ERRORS_RET()
+}
+
+static std::vector<std::optional<c10::zoom::ZoomStream>> unpack_streams(
+    PyObject* obj,
+    size_t size) {
+  if (obj == Py_None) {
+    return std::vector<std::optional<c10::zoom::ZoomStream>>(size, c10::nullopt);
+  }
+  auto streams = THPUtils_PySequence_to_ZoomStreamList(obj);
+  if (streams.size() != size) {
+    throw std::runtime_error(
+        "number of streams is not equal to number of inputs");
+  }
+  return streams;
+}
+
+static inline at::Tensor extract_tensor(PyObject* obj);
+static inline std::vector<at::Tensor> extract_tensors(PyObject* obj);
+
+static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
+  if (obj == Py_None) {
+    return std::vector<ncclComm_t>();
+  }
+  std::vector<ncclComm_t> comms;
+  if (PyCapsule_CheckExact(obj)) {
+    comms = {unpack_nccl_comm(obj)};
+  } else {
+    auto seq = THPObjectPtr(PySequence_Fast(obj, "comm is not a sequence"));
+    if (!seq)
+      throw python_error();
+    auto size = PySequence_Fast_GET_SIZE(seq.get());
+    comms = std::vector<ncclComm_t>(size);
+    for (const auto i : c10::irange(size)) {
+      comms[i] = unpack_nccl_comm(PySequence_Fast_GET_ITEM(seq.get(), i));
+    }
+  }
+  if (comms.size() != size) {
+    throw std::runtime_error(
+        "number of communicators is not equal to number of inputs");
+  }
+  return comms;
+}
+
+PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  int nranks = 0;
+  const char* id = nullptr;
+  Py_ssize_t id_len = 0;
+  int rank = 0;
+
+  if (!PyArg_ParseTuple(
+          args, "is#i:nccl_init_rank", &nranks, &id, &id_len, &rank)) {
+    return nullptr;
+  }
+  TORCH_CHECK(
+      id_len == NCCL_UNIQUE_ID_BYTES,
+      "invalid unqiue_id (expected ",
+      NCCL_UNIQUE_ID_BYTES,
+      " bytes, got ",
+      id_len,
+      ")");
+
+  ncclUniqueId commId;
+  memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
+  ncclComm_t comm = nullptr;
+  {
+    pybind11::gil_scoped_release no_gil;
+    comm = comm_init_rank(nranks, commId, rank);
+  }
+  return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs = nullptr, *_output = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int root = 0, op = 0;
+
+  if (!PyArg_ParseTuple(
+          args, "OOiiOO", &_inputs, &_output, &root, &op, &_streams, &_comms)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "nccl_reduce",
+        1,
+        "(sequence[Tensor] inputs, Tensor output, int root,"
+        " int op, sequence[torch.zoom.Stream or None]");
+    return nullptr;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  auto output = extract_tensor(_output);
+  std::vector<std::optional<c10::zoom::ZoomStream>> streams =
+      unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  {
+    pybind11::gil_scoped_release no_gil;
+    torch::zoom::nccl::reduce(inputs, output, root, op, streams, user_comms);
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int op = 0;
+
+  if (!PyArg_ParseTuple(
+          args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "nccl_all_reduce",
+        1,
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op,"
+        " sequence[torch.zoom.Stream] streams,"
+        " sequence[torch.zoom.nccl.Communicator] comms)");
+    return nullptr;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  {
+    pybind11::gil_scoped_release no_gil;
+    all_reduce(inputs, outputs, op, streams, user_comms);
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs = nullptr, *_streams = nullptr, *_comms = nullptr;
+  int root = 0;
+
+  if (!PyArg_ParseTuple(args, "OiOO", &_inputs, &root, &_streams, &_comms)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "nccl_broadcast",
+        1,
+        "(sequence[Tensor] inputs, int root"
+        " sequence[torch.zoom.Stream] streams,"
+        " sequence[torch.zoom.nccl.Communicator] comms)");
+    return nullptr;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  TORCH_CHECK(root >= 0 && (size_t)root < inputs.size(), "invalid root");
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  {
+    pybind11::gil_scoped_release no_gil;
+    torch::zoom::nccl::broadcast(inputs, streams, user_comms);
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+
+  if (!PyArg_ParseTuple(
+          args, "OOOO", &_inputs, &_outputs, &_streams, &_comms)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "nccl_all_gather",
+        1,
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs"
+        " sequence[torch.zoom.Stream] streams,"
+        " sequence[torch.zoom.nccl.Communicator] comms)");
+    return nullptr;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  {
+    pybind11::gil_scoped_release no_gil;
+    all_gather(inputs, outputs, streams, user_comms);
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject *_inputs = nullptr, *_outputs = nullptr, *_streams = nullptr,
+           *_comms = nullptr;
+  int op = 0;
+
+  if (!PyArg_ParseTuple(
+          args, "OOiOO", &_inputs, &_outputs, &op, &_streams, &_comms)) {
+    THPUtils_invalidArguments(
+        args,
+        nullptr,
+        "nccl_reduce_scatter",
+        1,
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op"
+        " sequence[torch.zoom.Stream] streams,"
+        " sequence[torch.zoom.nccl.Communicator] comms)");
+    return nullptr;
+  }
+
+  std::vector<at::Tensor> inputs = extract_tensors(_inputs);
+  std::vector<at::Tensor> outputs = extract_tensors(_outputs);
+  auto streams = unpack_streams(_streams, inputs.size());
+  auto user_comms = unpack_comms(_comms, inputs.size());
+
+  {
+    pybind11::gil_scoped_release no_gil;
+    reduce_scatter(inputs, outputs, op, streams, user_comms);
+  }
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static inline at::Tensor extract_tensor(PyObject* obj) {
+  TORCH_CHECK_TYPE(
+      THPVariable_Check(obj),
+      "expected Tensor (got ",
+      Py_TYPE(obj)->tp_name,
+      ")");
+  return THPVariable_Unpack(obj);
+}
+
+static inline std::vector<at::Tensor> extract_tensors(PyObject* obj) {
+  auto seq = THPObjectPtr(PySequence_Fast(obj, "expected a sequence"));
+  if (!seq)
+    throw python_error();
+
+  const Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+  std::vector<at::Tensor> list;
+  if (length >= 0) {
+    list.reserve(length);
+  }
+  for (Py_ssize_t i = 0; i < length; i++) {
+    PyObject* item = PySequence_Fast_GET_ITEM(seq.get(), i);
+    TORCH_CHECK_TYPE(
+        THPVariable_Check(item),
+        "expected Tensor at ",
+        i,
+        " (got ",
+        Py_TYPE(item)->tp_name,
+        ")");
+    list.emplace_back(THPVariable_Unpack(item));
+  }
+  return list;
+}
diff --git a/torch/csrc/zoom/python_nccl.h b/torch/csrc/zoom/python_nccl.h
new file mode 100644
index 00000000000000..ebaa666a22d2cf
--- /dev/null
+++ b/torch/csrc/zoom/python_nccl.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_unique_id(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args);
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 70283cada92875..af2234ff2b2efc 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -192,8 +192,8 @@ class Backend(str):
     }
 
     backend_capability: Dict[str, List[str]] = {
-        GLOO : ["cpu", "cuda"],
-        NCCL : ["cuda"],
+        GLOO : ["cpu", "cuda", "zoom"],
+        NCCL : ["cuda", "zoom"],
         UCC : ["cpu", "cuda"],
         MPI : ["cpu", "cuda"],
     }
@@ -268,7 +268,7 @@ def register_backend(cls, name, func, extended_api=False, devices: Optional[Unio
                 "`cuda`. Please specify it via the `devices` argument of "
                 "`register_backend`."
             )
-            Backend.backend_capability[name.lower()] = ["cpu", "cuda"]
+            Backend.backend_capability[name.lower()] = ["cpu", "cuda", "zoom"]
         elif isinstance(devices, str):
             # Single device string specified. Simply convert to list.
             Backend.backend_capability[name.lower()] = [devices]
@@ -332,6 +332,7 @@ def __init__(self, backend: Backend):
             self.device_backend_map = {
                 "cpu" : backend_val,
                 "cuda" : backend_val,
+                "zoom" : backend_val,
                 "xpu" : backend_val,
             }
 
@@ -1187,6 +1188,12 @@ def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) ->
             backends.add(backend)  # type: ignore[arg-type]
         elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
             backends.add(backend)  # type: ignore[arg-type]
+    if torch.device("zoom") in devices:
+        backend = group._get_backend(torch.device("zoom"))
+        if is_nccl_available() and isinstance(backend, ProcessGroupNCCL):
+            backends.add(backend)  # type: ignore[arg-type]
+        elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
+            backends.add(backend)  # type: ignore[arg-type]
     if len(backends) == 0:
         warnings.warn("Set timeout is now only supported for either nccl or gloo.")
     for backend in backends:
@@ -2139,7 +2146,7 @@ def batch_isend_irecv(p2p_op_list):
     _check_p2p_op_list(p2p_op_list)
     group = p2p_op_list[0].group
     device = p2p_op_list[0].tensor.device
-    if device.type == "cuda":
+    if device.type in ["cuda", "zoom"]:
         # NCCL style coalescing
         with _coalescing_manager(group, device, async_ops=True) as cm:
             for p2p_op in p2p_op_list:
diff --git a/torch/zoom/__init__.py b/torch/zoom/__init__.py
index debc3c917f96ae..80fbfac53af7d4 100644
--- a/torch/zoom/__init__.py
+++ b/torch/zoom/__init__.py
@@ -291,6 +291,105 @@ def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool:
     return torch._C._zoom_canDeviceAccessPeer(device, peer_device)
 
 
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All Zoom kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch.zoom.Stream"]
+
+    def __init__(self, stream: Optional["torch.zoom.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if not torch.jit.is_scripting():
+            if self.idx is None:
+                self.idx = -1
+
+        self.src_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.zoom.default_stream(None)
+        )
+        self.dst_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.zoom.default_stream(None)
+        )
+
+    def __enter__(self):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None or Zoom device not available
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.zoom.current_stream(None)
+
+        # If the stream is not on the current device, then
+        # set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.zoom.current_stream(cur_stream.device)
+        torch.zoom.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no Zoom device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device
+        # and destination device
+        if self.src_prev_stream.device != cur_stream.device:  # type: ignore[union-attr]
+            torch.zoom.set_stream(self.dst_prev_stream)  # type: ignore[arg-type]
+        torch.zoom.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
+
+
+def stream(stream: Optional["torch.zoom.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    ..Note:: In eager mode stream is of type Stream class while in JIT it is
+    an object of the custom class ``torch.classes.zoom.Stream``.
+    """
+    return StreamContext(stream)
+
+
+def _set_stream_by_id(stream_id, device_index, device_type):
+    r"""set stream specified by the stream id, device index and
+        device type
+
+    Args: stream_id (int): stream id in stream pool
+          device_index (int): device index in topo
+          device_type (int): enum device type
+    """
+    torch._C._zoom_setStream(
+        stream_id=stream_id,
+        device_index=device_index,
+        device_type=device_type,
+    )
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    _set_stream_by_id(
+        stream_id=stream.stream_id,
+        device_index=stream.device_index,
+        device_type=stream.device_type,
+    )
 
 def current_device() -> int:
     r"""Return the index of a currently selected device."""
diff --git a/torch/zoom/nccl.py b/torch/zoom/nccl.py
new file mode 100644
index 00000000000000..66c9e40030ca8e
--- /dev/null
+++ b/torch/zoom/nccl.py
@@ -0,0 +1,137 @@
+import collections
+import warnings
+from typing import Optional, Sequence, Union
+
+import torch.zoom
+
+
+__all__ = ["all_reduce", "reduce", "broadcast", "all_gather", "reduce_scatter"]
+
+SUM = 0  # ncclRedOp_t
+
+
+def is_available(tensors):
+    if not hasattr(torch._C, "_nccl_all_reduce"):
+        warnings.warn("PyTorch is not compiled with NCCL support")
+        return False
+
+    devices = set()
+    for tensor in tensors:
+        if tensor.is_sparse:
+            return False
+        if not tensor.is_contiguous():
+            return False
+        if not tensor.is_privateuseone():
+            return False
+        device = tensor.get_device()
+        if device in devices:
+            return False
+        devices.add(device)
+
+    return True
+
+
+def version():
+    ver = torch._C._nccl_version()
+    major = ver >> 32
+    minor = (ver >> 16) & 65535
+    patch = ver & 65535
+    suffix = torch._C._nccl_version_suffix().decode("utf-8")
+    if suffix == "":
+        return (major, minor, patch)
+    else:
+        return (major, minor, patch, suffix)
+
+
+def unique_id():
+    return torch._C._nccl_unique_id()
+
+
+def init_rank(num_ranks, uid, rank):
+    return torch._C._nccl_init_rank(num_ranks, uid, rank)
+
+
+def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None:
+    if not isinstance(inputs, collections.abc.Container) or isinstance(
+        inputs, torch.Tensor
+    ):
+        raise TypeError("Inputs should be a collection of tensors")
+
+
+def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None):
+    _check_sequence_type(inputs)
+    if outputs is None:
+        outputs = inputs
+    _check_sequence_type(outputs)
+    torch._C._nccl_all_reduce(inputs, outputs, op, streams, comms)
+
+
+# `output` used to be `outputs`, taking in a list of tensors. So we have two
+# arguments for BC reasons.
+def reduce(
+    inputs: Sequence[torch.Tensor],
+    output: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] = None,
+    root: int = 0,
+    op: int = SUM,
+    streams: Optional[Sequence[torch.zoom.Stream]] = None,
+    comms=None,
+    *,
+    outputs: Optional[Sequence[torch.Tensor]] = None,
+) -> None:
+    _check_sequence_type(inputs)
+    _output: torch.Tensor
+    if outputs is not None:
+        if output is not None:
+            raise ValueError(
+                "'output' and 'outputs' can not be both specified. 'outputs' is deprecated in "
+                "favor of 'output', taking in a single output tensor. The signature of reduce is: "
+                "reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None)."
+            )
+        else:
+            warnings.warn(
+                "nccl.reduce with an output tensor list is deprecated. "
+                "Please specify a single output tensor with argument 'output' instead instead."
+            )
+            _output = outputs[root]
+    elif not isinstance(output, torch.Tensor) and isinstance(
+        output, collections.abc.Sequence
+    ):
+        # User called old API with positional arguments of list of output tensors.
+        warnings.warn(
+            "nccl.reduce with an output tensor list is deprecated. "
+            "Please specify a single output tensor."
+        )
+        _output = output[root]
+    else:
+        _output = inputs[root] if output is None else output
+    torch._C._nccl_reduce(inputs, _output, root, op, streams, comms)
+
+
+def broadcast(
+    inputs: Sequence[torch.Tensor], root: int = 0, streams=None, comms=None
+) -> None:
+    _check_sequence_type(inputs)
+    torch._C._nccl_broadcast(inputs, root, streams, comms)
+
+
+def all_gather(
+    inputs: Sequence[torch.Tensor],
+    outputs: Sequence[torch.Tensor],
+    streams=None,
+    comms=None,
+) -> None:
+    _check_sequence_type(inputs)
+    _check_sequence_type(outputs)
+    torch._C._nccl_all_gather(inputs, outputs, streams, comms)
+
+
+def reduce_scatter(
+    inputs: Sequence[torch.Tensor],
+    outputs: Sequence[torch.Tensor],
+    op: int = SUM,
+    streams=None,
+    comms=None,
+) -> None:
+    _check_sequence_type(inputs)
+    _check_sequence_type(outputs)
+    torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms)
diff --git a/torch/zoom/zoom_triton_mm.py b/torch/zoom/zoom_triton_mm.py
index 6967ed7f8c1a77..61ba831d5378ae 100644
--- a/torch/zoom/zoom_triton_mm.py
+++ b/torch/zoom/zoom_triton_mm.py
@@ -157,6 +157,11 @@ def leaky_relu(x):
 def mm_out_zoom(self, mat2, out):
     batched_matmul(self.unsqueeze(0), mat2.unsqueeze(0), out.unsqueeze(0), None, None, False)
     
+def addmm_out_zoom(self, mat1, mat2, beta, alpha, out):
+    mm_out_zoom(mat1, mat2, out)
+    out = (beta * self) + (alpha * out)
+    return out
+    
 def bmm_out_zoom(self, mat2, out):
     batched_matmul(self, mat2, out, None, None, False)
 
@@ -169,6 +174,15 @@ def mm(self, mat2):
     out = self.new_empty((self.size(0), mat2.size(1)))
     mm_out_zoom(self, mat2, out)
     return out
+
+@register_kernel("aten::addmm.out", "zoom")
+def addmm_out(self, mat1, mat2, beta, alpha, out):
+    return addmm_out_zoom(self, mat1, mat2, beta, alpha, out)
+
+@register_kernel("aten::addmm", "zoom")
+def addmm(self, mat1, mat2, beta=1, alpha=1):
+    out = self.new_empty((mat1.size(0), mat2.size(1)))
+    return addmm_out_zoom(self, mat1, mat2, beta, alpha, out)
     
 @register_kernel("aten::bmm.out", "zoom")
 def bmm_out(self, mat2, out):

From 0a1a39bb4d47c5e9c807bd583c5bd15c58a432b3 Mon Sep 17 00:00:00 2001
From: Arham Khan <arhammkhan@gmail.com>
Date: Mon, 31 Mar 2025 13:26:46 -0500
Subject: [PATCH 19/23] add hipblas and hipblaslt with build flags

---
 CMakeLists.txt                             |    2 +
 aten/src/ATen/CMakeLists.txt               |    2 +-
 aten/src/ATen/native/native_functions.yaml |   18 +-
 aten/src/ATen/native/zoom/Blas.cpp         | 1209 +++++++++++
 aten/src/ATen/zoom/HIPBlas.cpp             | 2092 ++++++++++++++++++++
 aten/src/ATen/zoom/HIPBlas.h               |  366 ++++
 aten/src/ATen/zoom/HIPblasHandlePool.cpp   |  191 ++
 aten/src/ATen/zoom/ZoomContextLight.h      |   14 +
 aten/src/ATen/zoom/jit/jit_utils.cpp       |    7 -
 aten/src/ATen/zoom/tunable/GemmCommon.h    |  218 ++
 aten/src/ATen/zoom/tunable/GemmHipblaslt.h |  523 +++++
 aten/src/ATen/zoom/tunable/GemmRocblas.h   |  276 +++
 aten/src/ATen/zoom/tunable/README.md       |   88 +
 aten/src/ATen/zoom/tunable/StreamTimer.cpp |   44 +
 aten/src/ATen/zoom/tunable/StreamTimer.h   |   35 +
 aten/src/ATen/zoom/tunable/Tunable.cpp     |  565 ++++++
 aten/src/ATen/zoom/tunable/Tunable.h       |  205 ++
 aten/src/ATen/zoom/tunable/TunableGemm.h   |  371 ++++
 aten/src/ATen/zoom/tunable/TunableOp.h     |  243 +++
 caffe2/CMakeLists.txt                      |    6 +
 cmake/Dependencies.cmake                   |    6 +
 cmake/public/LoadHIP.cmake                 |    7 +
 torch/csrc/zoom/Module.cpp                 |   14 +
 torch/zoom/__init__.py                     |    4 +-
 24 files changed, 6494 insertions(+), 12 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/Blas.cpp
 create mode 100644 aten/src/ATen/zoom/HIPBlas.cpp
 create mode 100644 aten/src/ATen/zoom/HIPBlas.h
 create mode 100644 aten/src/ATen/zoom/HIPblasHandlePool.cpp
 create mode 100644 aten/src/ATen/zoom/tunable/GemmCommon.h
 create mode 100644 aten/src/ATen/zoom/tunable/GemmHipblaslt.h
 create mode 100644 aten/src/ATen/zoom/tunable/GemmRocblas.h
 create mode 100644 aten/src/ATen/zoom/tunable/README.md
 create mode 100644 aten/src/ATen/zoom/tunable/StreamTimer.cpp
 create mode 100644 aten/src/ATen/zoom/tunable/StreamTimer.h
 create mode 100644 aten/src/ATen/zoom/tunable/Tunable.cpp
 create mode 100644 aten/src/ATen/zoom/tunable/Tunable.h
 create mode 100644 aten/src/ATen/zoom/tunable/TunableGemm.h
 create mode 100644 aten/src/ATen/zoom/tunable/TunableOp.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a132d9e4ee453a..adfd8510e6ad07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -232,6 +232,8 @@ option(USE_MAGMA "Use MAGMA" ON)
 option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
+option(ENABLE_ZOOM_BLAS "Use HIPBlas Kernels in the ZOOM backend" ON)
+option(DISABLE_HIPBLASLT "Disable HIPBlasLt Kernels in the ZOOM backend" OFF)
 cmake_dependent_option(
     USE_MPS "Use MPS for macOS build" ON
     "MPS_FOUND" OFF)
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 684b2c4cdeb905..38b94f40408b2c 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -83,7 +83,7 @@ file(GLOB miopen_h "miopen/*.h")
 file(GLOB miopen_cpp "miopen/*.cpp")
 
 file(GLOB zoom_h "zoom/*.h" "zoom/detail/*.h" "zoom/*.cuh" "zoom/detail/*.cuh" "zoom/tunable/*.h" "zoom/jit/*.cuh" "zoom/jit/*.h")
-file(GLOB zoom_cpp "zoom/*.cpp" "zoom/detail/*.cpp" "zoom/jit/*.cpp")
+file(GLOB zoom_cpp "zoom/*.cpp" "zoom/detail/*.cpp"  "zoom/tunable/*.cpp" "zoom/jit/*.cpp")
 file(GLOB zoom_hip "zoom/*.cu" "zoom/detail/*.cu")
 file(GLOB zoom_hiprtc_stub_h "zoom/hiprtc_stub/*.h")
 file(GLOB zoom_hiprtc_stub_cpp "zoom/hiprtc_stub/*.cpp")
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index fd33884a40b15a..cc7153cb84b18f 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -638,6 +638,7 @@
   dispatch:
     CPU: addmv_out_cpu
     CUDA: addmv_out_cuda
+    PrivateUse1: addmv_out_hip
     MPS: addmv_out_mps
     SparseCsrCPU: addmv_out_sparse_compressed
     SparseCsrCUDA: addmv_out_sparse_compressed_cuda
@@ -1056,6 +1057,7 @@
   dispatch:
     CPU: baddbmm_out_cpu
     CUDA: baddbmm_out_cuda
+    PrivateUse1: baddbmm_out_hip
     MPS: baddbmm_out_mps
     SparseCsrCUDA: baddbmm_out_sparse_csr_cuda
 
@@ -1357,6 +1359,7 @@
   dispatch:
     CPU: bmm_out_cpu
     CUDA: bmm_out_cuda
+    PrivateUse1: bmm_out_hip
     MPS: bmm_out_mps
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
@@ -2261,6 +2264,7 @@
   dispatch:
     CPU: dot
     CUDA: dot_cuda
+    PrivateUse1: dot_hip
     MPS: dot_mps
 
 - func: dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
@@ -2272,6 +2276,7 @@
   dispatch:
     CPU: vdot
     CUDA: vdot_cuda
+    PrivateUse1: vdot_hip
 
 - func: vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -4127,6 +4132,7 @@
   dispatch:
     CPU: mm_out_cpu
     CUDA: mm_out_cuda
+    PrivateUse1: mm_out_hip
     MPS: mm_out_mps
     SparseCPU, SparseCUDA: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA: _sparse_csr_mm_out
@@ -4135,11 +4141,13 @@
   dispatch:
     CPU: _int_mm_cpu
     CUDA: _int_mm_cuda
+    PrivateUse1: _int_mm_hip
 
 - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: _int_mm_out_cpu
     CUDA: _int_mm_out_cuda
+    PrivateUse1: _int_mm_out_hip
 
 - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
   dispatch:
@@ -6980,6 +6988,7 @@
   dispatch:
     CPU: addmm_out_cpu
     CUDA: addmm_out_cuda
+    PrivateUse1: addmm_out_hip
     MPS: addmm_out_mps
     SparseCPU: addmm_out_sparse_dense_cpu
     SparseCUDA: addmm_out_sparse_dense_cuda
@@ -7009,6 +7018,7 @@
   dispatch:
     CPU: addmm_activation_out_cpu
     CUDA: addmm_activation_out_cuda
+    PrivateUse1: addmm_activation_out_hip
 
 - func: _addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
   structured_delegate: _addmm_activation.out
@@ -7018,11 +7028,13 @@
   variants: function
   dispatch:
     CUDA: _scaled_mm_cuda
+    PrivateUse1: _scaled_mm_hip
 
 - func: _scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
   variants: function
   dispatch:
     CUDA: _scaled_mm_out_cuda
+    PrivateUse1: _scaled_mm_out_hip
 
 # NOTE [ Sparse: autograd and API ]
 #
@@ -8652,18 +8664,18 @@
 - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   variants: method
   dispatch:
-    CPU, CUDA: addbmm_
+    CPU, CUDA, PrivateUse1: addbmm_
     MPS: addbmm_mps_
 
 - func: addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: addbmm_out
+    CPU, CUDA, PrivateUse1: addbmm_out
     MPS: addbmm_out_mps
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: method, function
   dispatch:
-    CPU, CUDA: addbmm
+    CPU, CUDA, PrivateUse1: addbmm
     MPS: addbmm_mps
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
diff --git a/aten/src/ATen/native/zoom/Blas.cpp b/aten/src/ATen/native/zoom/Blas.cpp
new file mode 100644
index 00000000000000..107640de6bf4b7
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Blas.cpp
@@ -0,0 +1,1209 @@
+#ifdef ENABLE_ZOOM_BLAS
+
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/zoom/HIPBlas.h>
+#include <ATen/zoom/tunable/Tunable.h>
+#include <ATen/zoom/tunable/TunableGemm.h>
+#include <ATen/zoom/jit/thread_constants.h>
+#include <ATen/code_template.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/MaybeOwned.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+
+#include <ATen/zoom/jit/jit_utils.h>
+#include <c10/util/SmallBuffer.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <ATen/detail/ZoomHooksInterface.h>
+#include <iostream>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
+
+constexpr int64_t c_i64_grid_X_chunk = 1ULL << 28;
+constexpr int64_t c_i64_grid_YZ_chunk
+    = int64_t((std::numeric_limits<uint16_t>::max() & ~0xf)); // % 16 == 0
+
+namespace at::native {
+
+namespace {
+
+// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
+c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
+  if (resolve_conj && tensor.is_conj()) {
+    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_hipblas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
+  }
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, !transpose_result);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, transpose_result);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
+c10::MaybeOwned<Tensor> inline prepare_matrix_for_hipblas(const Tensor& tensor, bool& transpose_tensor) {
+  if (tensor.is_non_overlapping_and_dense()) { // common case
+      transpose_tensor = tensor.is_contiguous();
+      return resolve_conj_if_indicated(tensor, true);
+  }
+  IntArrayRef tensor_strides = tensor.strides();
+  IntArrayRef tensor_sizes = tensor.sizes();
+  if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
+    transpose_tensor = false;
+    return resolve_conj_if_indicated(tensor, true);
+  } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
+    transpose_tensor = true;
+    return resolve_conj_if_indicated(tensor, true);
+  } else {
+    transpose_tensor = true;
+    return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+  }
+}
+
+struct hipblasCommonArgs {
+  hipblasCommonArgs(const Tensor& mat1, const Tensor& mat2, Tensor& c) {
+    bool transpose_result, transpose_mat1, transpose_mat2;
+    result = prepare_matrix_for_hipblas(c, transpose_result);
+    mata = prepare_matrix_for_hipblas(transpose_result ? mat2 : mat1, transpose_mat1, transpose_result);
+    matb = prepare_matrix_for_hipblas(transpose_result ? mat1 : mat2, transpose_mat2, transpose_result);
+    auto mat1_sizes = mat1.sizes();
+    auto mat2_sizes = mat2.sizes();
+    if (transpose_result) {
+      transpose_mat1 = !transpose_mat1;
+      transpose_mat2 = !transpose_mat2;
+      mat1_sizes = mata->sizes();
+      mat2_sizes = matb->sizes();
+    }
+
+    m = mat1_sizes[transpose_result ? 1 : 0];
+    k = mat1_sizes[transpose_result ? 0 : 1];
+    n = mat2_sizes[transpose_result ? 0 : 1];
+    lda = mata->stride((transpose_mat1 == transpose_result) ? 1 : 0);
+    ldb = matb->stride((transpose_mat2 == transpose_result) ? 1 : 0);
+    result_ld = result->stride(transpose_result ? 0 : 1);
+    transa = transpose_mat1 ?  mata->is_conj() ? 'c' : 't' : 'n';
+    transb = transpose_mat2 ?  matb->is_conj() ? 'c' : 't' : 'n';
+  }
+  char transa, transb;
+  int64_t m, n, k;
+  int64_t lda, ldb, result_ld;
+  c10::MaybeOwned<Tensor> mata, matb, result;
+};
+} // namespace
+
+c10::MaybeOwned<Tensor> prepare_batch_matrix_for_hipblas(const Tensor& tensor, bool& transpose_tensor, int64_t& ld_tensor, bool transpose_result, int64_t m, int64_t n) {
+  IntArrayRef tensor_strides = tensor.strides();
+  c10::MaybeOwned<Tensor> tensor_;
+  int fast_dim = transpose_result ? 2 : 1;
+  int leading_dim = transpose_result ? 1 : 2;
+
+  if (tensor_strides[fast_dim] == 1 &&
+    (tensor_strides[leading_dim] >= std::max<int64_t>(1, m))) {
+    transpose_tensor = false;
+    tensor_ = resolve_conj_if_indicated(tensor, true);
+    ld_tensor = tensor_->strides()[leading_dim];
+  } else if ((tensor_strides[leading_dim] == 1) &&
+    (tensor_strides[fast_dim] >= std::max<int64_t>(1, n))) {
+    transpose_tensor = true;
+    tensor_ = resolve_conj_if_indicated(tensor, false);
+    ld_tensor = tensor_->strides()[fast_dim];
+  } else {
+    transpose_tensor = !transpose_result;
+    // gemm call requires leading dimension and stride parameters to be non-zero
+    bool is_stride_non_zero = tensor.strides()[1] != 0 && tensor.strides()[2] != 0;
+    if (tensor.is_contiguous() && is_stride_non_zero) {
+      tensor_ = resolve_conj_if_indicated(tensor, transpose_result);
+    } else {
+      tensor_ = c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
+    }
+    ld_tensor = tensor_->strides()[1];
+  }
+
+  return tensor_;
+}
+
+namespace {
+
+enum class Activation {
+  None,
+  RELU,
+  GELU,
+};
+
+zoom::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) {
+  switch (a) {
+    case Activation::None:
+      return zoom::blas::GEMMAndBiasActivationEpilogue::None;
+    case Activation::RELU:
+      return zoom::blas::GEMMAndBiasActivationEpilogue::RELU;
+    case Activation::GELU:
+      return zoom::blas::GEMMAndBiasActivationEpilogue::GELU;
+    default:
+      TORCH_CHECK(false);
+      return zoom::blas::GEMMAndBiasActivationEpilogue::None;
+  }
+}
+
+static bool getDisableAddmmHIPLt() {
+  #ifdef DISABLE_HIPBLASLT
+  return true;
+  #else
+    static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
+    // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
+    // note the current tunable op is not the hipblaslt path (gemm_and_bias)
+    auto tuning_ctx = at::zoom::tunable::getTuningContext();
+    if (tuning_ctx->IsTunableOpEnabled()) {
+      return true;
+    }
+    // allow both CUDA and HIP env var names for ROCm builds
+    // also, current default for ROCm builds is disable by default
+    if (env_value == nullptr) {
+        env_value = std::getenv("DISABLE_ADDMM_HIP_LT");
+    }
+    if (env_value != nullptr && strcmp(env_value, "0") == 0) {
+      return false;
+    }
+    return true;
+  #endif
+}
+
+
+static bool isSupportedHipLtROCmArch(int index) {
+  #ifdef DISABLE_HIPBLASLT
+  return false;
+  #else
+    hipDeviceProp_t* prop = at::zoom::getDeviceProperties(index);
+    std::string device_arch = prop->gcnArchName;
+    static const std::vector<std::string> archs = {"gfx90a", "gfx940", "gfx941", "gfx942"};
+    for (std::string arch : archs) {
+        size_t substring = device_arch.find(arch);
+        if (substring != std::string::npos) {
+            return true;
+        }
+    }
+    TORCH_CHECK(false, "Attempting to use hipBLASLt on a unsupported architecture!");
+    return false;
+    #endif
+}
+
+
+Tensor& addmm_out_hip_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) {
+  // Make sure to keep addmm_hip below in sync with this code; it
+  // preflights a check to try to avoid actually needing to call
+  // expand().
+  TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D");
+  TORCH_CHECK(
+    mat1.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
+  )
+
+  TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
+  checkAllSameGPU(__func__, targs);
+
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  IntArrayRef self__sizes;
+  bool useLtInterface = false;
+  static bool disable_addmm_hip_lt = getDisableAddmmHIPLt();
+  at::ScalarType scalar_type = self.scalar_type();
+  c10::MaybeOwned<Tensor> self_;
+  if (&result != &self) {
+    // Strangely, if mat2 has only 1 row or column, we get
+    // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
+    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+    // is to use lt interface only when self is bias.
+    // for cuda 11.4, cublasLtMatmul is activated
+    // the last two conditions is to skip 16b transA and non-trans-B having
+    // leading dim >> rows when they are sliced from a large tensor
+    // see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
+    // if (!disable_addmm_hip_lt) {
+    //   useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
+    //       result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
+    //       self.is_contiguous() && result.is_contiguous() &&
+    //       isSupportedHipLtROCmArch(self.device().index()) &&
+    //       (scalar_type == at::ScalarType::Float ||
+    //        scalar_type == at::ScalarType::Half ||
+    //        scalar_type == at::ScalarType::BFloat16) &&
+
+    //       mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
+    //       mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
+    //       mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
+    //       // avoid leading dim >> rows bugs
+    //       ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
+    //        (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
+    //        (scalar_type != at::ScalarType::Half &&
+    //         scalar_type != at::ScalarType::BFloat16)) &&
+    //       ((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) ||
+    //        (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
+    //        (scalar_type != at::ScalarType::Half &&
+    //         scalar_type != at::ScalarType::BFloat16));
+    // }
+    // if (!useLtInterface) {
+    self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
+    // }
+    self__sizes = self_->sizes();
+  } else {
+    // useLtInterface = !disable_addmm_hip_lt &&
+    //     result.dim() == 2 && result.is_contiguous() &&
+    //     isSupportedHipLtROCmArch(self.device().index()) &&
+    //     (scalar_type == at::ScalarType::Float ||
+    //       scalar_type == at::ScalarType::Half ||
+    //       scalar_type == at::ScalarType::BFloat16);
+
+    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
+    self__sizes = self_->sizes();
+    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
+    TORCH_CHECK(self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0");
+    TORCH_CHECK(self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1");
+  }
+
+  if (&result != &self) {
+    at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
+    if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
+      at::native::copy_(result, *self_);
+    }
+  }
+
+
+  IntArrayRef result_sizes = result.sizes();
+  if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
+    return result;
+  }
+
+  hipblasCommonArgs args(mat1, mat2, result);
+
+  if (mat1.numel() == 0) {
+    // By definition, when beta==0, values in self should be ignored. nans and infs
+    // should not propagate
+    if (beta.toComplexDouble() == 0.) {
+      return result.zero_();
+    }
+    // TODO: We could squeeze some perf by calling at::zoom::mul_out here instead, to bypass the dispatcher.
+    // That requires some fixing some internal build dependencies though.
+    return at::mul_out(
+        result,
+        self.expand(result.sizes()),
+        at::native::scalar_tensor(
+            beta,
+            self.scalar_type(),
+            c10::nullopt /* layout */,
+            at::kCPU,
+            c10::nullopt /* pin_memory */));
+  }
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
+
+  // if (useLtInterface) {
+  //   AT_DISPATCH_FLOATING_TYPES_AND2(
+  //       at::ScalarType::Half,
+  //       at::ScalarType::BFloat16,
+  //       scalar_type,
+  //       "addmm_hip_lt",
+  //       [&] {
+  //         at::zoom::blas::gemm_and_bias<scalar_t>(
+  //             args.transa == 't',
+  //             args.transb == 't',
+  //             args.m,
+  //             args.n,
+  //             args.k,
+  //             alpha.to<at::opmath_type<scalar_t>>(),
+  //             args.mata->const_data_ptr<scalar_t>(),
+  //             args.lda,
+  //             args.matb->const_data_ptr<scalar_t>(),
+  //             args.ldb,
+  //             // This condition is needed for mm case on ROCm for hipblasLt path.
+  //             // Passing the bias ptr as null to avoid accuracy issues for mm case.
+  //             (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+  //             args.result->data_ptr<scalar_t>(),
+  //             args.result_ld,
+  //             activation_to_gemm_and_blas_arg(activation)
+  //         );
+  //       });
+  // } else
+  // {
+  
+  
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        scalar_type,
+        "addmm_hip",
+        [&] {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+          scalar_t* result_ptr = args.result->mutable_data_ptr<scalar_t>();
+          at::zoom::blas::gemm<scalar_t>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
+  
+    // AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+    //     at::ScalarType::Half,
+    //     at::ScalarType::BFloat16,
+    //     scalar_type,
+    //     "addmm_hip",
+    //     [&] {
+    //       using opmath_t = at::opmath_type<scalar_t>;
+    //       opmath_t alpha_val = alpha.to<opmath_t>();
+    //       opmath_t beta_val = beta.to<opmath_t>();
+    //       const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+    //       const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+    //       scalar_t* result_ptr = args.result->mutable_data_ptr<scalar_t>();
+
+    //       static constexpr int GEMM_DIM_X = 32;
+    //       static constexpr int GEMM_DIM_Y = 32;
+
+    //       // JIT kernel
+    //       auto desc = at::zoom::jit::make_kernel_descriptor<scalar_t, scalar_t>("gemm", gemm_code, /*nInputs=*/5, /*nOutputs=*/1);
+    //       auto gemm_kernel = at::zoom::jit::zoom_generate_code(desc);
+    //       at::zoom::jit::hiprtcFunction gemm_f = at::zoom::jit::jit_pwise_function(gemm_kernel, desc.name);
+          
+    //       // chunked launch
+    //       for(int64_t n_base = 0; n_base < args.n; n_base += c_i64_grid_YZ_chunk)
+    //       {
+    //         // don't need to block through M as it's 32 bit and can use full 32-bits in X-dim of grid
+    //         int32_t nblock = int32_t(std::min(args.n - n_base, c_i64_grid_YZ_chunk));
+
+    //         void* gemm_args[] = {
+    //           args.transa,
+    //           args.transb,
+    //           args.m,
+    //           args.n,
+    //           args.k,
+    //           alpha_val,
+    //           mat1_ptr,
+    //           args.lda,
+    //           mat2_ptr,
+    //           args.ldb,
+    //           beta_val,
+    //           result_ptr,
+    //           args.result_ldN
+    //         };
+
+    //         at::zoom::jit::launch_jitted_pwise_function(gemm_f, gemm_args, gemm_grid, gemm_thread, smem);
+        
+    //       }
+    //     });
+    switch (activation) {
+      case Activation::RELU:
+        at::relu_(const_cast<Tensor&>(*args.result));
+        break;
+      case Activation::GELU:
+        at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
+        break;
+      default: break;
+    }
+  // }
+
+  if (!result.is_same(*args.result)) {
+    result.copy_(*args.result);
+  }
+  return result;
+}
+
+const Tensor& baddbmm_out_hip_impl(const Tensor& result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha) {
+  // handle pathological cases that blas may not like
+  if (result.numel() == 0) {
+    return result;
+  } else if (batch1.size(2) == 0) {
+    if (beta.to<c10::complex<double>>() == 0.0) {
+      return result.zero_();
+    } else {
+      return result.mul_(beta);
+    }
+  }
+
+  bool transpose_result = false;
+  c10::MaybeOwned<Tensor> result_;
+  IntArrayRef result_strides = result.strides();
+  IntArrayRef result_sizes = result.sizes();
+
+  if ((result_strides[1] == 1) &&
+      ((result_sizes[2] == 1) || (result_strides[2] >= std::max<int64_t>(1, result_sizes[1])))) {
+    result_ = resolve_conj_if_indicated(result, true);
+  } else if ((result_strides[2] == 1) &&
+    (result_sizes[1] == 1 || (result_strides[1] >= std::max<int64_t>(1, result_sizes[2])))) {
+    transpose_result = true;
+    result_ = resolve_conj_if_indicated(result, true);
+  } else {
+    result_ = c10::MaybeOwned<Tensor>::owned(result.transpose(1, 2).clone(at::MemoryFormat::Contiguous).transpose(1, 2));
+  }
+
+  int leading_dim = transpose_result ? 1 : 2;
+
+  int64_t m = result_sizes[transpose_result ? 2 : 1];
+  int64_t n = result_sizes[leading_dim];
+  int64_t k = (transpose_result ? batch2 : batch1).sizes()[leading_dim];
+
+  int64_t lda, ldb, ldc;
+  bool transpose_batch1, transpose_batch2;
+  auto batch1_ = prepare_batch_matrix_for_hipblas(transpose_result ? batch2 : batch1, transpose_batch1, lda, transpose_result, m, k);
+  auto batch2_ = prepare_batch_matrix_for_hipblas(transpose_result ? batch1 : batch2, transpose_batch2, ldb, transpose_result, k, n);
+
+  ldc = result_->strides()[leading_dim];
+  int64_t num_batches = result_->sizes()[0];
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "baddbmm_hip", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
+    opmath_t alpha_val = alpha.to<opmath_t>();
+    opmath_t beta_val = beta.to<opmath_t>();
+    const scalar_t* batch1_ptr = batch1_->const_data_ptr<scalar_t>();
+    const scalar_t* batch2_ptr = batch2_->const_data_ptr<scalar_t>();
+    scalar_t* result_ptr = result_->mutable_data_ptr<scalar_t>();
+    const auto transa = transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n';
+    const auto transb = transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n';
+    // If batch is 1 call gemm rather than bgemm
+    if (num_batches == 1) {
+      at::zoom::blas::gemm<scalar_t>(
+          transa, transb,
+          m, n, k,
+          alpha_val,
+          batch1_ptr, lda,
+          batch2_ptr, ldb,
+          beta_val,
+          result_ptr, ldc);
+    } else {
+      at::zoom::blas::bgemm<scalar_t>(
+        transa, transb,
+        m, n, k,
+        alpha_val,
+        batch1_ptr, lda, batch1_->strides()[0],
+        batch2_ptr, ldb, batch2_->strides()[0],
+        beta_val,
+        result_ptr, ldc, result_->strides()[0],
+        num_batches
+      );
+   }
+  });
+  if (!result.is_same(*result_)) {
+    result.copy_(*result_);
+  }
+  return result;
+}
+
+} // anonymous namespace
+
+TORCH_IMPL_FUNC(addmm_out_hip)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, const Tensor& result) {
+  addmm_out_hip_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha);
+}
+
+TORCH_IMPL_FUNC(addmm_activation_out_hip)(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, bool use_gelu, const Tensor& result) {
+  addmm_out_hip_impl(const_cast<Tensor&>(result), self, mat1, mat2, beta, alpha, use_gelu ? Activation::GELU : Activation::RELU);
+}
+
+TORCH_IMPL_FUNC(mm_out_hip)(const Tensor& self, const Tensor& mat2, const Tensor& result) {
+  addmm_out_hip_impl(const_cast<Tensor&>(result), result, self, mat2, 0, 1);
+}
+
+TORCH_IMPL_FUNC(baddbmm_out_hip)(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const Tensor& result) {
+  {
+    at::NoNamesGuard guard;
+    baddbmm_out_hip_impl(result, self, batch1, batch2, beta, alpha);
+  }
+}
+
+TORCH_IMPL_FUNC(bmm_out_hip)(const Tensor& batch1, const Tensor& batch2, const Tensor &result) {
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  {
+    NoNamesGuard guard;
+    baddbmm_out_hip_impl(result, result, batch1, batch2, beta, alpha);
+  }
+}
+
+
+namespace {
+
+inline void dot_check(const Tensor& self, const Tensor& other) {
+  TORCH_CHECK(
+      self.dim() == 1 && other.dim() == 1,
+      "1D tensors expected, but got ",
+      self.dim(),
+      "D and ",
+      other.dim(),
+      "D tensors");
+  TORCH_CHECK(
+      self.scalar_type() == other.scalar_type(),
+      "dot : expected both vectors to have same dtype, but found ",
+      self.scalar_type(),
+      " and ",
+      other.scalar_type());
+  TORCH_CHECK(
+      self.numel() == other.numel(),
+      "inconsistent tensor size, expected tensor [",
+      self.numel(),
+      "] and src [",
+      other.numel(),
+      "] to have the same number of elements, but got ",
+      self.numel(),
+      " and ",
+      other.numel(),
+      " elements respectively");
+  TORCH_CHECK(
+      (self.numel() <= INT_MAX) && (self.stride(0) <= INT_MAX) &&
+          (other.stride(0) <= INT_MAX),
+      "dot only supports n, incx, incy with the bound [val] <= %d",
+      INT_MAX);
+}
+
+} // anonymous namespace
+
+// global sum reduce partial dot kernel results
+std::string sum_reduce_blocks_code = R"(
+#define HIP_ENABLE_PRINTF_DEBUG
+extern "C" __global__ void reduce_blocks_kernel(scalar_t* block_results, scalar_t* out, int num_blocks)
+{
+    __shared__ scalar_t sdata[256];
+    int tid = threadIdx.x;
+    
+    scalar_t sum = zero_init<scalar_t>();
+    for (int i = tid; i < num_blocks; i += blockDim.x) {
+        sum += block_results[i];
+    }
+    
+    sdata[tid] = sum;
+    __syncthreads();
+    
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    if (tid == 0) {
+        out[0] = sdata[0];
+    }
+}
+)";
+
+// compute partial results for dot kernel in each warp
+std::string dot_partial_code = R"(
+#define HIP_ENABLE_PRINTF_DEBUG
+template<typename T>
+__device__ T dot_mul(T x, T y) {
+  return x * y;
+}
+
+// complex mul
+template<>
+__device__ hipFloatComplex dot_mul<hipFloatComplex>(hipFloatComplex x, hipFloatComplex y) {
+  return hipCmulf(x, y);
+}
+
+template<>
+__device__ hipDoubleComplex dot_mul<hipDoubleComplex>(hipDoubleComplex x, hipDoubleComplex y) {
+  return hipCmul(x, y);
+}
+
+extern "C" __global__ void dot_partial_kernel(scalar_t* a, scalar_t* b, scalar_t* block_results, int incx, int incy, int N)
+{
+    __shared__ scalar_t sdata[256];
+    int tid = threadIdx.x;
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    
+    scalar_t sum = zero_init<scalar_t>();
+    
+    // Grid stride loop
+    for (int i = gid; i < N; i += stride) {
+        sum += dot_mul(a[i * incx], b[i * incy]);
+    }
+    
+    // Store in shared memory
+    sdata[tid] = sum;
+    __syncthreads();
+    
+    // Perform reduction in shared memory
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    // Write result for this block to global memory
+    if (tid == 0) {
+        block_results[blockIdx.x] = sdata[0];
+    }
+}
+)";
+
+Tensor dot_hip(const Tensor& self, const Tensor& other) {
+  if (self.is_complex()) {
+    if (self.is_conj()) {
+      if (other.is_conj()) {
+        return (dot_hip(self.conj(), other.conj())).conj();
+       } else {
+         return vdot_hip(self.conj(), other);
+       }
+    } else if (other.is_conj()) {
+      return vdot_hip(other.conj(), self);
+    }
+  }
+
+  at::NoNamesGuard guard;
+  dot_check(self, other);
+
+  int N = static_cast<int>(self.numel());
+  int incx = static_cast<int>(self.stride(0));
+  int incy = static_cast<int>(other.stride(0));
+  if (N == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+  if (self._is_zerotensor() || other._is_zerotensor() || N == 0) {
+    return at::_efficientzerotensor({}, self.options());
+  }
+
+  return AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      self.scalar_type(), "dot",
+      [&] {
+        Tensor result = at::empty({}, self.options());
+        int THREADS_PER_BLOCK = num_threads();
+        int BLOCKS = (N + block_work_size() - 1) / block_work_size();
+        const int smem = sizeof(scalar_t)*256;
+
+        auto self_ptr = self.data_ptr<scalar_t>();
+        auto other_ptr = other.data_ptr<scalar_t>();
+        auto res_ptr = result.data_ptr<scalar_t>();
+        auto* allocator = at::zoom::getZoomDeviceAllocator();
+        scalar_t* partial_res_ptr = (scalar_t*) allocator->raw_allocate(sizeof(scalar_t)*BLOCKS);
+        void* dot_partial_args[] = {&self_ptr, &other_ptr, &partial_res_ptr, &incx, &incy, &N};
+
+        auto desc = at::zoom::jit::make_kernel_descriptor<scalar_t, scalar_t>("dot_partial", dot_partial_code, /*nInputs=*/5, /*nOutputs=*/1);
+        auto dot_partial_kernel = at::zoom::jit::zoom_generate_code(desc);
+        at::zoom::jit::hiprtcFunction dot_partial_f = at::zoom::jit::jit_pwise_function(dot_partial_kernel, desc.name);
+        at::zoom::jit::launch_jitted_pwise_function(dot_partial_f, dot_partial_args, {BLOCKS, 1u, 1u}, {THREADS_PER_BLOCK, 1u, 1u}, smem);
+        
+        void* sum_reduce_blocks_args[] = {&partial_res_ptr, &res_ptr, &BLOCKS};
+        auto reduce_desc = at::zoom::jit::make_kernel_descriptor<scalar_t, scalar_t>("reduce_blocks", sum_reduce_blocks_code, /*nInputs=*/2, /*nOutputs=*/1);
+        auto sum_reduce_blocks_kernel = at::zoom::jit::zoom_generate_code(reduce_desc);
+        at::zoom::jit::hiprtcFunction sum_reduce_blocks_f = at::zoom::jit::jit_pwise_function(sum_reduce_blocks_kernel, reduce_desc.name);
+        at::zoom::jit::launch_jitted_pwise_function(sum_reduce_blocks_f, sum_reduce_blocks_args, {1u, 1u, 1u}, {THREADS_PER_BLOCK, 1u, 1u}, smem);
+
+        allocator->raw_deallocate(partial_res_ptr);
+
+        return result;
+      });
+}
+
+// compute partial results for dot kernel in each warp
+std::string vdot_partial_code = R"(
+#define HIP_ENABLE_PRINTF_DEBUG
+template<typename T>
+__device__ T dot_mul(T x, T y) {
+  return x * y;
+}
+
+// conjugate for complex dot product
+template<>
+__device__ hipFloatComplex dot_mul<hipFloatComplex>(hipFloatComplex x, hipFloatComplex y) {
+  return hipCmulf(hipConjf(x), y);
+}
+
+template<>
+__device__ hipDoubleComplex dot_mul<hipDoubleComplex>(hipDoubleComplex x, hipDoubleComplex y) {
+  return hipCmul(hipConj(x), y);
+}
+
+extern "C" __global__ void vdot_partial_kernel(scalar_t* a, scalar_t* b, scalar_t* block_results, int incx, int incy, int N)
+{
+    __shared__ scalar_t sdata[256];
+    int tid = threadIdx.x;
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    
+    scalar_t sum = zero_init<scalar_t>();
+    
+    // Grid stride loop
+    for (int i = gid; i < N; i += stride) {
+        sum += dot_mul(a[i * incx], b[i * incy]);
+    }
+    
+    // Store in shared memory
+    sdata[tid] = sum;
+    __syncthreads();
+    
+    // Perform reduction in shared memory
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    // Write result for this block to global memory
+    if (tid == 0) {
+        block_results[blockIdx.x] = sdata[0];
+    }
+}
+)";
+
+
+Tensor vdot_hip(const Tensor& self, const Tensor& other) {
+  if (!self.is_complex()) {
+    return dot_hip(self, other);
+  }
+
+  if (self.is_conj()) {
+    if (other.is_conj()) {
+      return vdot_hip(other.conj(), self.conj());
+    } else {
+      return dot_hip(self.conj(), other);
+    }
+  } else if (other.is_conj()) {
+    return (dot_hip(self, other.conj())).conj();
+  }
+
+  at::NoNamesGuard guard;
+  dot_check(self, other);
+
+  int N = static_cast<int>(self.numel());
+  int incx = static_cast<int>(self.stride(0));
+  int incy = static_cast<int>(other.stride(0));
+  if (N == 1) {
+    incx = 1;
+    incy = 1;
+  }
+
+
+  if (self._is_zerotensor() || other._is_zerotensor() || N == 0) {
+    return at::_efficientzerotensor({}, self.options());
+  }
+
+  return AT_DISPATCH_COMPLEX_TYPES(self.scalar_type(), "vdot", [&] {
+    Tensor result = at::empty({}, self.options());
+    int THREADS_PER_BLOCK = num_threads();
+    int BLOCKS = (N + block_work_size() - 1) / block_work_size();
+    const int smem = sizeof(scalar_t)*256;
+
+    auto self_ptr = self.data_ptr<scalar_t>();
+    auto other_ptr = other.data_ptr<scalar_t>();
+    auto res_ptr = result.data_ptr<scalar_t>();
+    auto* allocator = at::zoom::getZoomDeviceAllocator();
+    scalar_t* partial_res_ptr = (scalar_t*) allocator->raw_allocate(sizeof(scalar_t)*BLOCKS);
+    void* vdot_partial_args[] = {&self_ptr, &other_ptr, &partial_res_ptr, &incx, &incy, &N};
+
+    auto desc = at::zoom::jit::make_kernel_descriptor<scalar_t, scalar_t>("vdot_partial", vdot_partial_code, /*nInputs=*/5, /*nOutputs=*/1);
+    auto vdot_partial_kernel = at::zoom::jit::zoom_generate_code(desc);
+    at::zoom::jit::hiprtcFunction vdot_partial_f = at::zoom::jit::jit_pwise_function(vdot_partial_kernel, desc.name);
+    at::zoom::jit::launch_jitted_pwise_function(vdot_partial_f, vdot_partial_args, {BLOCKS, 1u, 1u}, {THREADS_PER_BLOCK, 1u, 1u}, smem);
+    
+    void* sum_reduce_blocks_args[] = {&partial_res_ptr, &res_ptr, &BLOCKS};
+    auto reduce_desc = at::zoom::jit::make_kernel_descriptor<scalar_t, scalar_t>("reduce_blocks", sum_reduce_blocks_code, /*nInputs=*/2, /*nOutputs=*/1);
+    auto sum_reduce_blocks_kernel = at::zoom::jit::zoom_generate_code(reduce_desc);
+    at::zoom::jit::hiprtcFunction sum_reduce_blocks_f = at::zoom::jit::jit_pwise_function(sum_reduce_blocks_kernel, reduce_desc.name);
+    at::zoom::jit::launch_jitted_pwise_function(sum_reduce_blocks_f, sum_reduce_blocks_args, {1u, 1u, 1u}, {THREADS_PER_BLOCK, 1u, 1u}, smem);
+
+    allocator->raw_deallocate(partial_res_ptr);
+
+    return result;
+  });
+}
+
+TORCH_IMPL_FUNC(addmv_out_hip)(const Tensor &self, const Tensor &mat, const Tensor &vec, const Scalar& beta_, const Scalar& alpha_, const Tensor& result) {
+  c10::MaybeOwned<Tensor> self_ = expand_size(self, {mat.size(0)});
+  auto betaval = beta_.toComplexDouble();
+  if (mat.numel() == 0) {
+    // shortcut for an empty matrix
+    // By definition, when beta==0, values in self should be ignored. nans and infs
+    // should not propagate
+    if (betaval == 0.0) {
+      result.zero_();
+    } else {
+      at::mul_out(
+          const_cast<Tensor&>(result),
+          self,
+          at::native::scalar_tensor(
+              beta_, self.scalar_type(), c10::nullopt /* layout */, at::kCPU, c10::nullopt /* pin_memory */));
+    }
+  } else {
+    if (!result.is_same(*self_) && betaval != 0.0) { //if beta is 0, result contents will be zeroed later
+      at::native::copy_(const_cast<Tensor&>(result), *self_);
+    }
+    if (result.numel() != 0) {
+      auto r_stride = result.stride(0);
+      auto vec_stride = vec.stride(0);
+
+      // Check for contiguity of `vec` and update `vec_stride` accordingly
+      const auto vec_contiguous = vec_stride == 0 ? vec.contiguous() : vec;
+      // A vector can be contiguous and have a stride of zero if it has it is of length 1
+      vec_stride = std::max<int64_t>(vec_contiguous.stride(0), 1LL);
+
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "addmv_impl_hip", [&] {
+        auto beta = beta_.to<scalar_t>();
+        auto alpha = alpha_.to<scalar_t>();
+        if (mat.stride(0) == 1 && mat.stride(1) >= std::max<int64_t>(1, mat.size(0))) {
+          at::zoom::blas::gemv<scalar_t>('n',
+            mat.size(0), mat.size(1), alpha, mat.const_data_ptr<scalar_t>(), mat.stride(1), vec_contiguous.const_data_ptr<scalar_t>(),
+            vec_stride, beta, result.mutable_data_ptr<scalar_t>(), r_stride);
+        }
+        else if (mat.stride(1) == 1 && mat.stride(0) >= std::max<int64_t>(1, mat.size(1))) {
+          at::zoom::blas::gemv<scalar_t>('t',
+            mat.size(1), mat.size(0), alpha, mat.const_data_ptr<scalar_t>(), mat.stride(0),
+            vec_contiguous.const_data_ptr<scalar_t>(), vec_stride, beta, result.mutable_data_ptr<scalar_t>(), r_stride);
+        }
+        else {
+          Tensor cmat = mat.contiguous();
+          at::zoom::blas::gemv<scalar_t>('t',
+              mat.size(1), mat.size(0), alpha, cmat.const_data_ptr<scalar_t>(), cmat.stride(0),
+              vec_contiguous.const_data_ptr<scalar_t>(), vec_stride, beta, result.mutable_data_ptr<scalar_t>(), r_stride);
+        }
+      });
+    }
+  }
+}
+
+
+Tensor& _int_mm_out_hip(const Tensor& self, const Tensor& mat2, Tensor& result) {
+  // NOTE: cuBLAS is currently broken for some combination of transposed inputs.
+  TORCH_CHECK(self.dim() == 2, "Expected self to be of dimension 2 but got ", self.dim());
+  TORCH_CHECK(mat2.dim() == 2, "Expected mat2 to be of dimension 2 but got ", mat2.dim());
+  TORCH_CHECK(self.size(0) > 16, "self.size(0) needs to be greater than 16, but got ", self.size(0));
+  TORCH_CHECK(self.size(1) > 0 && self.size(1) % 8 == 0, "self.size(1) needs to be greater than 0 and a multiple of 8, but got ", self.size(1));
+  TORCH_CHECK(self.size(1) == mat2.size(0), "self.size(1) needs to match mat2.size(0) but got ", self.size(1), " and ", mat2.size(0));
+  TORCH_CHECK(mat2.size(1) > 0 && mat2.size(1) % 8 == 0, "mat2.size(1) needs to be greater than 0 and a multiple of 8, but got ", mat2.size(1));
+
+  TORCH_CHECK(result.dtype() == at::kInt, "Expected result dtype to be of type kInt but got ", result.dtype());
+  TORCH_CHECK(result.size(0) == self.size(0), "Expected result.size(0) to be ", self.size(0), " but got ", result.size(0));
+  TORCH_CHECK(result.size(1) == mat2.size(1), "Expected result.size(1) to be ", mat2.size(1), " but got ", result.size(1));
+
+  TORCH_CHECK(result.dim() == 2, "Expected result to be of dimension 2 but got ", result.dim());
+
+  TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous.");
+
+  hipblasCommonArgs args(self, mat2, result);
+
+  at::zoom::blas::int8_gemm(
+      args.transa == 't',
+      args.transb == 't',
+      args.m,
+      args.n,
+      args.k,
+      args.mata->data_ptr<int8_t>(),
+      args.lda,
+      args.matb->data_ptr<int8_t>(),
+      args.ldb,
+      args.result->data_ptr<int32_t>(),
+      args.result_ld);
+
+  if (!result.is_same(*args.result)) {
+    result.copy_(*args.result);
+  }
+
+
+  // holdover from cuda/hip backend
+  TORCH_CHECK(false, "_int_mm_out_hip not compiled for this platform.");
+
+  return result;
+}
+
+Tensor _int_mm_hip(const Tensor& self, const Tensor& mat2) {
+  Tensor result = at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
+  return _int_mm_out_hip(self, mat2, result);
+}
+
+static bool _scaled_mm_allowed_device() {
+    auto dprops = at::zoom::getCurrentDeviceProperties();
+    std::string device_arch = dprops->gcnArchName;
+    static const std::vector<std::string> archs = {"gfx940", "gfx941", "gfx942"};
+    for (std::string arch : archs) {
+        size_t substring = device_arch.find(arch);
+        if (substring != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+
+}
+
+// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
+// Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
+// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
+// Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//
+//  Arguments:
+//    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
+//    - `scale_a`: a scalar tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
+//    - `scale_b`: a scalar tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
+//    - `scale_result`: a scalar tensor with the scale of the output, only set if the output is a float8 type
+//    - `use_fast_accum`: if true, enables fast float8 accumulation
+//    - `out`: a reference to the output tensor
+//    - `amax`: a reference to the amax tensor of the output, only needed if the output is a float8 type and will be updated inplace
+
+std::tuple<Tensor&, Tensor&>
+_scaled_mm_out_hip(const Tensor& mat1, const Tensor& mat2,
+          const std::optional<at::Tensor>& bias,
+          std::optional<c10::ScalarType> out_dtype,
+          const std::optional<at::Tensor>& scale_a,
+          const std::optional<at::Tensor>& scale_b,
+          const std::optional<at::Tensor>& scale_result,
+          bool use_fast_accum,
+          Tensor& out, Tensor& amax) {
+  // Check sizes
+  bool allowed_device = _scaled_mm_allowed_device();
+  TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+  TORCH_CHECK(!scale_a || (scale_a->numel() == 1 && scale_a->scalar_type() == kFloat),
+       "scale_a must be float scalar");
+  TORCH_CHECK(!scale_b || (scale_b->numel() == 1 && scale_b->scalar_type() == kFloat),
+       "scale_b must be a float scalar");
+  TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
+       "scale_result must be a float scalar");
+  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
+       " but got ", bias->numel());
+  TORCH_CHECK(
+      mat1.sizes()[1] % 16 == 0,
+      "Expected trailing dimension of mat1 to be divisible by 16 ",
+      "but got mat1 shape: (",
+      mat1.sizes()[0],
+      "x",
+      mat1.sizes()[1],
+      ".");
+  TORCH_CHECK(mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0, "mat2 shape (", mat2.sizes()[0], "x",
+       mat2.sizes()[1], " must be divisible by 16");
+  // Check types
+  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK(amax.scalar_type() == kFloat, "amax must be a float scalar");
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+  // Type restrictions imposed by CuBLASLt as of CUDA-12.1
+  TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
+        "Multiplication of two Float8_e5m2 matrices is not supported");
+  if (bias) {
+    TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
+    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
+         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
+    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
+          bias->scalar_type() == ScalarType::BFloat16,
+          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
+          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+  }
+  {
+    auto bias_ = bias.value_or(Tensor());
+    auto scale_a_ = scale_a.value_or(Tensor());
+    auto scale_b_ = scale_b.value_or(Tensor());
+    auto scale_result_ = scale_result.value_or(Tensor());
+    TensorArg targs[]{{out, "out", 0}, {amax, "amax", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3},
+                      {bias_, "bias", 4}, {scale_a_, "scale_a", 5}, {scale_b_, "scale_b", 6},
+                      {scale_result_, "scale_result", 7}};
+    checkAllSameGPU(__func__, targs);
+  }
+
+  IntArrayRef mat1_sizes = mat1.sizes();
+  IntArrayRef mat2_sizes = mat2.sizes();
+  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+  at::native::resize_output(amax, {});
+
+  hipblasCommonArgs args(mat1, mat2, out);
+  const auto out_dtype_ = args.result->scalar_type();
+  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by hipBLASLt");
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
+        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::zoom::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::zoom::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }                                                               \
+        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
+          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+            static at::zoom::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+            static at::zoom::tunable::ScaledGemmTunableOp<              \
+                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+                BLASOP_A, BLASOP_B> scaledgemm{};                       \
+            scaledgemm(&params);                                        \
+          }                                                             \
+        }
+    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
+      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+      at::zoom::tunable::ScaledGemmParams<scalar_t> params;
+      params.transa = args.transa;
+      params.transb = args.transb;
+      params.m = args.m;
+      params.n = args.n;
+      params.k = args.k;
+      params.a = args.mata->data_ptr();
+      params.a_scale_ptr = scale_a ? scale_a->data_ptr() : nullptr;
+      params.lda = args.lda;
+      params.a_dtype = args.mata->scalar_type();
+      params.b = args.matb->data_ptr();
+      params.b_scale_ptr = scale_b ? scale_b->data_ptr() : nullptr;
+      params.ldb = args.ldb;
+      params.b_dtype = args.matb->scalar_type();
+      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
+      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+      params.c = args.result->data_ptr();
+      params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr;
+      params.ldc = args.result_ld;
+      params.c_dtype = out_dtype_;
+      params.amax_ptr = amax.data_ptr();
+      params.use_fast_accum = use_fast_accum;
+      if (transa_ && transb_) {
+        TUNABLE_DISPATCH(at::zoom::tunable::BlasOp::T, at::zoom::tunable::BlasOp::T)
+      }
+      else if (transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::zoom::tunable::BlasOp::T, at::zoom::tunable::BlasOp::N)
+      }
+      else if (!transa_ && transb_) {
+        TUNABLE_DISPATCH(at::zoom::tunable::BlasOp::N, at::zoom::tunable::BlasOp::T)
+      }
+      else if (!transa_ && !transb_) {
+        TUNABLE_DISPATCH(at::zoom::tunable::BlasOp::N, at::zoom::tunable::BlasOp::N)
+      }
+      else {
+        TORCH_CHECK(false, "unreachable");
+      }
+    }),
+    kHalf, kBFloat16, kFloat8_e4m3fnuz, kFloat8_e5m2fnuz, AT_EXPAND(AT_FLOATING_TYPES));
+#undef TUNABLE_DISPATCH
+  }
+  else
+  {
+#if ROCM_VERSION >= 60200
+  // hipBlasLT requires scaleD to be set to something in order to use AMAX
+    auto dummy_options = TensorOptions().dtype(kFloat).device(kPrivateUse1);
+    auto dummy_scale = at::ones(1, dummy_options);
+#endif
+    at::zoom::blas::scaled_gemm(
+        args.transa,
+        args.transb,
+        args.m,
+        args.n,
+        args.k,
+        args.mata->data_ptr(),
+        scale_a ? scale_a->data_ptr() : nullptr,
+        args.lda,
+        args.mata->scalar_type(),
+        args.matb->data_ptr(),
+        scale_b ? scale_b->data_ptr() : nullptr,
+        args.ldb,
+        args.matb->scalar_type(),
+        bias ? bias->data_ptr(): nullptr,
+        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
+        args.result->data_ptr(),
+#if ROCM_VERSION >= 60200
+        scale_result ? scale_result->data_ptr() : dummy_scale.data_ptr(),
+#else
+        scale_result ? scale_result->data_ptr() : nullptr,
+#endif
+        args.result_ld,
+        out_dtype_,
+        amax.data_ptr(),
+        use_fast_accum);
+  }
+
+#if ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
+  // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
+  amax = at::max(at::abs(out.to(kFloat)));
+#endif
+
+  return {out, amax};
+}
+
+std::tuple<Tensor, Tensor>
+_scaled_mm_hip(const Tensor& mat_a, const Tensor& mat_b,
+          const std::optional<at::Tensor>& bias,
+          std::optional<c10::ScalarType> out_dtype,
+          const std::optional<at::Tensor>& scale_a,
+          const std::optional<at::Tensor>& scale_b,
+          const std::optional<at::Tensor>& scale_result,
+          bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+  Tensor amax = at::empty({0}, mat_a.options().dtype(ScalarType::Float));
+  return _scaled_mm_out_hip(mat_a, mat_b, bias, out_dtype, scale_a, scale_b, scale_result, use_fast_accum, out, amax);
+}
+
+} // namespace at::native
+
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPBlas.cpp b/aten/src/ATen/zoom/HIPBlas.cpp
new file mode 100644
index 00000000000000..a907c7dc0cb0c2
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPBlas.cpp
@@ -0,0 +1,2092 @@
+// !!! This is a file automatically generated by hipify!!!
+/*
+  Provides the implementations of CUDA BLAS function templates.
+ */
+
+#ifdef ENABLE_ZOOM_BLAS
+
+#include <ATen/ATen.h>
+#include <ATen/zoom/HIPBlas.h>
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/ZoomDataType.h>
+#include <ATen/zoom/tunable/Tunable.h>
+#include <ATen/zoom/tunable/TunableGemm.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <c10/macros/Export.h>
+#include <c10/util/irange.h>
+
+#ifndef DISABLE_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+#endif
+// until hipblas has an API to accept flags, we must use rocblas here
+#include <hipblas/hipblas.h>
+#include <rocblas/rocblas.h>
+#define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR)
+#define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242)
+// needed to work around calling rocblas API instead of hipblas API
+static rocblas_operation hipOperationToRocOperation(hipblasOperation_t op)
+{
+    switch(op)
+    {
+    case HIPBLAS_OP_N:
+        return rocblas_operation_none;
+    case HIPBLAS_OP_T:
+        return rocblas_operation_transpose;
+    case HIPBLAS_OP_C:
+        return rocblas_operation_conjugate_transpose;
+    }
+    AT_ERROR("HIPBLAS_STATUS_INVALID_ENUM");
+}
+static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
+{
+    switch(error)
+    {
+    case rocblas_status_size_unchanged:
+    case rocblas_status_size_increased:
+    case rocblas_status_success:
+        return HIPBLAS_STATUS_SUCCESS;
+    case rocblas_status_invalid_handle:
+        return HIPBLAS_STATUS_NOT_INITIALIZED;
+    case rocblas_status_not_implemented:
+        return HIPBLAS_STATUS_NOT_SUPPORTED;
+    case rocblas_status_invalid_pointer:
+    case rocblas_status_invalid_size:
+    case rocblas_status_invalid_value:
+        return HIPBLAS_STATUS_INVALID_VALUE;
+    case rocblas_status_memory_error:
+        return HIPBLAS_STATUS_ALLOC_FAILED;
+    case rocblas_status_internal_error:
+        return HIPBLAS_STATUS_INTERNAL_ERROR;
+    }
+    AT_ERROR("HIPBLAS_STATUS_INVALID_ENUM");
+}
+
+// (Arham): torch cpp_extension auto defines HIPBLAS_V2 without checking versions which causes issues, hence we redefine here
+#if (defined(hipblasVersionMajor) && hipblasVersionMajor == 2)
+  #ifndef HIPBLAS_V2
+    #define HIPBLAS_V2
+  #endif
+#else
+  #ifdef HIPBLAS_V2
+    #undef HIPBLAS_V2
+  #endif
+#endif
+
+// hipblas does not have hipblasSetMathMode
+#define hipblasSetMathMode(handle, flags) HIPBLAS_STATUS_SUCCESS
+// until we use hiblas v2
+// hipify correctly maps things like HIP_R_16F to HIP_R_16F,
+// however hipblas v1 is still using its custom type
+#ifndef HIPBLAS_V2
+  #define HIP_R_16F  HIPBLAS_R_16F
+  #define HIP_R_32F  HIPBLAS_R_32F
+  #define HIP_R_64F  HIPBLAS_R_64F
+  #define HIP_C_16F  HIPBLAS_C_16F
+  #define HIP_C_32F  HIPBLAS_C_32F
+  #define HIP_C_64F  HIPBLAS_C_64F
+  #define HIP_R_8I   HIPBLAS_R_8I
+  #define HIP_R_8U   HIPBLAS_R_8U
+  #define HIP_R_32I  HIPBLAS_R_32I
+  #define HIP_R_32U  HIPBLAS_R_32U
+  #define HIP_C_8I   HIPBLAS_C_8I
+  #define HIP_C_8U   HIPBLAS_C_8U
+  #define HIP_C_32I  HIPBLAS_C_32I
+  #define HIP_C_32U  HIPBLAS_C_32U
+  #define HIP_R_16BF HIPBLAS_R_16B
+  #define HIP_C_16BF HIPBLAS_C_16B
+
+  // hipblas v2 uses classic HIP types to represent complex data types but hipblas v1 needs its custom types
+  #define hipComplex hipblasComplex
+  #define hipDoubleComplex hipblasDoubleComplex
+#endif
+
+
+#define CUDABLAS_POSINT_CHECK(FD, X)         \
+  TORCH_CHECK(                               \
+      (X > 0 && X <= INT_MAX),               \
+      "at::zoom::blas::" #FD " argument " #X \
+      " must be positive and less than ",    \
+      INT_MAX,                               \
+      " but got ",                           \
+      X)
+
+#define CUDABLAS_NONNEGINT_CHECK(FD, X)       \
+  TORCH_CHECK(                                \
+      (X >= 0 && X <= INT_MAX),               \
+      "at::zoom::blas::" #FD " argument " #X  \
+      " must be non-negative and less than ", \
+      INT_MAX,                                \
+      " but got ",                            \
+      X)
+
+namespace {
+
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  AT_ERROR(
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+static void _hipblasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
+  // Note: leading dimensions generally are checked that they are > 0
+  // and at least as big the result requires (even if the value won't
+  // be used).
+
+  // Q: Why does Level3 check trans but this doesn't?
+  // A: In level 2, the sizes (m, n) specify the size of A
+  // (independent of trans value). In level 3. the sizes (m, n, k)
+  // specify the sizes of op(A), op(B) where op depend on trans
+  // values.
+  if (n <= 1)
+    *lda = std::max<int64_t>(m, 1);
+}
+
+static void _hipblasAdjustLdLevel3(
+    char transa,
+    char transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    int64_t* lda,
+    int64_t* ldb,
+    int64_t* ldc) {
+  bool transa_ = ((transa != 'n') && (transa != 'N'));
+  bool transb_ = ((transb != 'n') && (transb != 'N'));
+
+  // Note: leading dimensions generally are checked that they are > 0
+  // and at least as big the result requires (even if the value won't
+  // be used).
+  if (n <= 1)
+    *ldc = std::max<int64_t>(m, 1);
+
+  if (transa_) {
+    if (m <= 1)
+      *lda = std::max<int64_t>(k, 1);
+  } else {
+    if (k <= 1)
+      *lda = std::max<int64_t>(m, 1);
+  }
+
+  if (transb_) {
+    if (k <= 1)
+      *ldb = std::max<int64_t>(n, 1);
+  } else {
+    if (n <= 1)
+      *ldb = std::max<int64_t>(k, 1);
+  }
+}
+
+
+static size_t _parseChosenWorkspaceSize() {
+  const char * val = getenv("CUBLASLT_WORKSPACE_SIZE");
+  if (!val) {
+    // accept either env var
+    val = getenv("HIPBLASLT_WORKSPACE_SIZE");
+  }
+  size_t workspace_size = 1024; /* default size in KiB according to #73328 */
+  if (val) {
+    try {
+      workspace_size = std::stoi(val);
+    } catch(std::invalid_argument const& e) {
+      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
+                 " using default workspace size of ", workspace_size, " KiB.");
+    } catch(std::out_of_range const& e) {
+      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
+                 " using default workspace size of ", workspace_size, " KiB.");
+    }
+  }
+  return workspace_size * 1024;
+}
+
+static size_t _getWorkspaceSize() {
+  static size_t workspace_size = _parseChosenWorkspaceSize();
+  return workspace_size;
+}
+
+} // anonymous namespace
+
+namespace at::zoom::blas {
+
+/* LEVEL 3 BLAS FUNCTIONS */
+
+#define GEMM_CHECK_ARGVALUES(Dtype)           \
+  do {                                        \
+    CUDABLAS_NONNEGINT_CHECK(gemm<Dtype>, m); \
+    CUDABLAS_NONNEGINT_CHECK(gemm<Dtype>, n); \
+    CUDABLAS_NONNEGINT_CHECK(gemm<Dtype>, k); \
+    CUDABLAS_POSINT_CHECK(gemm<Dtype>, lda);  \
+    CUDABLAS_POSINT_CHECK(gemm<Dtype>, ldb);  \
+    CUDABLAS_POSINT_CHECK(gemm<Dtype>, ldc);  \
+  } while (0)
+
+#define BGEMM_CHECK_ARGVALUES(Dtype)           \
+  do {                                        \
+    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, m); \
+    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, n); \
+    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, k); \
+    CUDABLAS_POSINT_CHECK(bgemm<Dtype>, lda);  \
+    CUDABLAS_POSINT_CHECK(bgemm<Dtype>, ldb);  \
+    CUDABLAS_POSINT_CHECK(bgemm<Dtype>, ldc);  \
+    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
+  } while (0)
+
+
+#ifndef DISABLE_HIPBLASLT
+namespace {
+// Following the pattern of HipSparseDescriptor
+// Defined here for now because this is the only place hipblas_lt interface is
+// used but can be moved to a header once hipblas_lt interface is used in
+// multiple places.
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+struct CuBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_HIPBLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class CuBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, CuBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  CuBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLAS_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLAS_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
+                                 hipblasLtMatrixLayoutOpaque_t,
+                                 &hipblasLtMatrixLayoutDestroy> {
+ public:
+  CuBlasLtMatrixLayout(
+      hipDataType type,
+      uint64_t rows,
+      uint64_t cols,
+      int64_t ld,
+      bool t = false) {
+    hipblasLtMatrixLayout_t raw_descriptor = nullptr;
+    TORCH_HIPBLAS_CHECK(
+        hipblasLtMatrixLayoutCreate(&raw_descriptor, type, t ? cols : rows, t ? rows : cols, ld));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatrixLayoutAttribute_t attr, const T value) {
+    TORCH_HIPBLAS_CHECK(::hipblasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
+                                     hipblasLtMatmulPreferenceOpaque_t,
+                                     &hipblasLtMatmulPreferenceDestroy> {
+ public:
+  CuBlasLtMatmulPreference() {
+    hipblasLtMatmulPreference_t raw_descriptor = nullptr;
+    TORCH_HIPBLAS_CHECK(hipblasLtMatmulPreferenceCreate(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulPreferenceAttributes_t attr, const T value) {
+    TORCH_HIPBLAS_CHECK(::hipblasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+} // namespace
+#endif
+
+
+template <typename Dtype>
+inline void bgemm_internal_hipblaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  #ifdef DISABLE_HIPBLASLT
+  TORCH_CHECK_DISABLE_HIPBLAS_LT
+  #else
+  hipDataType abcType = HIP_R_32F;
+  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+  hipDataType scaleType = HIP_R_32F;
+  if constexpr (std::is_same_v<Dtype, double>) {
+    abcType = HIP_R_64F;
+    computeType = HIPBLAS_COMPUTE_64F;
+    scaleType = HIP_R_64F;
+  } else if constexpr (std::is_same_v<Dtype, float>) {
+
+  } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
+    abcType = HIP_C_64F;
+    computeType = HIPBLAS_COMPUTE_64F;
+    scaleType = HIP_C_64F;
+  } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
+    abcType = HIP_C_32F;
+    scaleType = HIP_C_32F;
+  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
+    abcType = HIP_R_16F;
+  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
+    abcType = HIP_R_16BF;
+  } else {
+    AT_ERROR("at::zoom::blas::bgemm_internal_hipblaslt: not implemented for ", typeid(Dtype).name());
+  }
+
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasLtHandle_t ltHandle = at::zoom::getCurrentHIPBlasLtHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == HIPBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == HIPBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+
+  if (num_batches > 1) {
+    int num_batches_as_int = static_cast<int>(num_batches);
+    Adesc.setAttribute(HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int);
+    Bdesc.setAttribute(HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int);
+    Cdesc.setAttribute(HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int);
+    Adesc.setAttribute(HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridea);
+    Bdesc.setAttribute(HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, strideb);
+    Cdesc.setAttribute(HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridec);
+  }
+
+  CuBlasLtMatmulPreference preference;
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // setting this to 1M.
+  size_t workspaceSize = _getWorkspaceSize();
+  preference.setAttribute(HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
+
+  auto& allocator = *::c10::zoom::ZoomCachingAllocator::get();
+  auto workspace = allocator.allocate(workspaceSize);
+  TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for hipblaslt");
+
+  hipblasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  TORCH_HIPBLAS_CHECK(hipblasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_HIPBLAS_CHECK(HIPBLAS_STATUS_NOT_SUPPORTED);
+  }
+
+  hipblasStatus_t hipblasStatus = hipblasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha,
+      a,
+      Adesc.descriptor(),
+      b,
+      Bdesc.descriptor(),
+      &beta,
+      c,
+      Cdesc.descriptor(),
+      c,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.mutable_get(),
+      workspaceSize,
+      c10::zoom::getCurrentZoomStream());
+  TORCH_CHECK(
+      hipblasStatus == HIPBLAS_STATUS_SUCCESS,
+      "CUDA error: ",
+      at::zoom::blas::_hipblasGetErrorEnum(hipblasStatus),
+      " when calling hipblasLtMatmul with transpose_mat1 ",
+      (opa == HIPBLAS_OP_T),
+      " transpose_mat2 ",
+      (opb == HIPBLAS_OP_T),
+      " m ",
+      m,
+      " n ",
+      n,
+      " k ",
+      k,
+      " lda ",
+      lda,
+      " ldb ",
+      ldb,
+      " ldc ",
+      ldc,
+      " abcType ",
+      abcType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType);
+  #endif
+}
+
+
+template <typename Dtype>
+inline void bgemm_internal_hipblas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::bgemm_internal_hipblas: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void bgemm_internal_hipblas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  BGEMM_CHECK_ARGVALUES(double);
+  TORCH_HIPBLAS_CHECK(hipblasDgemmStridedBatched(
+      handle, opa, opb, m, n, k, &alpha, a, lda, stridea, b, ldb, strideb, &beta, c, ldc, stridec, num_batches));
+}
+
+template <>
+void bgemm_internal_hipblas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  BGEMM_CHECK_ARGVALUES(float);
+  TORCH_HIPBLAS_CHECK(hipblasSgemmStridedBatched(
+      handle, opa, opb, m, n, k, &alpha, a, lda, stridea, b, ldb, strideb, &beta, c, ldc, stridec, num_batches));
+}
+
+template <>
+void bgemm_internal_hipblas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  BGEMM_CHECK_ARGVALUES(c10::complex<double>);
+  TORCH_HIPBLAS_CHECK(hipblasZgemmStridedBatched(
+      handle, opa, opb, m, n, k, reinterpret_cast<const hipDoubleComplex*>(&alpha), reinterpret_cast<const hipDoubleComplex*>(a),
+      lda, stridea, reinterpret_cast<const hipDoubleComplex*>(b), ldb, strideb, reinterpret_cast<const hipDoubleComplex*>(&beta),
+      reinterpret_cast<hipDoubleComplex*>(c), ldc, stridec, num_batches));
+}
+
+template <>
+void bgemm_internal_hipblas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  BGEMM_CHECK_ARGVALUES(c10::complex<float>);
+  TORCH_HIPBLAS_CHECK(hipblasCgemmStridedBatched(
+      handle, opa, opb, m, n, k, reinterpret_cast<const hipComplex*>(&alpha), reinterpret_cast<const hipComplex*>(a),
+      lda, stridea, reinterpret_cast<const hipComplex*>(b), ldb, strideb, reinterpret_cast<const hipComplex*>(&beta),
+      reinterpret_cast<hipComplex*>(c), ldc, stridec, num_batches));
+}
+
+template <>
+void bgemm_internal_hipblas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  BGEMM_CHECK_ARGVALUES(at::Half);
+  float falpha = alpha;
+  float fbeta = beta;
+
+  int flag = 0;
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+  flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#endif
+  TORCH_HIPBLAS_CHECK(rocBLASStatusToHIPStatus(rocblas_gemm_strided_batched_ex((rocblas_handle)handle,
+                                   hipOperationToRocOperation(opa),
+                                   hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
+                                   (void*)&falpha, a, rocblas_datatype_f16_r, (int)lda, stridea,
+                                   b, rocblas_datatype_f16_r, (int)ldb, strideb,
+                                   (void*)&fbeta, c, rocblas_datatype_f16_r, (int)ldc, stridec,
+                                   c, rocblas_datatype_f16_r, (int)ldc, stridec,
+                                   (int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
+                                   0, flag)));
+}
+
+template <>
+void bgemm_internal_hipblas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  BGEMM_CHECK_ARGVALUES(at::BFloat16);
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  const float falpha = alpha;
+  const float fbeta = beta;
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+
+  #ifdef HIPBLAS_V2
+  auto compute_type = HIPBLAS_COMPUTE_32F;
+  #else
+  auto compute_type = HIP_R_16BF;
+  #endif
+
+  TORCH_HIPBLAS_CHECK(hipblasGemmStridedBatchedEx(handle,
+                                  opa, opb, (int)m, (int)n, (int)k,
+                                  (void*)&falpha, a, HIP_R_16BF, (int)lda, stridea,
+                                  b, HIP_R_16BF, (int)ldb, strideb,
+                                  (void*)&fbeta, c, HIP_R_16BF, (int)ldc, stridec,
+                                  (int)num_batches,
+                                  compute_type,
+                                  HIPBLAS_GEMM_DEFAULT));
+}
+
+template <>
+void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    // hipblaslt does not support double gemm yet
+    bgemm_internal_hipblas<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+  else {
+    bgemm_internal_hipblas<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+}
+
+template <>
+void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    #ifdef DISABLE_HIPBLASLT
+    TORCH_WARN_DISABLE_HIPBLASLT
+    bgemm_internal_hipblas<float>(CUDABLAS_BGEMM_ARGS(float));
+    #else
+    bgemm_internal_hipblaslt<float>(CUDABLAS_BGEMM_ARGS(float));
+    #endif
+  }
+  else {
+    bgemm_internal_hipblas<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+}
+
+template <>
+void bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    // hipblaslt does not support complex<double> gemm yet
+    bgemm_internal_hipblas<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+  else {
+    bgemm_internal_hipblas<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    // hipblaslt does not support complex<float> gemm yet
+    bgemm_internal_hipblas<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+  else {
+    bgemm_internal_hipblas<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    #ifdef DISABLE_HIPBLASLT
+    TORCH_WARN_DISABLE_HIPBLASLT
+    bgemm_internal_hipblas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+    #else
+    bgemm_internal_hipblaslt<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+    #endif
+  }
+  else {
+    bgemm_internal_hipblas<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    #ifdef DISABLE_HIPBLASLT
+    TORCH_WARN_DISABLE_HIPBLASLT
+    bgemm_internal_hipblas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+    #else
+    bgemm_internal_hipblaslt<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+    #endif
+  }
+  else {
+    bgemm_internal_hipblas<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename DType>
+inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
+  tunable::GemmStridedBatchedParams<DType> params;
+  params.transa = transa;
+  params.transb = transb;
+  params.m = m;
+  params.n = n;
+  params.k = k;
+  params.alpha = alpha;
+  params.a = a;
+  params.lda = lda;
+  params.stride_a = stridea;
+  params.b = b;
+  params.ldb = ldb;
+  params.stride_b = strideb;
+  params.beta = beta;
+  params.c = c;
+  params.ldc = ldc;
+  params.stride_c = stridec;
+  params.batch = num_batches;
+
+  bool transa_ = ((transa != 'n') && (transa != 'N'));
+  bool transb_ = ((transb != 'n') && (transb != 'N'));
+
+  if (transa_ && transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
+    bgemm(&params);
+  }
+  else if (transa_ && !transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
+    bgemm(&params);
+  }
+  else if (!transa_ && transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
+    bgemm(&params);
+  }
+  else if (!transa_ && !transb_) {
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+    bgemm(&params);
+  }
+  else {
+    TORCH_CHECK(false, "unreachable");
+  }
+}
+
+template <>
+void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+  else {
+    bgemm_internal<double>(CUDABLAS_BGEMM_ARGS(double));
+  }
+}
+
+template <>
+void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+  else {
+    bgemm_internal<float>(CUDABLAS_BGEMM_ARGS(float));
+  }
+}
+
+template <>
+void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+  else {
+    bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+  else {
+    bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+  else {
+    bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    bgemm_tunable<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+  else {
+    bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename Dtype>
+inline void gemm_internal_hipblaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  // forward to bgemm implementation but set strides and batches to 0
+  bgemm_internal_hipblaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0);
+}
+
+template <typename Dtype>
+inline void gemm_internal_hipblas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::gemm_internal_hipblas: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemm_internal_hipblas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  GEMM_CHECK_ARGVALUES(double);
+  TORCH_HIPBLAS_CHECK(hipblasDgemm(
+      handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+}
+
+template <>
+void gemm_internal_hipblas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  GEMM_CHECK_ARGVALUES(float);
+  TORCH_HIPBLAS_CHECK(hipblasSgemm(
+      handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
+}
+
+template <>
+void gemm_internal_hipblas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  GEMM_CHECK_ARGVALUES(c10::complex<double>);
+  TORCH_HIPBLAS_CHECK(hipblasZgemm(
+      handle, opa, opb, m, n, k, reinterpret_cast<const hipDoubleComplex*>(&alpha), reinterpret_cast<const hipDoubleComplex*>(a),
+      lda, reinterpret_cast<const hipDoubleComplex*>(b), ldb, reinterpret_cast<const hipDoubleComplex*>(&beta),
+      reinterpret_cast<hipDoubleComplex*>(c), ldc));
+}
+
+template <>
+void gemm_internal_hipblas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  GEMM_CHECK_ARGVALUES(c10::complex<float>);
+  TORCH_HIPBLAS_CHECK(hipblasCgemm(
+      handle, opa, opb, m, n, k, reinterpret_cast<const hipComplex*>(&alpha), reinterpret_cast<const hipComplex*>(a),
+      lda, reinterpret_cast<const hipComplex*>(b), ldb, reinterpret_cast<const hipComplex*>(&beta),
+      reinterpret_cast<hipComplex*>(c), ldc));
+}
+
+template <>
+void gemm_internal_hipblas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  float falpha = alpha;
+  float fbeta = beta;
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  GEMM_CHECK_ARGVALUES(at::Half);
+
+  int flag = 0;
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+  flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#endif
+  TORCH_HIPBLAS_CHECK(rocBLASStatusToHIPStatus(rocblas_gemm_ex(
+      (rocblas_handle)handle,
+      hipOperationToRocOperation(opa),
+      hipOperationToRocOperation(opb),
+      m,
+      n,
+      k,
+      &falpha,
+      a,
+      rocblas_datatype_f16_r,
+      lda,
+      b,
+      rocblas_datatype_f16_r,
+      ldb,
+      &fbeta,
+      c,
+      rocblas_datatype_f16_r,
+      ldc,
+      c,
+      rocblas_datatype_f16_r,
+      ldc,
+      rocblas_datatype_f32_r,
+      rocblas_gemm_algo_standard,
+      0,
+      flag)));
+
+}
+
+template <>
+void gemm_internal_hipblas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t opa = _hipblasOpFromChar(transa);
+  hipblasOperation_t opb = _hipblasOpFromChar(transb);
+  float falpha = alpha;
+  float fbeta = beta;
+  _hipblasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  GEMM_CHECK_ARGVALUES(at::BFloat16);
+#ifdef HIPBLAS_V2
+  auto compute_type = HIPBLAS_COMPUTE_32F;
+#else
+  auto compute_type = HIP_R_16BF;
+#endif
+  TORCH_HIPBLAS_CHECK(hipblasSetMathMode(handle, hipblas_flags));
+  TORCH_HIPBLAS_CHECK(hipblasGemmEx(
+      handle,
+      opa,
+      opb,
+      m,
+      n,
+      k,
+      &falpha,
+      a,
+      HIP_R_16BF,
+      lda,
+      b,
+      HIP_R_16BF,
+      ldb,
+      &fbeta,
+      c,
+      HIP_R_16BF,
+      ldc,
+      compute_type,
+      HIPBLAS_GEMM_DEFAULT));
+  TORCH_HIPBLAS_CHECK(hipblasSetMathMode(handle, HIPBLAS_DEFAULT_MATH));
+}
+
+template <>
+void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    // hipblaslt does not support double gemm yet
+    gemm_internal_hipblas<double>(CUDABLAS_GEMM_ARGS(double));
+  }
+  else {
+    gemm_internal_hipblas<double>(CUDABLAS_GEMM_ARGS(double));
+  }
+}
+
+template <>
+void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    #ifdef DISABLE_HIPBLASLT
+    TORCH_WARN_DISABLE_HIPBLASLT
+    gemm_internal_hipblas<float>(CUDABLAS_GEMM_ARGS(float));
+    #else
+    gemm_internal_hipblaslt<float>(CUDABLAS_GEMM_ARGS(float));
+    #endif
+  }
+  else {
+    gemm_internal_hipblas<float>(CUDABLAS_GEMM_ARGS(float));
+  }
+}
+
+template <>
+void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    // hipblaslt does not support complex gemm yet
+    gemm_internal_hipblas<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
+  else {
+    gemm_internal_hipblas<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    // hipblaslt does not support complex gemm yet
+    gemm_internal_hipblas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+  else {
+    gemm_internal_hipblas<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    #ifdef DISABLE_HIPBLASLT
+    TORCH_WARN_DISABLE_HIPBLASLT
+    gemm_internal_hipblas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+    #else
+    gemm_internal_hipblaslt<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+    #endif
+  }
+  else {
+    gemm_internal_hipblas<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    #ifdef DISABLE_HIPBLASLT
+    TORCH_WARN_DISABLE_HIPBLASLT
+    gemm_internal_hipblas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+    #else
+    gemm_internal_hipblaslt<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+    #endif
+  }
+  else {
+    gemm_internal_hipblas<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename DType>
+inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) {
+  tunable::GemmParams<DType> params;
+  params.transa = transa;
+  params.transb = transb;
+  params.m = m;
+  params.n = n;
+  params.k = k;
+  params.alpha = alpha;
+  params.a = a;
+  params.lda = lda;
+  params.b = b;
+  params.ldb = ldb;
+  params.beta = beta;
+  params.c = c;
+  params.ldc = ldc;
+
+  bool transa_ = ((transa != 'n') && (transa != 'N'));
+  bool transb_ = ((transb != 'n') && (transb != 'N'));
+
+  if (transa_ && transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> gemm{};
+    gemm(&params);
+  }
+  else if (transa_ && !transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> gemm{};
+    gemm(&params);
+  }
+  else if (!transa_ && transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> gemm{};
+    gemm(&params);
+  }
+  else if (!transa_ && !transb_) {
+    static tunable::GemmTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> gemm{};
+    gemm(&params);
+  }
+  else {
+    TORCH_CHECK(false, "unreachable");
+  }
+}
+
+template <>
+void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<double>(CUDABLAS_GEMM_ARGS(double));
+  }
+  else {
+    gemm_internal<double>(CUDABLAS_GEMM_ARGS(double));
+  }
+}
+
+template <>
+void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<float>(CUDABLAS_GEMM_ARGS(float));
+  }
+  else {
+    gemm_internal<float>(CUDABLAS_GEMM_ARGS(float));
+  }
+}
+
+template <>
+void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
+  else {
+    gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGS(c10::complex<double>));
+  }
+}
+
+template <>
+void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+  else {
+    gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGS(c10::complex<float>));
+  }
+}
+
+template <>
+void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+  else {
+    gemm_internal<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+template <>
+void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  auto tuning_ctx = at::zoom::tunable::getTuningContext();
+  if (tuning_ctx->IsTunableOpEnabled()) {
+    gemm_tunable<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+  else {
+    gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation) {
+  #ifdef DISABLE_HIPBLASLT
+  TORCH_CHECK_DISABLE_HIPBLAS_LT
+  #else
+  using opmath_t = at::opmath_type<Dtype>;
+  opmath_t beta_val = 0; // bias is added in epilogue
+
+  hipDataType abcType = HIP_R_32F;
+  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+  hipDataType scaleType = HIP_R_32F;
+  if constexpr (std::is_same_v<Dtype, double>) {
+    abcType = HIP_R_64F;
+    computeType = HIPBLAS_COMPUTE_64F;
+    scaleType = HIP_R_64F;
+  } else if constexpr (std::is_same_v<Dtype, float>) {
+    abcType = HIP_R_32F;
+  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
+    abcType = HIP_R_16F;
+  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
+    abcType = HIP_R_16BF;
+  }
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  hipblasOperation_t transa = transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N;
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, transa);
+  hipblasOperation_t transb = transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N;
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, transb);
+  hipblasLtEpilogue_t epilogue = HIPBLASLT_EPILOGUE_BIAS;
+  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
+    epilogue = HIPBLASLT_EPILOGUE_RELU_BIAS;
+  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
+#if TORCH_HIP_VERSION >= 11040
+    epilogue = HIPBLASLT_EPILOGUE_GELU_BIAS;
+#endif
+  }
+
+  if (bias != nullptr) {
+    computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
+    computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
+  }
+
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
+
+  CuBlasLtMatmulPreference preference;
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
+  // setting this to 1M.
+  size_t workspaceSize = _getWorkspaceSize();
+  preference.setAttribute(HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
+
+  auto& allocator = *::c10::zoom::ZoomCachingAllocator::get();
+  auto workspace = allocator.allocate(workspaceSize);
+  TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for hipblaslt");
+
+  hipblasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  hipblasLtHandle_t ltHandle = at::zoom::getCurrentHIPBlasLtHandle();
+  TORCH_HIPBLAS_CHECK(hipblasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_HIPBLAS_CHECK(HIPBLAS_STATUS_NOT_SUPPORTED);
+  }
+
+  hipblasStatus_t hipblasStatus = hipblasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha_val,
+      mat1_ptr,
+      Adesc.descriptor(),
+      mat2_ptr,
+      Bdesc.descriptor(),
+      &beta_val,
+      result_ptr,
+      Cdesc.descriptor(),
+      result_ptr,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.mutable_get(),
+      workspaceSize,
+      c10::zoom::getCurrentZoomStream());
+  TORCH_CHECK(
+      hipblasStatus == HIPBLAS_STATUS_SUCCESS,
+      "CUDA error: ",
+      at::zoom::blas::_hipblasGetErrorEnum(hipblasStatus),
+      " when calling hipblasLtMatmul with transpose_mat1 ",
+      transpose_mat1,
+      " transpose_mat2 ",
+      transpose_mat2,
+      " m ",
+      m,
+      " n ",
+      n,
+      " k ",
+      k,
+      " mat1_ld ",
+      mat1_ld,
+      " mat2_ld ",
+      mat2_ld,
+      " result_ld ",
+      result_ld,
+      " abcType ",
+      abcType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType);
+  #endif
+}
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<double> alpha_val,
+    const double* mat1_ptr,
+    int64_t mat1_ld,
+    const double* mat2_ptr,
+    int64_t mat2_ld,
+    const double* bias,
+    double* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<float> alpha_val,
+    const float* mat1_ptr,
+    int64_t mat1_ld,
+    const float* mat2_ptr,
+    int64_t mat2_ld,
+    const float* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::Half> alpha_val,
+    const at::Half* mat1_ptr,
+    int64_t mat1_ld,
+    const at::Half* mat2_ptr,
+    int64_t mat2_ld,
+    const at::Half* bias,
+    at::Half* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+template void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::BFloat16> alpha_val,
+    const at::BFloat16* mat1_ptr,
+    int64_t mat1_ld,
+    const at::BFloat16* mat2_ptr,
+    int64_t mat2_ld,
+    const at::BFloat16* bias,
+    at::BFloat16* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
+void scaled_gemm(
+    char transa,
+    char transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const void* mat1_ptr,
+    const void* mat1_scale_ptr,
+    int64_t mat1_ld,
+    ScalarType mat1_dtype,
+    const void* mat2_ptr,
+    const void* mat2_scale_ptr,
+    int64_t mat2_ld,
+    ScalarType mat2_dtype,
+    const void* bias_ptr,
+    ScalarType bias_dtype,
+    void* result_ptr,
+    const void *result_scale_ptr,
+    int64_t result_ld,
+    ScalarType result_dtype,
+    void* amax_ptr,
+    bool use_fast_accum) {
+#ifdef DISABLE_HIPBLASLT
+  TORCH_CHECK_DISABLE_HIPBLAS_LT
+#else
+#if TORCH_HIP_VERSION >= 11080
+  const auto computeType = HIPBLAS_COMPUTE_32F;
+  const auto scaleType = HIP_R_32F;
+  const int8_t fastAccuMode = use_fast_accum ? 1 : 0;
+  const float alpha_val = 1.0;
+  const float beta_val = 0.0;
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, _hipblasOpFromChar(transa));
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, _hipblasOpFromChar(transb));
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+#if ROCM_VERSION >= 60200
+  // Amax support in ROCm as of 6.2
+  if (isFloat8Type(result_dtype)) {
+    computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
+  }
+#endif
+
+  CuBlasLtMatrixLayout Adesc(ScalarTypeToHIPDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
+  CuBlasLtMatrixLayout Bdesc(ScalarTypeToHIPDataType(mat2_dtype), k, n, mat2_ld, transb == 't');
+
+  // Cdesc is unused, beta is 0. But hipblaslt needs this set to something reasonable.
+  CuBlasLtMatrixLayout Cdesc(ScalarTypeToHIPDataType(result_dtype), m, n, result_ld);
+
+  CuBlasLtMatrixLayout Ddesc(ScalarTypeToHIPDataType(result_dtype), m, n, result_ld);
+  if (bias_ptr) {
+    computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+    computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+    computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToHIPDataType(bias_dtype));
+  }
+  size_t workspaceSize = _getWorkspaceSize();
+  auto& allocator = *::c10::zoom::ZoomCachingAllocator::get();
+  auto workspace = allocator.allocate(workspaceSize);
+  TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for hipblaslt");
+
+  CuBlasLtMatmulPreference preference;
+  preference.setAttribute(HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
+  hipblasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  hipblasLtHandle_t ltHandle = at::zoom::getCurrentHIPBlasLtHandle();
+  TORCH_HIPBLAS_CHECK(hipblasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Ddesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+
+    // hipblaslt might be able to recover by returning all algos
+    std::vector<hipblasLtMatmulHeuristicResult_t> all_algos;
+    TORCH_HIPBLAS_CHECK(hipblaslt_ext::getAllAlgos(
+        ltHandle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        _hipblasOpFromChar(transa),
+        _hipblasOpFromChar(transb),
+        ScalarTypeToHIPDataType(mat1_dtype),
+        ScalarTypeToHIPDataType(mat2_dtype),
+        // C is nullptr and beta=0, so set to something reasonable. See above.
+        //ScalarTypeToHIPDataType(bias_dtype),
+        ScalarTypeToHIPDataType(result_dtype),
+        ScalarTypeToHIPDataType(result_dtype),
+        HIPBLAS_COMPUTE_32F,
+        all_algos));
+    if (all_algos.size() == 0) {
+      TORCH_HIPBLAS_CHECK(HIPBLAS_STATUS_NOT_SUPPORTED);
+    }
+    // pick first valid solution
+    bool found = false;
+    for (size_t i = 0; i < all_algos.size(); i++) {
+        size_t ret_workspace_size = 0;
+        auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
+                ltHandle,
+                computeDesc.descriptor(),
+                &alpha_val,
+                Adesc.descriptor(),
+                Bdesc.descriptor(),
+                &beta_val,
+                Cdesc.descriptor(),
+                Ddesc.descriptor(),
+                all_algos[i].algo,
+                ret_workspace_size);
+        if (is_valid_status == HIPBLAS_STATUS_SUCCESS) {
+            if (ret_workspace_size <= workspaceSize) {
+                heuristicResult = all_algos[i];
+                found = true;
+                break;
+            }
+        }
+    }
+    TORCH_CHECK(found, "could not find valid hipblaslt solution");
+  }
+  hipblasStatus_t hipblasStatus = hipblasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha_val,
+      mat1_ptr,
+      Adesc.descriptor(),
+      mat2_ptr,
+      Bdesc.descriptor(),
+      &beta_val,
+      result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
+      Cdesc.descriptor(),
+      result_ptr,
+      Ddesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.mutable_get(),
+      workspaceSize,
+      c10::zoom::getCurrentZoomStream());
+  TORCH_CHECK(
+      hipblasStatus == HIPBLAS_STATUS_SUCCESS,
+      "CUDA error: ",
+      at::zoom::blas::_hipblasGetErrorEnum(hipblasStatus),
+      " when calling hipblasLtMatmul with transpose_mat1 ",
+      transa,
+      " transpose_mat2 ",
+      transb,
+      " m ",
+      m,
+      " n ",
+      n,
+      " k ",
+      k,
+      " mat1_ld ",
+      mat1_ld,
+      " mat2_ld ",
+      mat2_ld,
+      " result_ld ",
+      result_ld,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType);
+  return;
+#endif // TORCH_HIP_VERSION >= 11080
+  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
+#endif
+}
+
+void int8_gemm(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const int8_t* mat1_ptr,
+    int64_t mat1_ld,
+    const int8_t* mat2_ptr,
+    int64_t mat2_ld,
+    int32_t* result_ptr,
+    int64_t result_ld) {
+#ifdef DISABLE_HIPBLASLT
+  TORCH_CHECK_DISABLE_HIPBLAS_LT
+#else
+  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32I;
+  hipDataType scaleType = HIP_R_32I;
+
+  hipDataType abType = HIP_R_8I;
+  hipDataType cType = HIP_R_32I;
+
+  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
+  hipblasOperation_t transa = transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N;
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, transa);
+  hipblasOperation_t transb = transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N;
+  computeDesc.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, transb);
+
+
+  CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);
+
+  // hipblas team: alpha and beta need to be the same dtype as of scaleType
+  at::opmath_type<int32_t> alpha_val = 1;
+  int32_t beta_val = 0;
+  hipblasLtHandle_t ltHandle = at::zoom::getCurrentHIPBlasLtHandle();
+
+  CuBlasLtMatmulPreference preference;
+  size_t workspaceSize = _getWorkspaceSize();
+  preference.setAttribute(HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
+  auto& allocator = *::c10::zoom::ZoomCachingAllocator::get();
+  auto workspace = allocator.allocate(workspaceSize);
+  hipblasLtMatmulHeuristicResult_t heuristicResult = {};
+  int returnedResult = 0;
+  TORCH_HIPBLAS_CHECK(hipblasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
+  if (returnedResult == 0) {
+    TORCH_HIPBLAS_CHECK(HIPBLAS_STATUS_NOT_SUPPORTED);
+  }
+
+
+  hipblasStatus_t hipblasStatus = hipblasLtMatmul(
+      ltHandle,
+      computeDesc.descriptor(),
+      &alpha_val,
+      mat1_ptr,
+      Adesc.descriptor(),
+      mat2_ptr,
+      Bdesc.descriptor(),
+      &beta_val,
+      result_ptr,
+      Cdesc.descriptor(),
+      result_ptr,
+      Cdesc.descriptor(),
+      &heuristicResult.algo,
+      workspace.mutable_get(),
+      workspaceSize,
+      c10::zoom::getCurrentZoomStream());
+  TORCH_CHECK(
+      hipblasStatus == HIPBLAS_STATUS_SUCCESS,
+      "CUDA error: ",
+      at::zoom::blas::_hipblasGetErrorEnum(hipblasStatus),
+      " when calling hipblasLtMatmul with transpose_mat1 ",
+      transpose_mat1,
+      " transpose_mat2 ",
+      transpose_mat2,
+      " m ",
+      m,
+      " n ",
+      n,
+      " k ",
+      k,
+      " mat1_ld ",
+      mat1_ld,
+      " mat2_ld ",
+      mat2_ld,
+      " result_ld ",
+      result_ld,
+      " abType ",
+      abType,
+      " cType ",
+      cType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType);
+  #endif
+}
+
+template <>
+void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float)) {
+  TORCH_HIPBLAS_CHECK(hipblasStrsm(
+      handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb));
+}
+
+template <>
+void trsm<double>(CUDABLAS_TRSM_ARGTYPES(double)) {
+  TORCH_HIPBLAS_CHECK(hipblasDtrsm(
+      handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb));
+}
+
+template <>
+void trsm<c10::complex<float>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCtrsm(
+      handle,
+      side,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      reinterpret_cast<const hipComplex*>(alpha),
+      reinterpret_cast<const hipComplex*>(A),
+      lda,
+      reinterpret_cast<hipComplex*>(B),
+      ldb));
+}
+
+template <>
+void trsm<c10::complex<double>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZtrsm(
+      handle,
+      side,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      reinterpret_cast<const hipDoubleComplex*>(alpha),
+      reinterpret_cast<const hipDoubleComplex*>(A),
+      lda,
+      reinterpret_cast<hipDoubleComplex*>(B),
+      ldb));
+}
+
+template <>
+void trsmBatched<float>(CUDABLAS_TRSM_BATCHED_ARGTYPES(float)) {
+  TORCH_HIPBLAS_CHECK(hipblasStrsmBatched(
+      handle,
+      side,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      alpha,
+      A,
+      lda,
+      B,
+      ldb,
+      batchCount));
+}
+
+template <>
+void trsmBatched<double>(CUDABLAS_TRSM_BATCHED_ARGTYPES(double)) {
+  TORCH_HIPBLAS_CHECK(hipblasDtrsmBatched(
+      handle,
+      side,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      alpha,
+      A,
+      lda,
+      B,
+      ldb,
+      batchCount));
+}
+
+template <>
+void trsmBatched<c10::complex<float>>(
+    CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCtrsmBatched(
+      handle,
+      side,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      reinterpret_cast<const hipComplex*>(alpha),
+      reinterpret_cast<hipComplex**>(A),
+      lda,
+      reinterpret_cast<hipComplex**>(B),
+      ldb,
+      batchCount));
+}
+
+template <>
+void trsmBatched<c10::complex<double>>(
+    CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZtrsmBatched(
+      handle,
+      side,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      reinterpret_cast<const hipDoubleComplex*>(alpha),
+      reinterpret_cast<hipDoubleComplex**>(A),
+      lda,
+      reinterpret_cast<hipDoubleComplex**>(B),
+      ldb,
+      batchCount));
+}
+
+/* LEVEL 2 BLAS FUNCTIONS */
+
+#define GEMV_CHECK_ARGVALUES(Dtype)           \
+  do {                                        \
+    CUDABLAS_NONNEGINT_CHECK(gemv<Dtype>, m); \
+    CUDABLAS_NONNEGINT_CHECK(gemv<Dtype>, n); \
+    CUDABLAS_POSINT_CHECK(gemv<Dtype>, lda);  \
+    CUDABLAS_POSINT_CHECK(gemv<Dtype>, incx); \
+    CUDABLAS_POSINT_CHECK(gemv<Dtype>, incy); \
+  } while (0)
+
+template <>
+void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t op = _hipblasOpFromChar(trans);
+  _hipblasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(c10::complex<double>);
+  TORCH_HIPBLAS_CHECK(
+      hipblasZgemv(handle, op, m, n, reinterpret_cast<const hipDoubleComplex*>(&alpha), reinterpret_cast<const hipDoubleComplex*>(a),
+      lda, reinterpret_cast<const hipDoubleComplex*>(x), incx, reinterpret_cast<const hipDoubleComplex*>(&beta),
+      reinterpret_cast<hipDoubleComplex*>(y), incy));
+}
+
+template <>
+void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t op = _hipblasOpFromChar(trans);
+  _hipblasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(c10::complex<float>);
+  TORCH_HIPBLAS_CHECK(
+      hipblasCgemv(handle, op, m, n, reinterpret_cast<const hipComplex*>(&alpha), reinterpret_cast<const hipComplex*>(a),
+      lda, reinterpret_cast<const hipComplex*>(x), incx, reinterpret_cast<const hipComplex*>(&beta),
+      reinterpret_cast<hipComplex*>(y), incy));
+}
+
+template <>
+void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t op = _hipblasOpFromChar(trans);
+  _hipblasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(double);
+  TORCH_HIPBLAS_CHECK(
+      hipblasDgemv(handle, op, m, n, &alpha, a, lda, x, incx, &beta, y, incy));
+}
+
+template <>
+void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  hipblasHandle_t handle = at::zoom::getCurrentHIPBlasHandle();
+  hipblasOperation_t op = _hipblasOpFromChar(trans);
+  _hipblasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(float);
+  TORCH_HIPBLAS_CHECK(
+      hipblasSgemv(handle, op, m, n, &alpha, a, lda, x, incx, &beta, y, incy));
+}
+
+template <>
+void gemv<at::Half>(CUDABLAS_GEMV_ARGTYPES(at::Half)) {
+  // In general, hipblas regards matrices as column-major.
+  // The hipblasS/Dgemv usages in zoom::blas::gemv<float>/<double> above
+  // require that external blas::gemv callers obey the following convention:
+  //
+  // If "a" is row-major with shape (output, summed) in blas::gemv's caller,
+  // caller interprets it as column-major with shape (summed, output), passes
+  // summed and output respectively to our local vars m, n, and requests that hipblas
+  // internally transpose ("trans") the column-major interpretation of a.
+  //
+  // There's no such thing as "hipblasHalfgemv", so here we hack gemv with a gemm.
+  // However, we must allow the same calling convention, because the caller shouldn't
+  // have to swap args based on whether it's calling blas::gemv<at::Half> or <float>.
+
+  bool trans_bool = (_hipblasOpFromChar(trans) != HIPBLAS_OP_N);
+  if (trans_bool) {
+    std::swap(m, n);
+  }
+  // After swap, local vars m, n contain the output and summed sizes respectively,
+  // regardless of whether "a" was row-major or column-major in gemv<>'s caller.
+
+  // To handle the possibility incy > 1, interprets vector y as column-major matrix with one row
+  // (shape (1, output)) and leading dim incy.
+  // trans(a)*x would compute a matrix with one column (shape (output, 1)) which wouldn't match y.
+  // So instead, we interpret x similarly to y, as a column-major matrix with one row
+  // (shape (1, summed)) and leading dim incx.  The gemm then carries out x*transpose(trans(a)) to
+  // produce a matrix with one row (shape (1, output)), matching y.
+  char trans_flipped = (trans_bool ? 'n' : 't');
+  gemm<at::Half>(
+      'n', trans_flipped, 1, m, n, alpha, x, incx, a, lda, beta, y, incy);
+}
+
+template <>
+void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) {
+  bool trans_bool = (_hipblasOpFromChar(trans) != HIPBLAS_OP_N);
+  if (trans_bool) {
+    std::swap(m, n);
+  }
+  char trans_flipped = (trans_bool ? 'n' : 't');
+  gemm<at::BFloat16>(
+      'n', trans_flipped, 1, m, n, alpha, x, incx, a, lda, beta, y, incy);
+}
+
+/* LEVEL 1 BLAS FUNCTIONS */
+
+template <>
+void dot<double>(CUDABLAS_DOT_ARGTYPES(double)) {
+  TORCH_HIPBLAS_CHECK(hipblasDdot(handle, n, x, incx, y, incy, result));
+}
+
+template <>
+void dot<float>(CUDABLAS_DOT_ARGTYPES(float)) {
+  TORCH_HIPBLAS_CHECK(hipblasSdot(handle, n, x, incx, y, incy, result));
+}
+
+template <>
+void dot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZdotu(handle, n, reinterpret_cast<const hipDoubleComplex*>(x),
+                                   incx, reinterpret_cast<const hipDoubleComplex*>(y), incy,
+                                   reinterpret_cast<hipDoubleComplex*>(result)));
+}
+
+template <>
+void dot<c10::complex<float>>(CUDABLAS_DOT_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCdotu(handle, n, reinterpret_cast<const hipComplex*>(x),
+                                   incx, reinterpret_cast<const hipComplex*>(y), incy,
+                                   reinterpret_cast<hipComplex*>(result)));
+}
+
+template <>
+void dot<at::Half>(CUDABLAS_DOT_ARGTYPES(at::Half)) {
+  TORCH_HIPBLAS_CHECK(hipblasDotEx(
+      handle,
+      n,
+      x,
+      HIP_R_16F,
+      incx,
+      y,
+      HIP_R_16F,
+      incy,
+      result,
+      HIP_R_16F,
+      HIP_R_32F));
+}
+
+template <>
+void dot<at::BFloat16>(CUDABLAS_DOT_ARGTYPES(at::BFloat16)) {
+  TORCH_HIPBLAS_CHECK(hipblasDotEx(
+      handle,
+      n,
+      x,
+      HIP_R_16BF,
+      incx,
+      y,
+      HIP_R_16BF,
+      incy,
+      result,
+      HIP_R_16BF,
+      HIP_R_32F));
+}
+
+template <>
+void vdot<c10::complex<float>>(CUDABLAS_DOT_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCdotc(handle, n, reinterpret_cast<const hipComplex*>(x),
+                                   incx, reinterpret_cast<const hipComplex*>(y), incy,
+                                   reinterpret_cast<hipComplex*>(result)));
+}
+
+template <>
+void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZdotc(handle, n, reinterpret_cast<const hipDoubleComplex*>(x),
+                                   incx, reinterpret_cast<const hipDoubleComplex*>(y), incy,
+                                   reinterpret_cast<hipDoubleComplex*>(result)));
+}
+
+template <>
+void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float)) {
+  TORCH_HIPBLAS_CHECK(hipblasSgetrsBatched(
+      handle,
+      trans,
+      n,
+      nrhs,
+      dA_array,
+      lda,
+      ipiv_array,
+      dB_array,
+      ldb,
+      info_array,
+      batchsize));
+}
+
+template <>
+void getrsBatched<double>(CUDABLAS_GETRS_ARGTYPES(double)) {
+  TORCH_HIPBLAS_CHECK(hipblasDgetrsBatched(
+      handle,
+      trans,
+      n,
+      nrhs,
+      dA_array,
+      lda,
+      ipiv_array,
+      dB_array,
+      ldb,
+      info_array,
+      batchsize));
+}
+
+template <>
+void getrsBatched<c10::complex<float>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCgetrsBatched(
+      handle,
+      trans,
+      n,
+      nrhs,
+      reinterpret_cast<hipComplex**>(dA_array),
+      lda,
+      ipiv_array,
+      reinterpret_cast<hipComplex**>(dB_array),
+      ldb,
+      info_array,
+      batchsize));
+}
+
+template <>
+void getrsBatched<c10::complex<double>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZgetrsBatched(
+      handle,
+      trans,
+      n,
+      nrhs,
+      reinterpret_cast<hipDoubleComplex**>(dA_array),
+      lda,
+      ipiv_array,
+      reinterpret_cast<hipDoubleComplex**>(dB_array),
+      ldb,
+      info_array,
+      batchsize));
+}
+
+template <>
+void geqrfBatched<float>(CUDABLAS_GEQRF_BATCHED_ARGTYPES(float)) {
+  TORCH_HIPBLAS_CHECK(hipblasSgeqrfBatched(
+      handle, m, n, A_array, lda, tau_array, info, batchsize));
+}
+
+template <>
+void geqrfBatched<double>(CUDABLAS_GEQRF_BATCHED_ARGTYPES(double)) {
+  TORCH_HIPBLAS_CHECK(hipblasDgeqrfBatched(
+      handle, m, n, A_array, lda, tau_array, info, batchsize));
+}
+
+template <>
+void geqrfBatched<c10::complex<float>>(
+    CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCgeqrfBatched(
+      handle,
+      m,
+      n,
+      reinterpret_cast<hipComplex**>(A_array),
+      lda,
+      reinterpret_cast<hipComplex**>(tau_array),
+      info,
+      batchsize));
+}
+
+template <>
+void geqrfBatched<c10::complex<double>>(
+    CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZgeqrfBatched(
+      handle,
+      m,
+      n,
+      reinterpret_cast<hipDoubleComplex**>(A_array),
+      lda,
+      reinterpret_cast<hipDoubleComplex**>(tau_array),
+      info,
+      batchsize));
+}
+
+template <>
+void getrfBatched<double>(
+    int n, double** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize) {
+  auto handle = at::zoom::getCurrentHIPBlasHandle();
+  TORCH_HIPBLAS_CHECK(hipblasDgetrfBatched(
+      handle, n, dA_array, ldda, ipiv_array, info_array, batchsize));
+}
+
+template <>
+void getrfBatched<float>(
+    int n, float** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize) {
+  auto handle = at::zoom::getCurrentHIPBlasHandle();
+  TORCH_HIPBLAS_CHECK(hipblasSgetrfBatched(
+      handle, n, dA_array, ldda, ipiv_array, info_array, batchsize));
+}
+
+template <>
+void getrfBatched<c10::complex<double>>(
+    int n,
+    c10::complex<double>** dA_array,
+    int ldda,
+    int* ipiv_array,
+    int* info_array,
+    int batchsize) {
+  auto handle = at::zoom::getCurrentHIPBlasHandle();
+  TORCH_HIPBLAS_CHECK(hipblasZgetrfBatched(
+      handle,
+      n,
+      reinterpret_cast<hipDoubleComplex**>(dA_array),
+      ldda,
+      ipiv_array,
+      info_array,
+      batchsize));
+}
+
+template <>
+void getrfBatched<c10::complex<float>>(
+    int n,
+    c10::complex<float>** dA_array,
+    int ldda,
+    int* ipiv_array,
+    int* info_array,
+    int batchsize) {
+  auto handle = at::zoom::getCurrentHIPBlasHandle();
+  TORCH_HIPBLAS_CHECK(hipblasCgetrfBatched(
+      handle,
+      n,
+      reinterpret_cast<hipComplex**>(dA_array),
+      ldda,
+      ipiv_array,
+      info_array,
+      batchsize));
+}
+
+
+template <>
+void gelsBatched<double>(CUDABLAS_GELS_BATCHED_ARGTYPES(double)) {
+  TORCH_HIPBLAS_CHECK(hipblasDgelsBatched(
+      handle, trans, m, n, nrhs, dA_array, ldda, dC_array, lddc, info, devInfoArray, batchSize));
+}
+
+template <>
+void gelsBatched<float>(CUDABLAS_GELS_BATCHED_ARGTYPES(float)) {
+  TORCH_HIPBLAS_CHECK(hipblasSgelsBatched(
+      handle, trans, m, n, nrhs, dA_array, ldda, dC_array, lddc, info, devInfoArray, batchSize));
+}
+
+template <>
+void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<double>)) {
+  TORCH_HIPBLAS_CHECK(hipblasZgelsBatched(
+      handle, trans,
+      m, n, nrhs,
+      reinterpret_cast<hipDoubleComplex**>(dA_array),
+      ldda,
+      reinterpret_cast<hipDoubleComplex**>(dC_array),
+      lddc,
+      info,
+      devInfoArray,
+      batchSize));
+}
+
+template <>
+void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>)) {
+  TORCH_HIPBLAS_CHECK(hipblasCgelsBatched(
+      handle, trans,
+      m, n, nrhs,
+      reinterpret_cast<hipComplex**>(dA_array),
+      ldda,
+      reinterpret_cast<hipComplex**>(dC_array),
+      lddc,
+      info,
+      devInfoArray,
+      batchSize));
+}
+
+} // namespace at::zoom::blas
+
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/HIPBlas.h b/aten/src/ATen/zoom/HIPBlas.h
new file mode 100644
index 00000000000000..c230f4e9fc14fe
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPBlas.h
@@ -0,0 +1,366 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+/*
+  Provides a subset of CUDA BLAS functions as templates:
+
+    gemm<Dtype>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+  ldc)
+
+    gemv<Dtype>(transa, m, n, alpha, a, lda, x, incx, beta, y, incy)
+
+    dot<Dtype>(n, x, incx, y, incy, result)
+
+  where Dtype is double, float, at::Half or at::BFloat16 (ROCm, NOT for dot).
+  The functions are available in at::zoom::blas namespace.
+ */
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/OpMathType.h>
+
+namespace at::zoom::blas {
+
+// RAII guard that sets the CuBLAS pointer mode and restores it to
+// its previous value when the guard is destroyed
+class PointerModeGuard {
+public:
+  PointerModeGuard(hipblasHandle_t handle, hipblasPointerMode_t mode) :
+      handle(handle) {
+    TORCH_HIPBLAS_CHECK(hipblasGetPointerMode(handle, &previous_mode));
+    TORCH_HIPBLAS_CHECK(hipblasSetPointerMode(handle, mode));
+  }
+
+  ~PointerModeGuard() {
+    hipblasSetPointerMode(handle, previous_mode);
+  }
+
+private:
+  hipblasHandle_t handle;
+  hipblasPointerMode_t previous_mode;
+};
+
+/* LEVEL 3 BLAS FUNCTIONS */
+
+#define CUDABLAS_GEMM_ARGTYPES(Dtype)                                                       \
+  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
+      Dtype *c, int64_t ldc
+
+#define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc
+
+template <typename Dtype>
+inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::gemm: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
+template <>
+void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
+template <>
+void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+template <>
+void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+template <>
+void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+template <>
+void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+
+template <typename Dtype>
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::gemm_internal: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double));
+template <>
+void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float));
+template <>
+void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+template <>
+void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+template <>
+void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+template <>
+void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+
+enum GEMMAndBiasActivationEpilogue {
+  None,
+  RELU,
+  GELU,
+};
+
+// NOTE: GELU activation is not supported prior to CUDA 11.4 and will
+// do nothing if passed in that case.
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);
+
+void int8_gemm(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const int8_t* mat1_ptr,
+    int64_t mat1_ld,
+    const int8_t* mat2_ptr,
+    int64_t mat2_ld,
+    int32_t* result_ptr,
+    int64_t result_ld);
+
+void scaled_gemm(
+    char transa,
+    char transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const void* mat1_ptr,
+    const void* mat1_scale_ptr,
+    int64_t mat1_ld,
+    ScalarType mat1_dtype,
+    const void* mat2_ptr,
+    const void* mat2_scale_ptr,
+    int64_t mat2_ld,
+    ScalarType mat2_dtype,
+    const void* bias_ptr,
+    ScalarType bias_dtype,
+    void* result_ptr,
+    const void* result_scale_ptr,
+    int64_t result_ld,
+    ScalarType result_dtype,
+    void* amax_ptr,
+    bool use_fast_accum);
+
+#define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
+  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
+      const Dtype *a, int64_t lda, int64_t stridea,                                           \
+      const Dtype *b, int64_t ldb, int64_t strideb,                                           \
+      at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+
+#define CUDABLAS_BGEMM_ARGS(Dtype) \
+  transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches
+
+template <typename Dtype>
+inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::bgemm: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double));
+template <>
+void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float));
+template <>
+void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>));
+template <>
+void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>));
+template <>
+void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
+template <>
+void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+
+template <typename Dtype>
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::bgemm_internal: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double));
+template <>
+void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float));
+template <>
+void bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>));
+template <>
+void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>));
+template <>
+void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
+template <>
+void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+
+#define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
+  hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo, \
+      hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,    \
+      const Dtype *alpha, const Dtype *A, int lda, Dtype *B, int ldb
+
+template <typename Dtype>
+inline void trsm(CUDABLAS_TRSM_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::zoom::blas::trsm: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+TORCH_CUDA_CU_API void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float));
+template <>
+TORCH_CUDA_CU_API void trsm<double>(CUDABLAS_TRSM_ARGTYPES(double));
+template <>
+TORCH_CUDA_CU_API void trsm<c10::complex<float>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<float>));
+template <>
+TORCH_CUDA_CU_API void trsm<c10::complex<double>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<double>));
+
+#define CUDABLAS_TRSM_BATCHED_ARGTYPES(Dtype)                          \
+  hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo, \
+      hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,    \
+      const Dtype *alpha, Dtype *A[], int lda, Dtype *B[], int ldb,    \
+      int batchCount
+
+template <typename Dtype>
+inline void trsmBatched(CUDABLAS_TRSM_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::zoom::blas::trsmBatched: not implemented for ",
+      typeid(Dtype).name());
+}
+
+template <>
+TORCH_CUDA_CU_API void trsmBatched<float>(CUDABLAS_TRSM_BATCHED_ARGTYPES(float));
+template <>
+TORCH_CUDA_CU_API void trsmBatched<double>(CUDABLAS_TRSM_BATCHED_ARGTYPES(double));
+template <>
+TORCH_CUDA_CU_API void trsmBatched<c10::complex<float>>(CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<float>));
+template <>
+TORCH_CUDA_CU_API void trsmBatched<c10::complex<double>>(CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<double>));
+
+/* LEVEL 2 BLAS FUNCTIONS */
+
+#define CUDABLAS_GEMV_ARGTYPES(Dtype)                                         \
+  char trans, int64_t m, int64_t n, Dtype alpha, const Dtype *a, int64_t lda, \
+      const Dtype *x, int64_t incx, Dtype beta, Dtype *y, int64_t incy
+
+template <typename Dtype>
+inline void gemv(CUDABLAS_GEMV_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::gemv: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double));
+template <>
+void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float));
+template <>
+void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>));
+template <>
+void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>));
+template <>
+void gemv<at::Half>(CUDABLAS_GEMV_ARGTYPES(at::Half));
+template <>
+void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16));
+
+/* LEVEL 1 BLAS FUNCTIONS */
+
+#define CUDABLAS_DOT_ARGTYPES(Dtype)                                      \
+  hipblasHandle_t handle, int n, const Dtype *x, int incx, const Dtype *y, \
+      int incy, Dtype *result
+
+template <typename Dtype>
+inline void dot(CUDABLAS_DOT_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::dot: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void dot<double>(CUDABLAS_DOT_ARGTYPES(double));
+template <>
+void dot<float>(CUDABLAS_DOT_ARGTYPES(float));
+template <>
+void dot<at::Half>(CUDABLAS_DOT_ARGTYPES(at::Half));
+template <>
+void dot<at::BFloat16>(CUDABLAS_DOT_ARGTYPES(at::BFloat16));
+template <>
+void dot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
+template <>
+void dot<c10::complex<float>>(CUDABLAS_DOT_ARGTYPES(c10::complex<float>));
+
+template <typename Dtype>
+inline void vdot(CUDABLAS_DOT_ARGTYPES(Dtype)) {
+  AT_ERROR("at::zoom::blas::vdot: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void vdot<c10::complex<float>>(CUDABLAS_DOT_ARGTYPES(c10::complex<float>));
+template <>
+void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
+
+#define CUDABLAS_GETRS_ARGTYPES(Dtype)  \
+  hipblasHandle_t handle, hipblasOperation_t trans, \
+  int n, int nrhs, Dtype** dA_array, int lda, int* ipiv_array, \
+  Dtype** dB_array, int ldb, int* info_array, int batchsize
+
+template<class Dtype>
+void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::zoom::blas::getrsBatched: not implemented for ",
+    typeid(Dtype).name());
+}
+template<>
+TORCH_CUDA_CU_API void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float));
+template<>
+TORCH_CUDA_CU_API void getrsBatched<double>(CUDABLAS_GETRS_ARGTYPES(double));
+template<>
+TORCH_CUDA_CU_API void getrsBatched<c10::complex<float>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<float>));
+template<>
+TORCH_CUDA_CU_API void getrsBatched<c10::complex<double>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<double>));
+
+#define CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)                   \
+  hipblasHandle_t handle, int m, int n, Dtype **A_array, int lda, \
+      Dtype **tau_array, int *info, int batchsize
+
+template <class Dtype>
+void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::zoom::blas::geqrfBatched: not implemented for ",
+      typeid(Dtype).name());
+}
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<float>(CUDABLAS_GEQRF_BATCHED_ARGTYPES(float));
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<double>(CUDABLAS_GEQRF_BATCHED_ARGTYPES(double));
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<c10::complex<double>>(
+    CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<double>));
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<c10::complex<float>>(
+    CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<float>));
+
+#define CUDABLAS_GETRF_ARGTYPES(Dtype)  \
+  int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize
+
+template<class Dtype>
+void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::zoom::blas::getrfBatched: not implemented for ", typeid(Dtype).name());
+}
+template<>
+TORCH_CUDA_CU_API void getrfBatched<float>(CUDABLAS_GETRF_ARGTYPES(float));
+template<>
+TORCH_CUDA_CU_API void getrfBatched<double>(CUDABLAS_GETRF_ARGTYPES(double));
+template<>
+TORCH_CUDA_CU_API void getrfBatched<c10::complex<double>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<double>));
+template<>
+TORCH_CUDA_CU_API void getrfBatched<c10::complex<float>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<float>));
+
+#define CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)  \
+  hipblasHandle_t handle, hipblasOperation_t trans, int m, int n, int nrhs, Dtype** dA_array, int ldda, Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
+
+template <class Dtype>
+void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::zoom::blas::gelsBatched: not implemented for ", typeid(Dtype).name());
+}
+
+template<>
+TORCH_CUDA_CU_API void gelsBatched<double>(CUDABLAS_GELS_BATCHED_ARGTYPES(double));
+template<>
+TORCH_CUDA_CU_API void gelsBatched<float>(CUDABLAS_GELS_BATCHED_ARGTYPES(float));
+template<>
+TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<double>));
+template<>
+TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));
+
+} // namespace at::zoom::blas
diff --git a/aten/src/ATen/zoom/HIPblasHandlePool.cpp b/aten/src/ATen/zoom/HIPblasHandlePool.cpp
new file mode 100644
index 00000000000000..c8fe81ede3a64d
--- /dev/null
+++ b/aten/src/ATen/zoom/HIPblasHandlePool.cpp
@@ -0,0 +1,191 @@
+// !!! This is a file automatically generated by hipify!!!
+#ifdef ENABLE_ZOOM_BLAS
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/hiprtc_stub/ATenHIPRTC.h>
+#include <ATen/zoom/detail/DeviceThreadHandles.h>
+
+#include <c10/zoom/ZoomCachingAllocator.h>
+
+#include <map>
+#include <memory>
+#include <regex>
+#include <string>
+#include <tuple>
+
+/**
+ * Note [hipblaslt handles]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~
+ * The cublas documentation states:
+ * cuBLAS handle (hipblasHandle_t) encapsulates a cuBLASLt handle.
+ * Any valid hipblasHandle_t can be used in place of hipblasLtHandle_t with a simple cast.
+ *
+ * hipblaslt does not behave in this way.
+ * A hipblas handle does not encapsulate a hipblaslt handle.
+ *
+ * To work around this difference in behavior, a separate handle pool is available for ROCm builds.
+ * For CUDA builds, getCurrentHIPBlasLtHandle will alias for getCurrentHIPBlasHandle,
+ * whereas for ROCm builds, it is a distinct function.
+ */
+
+namespace at::zoom {
+
+namespace {
+
+#ifndef DISABLE_HIPBLASLT
+void createCublasLtHandle(hipblasLtHandle_t *handle) {
+  TORCH_HIPBLAS_CHECK(hipblasLtCreate(handle));
+}
+
+void destroyCublasLtHandle(hipblasLtHandle_t handle) {
+// this is because of something dumb in the ordering of
+// destruction. Sometimes atexit, the cuda context (or something)
+// would already be destroyed by the time this gets destroyed. It
+// happens in fbcode setting. @colesbury and @soumith decided to not destroy
+// the handle as a workaround.
+//   - Comments of @soumith copied from cuDNN handle pool implementation
+#ifdef NO_CUDNN_DESTROY_HANDLE
+#else
+    hipblasLtDestroy(handle);
+#endif
+}
+
+using CuBlasLtPoolType = DeviceThreadHandlePool<hipblasLtHandle_t, createCublasLtHandle, destroyCublasLtHandle>;
+#endif
+
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+  return instance;
+}
+
+void createCublasHandle(hipblasHandle_t *handle) {
+  TORCH_HIPBLAS_CHECK(hipblasCreate(handle));
+}
+
+void destroyCublasHandle(hipblasHandle_t handle) {
+// this is because of something dumb in the ordering of
+// destruction. Sometimes atexit, the cuda context (or something)
+// would already be destroyed by the time this gets destroyed. It
+// happens in fbcode setting. @colesbury and @soumith decided to not destroy
+// the handle as a workaround.
+//   - Comments of @soumith copied from cuDNN handle pool implementation
+#ifdef NO_CUDNN_DESTROY_HANDLE
+#else
+    hipblasDestroy(handle);
+#endif
+}
+
+using CuBlasPoolType = DeviceThreadHandlePool<hipblasHandle_t, createCublasHandle, destroyCublasHandle>;
+
+} // namespace
+
+size_t parseChosenWorkspaceSize() {
+  const char * val = getenv("HIPBLAS_WORKSPACE_CONFIG");
+  /* :4096:2:16:8 default, 32MiB for Hopper */
+  hipDeviceProp_t* properties = at::zoom::getCurrentDeviceProperties();
+  const bool sm90 = properties != nullptr && properties->major == 9 && properties->minor == 0;
+  const size_t default_size = sm90 ? 4096 * 8 * 1024 : 4096 * 1024 * 2 + 16 * 1024 * 8;
+
+  if (val) {
+    size_t total_size = 0;
+    const std::string config(val);
+    std::regex exp(":([0-9]+):([0-9]+)");
+    std::sregex_iterator next(config.begin(), config.end(), exp);
+    std::sregex_iterator end;
+    if (next == end) {
+      TORCH_WARN("Could not parse HIPBLAS_WORKSPACE_CONFIG, using default workspace size of ", default_size, " bytes.");
+      return default_size;
+    }
+    while (next != end) {
+      std::smatch match = *next;
+      TORCH_CHECK(match.size() == 3, "Expected HIPBLAS_WORKSPACE_SPACE_CONFIG match of size 3 (Format :SIZE:COUNT)");
+      size_t curr_size = (size_t) std::stoi(match.str(1));
+      size_t count = (size_t) std::stoi(match.str(2));
+      total_size += curr_size * 1024 * count;
+      next++;
+    }
+    return total_size;
+  } else {
+    return default_size;
+  }
+}
+
+size_t getChosenWorkspaceSize() {
+  size_t pool_size = parseChosenWorkspaceSize();
+  return pool_size;
+}
+
+at::DataPtr getNewWorkspace() {
+  return c10::zoom::ZoomCachingAllocator::get()->allocate(getChosenWorkspaceSize());
+}
+
+hipblasHandle_t getCurrentHIPBlasHandle() {
+  c10::DeviceIndex device = 0;
+  C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+
+  // Thread local PoolWindows are lazily-initialized
+  // to avoid initialization issues that caused hangs on Windows.
+  // See: https://github.com/pytorch/pytorch/pull/22405
+  // This thread local unique_ptrs will be destroyed when the thread terminates,
+  // releasing its reserved handles back to the pool.
+
+  // Use a leaky singleton for the pool following standard practice around
+  // singletons: https://isocpp.org/wiki/faq/ctors#construct-on-first-use-v2
+  static auto pool = std::shared_ptr<CuBlasPoolType>(
+      new CuBlasPoolType(), [](CuBlasPoolType* p) {
+        // Leak the memory.
+      });
+  thread_local std::unique_ptr<CuBlasPoolType::PoolWindow> myPoolWindow(
+      pool->newPoolWindow());
+
+  auto handle = myPoolWindow->reserve(device);
+  auto stream = c10::zoom::getCurrentZoomStream();
+  TORCH_HIPBLAS_CHECK(hipblasSetStream(handle, stream));
+
+  hipblasAtomicsMode_t hipblas_mode;
+  if (at::globalContext().deterministicAlgorithms()) {
+    hipblas_mode = HIPBLAS_ATOMICS_NOT_ALLOWED;
+  } else {
+    hipblas_mode = HIPBLAS_ATOMICS_ALLOWED;
+  }
+  TORCH_HIPBLAS_CHECK(hipblasSetAtomicsMode(handle, hipblas_mode));
+
+  return handle;
+}
+
+bool getHIPBlasAtomicsEnabled() {
+  auto handle = getCurrentHIPBlasHandle();
+  hipblasAtomicsMode_t hipblas_mode;
+  TORCH_HIPBLAS_CHECK(hipblasGetAtomicsMode(handle, &hipblas_mode));
+  return hipblas_mode == HIPBLAS_ATOMICS_ALLOWED;
+}
+
+#ifndef DISABLE_HIPBLASLT
+hipblasLtHandle_t getCurrentHIPBlasLtHandle() {
+  c10::DeviceIndex device = 0;
+  C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+
+  // Thread local PoolWindows are lazily-initialized
+  // to avoid initialization issues that caused hangs on Windows.
+  // See: https://github.com/pytorch/pytorch/pull/22405
+  // This thread local unique_ptrs will be destroyed when the thread terminates,
+  // releasing its reserved handles back to the pool.
+
+  // Use a leaky singleton for the pool following standard practice around
+  // singletons: https://isocpp.org/wiki/faq/ctors#construct-on-first-use-v2
+  static auto pool = std::shared_ptr<CuBlasLtPoolType>(
+      new CuBlasLtPoolType(), [](CuBlasLtPoolType* p) {
+        // Leak the memory.
+      });
+  thread_local std::unique_ptr<CuBlasLtPoolType::PoolWindow> myPoolWindow(
+      pool->newPoolWindow());
+
+  auto handle = myPoolWindow->reserve(device);
+  return handle;
+
+}
+#endif
+
+} // namespace at::zoom
+
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/ZoomContextLight.h b/aten/src/ATen/zoom/ZoomContextLight.h
index 93ad2791cd4a85..312703df4c0ef8 100644
--- a/aten/src/ATen/zoom/ZoomContextLight.h
+++ b/aten/src/ATen/zoom/ZoomContextLight.h
@@ -4,6 +4,13 @@
 #include <hip/hip_runtime.h>
 #include <c10/core/Allocator.h>
 #include <c10/zoom/ZoomFunctions.h>
+#ifdef ENABLE_ZOOM_BLAS
+#include <hipblas/hipblas.h>
+#ifndef DISABLE_HIPBLASLT
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+#endif
+#endif
 
 namespace c10 {
 struct Allocator;
@@ -60,4 +67,11 @@ TORCH_ZOOM_API bool canDeviceAccessPeer(
 
 TORCH_ZOOM_API c10::Allocator* getZoomDeviceAllocator();
 
+#ifdef ENABLE_ZOOM_BLAS
+TORCH_ZOOM_API hipblasHandle_t getCurrentHIPBlasHandle();
+#ifndef DISABLE_HIPBLASLT
+TORCH_ZOOM_API hipblasLtHandle_t getCurrentHIPBlasLtHandle();
+#endif
+#endif
+
 } // namespace at::zoom
\ No newline at end of file
diff --git a/aten/src/ATen/zoom/jit/jit_utils.cpp b/aten/src/ATen/zoom/jit/jit_utils.cpp
index 16f6b3807260d9..64f85d5b699b4c 100644
--- a/aten/src/ATen/zoom/jit/jit_utils.cpp
+++ b/aten/src/ATen/zoom/jit/jit_utils.cpp
@@ -1600,7 +1600,6 @@ hiprtcFunction jit_pwise_function(
 
   std::string file_path;
   if (cache_dir.has_value()) {
-    printf("Attempting to read from kernel cache...\n");
     // Attemps to read from the cache.
     // Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
     // Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
@@ -1628,15 +1627,12 @@ hiprtcFunction jit_pwise_function(
       //   an informative warning
       readin.close();
     } else {
-      printf("loading module from cache\n");
       // TODO: try passing the "mapped" file directly to cuModuleLoadCall instead of using an intermediate buffer
       std::vector<char> buffer(std::istreambuf_iterator<char>(readin), {});
       HIP_DRIVER_CHECK(hiprtc.hipModuleLoadData(&(compiled_kernel_.module), buffer.data()));
-      printf("funcload\n");
       HIP_DRIVER_CHECK(
         hiprtc.hipModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
       readin.close();
-      printf("finmodload\n");
       return compiled_kernel_;
     }
   }
@@ -1682,13 +1678,10 @@ hiprtcFunction jit_pwise_function(
   ptx.resize(ptx_size);
   ZOOM_HIPRTC_CHECK(getFunc(program, ptx.data()));
 
-  printf("modload2\n");
   HIP_DRIVER_CHECK(hiprtc.hipModuleLoadData(&(compiled_kernel_.module), ptx.data()));
-  printf("funcload2\n");
   HIP_DRIVER_CHECK(
      hiprtc.hipModuleGetFunction(&(compiled_kernel_.function), compiled_kernel_.module, name.c_str()));
   // TODO: use guards to avoid leaking
-  printf("flend\n");
   ZOOM_HIPRTC_CHECK(hiprtc.hiprtcDestroyProgram(&program));
 
   if (cache_dir.has_value()) {
diff --git a/aten/src/ATen/zoom/tunable/GemmCommon.h b/aten/src/ATen/zoom/tunable/GemmCommon.h
new file mode 100644
index 00000000000000..5be41a41161f13
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/GemmCommon.h
@@ -0,0 +1,218 @@
+// !!! This is a file automatically generated by hipify!!!
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <string>
+
+#include <ATen/zoom/tunable/TunableOp.h>
+#include <c10/zoom/ZoomException.h>
+#include <c10/util/StringUtil.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/from_blob.h>
+#endif
+
+namespace at::zoom::tunable {
+
+enum class BlasOp {
+  N = 0,
+  T = 1
+};
+
+inline std::string BlasOpToString(BlasOp op) {
+  switch (op) {
+    case BlasOp::N:
+      return "N";
+    case BlasOp::T:
+      return "T";
+  }
+  TORCH_CHECK(false, "unrecognized BlasOp");
+  return "N";
+}
+
+namespace detail {
+
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  // comparison done as 1D tensor
+  at::Tensor ref = at::from_blob(c,       {size}, options);
+  at::Tensor oth = at::from_blob(other_c, {size}, options);
+  at::Tensor ref_float = ref.to(at::kFloat);
+  at::Tensor oth_float = oth.to(at::kFloat);
+  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  double last_succeed_atol = 1;
+  double last_succeed_rtol = 1;
+  for (auto& atol : atols) {
+    for (auto& rtol : rtols) {
+      if (at::allclose(ref_float, oth_float, rtol, atol)) {
+        last_succeed_atol = atol;
+        last_succeed_rtol = rtol;
+      }
+    }
+  }
+  if (last_succeed_atol == 1) {
+    return false;
+  }
+  else {
+    TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+  }
+
+  return true;
+}
+
+}
+
+template <typename T>
+struct GemmParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+
+  GemmParams* DeepCopy() const {
+    GemmParams* copy = new GemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    size_t c_size = m * n * sizeof(T);
+    copy->c = static_cast<T*>(c10::zoom::ZoomCachingAllocator::raw_alloc(c_size));
+    C10_ZOOM_CHECK(c10::zoom::ZoomCachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, c10::zoom::getCurrentZoomStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::zoom::ZoomCachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  const T* b;
+  int64_t ldb;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+};
+
+template <typename T>
+struct GemmStridedBatchedParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
+  }
+
+  GemmStridedBatchedParams* DeepCopy() const {
+    GemmStridedBatchedParams* copy = new GemmStridedBatchedParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    size_t c_size = batch * stride_c * sizeof(T);
+    copy->c = static_cast<T*>(c10::zoom::ZoomCachingAllocator::raw_alloc(c_size));
+    C10_ZOOM_CHECK(c10::zoom::ZoomCachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, c10::zoom::getCurrentZoomStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::zoom::ZoomCachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, batch*stride_c) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  int64_t stride_a;
+  const T* b;
+  int64_t ldb;
+  int64_t stride_b;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+  int64_t stride_c;
+  int64_t batch;
+};
+
+template <typename T>
+struct ScaledGemmParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+
+  ScaledGemmParams* DeepCopy() const {
+    ScaledGemmParams* copy = new ScaledGemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    C10_ZOOM_CHECK(c10::zoom::GetDevice(&device));
+    size_t c_size = m * n * sizeof(T);
+    copy->c = c10::zoom::ZoomCachingAllocator::raw_alloc(c_size);
+    C10_ZOOM_CHECK(c10::zoom::ZoomCachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, c10::zoom::getCurrentZoomStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::zoom::ZoomCachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+    return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  const void* a;
+  const void* a_scale_ptr;
+  int64_t lda;
+  ScalarType a_dtype;
+  const void* b;
+  const void* b_scale_ptr;
+  int64_t ldb;
+  ScalarType b_dtype;
+  const void* bias_ptr;
+  ScalarType bias_dtype;
+  void* c;
+  const void* c_scale_ptr;
+  int64_t ldc;
+  ScalarType c_dtype;
+  void* amax_ptr;
+  bool use_fast_accum;
+};
+
+} // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/GemmHipblaslt.h b/aten/src/ATen/zoom/tunable/GemmHipblaslt.h
new file mode 100644
index 00000000000000..21f89725223d8d
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/GemmHipblaslt.h
@@ -0,0 +1,523 @@
+// !!! This is a file automatically generated by hipify!!!
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ZoomDataType.h>
+#include <ATen/zoom/tunable/TunableOp.h>
+#include <ATen/zoom/tunable/GemmCommon.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/util/StringUtil.h>
+
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+
+#define TORCH_HIPBLASLT_CHECK(EXPR)               \
+  do {                                            \
+    hipblasStatus_t __err = EXPR;                 \
+    TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,  \
+                "hipblaslt error: ",              \
+                hipblasStatusToString(__err),     \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::zoom::tunable {
+
+template <typename T>
+constexpr hipblasDatatype_t HipBlasDataTypeFor();
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<float>() {
+  return HIPBLAS_R_32F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<Half>() {
+  return HIPBLAS_R_16F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<BFloat16>() {
+  return HIPBLAS_R_16B;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<double>() {
+  return HIPBLAS_R_64F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIP_R_8F_E4M3_FNUZ;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIP_R_8F_E5M2_FNUZ;
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->batch;
+}
+
+template <typename T>
+int GetBatchFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_a;
+}
+
+template <typename T>
+int GetStrideAFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_b;
+}
+
+template <typename T>
+int GetStrideBFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_c;
+}
+
+template <typename T>
+int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->alpha;
+}
+
+template <typename T>
+float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return 1.0;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->beta;
+}
+
+template <typename T>
+float GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return 0.0;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetAScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scale_ptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scale_ptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetDScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->c_scale_ptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+
+template <typename T>
+const void* GetBiasPointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->bias_ptr;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return HIP_R_32F;
+}
+
+template <typename T>
+hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
+  return at::zoom::ScalarTypeToHIPDataType(params->bias_dtype);
+}
+
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  AT_ERROR(
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+static char _charFromhipblasOp(hipblasOperation_t op) {
+  switch (op) {
+    case HIPBLAS_OP_N:
+      return 'N';
+    case HIPBLAS_OP_T:
+      return 'T';
+    case HIPBLAS_OP_C:
+      return 'C';
+  }
+  AT_ERROR(
+      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
+}
+
+static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
+  if (layout == BlasOp::N) {
+    return HIPBLAS_OP_N;
+  }
+  return HIPBLAS_OP_T;
+}
+
+static size_t GetHipblasltWorkspaceSize() {
+  static const char * env = getenv("HIPBLASLT_WORKSPACE_SIZE");
+  // 256MB is max workspace size allowed for hipblaslt
+  // hipblaslt-bench uses 32MB
+  // recommendation from hipblaslt author was 76MB
+  size_t workspace_size = 2*128*1024*1024; // default 256MB
+  if (env) {
+    try {
+      workspace_size = std::stoi(env);
+    } catch(std::invalid_argument const& e) {
+      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
+                 " using default workspace size of ", workspace_size, " bytes.");
+    } catch(std::out_of_range const& e) {
+      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
+                 " using default workspace size of ", workspace_size, " bytes.");
+    }
+  }
+  return workspace_size;
+}
+
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+struct HipBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_HIPBLAS_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class HipBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, HipBlasLtDeleter<T, destructor>> descriptor_;
+};
+
+class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  HipBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLASLT_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+class HipblasltGemmOp : public Callable<ParamsT> {
+  public:
+    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
+
+    TuningStatus Call(const ParamsT* params) override {
+      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+      auto a_datatype = HipBlasDataTypeFor<AT>();
+      auto b_datatype = HipBlasDataTypeFor<BT>();
+      auto in_out_datatype = HipBlasDataTypeFor<CT>();
+      auto opa = _hipblasOpFromChar(params->transa);
+      auto opb = _hipblasOpFromChar(params->transb);
+
+      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
+
+      float alpha = GetAlphaFromParams<CT>(params);
+      float beta = GetBetaFromParams<CT>(params);
+
+      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
+      if (opa == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda));
+      }
+      if (opb == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb));
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
+
+      // specific to batched gemmm
+      int batch = GetBatchFromParams<CT>(params);
+      if (batch > 1) {
+        int64_t stride_a = GetStrideAFromParams<CT>(params);
+        int64_t stride_b = GetStrideBFromParams<CT>(params);
+        int64_t stride_c = GetStrideCFromParams<CT>(params);
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
+      }
+
+      HipBlasLtMatmulDescriptor matmul(HIPBLAS_COMPUTE_32F, HIP_R_32F);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+
+      // specific to scaled gemm
+      const void* mat1_scale_ptr = GetAScalePointerFromParams<CT>(params);
+      const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
+      const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
+      if (mat1_scale_ptr && mat2_scale_ptr && result_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+
+        const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+        auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+        if (bias_ptr) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        }
+      }
+
+      size_t workspace_size = GetHipblasltWorkspaceSize();
+
+      auto op_handle = at::zoom::getCurrentHIPBlasLtHandle();
+
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
+          matmul.descriptor(),
+          &alpha,
+          mat_a,
+          mat_b,
+          &beta,
+          mat_c,
+          mat_c,
+          algo_,
+          ret_workspace_size);
+
+      if (status == HIPBLAS_STATUS_SUCCESS) {
+        if (ret_workspace_size >= workspace_size) {
+          //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " workspace too large");
+          return FAIL;
+        }
+      }
+      else {
+        //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " not supported");
+        return FAIL;
+      }
+
+      void* workspace_buffer = nullptr;
+      if (workspace_size > 0) {
+        workspace_buffer = c10::zoom::ZoomCachingAllocator::raw_alloc(workspace_size);
+      }
+
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
+            matmul.descriptor(),
+            &alpha,
+            params->a,
+            mat_a,
+            params->b,
+            mat_b,
+            &beta,
+            params->c,
+            mat_c,
+            params->c,
+            mat_c,
+            &algo_,
+            workspace_buffer,
+            workspace_size,
+            c10::zoom::getCurrentZoomStream()));
+
+      //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
+      if (workspace_size > 0) {
+        c10::zoom::ZoomCachingAllocator::raw_delete(workspace_buffer);
+      }
+      return OK;
+    }
+
+  private:
+    hipblasLtMatmulAlgo_t algo_;
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+auto GetHipBlasLtTypeStringAndOps() {
+  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+  auto a_datatype = HipBlasDataTypeFor<AT>();
+  auto b_datatype = HipBlasDataTypeFor<BT>();
+  auto in_out_datatype = HipBlasDataTypeFor<CT>();
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+
+  hipblasLtHandle_t handle;
+  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
+  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        transa_outer,
+        transb_outer,
+        a_datatype,
+        b_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        HIPBLAS_COMPUTE_32F,
+        heuristic_result));
+  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
+
+  // Sort heuristic_result by algo index to make sure the order of returned algos is deterministic.
+  std::sort(heuristic_result.begin(),
+      heuristic_result.end(),
+      [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) {
+      return hipblaslt_ext::getIndexFromAlgo(a.algo) < hipblaslt_ext::getIndexFromAlgo(b.algo);
+      });
+
+  int returned_algo_count = heuristic_result.size();
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ret;
+  for (int i = 0; i < returned_algo_count; i++) {
+    auto algo = heuristic_result[i].algo;
+    int algo_index = hipblaslt_ext::getIndexFromAlgo(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<AT, BT, CT, ALayout, BLayout, ParamsT>>(algo);
+    std::string type_string = c10::str(
+        "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index);
+    ret.emplace_back(type_string, std::move(callable));
+  }
+
+  return ret;
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtScaledGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<AT, BT, CT, ALayout, BLayout, ScaledGemmParams<CT>>();
+}
+
+#undef TORCH_HIPBLASLT_CHECK
+
+}  // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/GemmRocblas.h b/aten/src/ATen/zoom/tunable/GemmRocblas.h
new file mode 100644
index 00000000000000..ef4146766d77a1
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/GemmRocblas.h
@@ -0,0 +1,276 @@
+// !!! This is a file automatically generated by hipify!!!
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/tunable/TunableOp.h>
+#include <ATen/zoom/tunable/GemmCommon.h>
+#include <c10/util/StringUtil.h>
+
+#define ROCBLAS_BETA_FEATURES_API
+#include <rocblas/rocblas.h>
+
+#define TORCH_ROCBLAS_CHECK(EXPR)                 \
+  do {                                            \
+    rocblas_status __err = EXPR;                  \
+    TORCH_CHECK(__err == rocblas_status_success,  \
+                "rocblas error: ",                \
+                rocblas_status_to_string(__err),  \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::zoom::tunable {
+
+template <typename T>
+constexpr rocblas_datatype RocBlasDataTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<Half>() {
+  return rocblas_datatype_f16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<BFloat16>() {
+  return rocblas_datatype_bf16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+constexpr rocblas_datatype RocBlasComputeTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<Half>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // FP16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<BFloat16>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // BF16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+auto DoCastForHalfOrBfloat16(const T fp) {
+  return fp;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<Half>(const Half fp) {
+  // alpha and beta should be the same as compute_type, in Half case it is float.
+  float h = fp;
+  return h;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<BFloat16>(const BFloat16 fp) {
+  // alpha and beta should be the same as compute_type, in bfloat16 case it is float.
+  float h = fp;
+  return h;
+}
+
+static rocblas_operation _rocblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return rocblas_operation_none;
+    case 't':
+    case 'T':
+      return rocblas_operation_transpose;
+    case 'c':
+    case 'C':
+      return rocblas_operation_conjugate_transpose;
+  }
+  AT_ERROR(
+      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+template <typename T>
+class RocblasGemmOp : public Callable<GemmParams<T>> {
+  public:
+    RocblasGemmOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_ex(
+          (rocblas_handle)at::zoom::getCurrentHIPBlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda,
+          params->b, input_output_type, params->ldb,
+          &h_b,
+          params->c, input_output_type, params->ldc,
+          params->c, input_output_type, params->ldc,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::zoom::getCurrentHIPBlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+template <typename T>
+class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_strided_batched_ex(
+          (rocblas_handle)at::zoom::getCurrentHIPBlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda, params->stride_a,
+          params->b, input_output_type, params->ldb, params->stride_b,
+          &h_b,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->batch,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmStridedBatchedTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::zoom::getCurrentHIPBlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmStridedBatchedParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmStridedBatchedOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+}  // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/README.md b/aten/src/ATen/zoom/tunable/README.md
new file mode 100644
index 00000000000000..7a41b3684dfa57
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/README.md
@@ -0,0 +1,88 @@
+# TunableOp
+
+This directory implements a TunableOp interface.
+
+Some operations, such as GEMMs, could be implemented using more than one library or more than one technique.  For
+example, a GEMM could be implemented for CUDA or ROCm using either the blas or blasLt libraries.  Further, ROCm's
+rocblas and hipblaslt libraries allow the user to query for all possible algorithms and then choose one.  How does one
+know which implementation is the fastest and should be chosen?  That's what TunableOp provides.
+
+The behavior of TunableOp is currently easily manipulated through environment variables, though you could use the C++
+interface of at::zoom::tunable::getTuningContext().  A Python interface to the TuningContext does not yet exist.
+
+Currently only a TunableGemm for ROCm is implemented.  Any call to at::zoom::blas::gemm() can optionally use the
+TunableGemm.  Calling gemm() for a given set of input arguments (transa, transb, m, n, k) will attempt to use the
+fastest available implementation.
+
+## Environment Variables
+
+#### PYTORCH_TUNABLEOP_ENABLED
+Default is 0. Set to 1 to enable.
+This is the big on/off switch for all TunableOp implementations.
+
+#### PYTORCH_TUNABLEOP_TUNING
+Default is 1. Set to 0 to disable.
+When enabled, if a tuned entry isn't found, run the tuning step and record the entry.
+
+#### PYTORCH_TUNABLEOP_VERBOSE
+Default is 0. Set to 1 to enable.
+This will produce a lot of diagnostic messages but may be useful to see if TunableOp is being used at all.
+Otherwise, TunableOp is completely silent unless there is a warning or error during its use.
+
+#### PYTORCH_TUNABLEOP_FILENAME
+Default is 'tunableop_results.csv'.  If you provide a filename, the TuningContext will attempt to read it the first time
+the context is used.  If tuning is enabled and new tunings are discovered, it will also write out to this same filename
+with all tunings, both the ones it read in at startup as well as the new ones found at runtime.  This can be used, for
+example, to build up a tunings file across many workloads by reusing the same file.  Unsetting this variable is not
+recommended but can be done, in which case the tuning results will not be saved.
+
+#### PYTORCH_TUNABLEOP_NUMERICAL_CHECK
+Default is 1. Set to 0 to disable. Compare the results of each possible solution against the default solution and reject
+those with low accuracy.
+
+#### PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED
+Default is 1. Set to 0 to disable hipblaslt being considered during tuning.
+
+### Tuning Iterations
+By default, each possible solution for a given operator will be run for either 100 iterations or as many iterations can
+be run within 30ms, whichever is smaller. Its average execution will be calculated. The fastest solution is chosen. In
+addition, a set of warm up iterations can optionally be run prior to the timed iterations. The following environment
+variables can be used to set either the maximum number of iterations to attempt or the maximum amount of time allowed in
+milliseconds, or both, in which case the smaller of the two values used.
+
+#### PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS
+Default is 30.
+
+#### PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS
+Default is 100.
+
+#### PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS
+Default is 0, meaning it is not used.
+
+#### PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS
+Default is 1.
+
+## File Output
+
+Assuming you specified a filename, you'll end up with a CSV file with contents like so:
+
+```
+Validator,PT_VERSION,2.2.0
+Validator,ROCM_VERSION,6.0.0.0-12969-1544e39
+Validator,HIPBLASLT_VERSION,0.6.0-a9c5cc7
+Validator,ROCBLAS_VERSION,4.0.0-72e57364-dirty
+GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
+GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
+```
+
+Note the "Validator" lines.  If you change a library verison, or rocm version, or pytorch version, TunableOp will detect
+this and not load the tunings because they are likely affected by other software changes.
+
+The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
+4 comma-separated fields: operator name, operator parameters, solution name, and average execution time. The execution
+time is an optional field. The CSV file can be edited, but with caution. For example, the solution name (field 3) can be
+changed to "Default" and it will fall back to the original PyTorch untuned implementation. Or, in the case of ROCm's
+hipBLAS or hipBLASLt libraries, if you know the specific solution index you can override the solution that TunableOp
+selected by replacing the value. The operator name and parameters (fields 1 and 2) are internally named and should not
+be modified. In the case of GemmTunableOp, field 1 indicates the datatype and whether the inputs are transposed (T) or
+not (N) and field 2 indicates the M, N, K input shapes.
diff --git a/aten/src/ATen/zoom/tunable/StreamTimer.cpp b/aten/src/ATen/zoom/tunable/StreamTimer.cpp
new file mode 100644
index 00000000000000..ccf25f4ec59ce8
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/StreamTimer.cpp
@@ -0,0 +1,44 @@
+// !!! This is a file automatically generated by hipify!!!
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#include <hip/hip_runtime.h>
+
+#include <c10/zoom/ZoomStream.h>
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/tunable/StreamTimer.h>
+
+namespace at::zoom::tunable {
+
+StreamTimer::StreamTimer() {
+  C10_ZOOM_CHECK(hipEventCreate(&start_));
+  C10_ZOOM_CHECK(hipEventCreate(&end_));
+}
+
+StreamTimer::~StreamTimer() {
+}
+
+void StreamTimer::Start() {
+  C10_ZOOM_CHECK(hipDeviceSynchronize());
+  C10_ZOOM_CHECK(hipEventRecord(start_, c10::zoom::getCurrentZoomStream()));
+}
+
+void StreamTimer::End() {
+  C10_ZOOM_CHECK(hipEventRecord(end_, c10::zoom::getCurrentZoomStream()));
+  C10_ZOOM_CHECK(hipEventSynchronize(end_));
+}
+
+float StreamTimer::Duration() {
+  float time;
+  // time is in ms with a resolution of 1 us
+  C10_ZOOM_CHECK(hipEventElapsedTime(&time, start_, end_));
+  return time;
+}
+
+} // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/StreamTimer.h b/aten/src/ATen/zoom/tunable/StreamTimer.h
new file mode 100644
index 00000000000000..69ff88835d493c
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/StreamTimer.h
@@ -0,0 +1,35 @@
+// !!! This is a file automatically generated by hipify!!!
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include <ATen/zoom/tunable/Tunable.h>
+
+namespace at::zoom::tunable {
+
+class StreamTimer : public ITimer {
+  public:
+    StreamTimer();
+    virtual ~StreamTimer();
+
+    void Start() override;
+
+    void End() override;
+
+    float Duration() override;
+
+  private:
+    hipEvent_t start_;
+    hipEvent_t end_;
+};
+
+} // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/Tunable.cpp b/aten/src/ATen/zoom/tunable/Tunable.cpp
new file mode 100644
index 00000000000000..842812804e357d
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/Tunable.cpp
@@ -0,0 +1,565 @@
+// !!! This is a file automatically generated by hipify!!!
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#include <hip/hip_runtime.h>
+
+#include <ATen/zoom/ZoomContextLight.h>
+#include <ATen/zoom/tunable/Tunable.h>
+#include <c10/util/Exception.h>
+#include <c10/util/StringUtil.h>
+#include <torch/version.h>
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+
+#include <chrono>
+#include <fstream>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace at::zoom::tunable {
+
+namespace {
+
+TuningContext tuning_context;
+
+} // anonymous namespace
+
+TuningContext* getTuningContext() {
+  return &tuning_context;
+}
+
+std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
+  return stream << entry.key_ << "," << entry.time_;
+}
+
+// TuningResultsManager
+
+KernelMap TuningResultsManager::Lookup(const std::string& op_signature) {
+  std::scoped_lock l{lock_};
+  auto it = results_.find(op_signature);
+  if (it == results_.cend()) {
+    return {};
+  }
+  return it->second;  // copied
+}
+
+ResultEntry TuningResultsManager::Lookup(const std::string& op_signature, const std::string& params_signature) {
+  std::scoped_lock l{lock_};
+  auto kernel_map_it = results_.find(op_signature);
+  if (kernel_map_it == results_.cend()) {
+    TUNABLE_LOG("missing op_signature, returning null ResultEntry");
+    return ResultEntry::Null();
+  }
+
+  const auto& km = kernel_map_it->second;
+  auto it = km.find(params_signature);
+  if (it == km.cend()) {
+    TUNABLE_LOG("missing params_signature, returning null ResultEntry");
+    return ResultEntry::Null();
+  }
+  return it->second;
+}
+
+inline void TuningResultsManager::AddImpl(const std::string& op_signature,
+    const std::string& params_signature,
+    ResultEntry best,
+    KernelMap& kernel_map) {
+  auto it = kernel_map.find(params_signature);
+  if (it != kernel_map.end()) {
+    if (it->second != best) {
+      TUNABLE_LOG(op_signature, "(", params_signature, ") already has a best kernel ",
+          "id=", it->second, " selected, want to add a different best kernel ", best,
+          ", the new kernel id will be ignored.");
+    }
+    return;
+  }
+
+  TUNABLE_LOG(op_signature, "(", params_signature, ") -> ", best);
+  kernel_map.emplace(params_signature, best);
+}
+
+void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
+  std::scoped_lock l{lock_};
+
+  auto it = results_.find(op_signature);
+  if (it == results_.end()) {
+    it = results_.insert({op_signature, {}}).first;
+  }
+
+  AddImpl(op_signature, params_signature, best, it->second);
+}
+
+void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
+  std::scoped_lock l{lock_};
+
+  auto it = results_.find(op_signature);
+  if (it == results_.end()) {
+    return;
+  }
+
+  auto it2 = it->second.find(params_signature);
+  if (it2 == it->second.end()) {
+    return;
+  }
+
+  TUNABLE_LOG(op_signature, "(", params_signature, ")");
+  it->second.erase(it2);
+}
+
+inline void TuningResultsManager::DisjointMergeImpl(
+    const std::string& op_signature,
+    const KernelMap& kernel_map,
+    /*out*/ std::unordered_map<std::string, KernelMap>& results) {
+  auto it = results.find(op_signature);
+  if (it == results.end()) {
+    for (const auto& [param_sig, kernel_id] : kernel_map) {
+      TUNABLE_LOG(op_signature, "(", param_sig, ") -> ", kernel_id);
+    }
+    results[op_signature] = kernel_map;
+    return;
+  }
+
+  for (const auto& [params_signature, best] : kernel_map) {
+    AddImpl(op_signature, params_signature, best, it->second);
+  }
+}
+
+void TuningResultsManager::Load(const std::unordered_map<std::string, KernelMap>& results_to_load) {
+  TUNABLE_LOG("Loading results");
+  std::scoped_lock l{lock_};
+  for (const auto& [op_signature, kernel_map] : results_to_load) {
+    DisjointMergeImpl(op_signature, kernel_map, results_);
+  }
+}
+
+ResultsMap TuningResultsManager::Dump() {
+  std::scoped_lock l{lock_};
+  return results_;
+}
+
+void TuningResultsManager::DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map) {
+  std::scoped_lock l{lock_};
+  DisjointMergeImpl(op_signature, kernel_map, results_);
+}
+
+size_t TuningResultsManager::GetSize() {
+  size_t size = 0;
+  std::scoped_lock l{lock_};
+  for (const auto& [op_signature, kernel_map] : results_) {
+    size += kernel_map.size();
+  }
+  return size;
+}
+
+// TuningResultsValidator
+
+TuningResultsValidator::TuningResultsValidator() {
+  RegisterValidator(
+      "PT_VERSION",
+      [this]() { return GetPyTorchVersion(); },
+      [this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
+}
+
+std::unordered_map<std::string, std::string> TuningResultsValidator::GetAllValidators() const {
+  std::unordered_map<std::string, std::string> ret;
+  for (const auto& [key, get_validate_func_pair] : validators_) {
+    const GetFunc& getter = get_validate_func_pair.first;
+    ret[key] = getter();
+  }
+  return ret;
+}
+
+static bool CheckMandatoryKeys(
+    const TuningResultsValidator::GetValidateFuncs& gv_funcs,
+    const std::unordered_map<std::string, std::string>& to_check) {
+  bool passed = true;
+  for (const auto& k : TuningResultsValidator::mandatory_keys) {
+    if (gv_funcs.find(k) == gv_funcs.end()) {
+      passed = false;
+      TUNABLE_LOG("key=\"", k, "\" is not registered for Get and Validate. ");
+    }
+
+    if (to_check.find(k) == to_check.end()) {
+      passed = false;
+      TUNABLE_LOG("key=\"", k, "\" is not provided for validation. ");
+    }
+  }
+  return passed;
+}
+
+static bool CheckKeysMatching(
+    const TuningResultsValidator::GetValidateFuncs& gv_funcs,
+    const std::unordered_map<std::string, std::string>& to_check) {
+  auto get_keys = [](const auto& it) -> std::string { return it.first; };
+  std::vector<std::string> required_keys;
+  std::vector<std::string> provided_keys;
+  std::transform(gv_funcs.cbegin(), gv_funcs.cend(), std::back_inserter(required_keys), get_keys);
+  std::transform(to_check.cbegin(), to_check.cend(), std::back_inserter(provided_keys), get_keys);
+  std::sort(required_keys.begin(), required_keys.end());
+  std::sort(provided_keys.begin(), provided_keys.end());
+
+  std::unordered_set<std::string> intersection;
+  std::set_intersection(required_keys.cbegin(), required_keys.cend(),
+                        provided_keys.cbegin(), provided_keys.cend(),
+                        std::inserter(intersection, intersection.end()));
+  bool matched = true;
+  if (intersection.size() != required_keys.size()) {
+    matched = false;
+    for (const auto& k : required_keys) {
+      if (intersection.find(k) == intersection.end()) {
+        TORCH_WARN("Unmatched validator: \"", k, "\" is required, but the tuning results does not provide it. ");
+      }
+    }
+  }
+  if (intersection.size() != provided_keys.size()) {
+    matched = false;
+    for (const auto& k : provided_keys) {
+      if (intersection.find(k) == intersection.end()) {
+        TORCH_WARN("Unmatched validator: \"", k, "\" is provided, but pytorch is unable to consume it. ");
+      }
+    }
+  }
+  return matched;
+}
+
+TuningStatus TuningResultsValidator::ValidateAll(
+        const std::unordered_map<std::string, std::string>& to_validate) const {
+  if (!CheckMandatoryKeys(validators_, to_validate)) {
+    return FAIL;
+  }
+  if (!CheckKeysMatching(validators_, to_validate)) {
+    return FAIL;
+  }
+
+  for (const auto& [key, value] : to_validate) {
+    const auto& it = validators_.find(key);
+    if (it == validators_.cend()) {
+      TORCH_WARN("Failed to lookup validator using key ", key);
+      for (const auto& [key2, val2] : validators_) {
+        TORCH_WARN("available key ", key2);
+      }
+      return FAIL;
+    }
+    const ValidateFunc& validator = it->second.second;
+    if (validator(value) != OK) {
+      TORCH_WARN("Failed validator: ", key);
+      return FAIL;
+    }
+  }
+
+  return OK;
+}
+
+void TuningResultsValidator::RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf) {
+  if (validators_.find(key) != validators_.end()) {
+    TORCH_WARN("Attempting to re-register validator with key ", key);
+  }
+  else {
+    validators_[key] = std::make_pair(gf, vf);
+  }
+}
+
+std::string TuningResultsValidator::GetPyTorchVersion() const {
+  return TORCH_VERSION;
+}
+
+TuningStatus TuningResultsValidator::ValidatePyTorchVersion(const std::string& value) const {
+  if (value == GetPyTorchVersion()) {
+    return OK;
+  }
+  return FAIL;
+}
+
+// TuningContext
+
+TuningContext::TuningContext() :
+    enable_{false},
+    tuning_enable_{true},
+    manager_initialized_{false},
+    max_tuning_duration_ms_{30},
+    max_tuning_iterations_{100},
+    max_warmup_duration_ms_{0},
+    max_warmup_iterations_{0},
+    filename_{},
+    results_count_from_input_file_{0}
+{
+}
+
+TuningContext::~TuningContext() {
+  if (!manager_initialized_) {
+    // TuningResultsManager was never initialized, no tuning requested or performed.
+    // This can happen in a DDP job where a python process spawns other workers
+    // but doesn't do any computation itself.
+    return;
+  }
+  auto filename = GetFilename();
+  if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty()) {
+    if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) {
+      if (results_count_from_input_file_ > 0) {
+        TUNABLE_LOG("additional tuning results available, rewriting file ", filename);
+      }
+      else {
+        TUNABLE_LOG("writing file ", filename);
+      }
+      if (!WriteFile(filename)) {
+        TUNABLE_LOG("failed to write file ", filename);
+      }
+    }
+  }
+}
+
+void TuningContext::EnableTunableOp() {
+  TUNABLE_LOG("Enable TunableOp");
+  enable_ = true;
+}
+
+void TuningContext::DisableTunableOp() {
+  TUNABLE_LOG("Disable TunableOp");
+  enable_ = false;
+}
+
+bool TuningContext::IsTunableOpEnabled() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_ENABLED");
+  if (env != nullptr && strcmp(env, "1") == 0) {
+    //TUNABLE_LOG("PYTORCH_TUNABLEOP_ENABLED=1");
+    return true;
+  }
+  return enable_;
+}
+
+void TuningContext::EnableTuning() {
+  TUNABLE_LOG("Enable Tuning for TunableOp");
+  tuning_enable_ = true;
+}
+
+void TuningContext::DisableTuning() {
+  TUNABLE_LOG("Disable Tuning for TunableOp");
+  tuning_enable_ = false;
+}
+
+bool TuningContext::IsTuningEnabled() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_TUNING");
+  if (env != nullptr && strcmp(env, "0") == 0) {
+    //TUNABLE_LOG("PYTORCH_TUNABLEOP_TUNING=1");
+    return false;
+  }
+  return tuning_enable_;
+}
+
+void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
+  max_tuning_duration_ms_ = max_duration_ms;
+}
+
+int TuningContext::GetMaxTuningDurationMs() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_tuning_duration_ms_;
+}
+
+void TuningContext::SetMaxTuningIterations(int max_iter) {
+  max_tuning_iterations_ = max_iter;
+}
+
+int TuningContext::GetMaxTuningIterations() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_tuning_iterations_;
+}
+
+void TuningContext::SetMaxWarmupDurationMs(int max_duration_ms) {
+  max_warmup_duration_ms_ = max_duration_ms;
+}
+
+int TuningContext::GetMaxWarmupDurationMs() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_warmup_duration_ms_;
+}
+
+void TuningContext::SetMaxWarmupIterations(int max_iter) {
+  max_warmup_iterations_ = max_iter;
+}
+
+int TuningContext::GetMaxWarmupIterations() const {
+  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS");
+  if (env != nullptr) {
+    return atoi(env);
+  }
+  return max_warmup_iterations_;
+}
+
+void TuningContext::EnableTunableOpAndTuning() {
+  EnableTunableOp();
+  EnableTuning();
+}
+
+void TuningContext::DisableTunableOpAndTuning() {
+  DisableTunableOp();
+  DisableTuning();
+}
+
+TuningResultsManager& TuningContext::GetTuningResultsManager() {
+  c10::call_once(manager_init_once_, [this]() {
+    manager_initialized_ = true;
+    if (GetFilename().empty()) {
+      // if SetFilename() was not already called, call it now with the default or env var
+      const char *env = std::getenv("PYTORCH_TUNABLEOP_FILENAME");
+      std::string filename = (env == nullptr) ? "tunableop_results.csv" : env;
+      SetFilename(filename);
+    }
+    auto filename = GetFilename();
+    if (!filename.empty()) {
+      ReadFile(filename);
+      // attempt immediately to open file for writing to catch errors early
+      std::ofstream file(filename, std::ios::out | std::ios::app);
+      if (!file.good()) {
+        TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved");
+      }
+    }
+  });
+  return manager_;
+}
+
+TuningResultsValidator& TuningContext::GetTuningResultsValidator() {
+  return validator_;
+}
+
+TuningResults TuningContext::GetTuningResults() {
+  TuningResults tr;
+  tr.validators = GetTuningResultsValidator().GetAllValidators();
+  tr.results = GetTuningResultsManager().Dump();
+  return tr;
+}
+
+TuningStatus TuningContext::LoadTuningResults(const TuningResults& tr) {
+  TORCH_CHECK(GetTuningResultsValidator().ValidateAll(tr.validators));
+  GetTuningResultsManager().Load(tr.results);
+  return OK;
+}
+
+void TuningContext::SetFilename(const std::string& filename) {
+  filename_ = filename;
+
+  if (filename_.empty()) {
+    return;
+  }
+
+  // differentiate filename based on device ordinal to avoid
+  // use case of one process per device writing to same file
+  std::string device = c10::str(int(c10::zoom::current_device()));
+
+  // does filename contain %d to insert device ordinal in specific location?
+  const std::string TOKEN("%d");
+  std::size_t found = filename_.find(TOKEN);
+  if (found != std::string::npos) {
+    filename_.replace(found, TOKEN.length(), device);
+  }
+  else {
+    // no %d present, so append device ordinal before final '.'
+    found = filename_.rfind(".");
+    if (found != std::string::npos) {
+      filename_.insert(found, device);
+    }
+    else {
+      // all else fails, just append
+      filename_.append(device);
+    }
+  }
+}
+
+std::string TuningContext::GetFilename() const {
+  return filename_;
+}
+
+bool TuningContext::ReadFile(const std::string& filename) {
+  TUNABLE_LOG("reading tuning results from ", filename);
+  ResultsMap results;
+  std::unordered_map<std::string, std::string> validators;
+  std::string line;
+  std::ifstream file(filename);
+  if (!file) {
+    TUNABLE_LOG("could not open ", filename, " for reading tuning results");
+    return false;
+  }
+  while (std::getline(file, line)) {
+    if (line.empty()) {
+      continue;
+    }
+    std::string part;
+    std::vector<std::string> parts;
+    std::stringstream line_as_stream(line);
+    while (std::getline(line_as_stream, part, ',')) {
+      parts.push_back(part);
+    }
+    if (parts[0] == "Validator" && parts.size() >= 3) {
+      validators[parts[1]] = parts[2];
+      TUNABLE_LOG("Validator ", parts[1], "=", parts[2]);
+    }
+    else if (parts.size() >= 4) {
+      results[parts[0]].emplace(parts[1], ResultEntry(parts[2], atof(parts[3].c_str())));
+    }
+    else if (parts.size() >= 3) {
+      // the timestamp from the file is optional
+      results[parts[0]].emplace(parts[1], ResultEntry(parts[2], 0));
+    }
+    else {
+      TUNABLE_LOG("could not parse line: ", line);
+    }
+  }
+  if (GetTuningResultsValidator().ValidateAll(validators) != FAIL) {
+    manager_.Load(results);
+    results_count_from_input_file_ = manager_.GetSize();
+  }
+  else {
+    TUNABLE_LOG("results validator check failed");
+    return false;
+  }
+  return true;
+}
+
+bool TuningContext::WriteFile(const std::string& filename) {
+  std::ofstream file(filename, std::ios::out | std::ios::trunc);
+  if (!file.good()) {
+    TUNABLE_LOG("error opening tuning results file for writing ", filename);
+    return false;
+  }
+  auto validators = GetTuningResultsValidator().GetAllValidators();
+  for (const auto& [key, val] : validators) {
+    file << "Validator," << key << "," << val << std::endl;
+  }
+  auto results = GetTuningResultsManager().Dump();
+  for (const auto& [op_sig, kernelmap] : results) {
+    for (const auto& [param_sig, result] : kernelmap) {
+      file << op_sig << "," << param_sig << "," << result << std::endl;
+    }
+  }
+  file.close();
+  return true;
+}
+
+} // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/Tunable.h b/aten/src/ATen/zoom/tunable/Tunable.h
new file mode 100644
index 00000000000000..df120685d2e06c
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/Tunable.h
@@ -0,0 +1,205 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <c10/util/CallOnce.h>
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace at::zoom::tunable {
+
+static void TunableLog(const std::string& msg) {
+  static const char *env = getenv("PYTORCH_TUNABLEOP_VERBOSE");
+  if (env != nullptr && strcmp(env, "1") == 0) {
+    std::cerr << msg << std::endl;
+  }
+}
+#define TUNABLE_LOG(...) TunableLog(c10::str(__VA_ARGS__))
+
+enum TuningStatus {
+  OK = 0,
+  FAIL = 1,
+  UNSUPPORTED = 2,
+};
+
+// Mapping from params signature to kernel id
+class ResultEntry {
+  public:
+    explicit ResultEntry(const std::string& key, double time) : key_(key), time_(time) {}
+    bool operator==(const ResultEntry& other) { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) { return key_ != other.key_; }
+    operator std::string () { return key_; }
+    friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry);
+    static ResultEntry Null() { return ResultEntry("Null", 0.0); }
+    static ResultEntry Default() { return ResultEntry("Default", 0.0); }
+
+  private:
+    std::string key_;
+    double time_;
+};
+
+typedef std::unordered_map<std::string, ResultEntry> KernelMap;
+typedef std::unordered_map<std::string, KernelMap> ResultsMap;
+
+struct TuningResults {
+  // Validates if these results are compatible with the libraries
+  std::unordered_map<std::string, std::string> validators;
+
+  // Mapping from Callable signature to Callable's tuning result
+  ResultsMap results;
+};
+
+class TuningResultsManager {
+  public:
+    TuningResultsManager() = default;
+    ~TuningResultsManager() = default;
+
+    KernelMap Lookup(const std::string& op_signature);
+
+    ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
+
+    inline void AddImpl(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best,
+        KernelMap& kernel_map);
+
+    void Add(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best);
+
+    void Delete(const std::string& op_signature, const std::string& params_signature);
+
+    inline void DisjointMergeImpl(
+        const std::string& op_signature,
+        const KernelMap& kernel_map,
+        /*out*/ ResultsMap& results);
+
+    void Load(const ResultsMap& results_to_load);
+
+    ResultsMap Dump();
+
+    void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map);
+
+    size_t GetSize();
+
+  private:
+    std::mutex lock_;
+    ResultsMap results_;
+};
+
+class TuningResultsValidator {
+  public:
+    using GetFunc = std::function<std::string()>;
+    using ValidateFunc = std::function<TuningStatus(const std::string&)>;
+    using GetValidateFuncs = std::unordered_map<std::string, std::pair<GetFunc, ValidateFunc>>;
+
+    TuningResultsValidator();
+    ~TuningResultsValidator() = default;
+
+    std::unordered_map<std::string, std::string> GetAllValidators() const;
+    TuningStatus ValidateAll(const std::unordered_map<std::string, std::string>& to_validate) const;
+    void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
+
+  protected:
+    std::string GetPyTorchVersion() const;
+    TuningStatus ValidatePyTorchVersion(const std::string& value) const;
+
+  public:
+    static constexpr const std::array mandatory_keys{"PT_VERSION"};
+
+  private:
+    GetValidateFuncs validators_;
+};
+
+class TuningContext {
+  public:
+    TuningContext();
+    ~TuningContext();
+    TuningContext(TuningContext &) = delete;
+    TuningContext(TuningContext &&) = delete;
+    TuningContext &operator=(TuningContext &) = delete;
+    TuningContext &operator=(TuningContext &&) = delete;
+
+    void EnableTunableOp();
+    void DisableTunableOp();
+    bool IsTunableOpEnabled() const;
+
+    void EnableTuning();
+    void DisableTuning();
+    bool IsTuningEnabled() const;
+
+    void SetMaxTuningDurationMs(int max_duration_ms);
+    int GetMaxTuningDurationMs() const;
+
+    void SetMaxTuningIterations(int max_iter);
+    int GetMaxTuningIterations() const;
+
+    void SetMaxWarmupDurationMs(int max_duration_ms);
+    int GetMaxWarmupDurationMs() const;
+
+    void SetMaxWarmupIterations(int max_iter);
+    int GetMaxWarmupIterations() const;
+
+    void EnableTunableOpAndTuning();
+    void DisableTunableOpAndTuning();
+
+    TuningResultsManager& GetTuningResultsManager();
+
+    TuningResultsValidator& GetTuningResultsValidator();
+
+    TuningResults GetTuningResults();
+
+    TuningStatus LoadTuningResults(const TuningResults& tr);
+
+    void SetFilename(const std::string& filename);
+    std::string GetFilename() const;
+
+  protected:
+    bool ReadFile(const std::string& filename);
+    bool WriteFile(const std::string& filename);
+
+  private:
+    bool enable_;
+    bool tuning_enable_;
+    bool manager_initialized_;
+    int max_tuning_duration_ms_;
+    int max_tuning_iterations_;
+    int max_warmup_duration_ms_;
+    int max_warmup_iterations_;
+    mutable TuningResultsManager manager_;
+    mutable c10::once_flag manager_init_once_;
+    TuningResultsValidator validator_;
+    std::string filename_;
+    size_t results_count_from_input_file_;
+};
+
+TuningContext* getTuningContext();
+
+class ITimer {
+  public:
+    ITimer() = default;
+    virtual ~ITimer() = default;
+
+    virtual void Start() = 0;
+    virtual void End() = 0;
+
+    /// Computes the elapsed time in milliseconds between Start() and End()
+    virtual float Duration() = 0;
+};
+
+} // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/TunableGemm.h b/aten/src/ATen/zoom/tunable/TunableGemm.h
new file mode 100644
index 00000000000000..3564f734ccbb25
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/TunableGemm.h
@@ -0,0 +1,371 @@
+// !!! This is a file automatically generated by hipify!!!
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/zoom/tunable/GemmCommon.h>
+
+#ifndef DISABLE_HIPBLASLT
+#include <ATen/zoom/tunable/GemmHipblaslt.h>
+#endif
+
+#include <ATen/zoom/tunable/GemmRocblas.h>
+
+#include <ATen/zoom/tunable/StreamTimer.h>
+#include <ATen/zoom/tunable/TunableOp.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/StringUtil.h>
+
+#include <rocm-core/rocm_version.h>
+
+#define STRINGIFY(s) #s
+#define XSTRINGIFY(s) STRINGIFY(s)
+
+namespace at::zoom::tunable {
+
+template <typename T>
+class DefaultGemmOp : public Callable<GemmParams<T>> {
+  public:
+    TuningStatus Call(const GemmParams<T>* params) override {
+      at::zoom::blas::gemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->beta,
+          params->c, params->ldc);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      at::zoom::blas::bgemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda, params->stride_a,
+          params->b, params->ldb, params->stride_b,
+          params->beta,
+          params->c, params->ldc, params->stride_c,
+          params->batch);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+  public:
+    TuningStatus Call(const ScaledGemmParams<T>* params) override {
+      at::zoom::blas::scaled_gemm(
+          params->transa,
+          params->transb,
+          params->m,
+          params->n,
+          params->k,
+          params->a,
+          params->a_scale_ptr,
+          params->lda,
+          params->a_dtype,
+          params->b,
+          params->b_scale_ptr,
+          params->ldb,
+          params->b_dtype,
+          params->bias_ptr,
+          params->bias_dtype,
+          params->c,
+          params->c_scale_ptr,
+          params->ldc,
+          params->c_dtype,
+          params->amax_ptr,
+          params->use_fast_accum);
+      return OK;
+    }
+};
+
+template <typename T>
+inline bool IsZero(T v) {
+  return v == 0.0f;
+}
+
+template <>
+inline bool IsZero(BFloat16 v) {
+  return v.x == 0;
+}
+
+template <>
+inline bool IsZero(Half v) {
+  return float(v) == 0.0f;
+}
+
+template <>
+inline bool IsZero(c10::complex<double> v) {
+  return v == 0.0;
+}
+
+template <>
+inline bool IsZero(c10::complex<float> v) {
+  return v == 0.0f;
+}
+
+template <typename T>
+inline std::string TypeName(T v) {
+  return "unknown";
+}
+
+template <>
+inline std::string TypeName(float v) {
+  return "float";
+}
+
+template <>
+inline std::string TypeName(double v) {
+  return "double";
+}
+
+template <>
+inline std::string TypeName(BFloat16 v) {
+  return "BFloat16";
+}
+
+template <>
+inline std::string TypeName(Half v) {
+  return "Half";
+}
+
+template <>
+inline std::string TypeName(Float8_e4m3fn v) {
+  return "Float8_e4m3fn";
+}
+
+template <>
+inline std::string TypeName(Float8_e5m2 v) {
+  return "Float8_e5m2";
+}
+
+template <>
+inline std::string TypeName(Float8_e4m3fnuz v) {
+  return "Float8_e4m3fnuz";
+}
+
+template <>
+inline std::string TypeName(Float8_e5m2fnuz v) {
+  return "Float8_e5m2fnuz";
+}
+
+template <>
+inline std::string TypeName(c10::complex<double> v) {
+  return "c10::complex<double>";
+}
+
+template <>
+inline std::string TypeName(c10::complex<float> v) {
+  return "c10::complex<float>";
+}
+
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+ public:
+  GemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+    for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("ROCM_VERSION") == validators.end()) {
+      std::string rocm_version = ROCM_BUILD_INFO;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCM_VERSION",
+          [rocm_version]() { return rocm_version; },
+          [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+    }
+
+    if (validators.find("GCN_ARCH_NAME") == validators.end()) {
+      std::string gcn_arch_name = at::zoom::getCurrentDeviceProperties()->gcnArchName;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "GCN_ARCH_NAME",
+          [gcn_arch_name]() { return gcn_arch_name; },
+          [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+    }
+
+    if (validators.find("ROCBLAS_VERSION") == validators.end()) {
+      std::string rocblas_version = c10::str(
+          XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
+          XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCBLAS_VERSION",
+          [rocblas_version]() { return rocblas_version; },
+          [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+    }
+
+    static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env == nullptr || strcmp(env, "1") == 0) {
+      #ifdef DISABLE_HIPBLASLT
+      TORCH_WARN_ONCE("Warning, PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED but hipblasLt is disabled in the Zoom backend. This is a no-op.")
+      #else
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+      #endif
+    }
+  }
+
+  std::string Signature() override {
+    return c10::str("GemmTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>, StreamTimer> {
+ public:
+  GemmStridedBatchedTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+    for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("ROCM_VERSION") == validators.end()) {
+      std::string rocm_version = ROCM_BUILD_INFO;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCM_VERSION",
+          [rocm_version]() { return rocm_version; },
+          [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+    }
+
+    if (validators.find("GCN_ARCH_NAME") == validators.end()) {
+      std::string gcn_arch_name = at::zoom::getCurrentDeviceProperties()->gcnArchName;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "GCN_ARCH_NAME",
+          [gcn_arch_name]() { return gcn_arch_name; },
+          [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+    }
+
+    if (validators.find("ROCBLAS_VERSION") == validators.end()) {
+      std::string rocblas_version = c10::str(
+          XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
+          XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCBLAS_VERSION",
+          [rocblas_version]() { return rocblas_version; },
+          [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+    }
+
+    static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env == nullptr || strcmp(env, "1") == 0) {
+      //disabling hipblasLt temporarily
+      #ifdef DISABLE_HIPBLASLT
+      TORCH_WARN("Warning, PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED but hipblasLt is disabled in the Zoom backend. This is a no-op.")
+      #else
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+      #endif
+    }
+  }
+
+  std::string Signature() override {
+    return c10::str("GemmStridedBatchedTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
+ public:
+  ScaledGemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+    #ifdef DISABLE_HIPBLASLT
+      TORCH_WARN_DISABLE_HIPBLASLT
+    #else
+      for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+    #endif
+  }
+
+  std::string Signature() override {
+    return c10::str("ScaledGemmTunableOp",
+            "_", TypeName<AT>(AT{}),
+            "_", TypeName<BT>(BT{}),
+            "_", TypeName<CT>(CT{}),
+            "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+#undef XSTRINGIFY
+#undef STRINGIFY
+
+} // namespace at::zoom::tunable
diff --git a/aten/src/ATen/zoom/tunable/TunableOp.h b/aten/src/ATen/zoom/tunable/TunableOp.h
new file mode 100644
index 00000000000000..0d0eb7e2beb7a7
--- /dev/null
+++ b/aten/src/ATen/zoom/tunable/TunableOp.h
@@ -0,0 +1,243 @@
+// !!! This is a file automatically generated by hipify!!!
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/zoom/tunable/Tunable.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+namespace at::zoom::tunable {
+
+template <typename ParamsT>
+class Callable {
+  public:
+    Callable() = default;
+    Callable(Callable&&) = default;
+    virtual ~Callable() = default;
+    virtual TuningStatus Call(const ParamsT*) {
+      return FAIL;
+    }
+    virtual TuningStatus IsSupported(const ParamsT* params) {
+      return Call(params);
+    }
+};
+
+template <typename ParamsT, typename TimerT>
+class TunableOp {
+  public:
+    TunableOp() = default;
+    TunableOp(TunableOp&&) = default;
+    virtual ~TunableOp() = default;
+
+    TuningStatus operator()(const ParamsT* params) {
+      ResultEntry result = ResultEntry::Null();
+      TuningContext* ctx = getTuningContext();
+      if (ctx->IsTunableOpEnabled()) {
+        auto& mgr = ctx->GetTuningResultsManager();
+        auto op_sig = Signature();
+        auto params_sig = params->Signature();
+        result = mgr.Lookup(op_sig, params_sig);
+        // If there is not previous tuning result been found, we do the tuning iff tuning is enabled
+        if (result == ResultEntry::Null() && ctx->IsTuningEnabled()) {
+          result = FindFastest(params);
+          mgr.Add(op_sig, params_sig, result);
+        }
+      }
+      else {
+        result = ResultEntry::Default();
+      }
+      if (result == ResultEntry::Null()) {
+        TUNABLE_LOG("no result, using default");
+        result = ResultEntry::Default();
+      }
+      auto iter = ops_.find(result);
+      TORCH_CHECK(iter != ops_.end());
+      return iter->second->Call(params);
+    }
+
+    virtual std::string Signature() {
+      // According to C++17 standard https://wg21.link/n4659 section 15.7.4
+      // > if the operand of typeid refers to the
+      // > object under construction or destruction, typeid yields the std::type_info object representing the constructor
+      // > or destructor’s class.
+      // So delay the op signature generation.
+      c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); });
+      return signature_;
+    }
+
+  protected:
+    void RegisterOp(const std::string& name, std::unique_ptr<Callable<ParamsT>> op) {
+      this->op_names_.emplace_back(name);
+      this->ops_.emplace(name, std::move(op));
+    }
+
+  private:
+    static void WarmUp(Callable<ParamsT> *op, ParamsT* param, size_t num_iter) {
+      for (size_t i = 0; i < num_iter; i++) {
+        TORCH_CHECK(op->Call(param) == OK);
+      }
+    }
+
+    static double Profile(Callable<ParamsT> *op, ParamsT* param, size_t num_iter) {
+      TimerT timer{};
+      timer.Start();
+      for (size_t i = 0; i < num_iter; i++) {
+        TORCH_CHECK(op->Call(param) == OK);
+      }
+      timer.End();
+      return timer.Duration() / num_iter;
+    }
+
+  protected:
+    bool IsNumericsCheckEnabled() {
+      static const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
+      if (env != nullptr && strcmp(env, "0") == 0) {
+        return false;
+      }
+      return true;
+    }
+
+    virtual ResultEntry FindFastest(const ParamsT* params) {
+      TuningContext* ctx = getTuningContext();
+      auto op_sig = Signature();
+      auto params_sig = params->Signature();
+      TUNABLE_LOG("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates");
+      auto min_duration_ms = std::numeric_limits<double>::infinity();
+      std::string id_name = "Default";
+
+      // calcaulte a reference answer for numerical check
+      ParamsT* reference_params = params->DeepCopy();
+      TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
+
+      // need a copy of params to reuse
+      ParamsT* reusable_params = params->DeepCopy();
+
+      for (size_t i = 0; i < op_names_.size(); i++) {
+        auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
+        auto status = candidate->Call(reusable_params);
+        if (status != OK) {
+          TUNABLE_LOG("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        if (IsNumericsCheckEnabled()) {
+          ParamsT* numerical_params = params->DeepCopy();
+          WarmUp(candidate, numerical_params, 1);
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+
+        // collect a small profile
+        constexpr const int approx_num_iter = 3;
+        auto approx_duration = Profile(candidate, reusable_params, approx_num_iter);
+        // bail if too slow
+        if (approx_duration > 2 * min_duration_ms) {
+          TUNABLE_LOG("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        // for warmup does user set max duration, max iters, or both?
+        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
+        int max_warmup_iter = ctx->GetMaxWarmupIterations();
+        int warmup_iter = 1; // default
+        if (max_warmup_duration > 0) {
+          int duration_iters = max_warmup_duration / approx_duration;
+          if (max_warmup_iter > 0) {
+            warmup_iter = std::min(max_warmup_iter, duration_iters);
+          }
+          else {
+            warmup_iter = duration_iters;
+          }
+        }
+        else if (max_warmup_iter > 0) {
+          warmup_iter = max_warmup_iter;
+        }
+
+        // for tuning does user set max duration, max iters, or both?
+        double max_tuning_duration = ctx->GetMaxTuningDurationMs();
+        int max_tuning_iter = ctx->GetMaxTuningIterations();
+        int tuning_iter = 100; // default
+        if (max_tuning_duration > 0) {
+          int duration_iters = max_tuning_duration / approx_duration;
+          if (max_tuning_iter > 0) {
+            tuning_iter = std::min(max_tuning_iter, duration_iters);
+          }
+          else {
+            tuning_iter = duration_iters;
+          }
+        }
+        else if (max_tuning_iter > 0) {
+          tuning_iter = max_tuning_iter;
+        }
+
+        // do the full warmup followed by tuning
+        double warmup_ms = warmup_iter * approx_duration;
+        double tuning_ms = tuning_iter * approx_duration;
+        TUNABLE_LOG("├──tuning using "
+            "warmup iters ", warmup_iter, " [", warmup_ms, " ms] "
+            "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ",
+            "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]);
+        WarmUp(candidate, reusable_params, warmup_iter);
+        auto duration_ms = Profile(candidate, reusable_params, tuning_iter);
+        if (duration_ms < min_duration_ms) {
+          TUNABLE_LOG("├──found better instance id=", i, ". " , duration_ms, "ms. ", op_names_[i]);
+          min_duration_ms = duration_ms;
+          id_name = op_names_[i];
+        }
+      }
+
+      reusable_params->Delete();
+      reference_params->Delete();
+
+      TUNABLE_LOG("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name);
+      return ResultEntry(id_name, min_duration_ms);
+    }
+
+  private:
+    std::string CreateSignature() {
+#ifndef _WIN32
+      const auto* name = typeid(*this).name();
+      char buf[256];
+      size_t buf_len = 256;
+      abi::__cxa_demangle(name, buf, &buf_len, nullptr);
+      buf[255] = '\0';
+      return buf;
+#else
+      return typeid(*this).name();
+#endif
+    }
+
+    mutable c10::once_flag signature_init_once_;
+    std::string signature_;
+
+    std::unordered_map<std::string, std::unique_ptr<Callable<ParamsT>>> ops_;
+    std::vector<std::string> op_names_;
+};
+
+struct OpParams {
+  OpParams() {}
+  virtual ~OpParams() = default;
+  virtual std::string Signature() const = 0;
+};
+
+} // namespace at::zoom::tunable
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index c1bf64696b8a85..981c4abdfdd699 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -982,6 +982,12 @@ if(USE_ROCM)
   endif()
 elseif(USE_ZOOM)
   ADD_DEFINITIONS(-DUSE_ZOOM)
+  if(ENABLE_ZOOM_BLAS)
+    ADD_DEFINITIONS(-DENABLE_ZOOM_BLAS)
+    if(DISABLE_HIPBLASLT)
+      ADD_DEFINITIONS(-DDISABLE_HIPBLASLT)
+    endif()
+  endif()
   set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
   # list(APPEND Caffe2_ZOOM_SRCS ${GENERATED_CXX_TORCH_CUDA})
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 98f3fc3a327487..56242dd0d52082 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1244,6 +1244,12 @@ if(USE_ROCM OR USE_ZOOM)
       set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
       ${PYTORCH_HIP_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB})
       list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS hip::hiprand)
+      if(ENABLE_ZOOM_BLAS)
+        if(NOT DISABLE_HIPBLASLT)
+          list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS ${hipblaslt_LIBRARIES})
+        endif()
+        list(APPEND Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS roc::hipblas)
+      endif()
     else()
       set(Caffe2_PUBLIC_HIP_DEPENDENCY_LIBS
       ${PYTORCH_HIP_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index b7ab4c6d3d5aeb..b49b95ae497380 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -178,6 +178,13 @@ if(HIP_FOUND)
     find_package_and_print_version(rocprim REQUIRED)
     find_package_and_print_version(hipcub REQUIRED)
     find_package_and_print_version(rocthrust REQUIRED)
+
+    if(ENABLE_ZOOM_BLAS)
+      find_package_and_print_version(hipblas REQUIRED)
+      if(NOT DISABLE_HIPBLASLT)
+        find_package_and_print_version(hipblaslt REQUIRED)
+      endif()
+    endif()
   endif()
 
 
diff --git a/torch/csrc/zoom/Module.cpp b/torch/csrc/zoom/Module.cpp
index be58f275be1c75..463667f4e2f42d 100644
--- a/torch/csrc/zoom/Module.cpp
+++ b/torch/csrc/zoom/Module.cpp
@@ -163,6 +163,16 @@ PyObject* THCPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THCPModule_hasBLAS(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  #ifdef ENABLE_ZOOM_BLAS
+    Py_RETURN_TRUE;
+  #else
+    Py_RETURN_FALSE;
+  #endif
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THCPModule_getArchFlags(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   poison_fork();
@@ -1262,6 +1272,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_getDeviceCount_wrap,
      METH_NOARGS,
      nullptr},
+     {"_zoom_hasBLAS",
+      THCPModule_hasBLAS,
+      METH_NOARGS,
+      nullptr},
     {"_zoom_canDeviceAccessPeer",
      THCPModule_canDeviceAccessPeer_wrap,
      METH_VARARGS,
diff --git a/torch/zoom/__init__.py b/torch/zoom/__init__.py
index 80fbfac53af7d4..f5c906a09afe27 100644
--- a/torch/zoom/__init__.py
+++ b/torch/zoom/__init__.py
@@ -44,7 +44,9 @@ def _maybe_exchange_device(device: int) -> int:
             return -1
         raise RuntimeError("PyTorch was compiled without Zoom support")
 
-from .zoom_triton_mm import *
+# load triton backup BLAS kernels when without HIPBlas
+if not (hasattr(torch._C, "_zoom_hasBLAS") and torch._C._zoom_hasBLAS()):
+    from .zoom_triton_mm import *
 
 _initialized = False
 _tls = threading.local()

From fcc1e4a9d1ec4e2dd826674c0b8928b3210c99cd Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 13 Apr 2025 20:19:17 -0500
Subject: [PATCH 20/23] add back binary, bitwise, polynomial, and conv kernels

---
 aten/src/ATen/native/native_functions.yaml    | 115 ++-
 .../native/zoom/AdaptiveAveragePooling.cu     | 822 ++++++++++++++++++
 .../native/zoom/AdaptiveAveragePooling3d.cu   | 545 ++++++++++++
 .../ATen/native/zoom/AdaptiveMaxPooling2d.cu  | 478 ++++++++++
 .../ATen/native/zoom/AdaptiveMaxPooling3d.cu  | 488 +++++++++++
 aten/src/ATen/native/zoom/AmpKernels.cu       | 252 ++++++
 aten/src/ATen/native/zoom/AveragePool2d.cu    | 463 ++++++++++
 aten/src/ATen/native/zoom/AveragePool3d.cu    | 606 +++++++++++++
 .../native/zoom/BinaryBitwiseOpsKernels.cu    |   3 +
 .../native/zoom/BinaryGeometricKernels.cu     |  39 +
 .../native/zoom/BinaryLogicalOpsKernels.cu    | 128 +++
 .../zoom/BinaryMiscBackwardOpsKernels.cu      | 131 +++
 .../ATen/native/zoom/BinaryMiscOpsKernels.cu  |  81 ++
 .../ATen/native/zoom/BinaryRemainderKernel.cu |  61 ++
 .../ATen/native/zoom/BinaryShiftOpsKernels.cu |  44 +
 aten/src/ATen/native/zoom/Bucketization.cu    | 233 +++++
 aten/src/ATen/native/zoom/Col2Im.cu           | 171 ++++
 aten/src/ATen/native/zoom/ComplexKernel.cu    |  36 +
 .../native/zoom/CompositeRandomAccessor.h     |  35 +
 aten/src/ATen/native/zoom/ConvolutionMM2d.cu  | 502 +++++++++++
 aten/src/ATen/native/zoom/CopysignKernel.cu   |  27 +
 aten/src/ATen/native/zoom/CrossKernel.cu      |  92 ++
 aten/src/ATen/native/zoom/airy_ai.cu          |  42 +
 aten/src/ATen/native/zoom/bessel_j0.cu        |  42 +
 aten/src/ATen/native/zoom/bessel_j1.cu        |  42 +
 aten/src/ATen/native/zoom/bessel_y0.cu        |  41 +
 aten/src/ATen/native/zoom/bessel_y1.cu        |  41 +
 .../native/zoom/chebyshev_polynomial_t.cu     |  31 +
 .../native/zoom/chebyshev_polynomial_u.cu     |  31 +
 .../native/zoom/chebyshev_polynomial_v.cu     |  31 +
 .../native/zoom/chebyshev_polynomial_w.cu     |  31 +
 aten/src/ATen/native/zoom/im2col.cuh          | 341 ++++++++
 32 files changed, 5983 insertions(+), 42 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/AdaptiveAveragePooling.cu
 create mode 100644 aten/src/ATen/native/zoom/AdaptiveAveragePooling3d.cu
 create mode 100644 aten/src/ATen/native/zoom/AdaptiveMaxPooling2d.cu
 create mode 100644 aten/src/ATen/native/zoom/AdaptiveMaxPooling3d.cu
 create mode 100644 aten/src/ATen/native/zoom/AmpKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/AveragePool2d.cu
 create mode 100644 aten/src/ATen/native/zoom/AveragePool3d.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryGeometricKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryLogicalOpsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryMiscBackwardOpsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryMiscOpsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryRemainderKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/BinaryShiftOpsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/Bucketization.cu
 create mode 100644 aten/src/ATen/native/zoom/Col2Im.cu
 create mode 100644 aten/src/ATen/native/zoom/ComplexKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/CompositeRandomAccessor.h
 create mode 100644 aten/src/ATen/native/zoom/ConvolutionMM2d.cu
 create mode 100644 aten/src/ATen/native/zoom/CopysignKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/CrossKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/airy_ai.cu
 create mode 100644 aten/src/ATen/native/zoom/bessel_j0.cu
 create mode 100644 aten/src/ATen/native/zoom/bessel_j1.cu
 create mode 100644 aten/src/ATen/native/zoom/bessel_y0.cu
 create mode 100644 aten/src/ATen/native/zoom/bessel_y1.cu
 create mode 100644 aten/src/ATen/native/zoom/chebyshev_polynomial_t.cu
 create mode 100644 aten/src/ATen/native/zoom/chebyshev_polynomial_u.cu
 create mode 100644 aten/src/ATen/native/zoom/chebyshev_polynomial_v.cu
 create mode 100644 aten/src/ATen/native/zoom/chebyshev_polynomial_w.cu
 create mode 100644 aten/src/ATen/native/zoom/im2col.cuh

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index cc7153cb84b18f..33dc6344e7b930 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1210,7 +1210,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA, MPS: copysign_out
+    CPU, CUDA, PrivateUse1, MPS: copysign_out
   tags: pointwise
 
 - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1287,7 +1287,7 @@
 - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: logical_xor_out
+    CPU, CUDA, PrivateUse1: logical_xor_out
     MPS: logical_xor_out_mps
   tags: pointwise
 
@@ -1308,7 +1308,7 @@
 - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: logical_and_out
+    CPU, CUDA, PrivateUse1: logical_and_out
     MPS: logical_and_out_mps
   tags: pointwise
 
@@ -1329,7 +1329,7 @@
 - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: logical_or_out
+    CPU, CUDA, PrivateUse1: logical_or_out
     MPS: logical_or_out_mps
   tags: pointwise
 
@@ -1649,7 +1649,7 @@
 
 - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: complex_out
+    CPU, CUDA, PrivateUse1: complex_out
     MPS: complex_out_mps
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
@@ -1659,7 +1659,7 @@
 
 - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: polar_out
+    CPU, CUDA, PrivateUse1: polar_out
     MPS: polar_out_mps
 
 - func: constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
@@ -3617,7 +3617,7 @@
   structured_inherits: TensorIteratorBase
   variants: function
   dispatch:
-    CPU, CUDA: xlogy_out
+    CPU, CUDA, PrivateUse1: xlogy_out
     MPS: xlogy_out_mps
   tags: pointwise
 
@@ -8483,21 +8483,21 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: __lshift__
+    CPU, CUDA, PrivateUse1: __lshift__
   tags: pointwise
 
 - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: __lshift__
+    CPU, CUDA, PrivateUse1: __lshift__
   tags: pointwise
 
 - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: __ilshift__
+    CPU, CUDA, PrivateUse1: __ilshift__
   autogen: __lshift__.Scalar_out
   tags: pointwise
 
@@ -8505,7 +8505,7 @@
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: __ilshift__
+    CPU, CUDA, PrivateUse1: __ilshift__
   autogen: __lshift__.Tensor_out
   tags: pointwise
 
@@ -8526,7 +8526,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: bitwise_left_shift_out
+    CPU, CUDA, PrivateUse1: bitwise_left_shift_out
   tags: pointwise
 
 - func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
@@ -8562,28 +8562,28 @@
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: __rshift__
+    CPU, CUDA, PrivateUse1: __rshift__
   tags: pointwise
 
 - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
-    CPU, CUDA: __rshift__
+    CPU, CUDA, PrivateUse1: __rshift__
   tags: pointwise
 
 - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: __irshift__
+    CPU, CUDA, PrivateUse1: __irshift__
   autogen: __rshift__.Scalar_out
 
 - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
-    CPU, CUDA: __irshift__
+    CPU, CUDA, PrivateUse1: __irshift__
   autogen: __rshift__.Tensor_out
 
 - func: bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
@@ -8603,7 +8603,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: bitwise_right_shift_out
+    CPU, CUDA, PrivateUse1: bitwise_right_shift_out
   tags: pointwise
 
 - func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
@@ -9660,7 +9660,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: atan2_out
+    CPU, CUDA, PrivateUse1: atan2_out
     MPS: atan2_out_mps
   tags: [core, pointwise]
 
@@ -9791,7 +9791,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: fmod_out
+    CPU, CUDA, PrivateUse1: fmod_out
     MPS: fmod_mps_out
   tags: pointwise
 
@@ -9811,7 +9811,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: hypot_out
+    CPU, CUDA, PrivateUse1: hypot_out
     MPS: hypot_out_mps
   tags: pointwise
 
@@ -9898,7 +9898,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: remainder_out
+    CPU, CUDA, PrivateUse1: remainder_out
     MPS: remainder_out_mps
   tags: pointwise
 
@@ -10356,6 +10356,7 @@
   variants: function
   dispatch:
     CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
+    PrivateUse1: _amp_foreach_non_finite_check_and_unscale_zoom_
     CPU: _amp_foreach_non_finite_check_and_unscale_cpu_
   autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out
 
@@ -10363,6 +10364,7 @@
   variants: function
   dispatch:
     CUDA: _amp_update_scale_cuda_
+    PrivateUse1: _amp_update_scale_zoom_
     CPU: _amp_update_scale_cpu_
   autogen: _amp_update_scale, _amp_update_scale.out
 
@@ -11568,18 +11570,21 @@
   dispatch:
     CPU: bucketize_cpu
     CUDA: bucketize_cuda
+    PrivateUse1: bucketize_zoom
     MPS: bucketize_mps
 
 - func: bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: bucketize_out_cpu
     CUDA: bucketize_out_cuda
+    PrivateUse1: bucketize_out_zoom
     MPS: bucketize_out_mps
 
 - func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
   dispatch:
     CPU: bucketize_cpu
     CUDA: bucketize_cuda
+    PrivateUse1: bucketize_zoom
     MPS: bucketize_mps
   autogen: bucketize.Scalar_out
 
@@ -11587,24 +11592,28 @@
   dispatch:
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
+    PrivateUse1: searchsorted_zoom
     MPS: searchsorted_mps
 
 - func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: searchsorted_out_cpu
     CUDA: searchsorted_out_cuda
+    PrivateUse1: searchsorted_out_zoom
     MPS: searchsorted_out_mps
 
 - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
   dispatch:
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
+    PrivateUse1: searchsorted_zoom
     MPS: searchsorted_mps
 
 - func: searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: searchsorted_out_cpu
     CUDA: searchsorted_out_cuda
+    PrivateUse1: searchsorted_out_zoom
     MPS: searchsorted_out_mps
 
 - func: _convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
@@ -11633,7 +11642,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: mse_loss_out
+    CPU, CUDA, PrivateUse1: mse_loss_out
     MPS: mse_loss_out_mps
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
@@ -11644,13 +11653,13 @@
 - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU, CUDA: mse_loss_backward_out
+    CPU, CUDA, PrivateUse1: mse_loss_backward_out
     MPS: mse_loss_backward_out_mps
 
 - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: mse_loss_backward
+    CPU, CUDA, PrivateUse1: mse_loss_backward
     MPS: mse_loss_backward_mps
 
 - func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
@@ -11795,7 +11804,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: smooth_l1_loss_out
+    CPU, CUDA, PrivateUse1: smooth_l1_loss_out
     MPS: smooth_l1_loss_out_mps
 
 - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
@@ -11808,6 +11817,7 @@
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
+    PrivateUse1: smooth_l1_loss_backward_out
     MPS: smooth_l1_loss_backward_out_mps
 
 - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
@@ -11818,19 +11828,19 @@
 - func: huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU, CUDA: huber_loss_out
+    CPU, CUDA, PrivateUse1: huber_loss_out
     MPS: huber_loss_out_mps
 
 - func: huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
   python_module: nn
   dispatch:
-    CPU, CUDA: huber_loss
+    CPU, CUDA, PrivateUse1: huber_loss
     MPS: huber_loss_mps
 
 - func: huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
-    CPU, CUDA: huber_loss_backward_out
+    CPU, CUDA, PrivateUse1: huber_loss_backward_out
     MPS: huber_loss_backward_out_mps
 
 - func: huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
@@ -12192,6 +12202,7 @@
   dispatch:
     CPU: adaptive_avg_pool2d_out_cpu
     CUDA: adaptive_avg_pool2d_out_cuda
+    PrivateUse1: adaptive_avg_pool2d_out_zoom
     MPS: adaptive_avg_pool2d_out_mps
     MkldnnCPU: mkldnn_adaptive_avg_pool2d_out_stub
 
@@ -12217,6 +12228,7 @@
   dispatch:
     CPU: adaptive_avg_pool2d_cpu
     CUDA: adaptive_avg_pool2d_cuda
+    PrivateUse1: adaptive_avg_pool2d_zoom
     MPS: adaptive_avg_pool2d_mps
     QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
     QuantizedCUDA: adaptive_avg_pool2d_quantized_cuda
@@ -12228,6 +12240,7 @@
   dispatch:
     CPU: adaptive_avg_pool2d_backward_cpu
     CUDA: adaptive_avg_pool2d_backward_cuda
+    PrivateUse1: adaptive_avg_pool2d_backward_zoom
     MPS: adaptive_avg_pool2d_backward_mps
   autogen: _adaptive_avg_pool2d_backward.out
   tags: core
@@ -12237,6 +12250,7 @@
   dispatch:
     CPU: adaptive_avg_pool3d_out_cpu
     CUDA: adaptive_avg_pool3d_out_cuda
+    PrivateUse1: adaptive_avg_pool3d_out_zoom
     QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
 
 - func: adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
@@ -12248,6 +12262,7 @@
   dispatch:
     CPU: adaptive_avg_pool3d_cpu
     CUDA: adaptive_avg_pool3d_cuda
+    PrivateUse1: adaptive_avg_pool3d_zoom
     QuantizedCPU: adaptive_avg_pool3d_quantized_cpu
   autogen: _adaptive_avg_pool3d.out
   tags: core
@@ -12257,12 +12272,14 @@
   dispatch:
     CPU: adaptive_avg_pool3d_backward_out_cpu
     CUDA: adaptive_avg_pool3d_backward_out_cuda
+    PrivateUse1: adaptive_avg_pool3d_backward_out_zoom
 
 - func: _adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool3d_backward_cpu
     CUDA: adaptive_avg_pool3d_backward_cuda
+    PrivateUse1: adaptive_avg_pool3d_backward_zoom
   autogen: _adaptive_avg_pool3d_backward.out
 
 # Return: (Tensor output, Tensor indices)
@@ -12272,6 +12289,7 @@
   dispatch:
     CPU: adaptive_max_pool2d_out_cpu
     CUDA: adaptive_max_pool2d_out_cuda
+    PrivateUse1: adaptive_max_pool2d_out_zoom
     MPS: adaptive_max_pool2d_out_mps
 
 # Return: (Tensor output, Tensor indices)
@@ -12285,6 +12303,7 @@
   dispatch:
     CPU: adaptive_max_pool2d_backward_out_cpu
     CUDA: adaptive_max_pool2d_backward_out_cuda
+    PrivateUse1: adaptive_max_pool2d_backward_out_zoom
     MPS: adaptive_max_pool2d_backward_out_mps
 
 - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
@@ -12298,6 +12317,7 @@
   dispatch:
     CPU: adaptive_max_pool3d_out_cpu
     CUDA: adaptive_max_pool3d_out_cuda
+    PrivateUse1: adaptive_max_pool3d_out_zoom
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
@@ -12310,6 +12330,7 @@
   dispatch:
     CPU: adaptive_max_pool3d_backward_out_cpu
     CUDA: adaptive_max_pool3d_backward_out_cuda
+    PrivateUse1: adaptive_max_pool3d_backward_out_zoom
 
 - func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
   python_module: nn
@@ -12325,6 +12346,7 @@
   dispatch:
     CPU: avg_pool2d_out_cpu
     CUDA: avg_pool2d_out_cuda
+    PrivateUse1: avg_pool2d_out_zoom
     MPS: avg_pool2d_out_mps
     MkldnnCPU: mkldnn_avg_pool2d_out
 
@@ -12342,6 +12364,7 @@
   dispatch:
     CPU: avg_pool2d_backward_out_cpu
     CUDA: avg_pool2d_backward_out_cuda
+    PrivateUse1: avg_pool2d_backward_out_zoom
     MPS: avg_pool2d_backward_out_mps
     MkldnnCPU: mkldnn_avg_pool2d_backward_out
 
@@ -12358,6 +12381,7 @@
   dispatch:
     CPU: avg_pool3d_out_cpu
     CUDA: avg_pool3d_out_cuda
+    PrivateUse1: avg_pool3d_out_zoom
     MkldnnCPU: mkldnn_avg_pool3d_out
 
 - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
@@ -12374,6 +12398,7 @@
   dispatch:
     CPU: avg_pool3d_backward_out_cpu
     CUDA: avg_pool3d_backward_out_cuda
+    PrivateUse1: avg_pool3d_backward_out_zoom
     MkldnnCPU: mkldnn_avg_pool3d_backward_out
 
 - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
@@ -13031,7 +13056,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sigmoid_backward_out
+    CPU, CUDA, PrivateUse1: sigmoid_backward_out
     MPS: sigmoid_backward_out_mps
   tags: pointwise
 
@@ -13045,7 +13070,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: logit_backward_out
+    CPU, CUDA, PrivateUse1: logit_backward_out
     MPS: logit_backward_out_mps
   tags: pointwise
 
@@ -13059,7 +13084,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: tanh_backward_out
+    CPU, CUDA, PrivateUse1: tanh_backward_out
     MPS: tanh_backward_out_mps
   tags: pointwise
 
@@ -13120,24 +13145,28 @@
   dispatch:
     CPU: slow_conv2d_forward_out_cpu
     CUDA: slow_conv2d_forward_out_cuda
+    PrivateUse1: slow_conv2d_forward_out_zoom
 
 - func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv2d_forward_cpu
     CUDA: slow_conv2d_forward_cuda
+    PrivateUse1: slow_conv2d_forward_zoom
 
 - func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_out_cpu
     CUDA: slow_conv2d_backward_out_cuda
+    PrivateUse1: slow_conv2d_backward_out_zoom
 
 - func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_cpu
     CUDA: slow_conv2d_backward_cuda
+    PrivateUse1: slow_conv2d_backward_zoom
   autogen: _slow_conv2d_backward.output_mask_out
 
 - func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
@@ -13192,12 +13221,14 @@
   dispatch:
     CPU: col2im_out_cpu
     CUDA: col2im_out_cuda
+    PrivateUse1: col2im_out_zoom
 
 - func: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   python_module: nn
   dispatch:
     CPU: col2im_cpu
     CUDA: col2im_cuda
+    PrivateUse1: col2im_zoom
   tags: core
 
 - func: column_stack(Tensor[] tensors) -> Tensor
@@ -13450,7 +13481,7 @@
   python_module: special
   variants: function
   dispatch:
-    CPU, CUDA: special_xlog1py_out
+    CPU, CUDA, PrivateUse1: special_xlog1py_out
   tags: pointwise
 
 - func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -13987,7 +14018,7 @@
   python_module: linalg
   structured: True
   dispatch:
-    CPU, CUDA, MPS: linalg_cross_out
+    CPU, CUDA, PrivateUse1, MPS: linalg_cross_out
 
 # linalg.lu_factor
 - func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
@@ -14986,7 +15017,7 @@
 
 - func: special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_airy_ai_out
+    CPU, CUDA, PrivateUse1: special_airy_ai_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15001,7 +15032,7 @@
 
 - func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_bessel_j0_out
+    CPU, CUDA, PrivateUse1: special_bessel_j0_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15016,7 +15047,7 @@
 
 - func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_bessel_j1_out
+    CPU, CUDA, PrivateUse1: special_bessel_j1_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15031,7 +15062,7 @@
 
 - func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_bessel_y0_out
+    CPU, CUDA, PrivateUse1: special_bessel_y0_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15046,7 +15077,7 @@
 
 - func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_bessel_y1_out
+    CPU, CUDA, PrivateUse1: special_bessel_y1_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15079,7 +15110,7 @@
 - func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_chebyshev_polynomial_t_out
+    CPU, CUDA, PrivateUse1: special_chebyshev_polynomial_t_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15128,7 +15159,7 @@
 - func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_chebyshev_polynomial_u_out
+    CPU, CUDA, PrivateUse1: special_chebyshev_polynomial_u_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15177,7 +15208,7 @@
 - func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_chebyshev_polynomial_v_out
+    CPU, CUDA, PrivateUse1: special_chebyshev_polynomial_v_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15226,7 +15257,7 @@
 - func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_chebyshev_polynomial_w_out
+    CPU, CUDA, PrivateUse1: special_chebyshev_polynomial_w_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
diff --git a/aten/src/ATen/native/zoom/AdaptiveAveragePooling.cu b/aten/src/ATen/native/zoom/AdaptiveAveragePooling.cu
new file mode 100644
index 00000000000000..a3a8fa6e33c987
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AdaptiveAveragePooling.cu
@@ -0,0 +1,822 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/Utils.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/zoom/LaunchUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+#include <ATen/native/AdaptivePooling.h>
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+#define START_IND(a,b,c) ((int64_t)((a / b) * c + ((a % b) * c) / b))
+#define END_IND(a,b,c) (1 + ((int64_t)(a + 1) * c - 1) / b)
+
+#define START_IND_INT(a,b,c) ((a * c) / b)
+#define END_IND_INT(a,b,c) (((a + 1) * c + b - 1) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+#define HIP_MAX_THREADS 1024 // this is safe, in reality 256 is our limit
+#define BLOCK_STRIDE 2 // increasing block_stride to lower # of blocks launched
+
+namespace at::native {
+
+namespace {
+
+  // 4d tensor B x D x H x W
+  // All kernels view batch dim B and feature dim D as collapsed.
+
+  /*
+   * Description:
+   *    this function adaptively average pools an input 4D tensor along dimensions 2 and 3
+   *    4D input, 4D output
+   */
+   template <typename scalar_t>
+  __global__ void adaptive_average_pool(const scalar_t *input, scalar_t *output,
+                          int isizeH, int isizeW,
+                          int osizeH, int osizeW,
+                          int64_t istrideD, int64_t istrideH, int64_t istrideW)
+  {
+    using opmath_t = at::opmath_type<scalar_t>;
+    // iterators on output pixels
+    int oh, ow;
+
+    // select input/output plane based on thread/block ID
+    int o_plane = blockIdx.x;
+    int i_plane = o_plane;
+
+    output = output + o_plane*osizeH*osizeW;
+    input = input + i_plane*istrideD;
+
+    int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+    int oendH = osizeH;
+    const int ostepH = blockDim.y*gridDim.y;
+
+    int ostartW = threadIdx.x;
+    int oendW = osizeW;
+    const int ostepW = blockDim.x;
+
+    // For all output pixels...
+    for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+      int istartH = START_IND(oh, osizeH, isizeH);
+      int iendH   = END_IND(oh, osizeH, isizeH);
+      int kH = iendH - istartH;
+
+      for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+        int istartW = START_IND(ow, osizeW, isizeW);
+        int iendW   = END_IND(ow, osizeW, isizeW);
+        int kW = iendW - istartW;
+
+        // Compute the average pooling over corresponding input pixels
+        const scalar_t *ptr_input = input + istartH*istrideH + istartW*istrideW;
+        scalar_t *ptr_output = output + oh*osizeW + ow;
+        opmath_t sum = static_cast<opmath_t>(0);
+        int ih, iw;
+        for(ih = 0; ih < kH; ++ih) {
+          for(iw = 0; iw < kW; ++iw) {
+            scalar_t val = ptr_input[iw*istrideW];
+            sum += val;
+          }
+          ptr_input += istrideH; // next input line
+        }
+        // Update output
+        *ptr_output = sum / kH / kW;
+      }
+    }
+  }
+
+  /*
+   * Description:
+   *    this function computes the gradInput from gradOutput
+   */
+   template <typename T>
+  __global__ void adaptive_average_gradinput(
+    T *gradInput, const T *gradOutput,
+    int isizeH, int isizeW, int osizeH, int osizeW
+  )
+  {
+    // iterators on input pixels
+    int ih, iw;
+
+    // select input/output plane based on thread/block ID
+    int i_plane = blockIdx.x;
+    int o_plane = i_plane;
+
+    gradOutput = gradOutput + o_plane*osizeH*osizeW;
+    gradInput = gradInput + i_plane*isizeH*isizeW;
+
+    int istartH = blockDim.y*blockIdx.y + threadIdx.y;
+    int iendH = isizeH;
+    int istepH = blockDim.y*gridDim.y;
+
+    int istartW = threadIdx.x;
+    int iendW = isizeW;
+    int istepW = blockDim.x;
+
+    // compute gradInput
+    for(ih = istartH; ih < iendH; ih += istepH) {
+
+      int ostartH = START_IND(ih, isizeH, osizeH);
+      int oendH   = END_IND(ih, isizeH, osizeH);
+
+      for(iw = istartW; iw < iendW; iw += istepW) {
+
+        int ostartW = START_IND(iw, isizeW, osizeW);
+        int oendW   = END_IND(iw, isizeW, osizeW);
+
+        // Compute the gradients over corresponding output pixels
+        T *ptr_gradInput = gradInput + ih*isizeW + iw;
+
+        int oh, ow;
+        for(oh = ostartH; oh < oendH; ++oh) {
+          int kH = START_IND(oh, osizeH, isizeH) - END_IND(oh, osizeH, isizeH);
+          for(ow = ostartW; ow < oendW; ++ow) {
+            int kW = START_IND(ow, osizeW, isizeW) - END_IND(ow, osizeW, isizeW);
+            T grad_delta = gradOutput[ow + oh*osizeW] / kH / kW;
+            *ptr_gradInput += grad_delta;
+          }
+        }
+      }
+    }
+  }
+
+  /*
+   * Description:
+   *    this function computes the gradInput from gradOutput
+   *    (uses atomic add)
+   */
+   template <typename T>
+  __global__ void atomic_adaptive_average_gradinput(
+    T *gradInput, const T *gradOutput,
+    int isizeH, int isizeW, int osizeH, int osizeW
+  )
+  {
+    // iterators on output indices
+    int oh, ow;
+
+    // select input/output plane based on thread/block ID
+    int o_plane = blockIdx.x;
+    int i_plane = o_plane;
+
+    gradOutput = gradOutput + o_plane*osizeW*osizeH;
+    gradInput = gradInput + i_plane*isizeW*isizeH;
+
+    int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+    int oendH = osizeH;
+    int ostepH = blockDim.y*gridDim.y;
+
+    int ostartW = threadIdx.x;
+    int oendW = osizeW;
+    int ostepW = blockDim.x;
+
+    // For all output pixels...
+    for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+      int istartH = START_IND(oh, osizeH, isizeH);
+      int iendH   = END_IND(oh, osizeH, isizeH);
+      int kH = iendH - istartH;
+
+      for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+        int istartW = START_IND(ow, osizeW, isizeW);
+        int iendW   = END_IND(ow, osizeW, isizeW);
+        int kW = iendW - istartW;
+
+        // Compute the gradients for over corresponding input pixels
+        T *ptr_gradInput = gradInput + istartH*isizeW + istartW;
+        const T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+        T grad_delta = *ptr_gradOutput / kW / kH;
+
+        int ih, iw;
+        for(ih = 0; ih < kH; ++ih) {
+          for(iw = 0; iw < kW; ++iw) {
+            // atomic add since different threads could update same variable
+            gpuAtomicAddNoReturn(&(ptr_gradInput[iw]), grad_delta);
+          }
+          ptr_gradInput += isizeW; // next input line
+        }
+      }
+    }
+  }
+
+  /*
+   * Description:
+   *    this function adaptively average pools an input 4D tensor along dimensions 2 and 3
+   *    NHWC layout for both input and output tensor
+   *    4D input, 4D output
+   */
+   template <typename index_t, typename scalar_t>
+  C10_LAUNCH_BOUNDS_1(HIP_MAX_THREADS)
+  __global__ void adaptive_average_pool_nhwc(const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
+                          int sizeB, int sizeC,
+                          int isizeH, int isizeW,
+                          int osizeH, int osizeW,
+                          int kernel_stride_C, int kernel_size_C,
+                          index_t istrideB, index_t istrideC,
+                          index_t istrideH, index_t istrideW)
+  {
+    using opmath_t = at::opmath_type<scalar_t>;
+    extern __shared__ int smem[];
+    opmath_t *out_cached = reinterpret_cast<opmath_t*>(smem);
+
+    // flattening cta for pre-computation & smem initialization;
+    int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+    int block_size = blockDim.x * blockDim.y * blockDim.z;
+
+    // use shared memory to store temporary output value. This is simply to
+    // reduce register usage.
+    for (index_t i = thread_id; i < kernel_size_C*blockDim.x*blockDim.y*blockDim.z; i+= block_size) {
+      out_cached[i] = opmath_t(0.0);
+    }
+
+    __syncthreads();
+
+    // each CTA handles a portion of a single slice on batch dimension;
+    int batch_id = blockIdx.x % sizeB;
+    int channel_id = blockIdx.x / sizeB;
+    int channel_offset = threadIdx.x + channel_id * blockDim.x;
+
+    // each CTA handles a single slice on batch dimension;
+    // We use gridDim.x to handle striding on C as well.
+    output = output + batch_id * osizeH * osizeW * sizeC;
+    input = input + batch_id * istrideB;
+
+    // split out_cached and exclusively it assigned to each thread;
+    out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C * blockDim.x];
+
+    // iterate on output H & W.
+    // Each CTA handles a consecutive H & W section (TILE); Do NOT stride CTA on
+    // tile so there's a better chance to hit L1 cache.
+    index_t oH = (osizeH + gridDim.z-1) / gridDim.z;
+    index_t oW = (osizeW + gridDim.y-1) / gridDim.y;
+    index_t ostartH = threadIdx.z + blockIdx.z*oH;
+    index_t oendH = ::min(ostartH+oH, osizeH);
+    index_t ostartW = threadIdx.y + blockIdx.y*oW;
+    index_t oendW = ::min(ostartW+oW, osizeW);
+
+    // Stride for threads, each warp can reuse L1 as they go. So theoretically
+    // better chance to survive cache eviction.
+    for (int oh = ostartH; oh < oendH; oh+=blockDim.z) {
+      int istartH = START_IND_INT(oh, osizeH, isizeH);
+      int iendH = END_IND_INT(oh, osizeH, isizeH);
+      for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
+        int istartW = START_IND_INT(ow, osizeW, isizeW);
+        int iendW = END_IND_INT(ow, osizeW, isizeW);
+        scalar_t factor = scalar_t(1.0) / ((iendH-istartH) * (iendW-istartW));
+
+        // loop on input: hierarchy h->w->c, use shared memory here hopefully
+        // would not stall global memory read;
+        for (index_t ih = istartH; ih < iendH; ih++) {
+          for (index_t iw = istartW; iw < iendW; iw++) {
+            int cached_index = threadIdx.x;
+            const scalar_t *ptr_input = input + ih*istrideH + iw*istrideW;
+            for (index_t c = channel_offset;
+                 c < sizeC;
+                 c += blockDim.x*kernel_stride_C) {
+              out_cached[cached_index] += ptr_input[c*istrideC];
+              cached_index += blockDim.x;
+            }
+          }
+        }
+        scalar_t *ptr_output = output + (oh * osizeW + ow) * sizeC;
+
+        int cached_index = threadIdx.x;
+        // write accumulated output to global memory;
+        for (index_t c = channel_offset;
+             c < sizeC;
+             c += blockDim.x*kernel_stride_C) {
+          // This causes numerical issueptr when unit test with NCHW kernel;
+          // switch to could verify the correctness;
+          // output[c] = out_cached[c] / (iendH-istartH) / (iendW-istartW);
+          ptr_output[c] = out_cached[cached_index] * factor;
+          out_cached[cached_index] = opmath_t(0.0);
+          cached_index += blockDim.x;
+        }
+        // no need to __syncthreads() since out_cached is not shared.
+      }
+    }
+  }
+
+  /*
+   * Description:
+   *    this function computes the gradInput from gradOutput
+   *    NHWC layout for both input and output tensor
+   *    4D input, 4D output
+   */
+   template <typename index_t, typename scalar_t>
+  C10_LAUNCH_BOUNDS_1(HIP_MAX_THREADS)
+  __global__ void adaptive_average_gradinput_nhwc(scalar_t* __restrict__ gradInput, const scalar_t* __restrict__ gradOutput,
+                          int sizeB, int sizeC,
+                          int isizeH, int isizeW,
+                          int osizeH, int osizeW,
+                          int kernel_stride_C, int kernel_size_C,
+                          index_t ostrideB, index_t ostrideC,
+                          index_t ostrideH, index_t ostrideW)
+  {
+    extern __shared__ int smem[];
+    index_t *ostartW_cached = smem;
+    index_t *oendW_cached = &ostartW_cached[isizeW];
+
+    // be careful with alignment, in case scalar_t is fp16, we want to assign
+    // int pointers first.
+    scalar_t *r_kW_cached = reinterpret_cast<scalar_t*>(&oendW_cached[isizeW]);
+    scalar_t *r_kH_cached = &r_kW_cached[osizeW];
+    scalar_t *out_cached = &r_kH_cached[osizeH];
+
+    // flattening cta for pre-computation & smem initialization;
+    int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+    int block_size = blockDim.x * blockDim.y * blockDim.z;
+
+    // Precompute output start/end index per input index on width dimension;
+    // Not doing this for height dimension, as that's our out-most loop.
+    for (index_t i = thread_id; i < isizeW; i+= block_size) {
+      ostartW_cached[i] = START_IND_INT(i, isizeW, osizeW);
+      oendW_cached[i] = END_IND_INT(i, isizeW, osizeW);
+    }
+
+    // Precompute pooling height/weight factor for each output element;
+    // This is used to weight output gradient when accumulate them on input
+    // gradient.
+    // Technically we don't have to compute it for the whole `osizeH`, since
+    // each cta only covers a consecutive portion of the entire output. But it's
+    // not going to save us from code divergence, and shared memory save is not
+    // an issue neither, so just leave it as is for now.
+    for (index_t i = thread_id; i < osizeH; i+= block_size) {
+      r_kH_cached[i] = scalar_t(1.0) / (END_IND_INT(i, osizeH, isizeH) - START_IND_INT(i, osizeH, isizeH));
+    }
+    for (index_t i = thread_id; i < osizeW; i+= block_size) {
+      r_kW_cached[i] = scalar_t(1.0) / (END_IND_INT(i, osizeW, isizeW) - START_IND_INT(i, osizeW, isizeW));
+    }
+
+    // each CTA handles a portion of a single slice on batch dimension;
+    int batch_id = blockIdx.x % sizeB;
+    int channel_id = blockIdx.x / sizeB;
+    int channel_offset = threadIdx.x + channel_id * blockDim.x;
+
+    // use shared memory to store temporary output value. This is simply to
+    // reduce register usage.
+    for (index_t i = thread_id; i < kernel_size_C*blockDim.x*blockDim.y*blockDim.z; i+= block_size) {
+      out_cached[i] = scalar_t(0.0);
+    }
+
+    __syncthreads();
+
+    // each CTA handles a portion of a single slice on batch dimension;
+    // We use gridDim.x to handle striding on C as well.
+    gradInput = gradInput + batch_id * isizeH * isizeW * sizeC;
+    gradOutput = gradOutput + batch_id * ostrideB;
+
+    // split out_cached and exclusively it assigned to each thread;
+    out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x * kernel_size_C];
+
+    // iterate on input H & W.
+    // Each CTA handles a consecutive H & W section (TILE); Do NOT stride CTA on
+    // tile so there's a better chance to hit L1 cache.
+    index_t iH = (isizeH + gridDim.z-1) / gridDim.z;
+    index_t iW = (isizeW + gridDim.y-1) / gridDim.y;
+    index_t istartH = threadIdx.z + blockIdx.z*iH;
+    index_t iendH = ::min(istartH+iH, isizeH);
+    index_t istartW = threadIdx.y + blockIdx.y*iW;
+    index_t iendW = ::min(istartW+iW, isizeW);
+
+    // Stride for threads, each warp can reuse L1 as they go. So theoretically
+    // better chance to survive cache eviction.
+    for (index_t ih = istartH; ih < iendH; ih+=blockDim.z) {
+      index_t ostartH = START_IND_INT(ih, isizeH, osizeH);
+      index_t oendH = END_IND_INT(ih, isizeH, osizeH);
+      for (index_t iw = istartW; iw < iendW; iw+=blockDim.y) {
+        // loop on output: hierarchy h->w->c, so we could reuse weight factor f
+        // because it remains the same for given oh & ow
+        for(index_t oh = ostartH; oh < oendH; ++oh) {
+          for(index_t ow = ostartW_cached[iw]; ow < oendW_cached[iw]; ++ow) {
+            scalar_t f = r_kW_cached[ow] * r_kH_cached[oh];
+            const scalar_t* ptr_gradOutput = gradOutput + oh*ostrideH + ow*ostrideW;
+            int cached_index = threadIdx.x;
+            for (index_t c = channel_offset;
+                 c < sizeC;
+                 c += blockDim.x*kernel_stride_C) {
+              out_cached[cached_index] += ptr_gradOutput[c*ostrideC] * f;
+              cached_index += blockDim.x;
+            }
+          }
+        }
+        scalar_t *ptr_gradInput = gradInput + (ih * isizeW + iw) * sizeC;
+        int cached_index = threadIdx.x;
+        // write accumulated gradIput to global memory;
+        for (index_t c = channel_offset;
+             c < sizeC;
+             c += blockDim.x*kernel_stride_C) {
+          ptr_gradInput[c] = out_cached[cached_index];
+          out_cached[cached_index] = scalar_t(0.0);
+          cached_index += blockDim.x;
+        }
+        // no need to __syncthreads() since out_cached is not shared.
+      }
+    }
+  }
+
+  // 4d tensor B x D x H x W
+
+  void adaptive_avg_pool2d_out_zoom_template(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size)
+  {
+    TensorArg input_arg{ input, "input", 1 },
+              output_arg{ output, "output", 2 };
+    checkAllSameGPU(__func__, {input_arg, output_arg});
+
+    TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
+    int64_t ndim = input.dim();
+    TORCH_CHECK((ndim == 3 || ndim == 4),
+      "adaptive_avg_pool2d(): Expected 3D or 4D tensor, but got ", input.sizes());
+    for (const auto i : {-2, -1}) {
+      TORCH_CHECK(input.size(i) > 0,
+        "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
+        "but input has sizes ", input.sizes(), " with dimension ", i + ndim, " being "
+        "empty");
+    }
+
+    Tensor input_ = input;
+    switch (input.suggest_memory_format()) {
+      case at::MemoryFormat::ChannelsLast: {
+        // special case for tensor memory format in channels_last
+        TORCH_CHECK(input.ndimension() == 4,
+                    "adaptive_avg_pool2d(): Expected 4D tensor, but got ",
+                    input.sizes());
+
+        int sizeB = input_.size(0);
+        int sizeC = input_.size(1);
+        int isizeH = input_.size(2);
+        int isizeW = input_.size(3);
+
+        int64_t istrideB = input_.stride(0);
+        int64_t istrideC = input_.stride(1);
+        int64_t istrideH = input_.stride(2);
+        int64_t istrideW = input_.stride(3);
+
+        int osizeH = output_size[0];
+        int osizeW = output_size[1];
+        // preserve channels_last stride on output tensor;
+        if (!output.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+          // TODO: modify this after resize_ added `memory_format` tag
+          output.resize_({sizeB, sizeC, osizeH, osizeW}).as_strided_({sizeB, sizeC, osizeH, osizeW}, {sizeC*osizeH*osizeW, 1, osizeW*sizeC, sizeC});
+        }
+
+        if (output.numel() == 0) {
+          return;
+        }
+
+        const int max_threads = std::min<int>(
+            at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, HIP_MAX_THREADS);
+        int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+        int* maxGridSize = at::zoom::getCurrentDeviceProperties()->maxGridSize;
+        size_t sharedMemPerBlock = at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock;
+
+        // Launch kernel on output tensor elements. Logic behind launch config:
+        // output tensor size NCHW, strides NHWC;
+        // Launch on:
+        // N -> grid.x
+        // H -> grid.z * block.z
+        // W -> grid.y * block.y
+        // C -> block.x
+        // encourage larger block_y & block_z for better cache hit while maintain
+        // reasonable block_x for coalesced memory access;
+        int block_x = std::min<int>(
+            maxThreadsDim[0], std::min<int>(lastPow2(sizeC), at::zoom::warp_size()));
+        int block_y = std::min<int>(
+            maxThreadsDim[1], std::min<int>(lastPow2(osizeW), max_threads / block_x));
+        int block_z = std::min<int>(
+            maxThreadsDim[2], std::min<int>(lastPow2(osizeH), max_threads / block_x / block_y));
+        block_x = std::min<int>(
+            maxThreadsDim[0], std::min<int>(lastPow2(sizeC), max_threads / block_y / block_z));
+        const dim3 block(block_x, block_y, block_z);
+        int kernel_stride_C = ceil_div(sizeC, block_x * 4);
+        int kernel_size_C = ceil_div(sizeC, block_x * kernel_stride_C);
+
+        // Do NOT clip grid_x, striding on Batch dimension is not in the kernel,
+        // although it could be easily implemented given current kernel.
+        int grid_x = sizeB*kernel_stride_C;
+        // it's OK to clip grid_y & grid_z, as we block the two dimensions in the kernel;
+        int grid_y = std::min<int>(
+            maxGridSize[1], ceil_div(osizeW, block_y*BLOCK_STRIDE));
+        int grid_z = std::min<int>(
+            maxGridSize[2], ceil_div(osizeH, block_z*BLOCK_STRIDE));
+        const dim3 grid(grid_x, grid_y, grid_z);
+
+
+        // we are dealing with packed tensor here. max index is the same as numel.
+        // TODO: to really support input tensor large enought to go beyond int32,
+        // we will need to restrict out shared memory usage and adjust the launch
+        // config;
+        AT_ASSERT(input_.numel() < std::numeric_limits<int32_t>::max());
+        AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+            input_.scalar_type(), "adaptive_avg_pool2d_nhwc_zoom", [&] {
+              using opmath_t = at::opmath_type<scalar_t>;
+              size_t shmem_size = (kernel_size_C * block_x * block_y * block_z) * sizeof(opmath_t);
+              AT_ASSERT(shmem_size <= sharedMemPerBlock);
+              adaptive_average_pool_nhwc<int32_t><<<grid, block, shmem_size, c10::zoom::getCurrentZoomStream()>>> (
+                input_.const_data_ptr<scalar_t>(),
+                output.mutable_data_ptr<scalar_t>(),
+                sizeB, sizeC, isizeH, isizeW, osizeH, osizeW,
+                kernel_stride_C, kernel_size_C,
+                istrideB, istrideC, istrideH, istrideW);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            }
+          );
+        break;
+      }
+      case at::MemoryFormat::Contiguous: {
+        int64_t grid_x = input.size(-3);
+        if (input.ndimension() == 4) {
+           input_ = input.contiguous();
+           grid_x *= input_.size(-4);
+        }
+        int64_t sizeD  = input_.size(-3);
+        int64_t isizeH = input_.size(-2);
+        int64_t isizeW = input_.size(-1);
+
+        int64_t istrideD = input_.stride(-3);
+        int64_t istrideH = input_.stride(-2);
+        int64_t istrideW = input_.stride(-1);
+
+        int64_t osizeH = output_size[0];
+        int64_t osizeW = output_size[1];
+        if (input.ndimension() == 4) {
+           output.resize_({input_.size(-4), sizeD, osizeH, osizeW});
+        } else {
+           output.resize_({sizeD, osizeH, osizeW});
+        }
+        if (output.numel() == 0) {
+          return;
+        }
+
+        AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+            input_.scalar_type(), "adaptive_avg_pool2d_zoom", [&] {
+              const scalar_t *input_data = input_.const_data_ptr<scalar_t>();
+              scalar_t *output_data = output.mutable_data_ptr<scalar_t>();
+
+              // cuda blocks & threads:
+              int blocksH = std::max<int64_t>((int)(16L / sizeD), 1);
+              dim3 blocks(grid_x, blocksH);
+              dim3 threads(32, 8);
+
+              // run averagepool kernel
+              adaptive_average_pool <<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>> (
+                input_data, output_data,
+                isizeH, isizeW, osizeH, osizeW,
+                istrideD, istrideH, istrideW);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            }
+          );
+        break;
+      }
+      default:
+        TORCH_CHECK(
+          false,
+          "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+    }
+  }
+
+  void adaptive_avg_pool2d_backward_out_zoom_template(
+    Tensor& gradInput,
+    const Tensor& gradOutput_,
+    const Tensor& input)
+  {
+    TensorArg grad_input_arg{ gradInput, "gradInput", 1 },
+              grad_output_arg{ gradOutput_, "gradOutput_", 2 },
+              input_arg{ input, "input", 3 };
+
+    adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool2d_backward");
+
+    checkAllSameGPU(__func__, {grad_input_arg, grad_output_arg, input_arg});
+
+    switch (input.suggest_memory_format()) {
+      case at::MemoryFormat::ChannelsLast: {
+        // special case for tensor memory format in channels_last
+        TORCH_CHECK(input.ndimension() == 4,
+                    "adaptive_avg_pool2d_backward_zoom(): Expected 4D tensor, but got ", input.ndimension());
+
+        int sizeB = input.size(0);
+        int sizeC = input.size(1);
+        int isizeH = input.size(2);
+        int isizeW = input.size(3);
+
+        Tensor gradOutput = gradOutput_;
+
+        int64_t ostrideB = gradOutput.stride(0);
+        int64_t ostrideC = gradOutput.stride(1);
+        int64_t ostrideH = gradOutput.stride(2);
+        int64_t ostrideW = gradOutput.stride(3);
+
+        int osizeH = gradOutput.size(-2);
+        int osizeW = gradOutput.size(-1);
+
+        // preserve channels_last stride on input tensor;
+        if (!gradInput.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+          gradInput.as_strided_(
+              {sizeB, sizeC, isizeH, isizeW},
+              {sizeC*isizeH*isizeW, 1, isizeW*sizeC, sizeC});
+        }
+
+        int max_threads = std::min<int>(
+            at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, HIP_MAX_THREADS);
+        int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+        int* maxGridSize = at::zoom::getCurrentDeviceProperties()->maxGridSize;
+        size_t sharedMemPerBlock = at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock;
+
+        // Launch kernel on input tensor elements. Logic behind launch config:
+        // input tensor size NCHW, strides NHWC;
+        // Launch on:
+        // N(C) -> grid.x (striding on C to reduce sh_mem usage)
+        // H    -> grid.z * block.z
+        // W    -> grid.y * block.y
+        // C    -> block.x
+        // encourage larger block_y & block_z for better cache hit while maintain
+        // reasonable block_x for coalesced memory access;
+        bool done = false;
+        do {
+          int block_x = std::max<int>(std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(sizeC), at::zoom::warp_size())), 1);
+          int block_y = std::max<int>(std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(isizeW), max_threads / block_x)), 1);
+          int block_z = std::max<int>(std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(isizeH), max_threads / block_x / block_y)), 1);
+          block_x = std::max<int>(std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(sizeC), max_threads / block_y / block_z)), 1);
+          const dim3 block(block_x, block_y, block_z);
+          int kernel_stride_C = ceil_div(sizeC, block_x * 4);
+          int kernel_size_C = ceil_div(sizeC, block_x * kernel_stride_C);
+
+          // Do NOT clip grid_x, striding on Batch dimension is not in the kernel,
+          // although it could be easily implemented given current kernel.
+          int grid_x = sizeB*kernel_stride_C;
+          // it's OK to clip grid_y & grid_z, as we block the two dimensions in the kernel;
+          int grid_y = std::min<int>(
+              maxGridSize[1], ceil_div(isizeW, block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              maxGridSize[2], ceil_div(isizeH, block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          // we are dealing with packed tensor here. max index is the same as numel.
+          // TODO: to really support input tensor large enought to go beyond int32,
+          // we will need to restrict out shared memory usage and adjust the launch
+          // config;
+          AT_ASSERT(input.numel() < std::numeric_limits<int32_t>::max());
+          AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+              input.scalar_type(), "adaptive_avg_pool2d_backward_nhwc_zoom", [&] {
+                size_t shmem_size = (kernel_size_C * block_x * block_y * block_z + osizeH + osizeW) * sizeof(scalar_t) + 2 * isizeW * sizeof(int32_t);
+                if (shmem_size <= sharedMemPerBlock) {
+                  adaptive_average_gradinput_nhwc<int32_t><<<grid, block, shmem_size, c10::zoom::getCurrentZoomStream()>>> (
+                    gradInput.mutable_data_ptr<scalar_t>(),
+                    gradOutput.const_data_ptr<scalar_t>(),
+                    sizeB, sizeC, isizeH, isizeW, osizeH, osizeW,
+                    kernel_stride_C, kernel_size_C,
+                    ostrideB, ostrideC, ostrideH, ostrideW);
+                  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+                  done = true;
+                } else {
+                  TORCH_WARN_ONCE("Requested shmem_size exceeds sharedMemPerBlock limit! Reducing max_threads...");
+                  max_threads /= 2;
+                }
+              }
+            );
+        } while (!done && max_threads);
+        if (!done) {
+          TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit");
+        }
+        break;
+      }
+      case at::MemoryFormat::Contiguous: {
+        bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
+
+        Tensor gradOutput = gradOutput_.contiguous();
+
+        int64_t sizeD  = input.size(-3);
+        int64_t isizeH = input.size(-2);
+        int64_t isizeW = input.size(-1);
+
+        int64_t osizeH = gradOutput.size(-2);
+        int64_t osizeW = gradOutput.size(-1);
+
+        int64_t grid_x = sizeD;
+        if (input.ndimension() == 4) grid_x *= input.size(-4);
+
+          //bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0);
+        AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+            input.scalar_type(), "adaptive_avg_pool2d_backward_zoom", [&] {
+              const scalar_t *gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+              scalar_t *gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+
+              // cuda blocks & threads:
+              int blocksH = std::max((int)(16L / sizeD), 1);
+              dim3 blocks(grid_x, blocksH);
+              dim3 threads(32, 8);
+
+              if(atomic)
+              {
+                // run updateGradInput kernel, accumulate gradients atomically
+                atomic_adaptive_average_gradinput <<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>> (
+                  gradInput_data, gradOutput_data,
+                  isizeH, isizeW, osizeH, osizeW);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              }
+              else
+              {
+                // run updateGradInput kernel
+                adaptive_average_gradinput <<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>> (
+                  gradInput_data, gradOutput_data,
+                  isizeH, isizeW, osizeH, osizeW);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              }
+            }
+          );
+        break;
+      }
+      default:
+        TORCH_CHECK(
+          false,
+          "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+
+    }
+  }
+
+} // namespace
+
+  Tensor& adaptive_avg_pool2d_out_zoom(
+    const Tensor& input,
+    IntArrayRef output_size,
+    Tensor& output)
+  {
+    adaptive_avg_pool2d_out_zoom_template(
+      output, input, output_size);
+    return output;
+  }
+
+  Tensor adaptive_avg_pool2d_zoom(
+    at::Tensor const& input,
+    IntArrayRef output_size)
+  {
+    auto output = at::empty({0}, input.options());
+    adaptive_avg_pool2d_out_zoom_template(
+      output, input, output_size);
+    return output;
+  }
+
+  Tensor& adaptive_avg_pool2d_backward_out_zoom(
+    Tensor& gradInput,
+    const Tensor& gradOutput,
+    const Tensor& input)
+  {
+    // See Note [Writing Nondeterministic Operations]
+    // Nondeterministic because of atomicAdd usage
+    globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_out_zoom");
+    gradInput.resize_as_(input);
+    if (gradInput.numel() != 0) {
+      adaptive_avg_pool2d_backward_out_zoom_template(
+        gradInput, gradOutput, input);
+    }
+    return gradInput;
+  }
+
+  Tensor adaptive_avg_pool2d_backward_zoom(
+    const Tensor& gradOutput,
+    const Tensor& input)
+  {
+    // See Note [Writing Nondeterministic Operations]
+    // Nondeterministic because of atomicAdd usage
+    globalContext().alertNotDeterministic("adaptive_avg_pool2d_backward_zoom");
+    auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    if (gradInput.numel() != 0) {
+      adaptive_avg_pool2d_backward_out_zoom_template(
+        gradInput, gradOutput, input);
+    }
+    return gradInput;
+  }
+
+} // namespace at::native
+
+#undef BLOCK_STRIDE
+#undef HIP_MAX_THREADS
+#undef START_IND
+#undef END_IND
diff --git a/aten/src/ATen/native/zoom/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/zoom/AdaptiveAveragePooling3d.cu
new file mode 100644
index 00000000000000..2253c9a215144f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AdaptiveAveragePooling3d.cu
@@ -0,0 +1,545 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
+#include <ATen/native/AdaptivePooling.h>
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+
+namespace at::native {
+
+namespace {
+
+__device__ inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
+  return (a / b) * c + ((a % b) * c) / b;
+}
+
+__device__ inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
+  return 1 + ((a + 1) * c - 1) / b;
+}
+
+// 5d tensor B x D x T x H x W
+// All kernels view batch dim B and dim D as collapsed.
+
+/*
+ * Description:
+ *    this function adaptively average pools an input 5D tensor along dimensions
+ * 2, 3, and 4 5D input, 5D output
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ */
+template <typename scalar_t, typename accscalar_t>
+__global__ void adaptiveaveragepool(
+    const scalar_t *input, scalar_t *output,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t istrideD,
+    int64_t istrideT, int64_t istrideH, int64_t istrideW,
+    int64_t offsetZ) {
+  // iterates on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT; // output frame/time
+  int d = o_plane / osizeT; // slice/feature
+
+  // input frame/time range is fixed.
+  int istartT = start_index(ot, osizeT, isizeT);
+  int iendT = end_index(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // input offset by slice/feature and earliest relevant frame/time
+  const scalar_t *input_dt = input + d*istrideD + istartT*istrideT;
+  // output offset by slice/feature and frame/time
+  scalar_t *output_dt = output + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for (oh = ostartH; oh < oendH; oh += ostepH) {
+    int istartH = start_index(oh, osizeH, isizeH);
+    int iendH = end_index(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for (ow = ostartW; ow < oendW; ow += ostepW) {
+      int istartW = start_index(ow, osizeW, isizeW);
+      int iendW = end_index(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the average pooling from corresponding input pixels
+      const scalar_t *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
+      scalar_t *ptr_output = output_dt + oh*osizeW + ow;
+      accscalar_t sum = static_cast<accscalar_t>(0);
+
+      int it, ih, iw;
+      for (it = 0; it < kT; ++it) {
+        for (ih = 0; ih < kH; ++ih) {
+          for (iw = 0; iw < kW; ++iw) {
+            scalar_t val = ptr_input[ih*istrideH + iw*istrideW];
+            sum += static_cast<accscalar_t>(val);
+          }
+        }
+        ptr_input += istrideT; // next input frame
+      }
+      // Update output
+      const accscalar_t divide_factor = static_cast<accscalar_t>(kT * kH * kW);
+      *ptr_output = static_cast<scalar_t>(sum / divide_factor);
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+void adaptiveaveragepool_loop(
+    const scalar_t *input_data, scalar_t *output_data,
+    int64_t totalZ,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t istrideD, int64_t istrideT, int64_t istrideH, int64_t istrideW) {
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    adaptiveaveragepool<scalar_t, accscalar_t>
+      <<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+        input_data, output_data,
+        isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW,
+        istrideD,
+        istrideT, istrideH, istrideW,
+        offsetZ);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ */
+template <typename scalar_t, typename accscalar_t>
+__global__ void adaptiveaveragegradinput(
+    scalar_t *gradInput, const scalar_t *gradOutput,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t offsetZ)
+{
+  // iterators on input pixels
+  int it, ih, iw;
+
+  // compute offsets based on thread/block ID
+  int istartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int iendH = isizeH;
+  int istepH = gridDim.y * blockDim.y;
+  int istartW = threadIdx.x;
+  int iendW = isizeW;
+  int istepW = blockDim.x;
+
+  // select input plane
+  int64_t i_plane = blockIdx.x + offsetZ;
+  it = i_plane % isizeT; // output frame/time
+  int d = i_plane / isizeT; // slice/feature
+
+  // output frame/time range is fixed.
+  int ostartT = start_index(it, isizeT, osizeT);
+  int oendT = end_index(it, isizeT, osizeT);
+
+  // gradInput offset by slice/feature and frame/time.
+  scalar_t *gradInput_dt = gradInput + i_plane*isizeH*isizeW;
+  // gradOutput offset by slice/feature and earliest relevant frame/time
+  const scalar_t *gradOutput_dt = gradOutput + (d*osizeT + ostartT)*osizeH*osizeW;
+
+  // For all input pixels...
+  for (ih = istartH; ih < iendH; ih += istepH) {
+    int ostartH = start_index(ih, isizeH, osizeH);
+    int oendH = end_index(ih, isizeH, osizeH);
+
+    for (iw = istartW; iw < iendW; iw += istepW) {
+      int ostartW = start_index(iw, isizeW, osizeW);
+      int oendW = end_index(iw, isizeW, osizeW);
+
+      // Compute the gradients from corresponding output pixels
+      scalar_t *ptr_gradInput = gradInput_dt + ih*isizeW + iw;
+      const scalar_t *ptr_gradOutput = gradOutput_dt;
+
+      // for all relevant output pixels
+      int ot, oh, ow;
+      for (ot = ostartT; ot < oendT; ++ot) {
+        int kT = end_index(ot, osizeT, isizeT) - start_index(ot, osizeT, isizeT);
+        for (oh = ostartH; oh < oendH; ++oh) {
+          int kH = end_index(oh, osizeH, isizeH) - start_index(oh, osizeH, isizeH);
+          for (ow = ostartW; ow < oendW; ++ow) {
+            int kW = end_index(ow, osizeW, isizeW) - start_index(ow, osizeW, isizeW);
+            const accscalar_t divide_factor = kW * kH * kT;
+            accscalar_t grad_delta = static_cast<accscalar_t>(ptr_gradOutput[oh*osizeW + ow] / divide_factor);
+            *ptr_gradInput += static_cast<scalar_t>(grad_delta);
+          }
+        }
+        ptr_gradOutput += osizeH*osizeW; // next output frame
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+void adaptiveaveragegradinput_loop(
+    scalar_t *gradInput_data, const scalar_t *gradOutput_data,
+    int64_t totalZ,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW) {
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    adaptiveaveragegradinput<scalar_t, accscalar_t>
+      <<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+        gradInput_data, gradOutput_data,
+        isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW,
+        offsetZ);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    (uses atomic add)
+ *
+ */
+template <typename scalar_t>
+__global__ void atomicadaptiveaveragegradinput(
+    scalar_t *gradInput, const scalar_t *gradOutput,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW,
+    int64_t offsetZ)
+{
+  // iterators on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT; // output frame/time
+  int d = o_plane / osizeT; // output slice/feature
+
+  // input frame/time range is fixed.
+  int istartT = start_index(ot, osizeT, isizeT);
+  int iendT = end_index(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // gradInput offset by slice/feature and earliest relevant frame/time
+  scalar_t *gradInput_nt = gradInput + (d*isizeT + istartT)*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/time
+  const scalar_t *gradOutput_nt = gradOutput + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for (oh = ostartH; oh < oendH; oh += ostepH) {
+    int istartH = start_index(oh, osizeH, isizeH);
+    int iendH = end_index(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for (ow = ostartW; ow < oendW; ow += ostepW) {
+      int istartW = start_index(ow, osizeW, isizeW);
+      int iendW = end_index(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the gradients from corresponding input pixels
+      scalar_t *ptr_gradInput = gradInput_nt + istartH*isizeW + istartW;
+      const scalar_t *ptr_gradOutput = gradOutput_nt + oh*osizeW + ow;
+      scalar_t grad_delta = *ptr_gradOutput / kT / kH / kW;
+
+      int it, ih, iw;
+      for (it = 0; it < kT; ++it) {
+        for (ih = 0; ih < kH; ++ih) {
+          for (iw = 0; iw < kW; ++iw) {
+            gpuAtomicAddNoReturn(&(ptr_gradInput[ih*isizeW + iw]), grad_delta);
+          }
+        }
+        ptr_gradInput += isizeH*isizeW; // next input frame
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void atomicadaptiveaveragegradinput_loop(
+    scalar_t* gradInput_data, const scalar_t* gradOutput_data,
+    int64_t totalZ,
+    int isizeT, int isizeH, int isizeW,
+    int osizeT, int osizeH, int osizeW) {
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    atomicadaptiveaveragegradinput<<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+        gradInput_data, gradOutput_data,
+        isizeT, isizeH, isizeW,
+        osizeT, osizeH, osizeW,
+        offsetZ);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+}
+
+// 5D tensor B x D x T x H x w
+
+void adaptive_avg_pool3d_out_zoom_template(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef& output_size) {
+  TensorArg output_arg{output, "output", 1};
+  TensorArg input_arg{input_, "input_", 2};
+
+  checkAllSameGPU("adaptive_avg_pool3d_zoom", {output_arg, input_arg});
+
+  for (int64_t i = 1; i < input_.ndimension(); i++) {
+    TORCH_CHECK(
+        input_.size(i) > 0,
+        "adaptive_avg_pool3d_zoom(): Expected input to have non-zero size for non-batch dimensions, "
+        "but input has sizes ", input_.sizes(),
+        " with dimension ", i, " being empty");
+  }
+
+  TORCH_CHECK(
+      (input_.ndimension() == 4 || input_.ndimension() == 5),
+      "adaptive_avg_pool3d_zoom(): Expected 4D or 5D tensor, but got ", input_.sizes());
+
+  // the jit sometimes passes output_size.size() == 1
+  TORCH_CHECK(
+      output_size.size() == 1 || output_size.size() == 3,
+      "adaptive_avg_pool3d: internal error: output_size.size() must be 1 or 3");
+
+  int64_t osizeT = output_size[0];
+  int64_t osizeH = output_size[1];
+  int64_t osizeW = output_size[2];
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t istrideD, istrideT, istrideH, istrideW;
+  int64_t totalZ;
+
+  const Tensor& input = input_.ndimension() == 4 ? input_ : input_.contiguous();
+
+  if (input.ndimension() == 4) {
+    sizeD = input.size(0);
+    isizeT = input.size(1);
+    isizeH = input.size(2);
+    isizeW = input.size(3);
+
+    istrideD = input.stride(0);
+    istrideT = input.stride(1);
+    istrideH = input.stride(2);
+    istrideW = input.stride(3);
+
+    output.resize_({sizeD, osizeT, osizeH, osizeW});
+
+    totalZ = sizeD * osizeT;
+  } else {
+    int64_t sizeB = input.size(0);
+    sizeD = input.size(1);
+    isizeT = input.size(2);
+    isizeH = input.size(3);
+    isizeW = input.size(4);
+
+    istrideD = input.stride(1);
+    istrideT = input.stride(2);
+    istrideH = input.stride(3);
+    istrideW = input.stride(4);
+
+    output.resize_({sizeB, sizeD, osizeT, osizeH, osizeW});
+
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  if (output.numel() == 0) {
+    return;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "adaptive_avg_pool3d_zoom", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+        const scalar_t* input_data = input.const_data_ptr<scalar_t>();
+        scalar_t* output_data = output.mutable_data_ptr<scalar_t>();
+
+        adaptiveaveragepool_loop<scalar_t, accscalar_t>(
+            input_data, output_data,
+            totalZ,
+            isizeT, isizeH, isizeW,
+            osizeT, osizeH, osizeW,
+            istrideD, istrideT, istrideH, istrideW);
+      });
+}
+
+void adaptive_avg_pool3d_backward_out_zoom_template(
+    Tensor& gradInput,
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  TensorArg grad_input_arg{gradInput, "gradInput", 1};
+  TensorArg grad_output_arg{gradOutput_, "gradOutput_", 2};
+  TensorArg input_arg{input, "input", 3};
+
+  adaptive_pool_empty_output_check(gradOutput_, "adaptive_avg_pool3d_backward");
+
+  checkAllSameGPU(
+      "adaptive_avg_pool3d_out_zoom",
+      {grad_input_arg, grad_output_arg, input_arg});
+
+  const Tensor gradOutput = gradOutput_.contiguous();
+
+  gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return;
+  }
+
+  gradInput.zero_();
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t osizeT, osizeH, osizeW;
+  int64_t totalZ;
+
+  if (input.ndimension() == 4) {
+    sizeD = input.size(0);
+    isizeT = input.size(1);
+    isizeH = input.size(2);
+    isizeW = input.size(3);
+
+    osizeT = gradOutput.size(1);
+    osizeH = gradOutput.size(2);
+    osizeW = gradOutput.size(3);
+  } else {
+    sizeD = input.size(1);
+    isizeT = input.size(2);
+    isizeH = input.size(3);
+    isizeW = input.size(4);
+
+    osizeT = gradOutput.size(2);
+    osizeH = gradOutput.size(3);
+    osizeW = gradOutput.size(4);
+  }
+
+  bool atomic = (isizeW%osizeW != 0) || (isizeH%osizeH != 0) || (isizeT%osizeT != 0);
+
+  if (input.ndimension() == 4) {
+    totalZ = atomic ? sizeD * osizeT : sizeD * isizeT;
+  } else {
+    int sizeB = input.size(0);
+    totalZ = atomic ? sizeB * sizeD * osizeT : sizeB * sizeD * isizeT;
+  }
+
+  if (atomic) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+        input.scalar_type(), "adaptive_avg_pool3d_backward_zoom", [&] {
+          scalar_t* gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+
+          atomicadaptiveaveragegradinput_loop(
+              gradInput_data, gradOutput_data,
+              totalZ,
+              isizeT, isizeH, isizeW,
+              osizeT, osizeH, osizeW);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+        input.scalar_type(), "adaptive_avg_pool3d_backward_zoom", [&] {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+
+          scalar_t* gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+
+          adaptiveaveragegradinput_loop<scalar_t, accscalar_t>(
+              gradInput_data, gradOutput_data,
+              totalZ,
+              isizeT, isizeH, isizeW,
+              osizeT, osizeH, osizeW);
+        });
+  }
+}
+
+} // namespace
+
+Tensor& adaptive_avg_pool3d_out_zoom(const Tensor& input,
+    IntArrayRef output_size,
+    Tensor& output) {
+  adaptive_avg_pool3d_out_zoom_template(output, input, output_size);
+  return output;
+}
+
+Tensor adaptive_avg_pool3d_zoom(
+    const Tensor& input,
+    IntArrayRef output_size) {
+  auto output = at::empty({0}, input.options());
+  adaptive_avg_pool3d_out_zoom_template(output, input, output_size);
+  return output;
+}
+
+Tensor& adaptive_avg_pool3d_backward_out_zoom(const Tensor& gradOutput_,
+    const Tensor& input,
+    Tensor& gradInput) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_out_zoom");
+  adaptive_avg_pool3d_backward_out_zoom_template(gradInput, gradOutput_, input);
+  return gradInput;
+}
+
+Tensor adaptive_avg_pool3d_backward_zoom(
+    const Tensor& gradOutput_,
+    const Tensor& input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("adaptive_avg_pool3d_backward_zoom");
+  auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  adaptive_avg_pool3d_backward_out_zoom_template(gradInput, gradOutput_, input);
+  return gradInput;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/zoom/AdaptiveMaxPooling2d.cu
new file mode 100644
index 00000000000000..737c63f1f3083b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AdaptiveMaxPooling2d.cu
@@ -0,0 +1,478 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool2d_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+
+namespace at::native {
+
+namespace {
+
+__device__ inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
+  return (a / b) * c + ((a % b) * c) / b;
+}
+
+__device__ inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
+  return 1 + ((a + 1) * c - 1) / b;
+}
+
+// 4d tensor B x D x H x W
+
+/*
+ * Description:
+ *    this function adaptively maxpools an input 4D tensor along dimensions 2 and 3
+ *    4D input, 4D output, 4D argmax x and y
+ */
+ template <typename T>
+__global__ void adaptivemaxpool(const T *input, T *output, int64_t *indices,
+                        int isizeH, int isizeW,
+                        int osizeH, int osizeW,
+                        int64_t istrideD, int64_t istrideH, int64_t istrideW)
+{
+  // iterators
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  const int ostepW = blockDim.x;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  const int ostepH = blockDim.y*gridDim.y;
+  // select input/output plane
+  output = output + o_plane*osizeH*osizeW;
+  input = input + i_plane*istrideD;
+  indices = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = start_index(oh, osizeH, isizeH);
+    int iendH   = end_index(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+      int istartW = start_index(ow, osizeW, isizeW);
+      int iendW   = end_index(ow, osizeW, isizeW);
+
+      int kW = iendW - istartW;
+
+      // Compute the mean of the input image...
+      const T *ptr_input = input + istartH*istrideH + istartW*istrideW;
+      T *ptr_output = output + oh*osizeW + ow;
+      int64_t *ptr_ind = indices + oh*osizeW + ow;
+      int argmax = istartH * isizeW + istartW;
+      T max = at::numeric_limits<T>::lower_bound(); // -Infinity
+      int ih, iw;
+      for(ih = 0; ih < kH; ih++) {
+        for(iw = 0; iw < kW; iw++) {
+          T val = ptr_input[iw*istrideW];
+          if ((val > max) || at::_isnan(val)) {
+            max = val;
+            argmax = (ih+istartH)*isizeW + iw+istartW;
+          }
+        }
+        ptr_input += istrideH; // next input line
+      }
+      // Update output and argmax
+      *ptr_output = max;
+      *ptr_ind = argmax;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from weight and gradOutput
+ */
+ template <typename T>
+__global__ void adaptivemaxgradinput(T *gradInput, const T *gradOutput, const int64_t *indices,
+                             int isizeH, int isizeW,
+                             int osizeH, int osizeW)
+{
+  // iterators
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+  //int k = blockIdx.x % sizeD;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o_plane*osizeH*osizeW;
+  gradInput = gradInput + i_plane*isizeH*isizeW;
+  indices = indices + o_plane*osizeH*osizeW;
+
+  // compute gradInput
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      const T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+      const int64_t *ptr_ind = indices + oh*osizeW + ow;
+      T z = *ptr_gradOutput;
+
+      int argmax = (*ptr_ind);
+
+      gradInput[argmax] += z;
+    }
+  }
+}
+
+/*
+ * Description:
+ *    this function computes the gradInput from weight and gradOutput
+ *    when kH != dH or kW != dW (uses atomic add)
+ */
+ template <typename T>
+__global__ void atomicadaptivemaxgradinput(
+  T *gradInput, const T *gradOutput, const int64_t *indices,
+  int isizeH, int isizeW, int osizeH, int osizeW
+)
+{
+  // iterators
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int o_plane = blockIdx.x;
+  int i_plane = o_plane;
+
+  int ostartW = threadIdx.x;
+  int oendW = osizeW;
+  int ostepW = blockDim.x;
+
+  int ostartH = blockDim.y*blockIdx.y + threadIdx.y;
+  int oendH = osizeH;
+  int ostepH = blockDim.y*gridDim.y;
+
+  // select input/output plane
+  gradOutput = gradOutput + o_plane*osizeH*osizeW;
+  gradInput = gradInput + i_plane*isizeH*isizeW;
+  indices = indices + o_plane*osizeH*osizeW;
+
+  // compute gradInput
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      const T *ptr_gradOutput = gradOutput + oh*osizeW + ow;
+      const int64_t *ptr_ind = indices + oh*osizeW + ow;
+      T z = *ptr_gradOutput;
+
+      int argmax = (*ptr_ind);
+
+      // atomic add since different threads could update same variable
+      gpuAtomicAddNoReturn(&(gradInput[argmax]), z);
+    }
+  }
+}
+} // namespace
+
+// 4d tensor B x D x H x W
+
+TORCH_IMPL_FUNC(adaptive_max_pool2d_out_zoom)
+(const Tensor& input,
+IntArrayRef output_size,
+const Tensor& output,
+const Tensor& indices) {
+  TensorArg output_arg{output, "output", 1};
+  TensorArg indices_arg{indices, "indices", 2};
+  TensorArg input_arg{input, "input", 3};
+
+  checkAllSameGPU(
+      __func__, {output_arg, indices_arg, input_arg});
+  if (input.numel() == 0) {
+    return;
+  }
+
+  int64_t osizeH = output_size[0];
+  int64_t osizeW = output_size[1];
+
+  const at::Tensor output_c = output.is_contiguous() ? output : at::empty(output.sizes(), output.options());
+  const at::Tensor indices_c = indices.is_contiguous() ? indices : at::empty(indices.sizes(), indices.options());
+
+  if (input.ndimension() == 3) {
+    int64_t sizeD = input.size(0);
+    int64_t isizeH = input.size(1);
+    int64_t isizeW = input.size(2);
+
+    int64_t istrideD = input.stride(0);
+    int64_t istrideH = input.stride(1);
+    int64_t istrideW = input.stride(2);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, input.scalar_type(), "adaptive_max_pool2d_zoom", [&] {
+          const scalar_t* input_data = input.const_data_ptr<scalar_t>();
+          scalar_t* output_data = output_c.mutable_data_ptr<scalar_t>();
+          int64_t* indices_data = indices_c.mutable_data_ptr<int64_t>();
+
+          // cuda blocks & threads:
+          int blocksH = (int)(16L / sizeD);
+          blocksH = blocksH < 1 ? 1 : blocksH;
+          dim3 blocks(sizeD, blocksH);
+          dim3 threads(32, 8);
+
+          // run maxpool kernel
+          adaptivemaxpool<<<
+              blocks,
+              threads,
+              0,
+              c10::zoom::getCurrentZoomStream()>>>(
+              input_data,
+              output_data,
+              indices_data,
+              isizeH,
+              isizeW,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideH,
+              istrideW);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+  } else {
+    Tensor input_ = input.contiguous();
+    int64_t sizeB = input_.size(0);
+    int64_t sizeD = input_.size(1);
+    int64_t isizeH = input_.size(2);
+    int64_t isizeW = input_.size(3);
+
+    // In the kernel, the batch and channel dimensions are treated as if they
+    // are flattened and istrideD is used as the stride of this flattened dim
+    // Handle the edge case where input_.size(1) == 1, where despite passing the
+    // contiguity check the stride might not be H * W
+    int64_t istrideD = isizeH * isizeW;
+    int64_t istrideH = input_.stride(2);
+    int64_t istrideW = input_.stride(3);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        input_.scalar_type(),
+        "adaptive_max_pool2d_zoom",
+        [&] {
+          const scalar_t* input_data = input_.const_data_ptr<scalar_t>();
+          scalar_t* output_data = output_c.mutable_data_ptr<scalar_t>();
+          int64_t* indices_data = indices_c.mutable_data_ptr<int64_t>();
+
+          // cuda blocks & threads:
+          int blocksH = (int)(16L / sizeD);
+          blocksH = blocksH < 1 ? 1 : blocksH;
+          dim3 blocks(sizeB * sizeD, blocksH);
+          dim3 threads(32, 8);
+
+          // run maxpool kernel
+          adaptivemaxpool<<<
+              blocks,
+              threads,
+              0,
+              c10::zoom::getCurrentZoomStream()>>>(
+              input_data,
+              output_data,
+              indices_data,
+              isizeH,
+              isizeW,
+              osizeH,
+              osizeW,
+              istrideD,
+              istrideH,
+              istrideW);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+  }
+
+  if (!output.is_contiguous()) {
+    output.copy_(output_c);
+  }
+  if (!indices.is_contiguous()) {
+    indices.copy_(indices_c);
+  }
+}
+
+TORCH_IMPL_FUNC(adaptive_max_pool2d_backward_out_zoom)
+(const Tensor& gradOutput,
+ const Tensor& input,
+ const Tensor& indices,
+ const Tensor& gradInput) {
+  globalContext().alertNotDeterministic(
+      "adaptive_max_pool2d_backward_zoom");
+
+  TensorArg grad_input_arg{gradInput, "gradInput", 1};
+  TensorArg grad_output_arg{gradOutput, "gradOutput", 2};
+  TensorArg input_arg{input, "input", 3};
+  TensorArg indices_arg{indices, "indices", 4};
+
+  checkAllSameGPU(
+      __func__,
+      {grad_input_arg, grad_output_arg, input_arg, indices_arg});
+
+  if (gradOutput.numel() == 0) {
+    return;
+  }
+
+  bool atomic =
+      true; // suboptimal, but without atomic it doesn't pass the tests
+
+  const at::Tensor gradOutput_ = gradOutput.contiguous();
+  const at::Tensor indices_ = indices.contiguous();
+  const at::Tensor gradInput_c = gradInput.is_contiguous() ? gradInput : at::empty(gradInput.sizes(), gradInput.options());
+
+  if (input.ndimension() == 3) {
+    int64_t sizeD = input.size(0);
+    int64_t isizeH = input.size(1);
+    int64_t isizeW = input.size(2);
+
+    int64_t osizeH = gradOutput_.size(1);
+    int64_t osizeW = gradOutput_.size(2);
+
+    // bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0);
+
+    gradInput_c.zero_();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        input.scalar_type(),
+        "adaptive_max_pool2d_backward_zoom",
+        [&] {
+          scalar_t* gradInput_data = gradInput_c.mutable_data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput_.const_data_ptr<scalar_t>();
+          const int64_t* indices_data = indices_.const_data_ptr<int64_t>();
+
+          // cuda blocks & threads:
+          int blocksH = (int)(16L / sizeD);
+          blocksH = blocksH < 1 ? 1 : blocksH;
+          dim3 blocks(sizeD, blocksH);
+          dim3 threads(32, 8);
+
+          if (atomic) {
+            // run updateGradInput kernel, accumulate gradients atomically
+            atomicadaptivemaxgradinput<<<
+                blocks,
+                threads,
+                0,
+                c10::zoom::getCurrentZoomStream()>>>(
+                gradInput_data,
+                gradOutput_data,
+                indices_data,
+                isizeH,
+                isizeW,
+                osizeH,
+                osizeW);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          } else {
+            // run updateGradInput kernel
+            atomicadaptivemaxgradinput<<<
+                blocks,
+                threads,
+                0,
+                c10::zoom::getCurrentZoomStream()>>>(
+                gradInput_data,
+                gradOutput_data,
+                indices_data,
+                isizeH,
+                isizeW,
+                osizeH,
+                osizeW);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        });
+  } else {
+    int64_t sizeB = input.size(0);
+    int64_t sizeD = input.size(1);
+    int64_t isizeH = input.size(2);
+    int64_t isizeW = input.size(3);
+
+    int64_t osizeH = gradOutput_.size(2);
+    int64_t osizeW = gradOutput_.size(3);
+
+    gradInput_c.zero_();
+
+    // bool atomic = (isizeH%osizeH != 0) || (isizeW%osizeW != 0);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        input.scalar_type(),
+        "adaptive_max_pool2d_backward_zoom",
+        [&] {
+          scalar_t* gradInput_data = gradInput_c.mutable_data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput_.const_data_ptr<scalar_t>();
+          const int64_t* indices_data = indices_.const_data_ptr<int64_t>();
+
+          // cuda blocks & threads:
+          int blocksH = (int)(16L / sizeD);
+          blocksH = blocksH < 1 ? 1 : blocksH;
+          dim3 blocks(sizeB * sizeD, blocksH);
+          dim3 threads(32, 8);
+
+          if (atomic) {
+            // run updateGradInput kernel, accumulate gradients atomically
+            atomicadaptivemaxgradinput<<<
+                blocks,
+                threads,
+                0,
+                c10::zoom::getCurrentZoomStream()>>>(
+                gradInput_data,
+                gradOutput_data,
+                indices_data,
+                isizeH,
+                isizeW,
+                osizeH,
+                osizeW);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          } else {
+            // run updateGradInput kernel, accumulate gradients atomically
+            adaptivemaxgradinput<<<
+                blocks,
+                threads,
+                0,
+                c10::zoom::getCurrentZoomStream()>>>(
+                gradInput_data,
+                gradOutput_data,
+                indices_data,
+                isizeH,
+                isizeW,
+                osizeH,
+                osizeW);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        });
+  }
+
+  if (!gradInput.is_contiguous()) {
+    gradInput.copy_(gradInput_c);
+  }
+ }
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/zoom/AdaptiveMaxPooling3d.cu
new file mode 100644
index 00000000000000..022053ced042ad
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AdaptiveMaxPooling3d.cu
@@ -0,0 +1,488 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/adaptive_max_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool3d_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+
+namespace at::native {
+
+namespace {
+
+__device__ inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
+  return (a / b) * c + ((a % b) * c) / b;
+}
+
+__device__ inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
+  return 1 + ((a + 1) * c - 1) / b;
+}
+
+// 5d tensor B x D x T x H x W
+
+/*
+ * Description:
+ *    this function adaptively maxpools an input 4D tensor along dimensions 2 and 3
+ *    4D input, 4D output, 4D argmax x and y
+ */
+ template <typename T>
+__global__ void adaptivemaxpool(
+                        const T *input, T *output, int64_t *indices,
+                        int isizeT, int isizeH, int isizeW,
+                        int osizeT, int osizeH, int osizeW,
+                        int64_t istrideD,
+                        int64_t istrideT, int64_t istrideH, int64_t istrideW,
+                        int64_t offsetZ)
+{
+  // iterators on output pixels
+  int ot, oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  ot = o_plane % osizeT;     // output frame/time
+  int d = o_plane / osizeT;  // slice/feature
+
+  // input frame/time ramge is fixed.
+  int istartT = start_index(ot, osizeT, isizeT);
+  int iendT = end_index(ot, osizeT, isizeT);
+  int kT = iendT - istartT;
+
+  // input offset by slice/feature and earliest relevant frame/time
+  const T *input_dt = input + d*istrideD + istartT*istrideT;
+  // output offset by slice/feature and frame/time
+  T *output_dt = output + o_plane*osizeH*osizeW;
+  // indices offset by slice/feature and frame/time
+  int64_t *indices_dt = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+
+    int istartH = start_index(oh, osizeH, isizeH);
+    int iendH   = end_index(oh, osizeH, isizeH);
+    int kH = iendH - istartH;
+
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+
+      int istartW = start_index(ow, osizeW, isizeW);
+      int iendW   = end_index(ow, osizeW, isizeW);
+      int kW = iendW - istartW;
+
+      // Compute the average pooling from corresponding input pixels
+      const T *ptr_input = input_dt + istartH*istrideH + istartW*istrideW;
+      T *ptr_output = output_dt + oh*osizeW + ow;
+      int64_t *ptr_ind = indices_dt + oh*osizeW + ow;
+      int64_t argmax = istartT*isizeH*isizeW + istartH*isizeW + istartW;
+      T max = at::numeric_limits<T>::lower_bound(); // -Infinity
+
+      int it, ih, iw;
+      for(it = 0; it < kT; ++it) {
+        for(ih = 0; ih < kH; ++ih) {
+          for(iw = 0; iw < kW; ++iw) {
+            T val = ptr_input[ih*istrideH + iw*istrideW];
+            if ((val > max) || at::_isnan(val)) {
+              max = val;
+              argmax = (it+istartT)*isizeH*isizeW + (ih+istartH)*isizeW + iw+istartW;
+            }
+          }
+        }
+        ptr_input += istrideT;   // next input frame
+      }
+      // Update output and argmax
+      *ptr_output = max;
+      *ptr_ind = argmax;
+    }
+  }
+}
+
+template <typename scalar_t>
+void adaptivemaxpool_loop(
+                        const scalar_t *input_data,
+                        scalar_t *output_data,
+                        int64_t *indices_data,
+                        int64_t totalZ,
+                        int isizeT, int isizeH, int isizeW,
+                        int osizeT, int osizeH, int osizeW,
+                        int64_t istrideD,
+                        int64_t istrideT, int64_t istrideH, int64_t istrideW)
+{
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    adaptivemaxpool<<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+      input_data, output_data, indices_data, isizeT, isizeH, isizeW,
+      osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW, offsetZ);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    Assumes that input size can be perfectly divided by output size, i.e.
+ *    each input pixel can only be argmax of one output pixel.
+ */
+ template <typename T>
+__global__ void adaptivemaxgradinput(
+  T *gradInput, const T *gradOutput, const int64_t *indices,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW,
+  int64_t offsetZ
+)
+{
+  // iterators on output pixels
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  int d = o_plane / osizeT;     // output slice/feature
+
+  // gradInput offset by slice/feature
+  T *gradInput_d = gradInput + d*isizeT*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/otme
+  const T *gradOutput_dt = gradOutput + o_plane*osizeH*osizeW;
+  // indices offset by slice/feature and frame/otme
+  const int64_t *indices_dt = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+      // Compute the gradients for the argmax input pixel
+      const T *ptr_gradOutput = gradOutput_dt + oh*osizeW + ow;
+      const int64_t *ptr_ind = indices_dt + oh*osizeW + ow;
+      T grad_delta = *ptr_gradOutput;
+      int argmax = (*ptr_ind);
+      gradInput_d[argmax] += grad_delta;
+    }
+  }
+}
+
+template <typename scalar_t>
+void adaptivemaxgradinput_loop(
+  scalar_t *gradInput_data,
+  const scalar_t *gradOutput_data,
+  const int64_t *indices_data,
+  int64_t totalZ,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW)
+{
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    adaptivemaxgradinput<<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+      gradInput_data, gradOutput_data, indices_data,
+      isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+}
+
+/*
+ * Description:
+ *    This function computes the gradInput from gradOutput.
+ *
+ *    gridDim.y blocks work together on a single 2D output plane specified by
+ *    (blockIdx.x + offsetZ).
+ *
+ *    Uses atomic add.
+ */
+ template <typename T>
+__global__ void atomicadaptivemaxgradinput(
+  T *gradInput, const T *gradOutput, const int64_t *indices,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW,
+  int64_t offsetZ
+)
+{
+  // iterators on output pixels
+  int oh, ow;
+
+  // compute offsets based on thread/block ID
+  int ostartH = blockIdx.y * blockDim.y + threadIdx.y;
+  int oendH   = osizeH;
+  int ostepH  = gridDim.y * blockDim.y;
+  int ostartW = threadIdx.x;
+  int oendW   = osizeW;
+  int ostepW  = blockDim.x;
+
+  // select output plane
+  int64_t o_plane = blockIdx.x + offsetZ;
+  int d = o_plane / osizeT;     // output slice/feature
+
+  // gradInput offset by slice/feature
+  T *gradInput_d = gradInput + d*isizeT*isizeH*isizeW;
+  // gradOutput offset by slice/feature and frame/otme
+  const T *gradOutput_dt = gradOutput + o_plane*osizeH*osizeW;
+  // indices offset by slice/feature and frame/otme
+  const int64_t *indices_dt = indices + o_plane*osizeH*osizeW;
+
+  // For all output pixels...
+  for(oh = ostartH; oh < oendH; oh += ostepH) {
+    for(ow = ostartW; ow < oendW; ow += ostepW) {
+      // Compute the gradients for the argmax input pixel
+      const T *ptr_gradOutput = gradOutput_dt + oh*osizeW + ow;
+      const int64_t *ptr_ind = indices_dt + oh*osizeW + ow;
+      T grad_delta = *ptr_gradOutput;
+      int64_t argmax = (*ptr_ind);
+      gpuAtomicAddNoReturn(&(gradInput_d[argmax]), grad_delta);
+    }
+  }
+}
+
+template <typename scalar_t>
+void atomicadaptivemaxgradinput_loop(
+  scalar_t *gradInput_data,
+  const scalar_t *gradOutput_data,
+  const int64_t *indices_data,
+  int64_t totalZ,
+  int isizeT, int isizeH, int isizeW,
+  int osizeT, int osizeH, int osizeW)
+{
+  int64_t offsetZ = 0;
+  dim3 threads(32, 8);
+  // each H*W plane is processed by blocksH thread blocks
+  int blocksH = std::max((int)(16L / totalZ), 1);
+  while (totalZ > 0) {
+    dim3 blocks(totalZ > 65535 ? 65535 : totalZ, blocksH);
+    atomicadaptivemaxgradinput<<<blocks, threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+      gradInput_data, gradOutput_data, indices_data,
+      isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+}
+} // namespace
+
+// 5d tensor B x D x T x H x W
+
+TORCH_IMPL_FUNC(adaptive_max_pool3d_out_zoom)
+(const Tensor& input,
+ IntArrayRef output_size,
+ const Tensor& output,
+ const Tensor& indices) {
+  TensorArg output_arg{output, "output", 1};
+  TensorArg indices_arg{indices, "indices", 2};
+  TensorArg input_arg{input, "input", 3};
+
+  checkAllSameGPU(
+      __func__, {output_arg, indices_arg, input_arg});
+  if (input.numel() == 0 || output.numel() == 0) {
+    return;
+  }
+
+  int64_t osizeT = output_size[0];
+  int64_t osizeH = output_size[1];
+  int64_t osizeW = output_size[2];
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t istrideD, istrideT, istrideH, istrideW;
+  int64_t totalZ;
+
+  const Tensor& input_ = input.ndimension() == 4 ? input : input.contiguous();
+
+  if (input_.ndimension() == 4) {
+    sizeD = input_.size(0);
+    isizeT = input_.size(1);
+    isizeH = input_.size(2);
+    isizeW = input_.size(3);
+
+    istrideD = input_.stride(0);
+    istrideT = input_.stride(1);
+    istrideH = input_.stride(2);
+    istrideW = input_.stride(3);
+
+    totalZ = sizeD * osizeT;
+  } else {
+    int64_t sizeB = input_.size(0);
+    sizeD = input_.size(1);
+    isizeT = input_.size(2);
+    isizeH = input_.size(3);
+    isizeW = input_.size(4);
+
+    // In the kernel, the batch and channel dimensions are treated as if they
+    // are flattened and istrideD is used as the stride of this flattened dim
+    // Handle the edge case where input_.size(1) == 1, where despite passing the
+    // contiguity check the stride might not be T * H * W
+    istrideD = isizeT * isizeH * isizeW;
+    istrideT = input_.stride(2);
+    istrideH = input_.stride(3);
+    istrideW = input_.stride(4);
+
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf, kBFloat16, input_.scalar_type(), "adaptive_max_pool3d_zoom", [&] {
+        const scalar_t* input_data = input_.const_data_ptr<scalar_t>();
+        scalar_t* output_data = output.mutable_data_ptr<scalar_t>();
+        int64_t* indices_data = indices.mutable_data_ptr<int64_t>();
+
+        adaptivemaxpool_loop(
+            input_data,
+            output_data,
+            indices_data,
+            totalZ,
+            isizeT,
+            isizeH,
+            isizeW,
+            osizeT,
+            osizeH,
+            osizeW,
+            istrideD,
+            istrideT,
+            istrideH,
+            istrideW);
+      });
+}
+
+TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_zoom)
+(const Tensor& gradOutput,
+ const Tensor& input,
+ const Tensor& indices,
+ const Tensor& gradInput) {
+  TensorArg grad_input_arg{gradInput, "gradInput", 1};
+  TensorArg grad_output_arg{gradOutput, "gradOutput", 2};
+  TensorArg input_arg{input, "input", 3};
+  TensorArg indices_arg{indices, "indices", 4};
+
+  checkAllSameGPU(
+      __func__,
+      {grad_input_arg, grad_output_arg, input_arg, indices_arg});
+  if (gradOutput.numel() == 0) {
+    return;
+  }
+
+  const Tensor gradOutput_ = gradOutput.contiguous();
+
+  gradInput.zero_();
+
+  int64_t sizeD, isizeT, isizeH, isizeW;
+  int64_t osizeT, osizeH, osizeW;
+  int64_t totalZ;
+
+  if (input.ndimension() == 4) {
+    sizeD = input.size(0);
+    isizeT = input.size(1);
+    isizeH = input.size(2);
+    isizeW = input.size(3);
+
+    osizeT = gradOutput_.size(1);
+    osizeH = gradOutput_.size(2);
+    osizeW = gradOutput_.size(3);
+  } else {
+    sizeD = input.size(1);
+    isizeT = input.size(2);
+    isizeH = input.size(3);
+    isizeW = input.size(4);
+
+    osizeT = gradOutput_.size(2);
+    osizeH = gradOutput_.size(3);
+    osizeW = gradOutput_.size(4);
+  }
+
+  bool atomic = (isizeW % osizeW != 0) || (isizeH % osizeH != 0) ||
+      (isizeT % osizeT != 0);
+
+  if (input.ndimension() == 4) {
+    totalZ = sizeD * osizeT;
+  } else {
+    int sizeB = input.size(0);
+    totalZ = sizeB * sizeD * osizeT;
+  }
+
+  if (atomic) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        input.scalar_type(),
+        "adaptive_max_pool3d_backward_zoom",
+        [&] {
+          scalar_t* gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput_.const_data_ptr<scalar_t>();
+          const int64_t* indices_data = indices.const_data_ptr<int64_t>();
+
+          atomicadaptivemaxgradinput_loop(
+              gradInput_data,
+              gradOutput_data,
+              indices_data,
+              totalZ,
+              isizeT,
+              isizeH,
+              isizeW,
+              osizeT,
+              osizeH,
+              osizeW);
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        input.scalar_type(),
+        "adaptive_max_pool3d_backward_zoom",
+        [&] {
+          scalar_t* gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+          const scalar_t* gradOutput_data = gradOutput_.const_data_ptr<scalar_t>();
+          const int64_t* indices_data = indices.const_data_ptr<int64_t>();
+
+          adaptivemaxgradinput_loop(
+              gradInput_data,
+              gradOutput_data,
+              indices_data,
+              totalZ,
+              isizeT,
+              isizeH,
+              isizeW,
+              osizeT,
+              osizeH,
+              osizeW);
+        });
+  }
+ }
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/AmpKernels.cu b/aten/src/ATen/native/zoom/AmpKernels.cu
new file mode 100644
index 00000000000000..14fa799fd6d283
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AmpKernels.cu
@@ -0,0 +1,252 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#define _USE_MATH_DEFINES
+
+#include <math.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/TensorIterator.h>
+
+
+namespace {
+// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
+// to ensure the Cuda math library's isfinite is actually what gets called in
+// _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda.
+//
+// isfinite_ensure_cuda_math is defined outside at::native because:
+// - A bare call to "isfinite(val)" inside at::native causes nvcc to prefer the unrelated
+//   Tensor at::native::isfinite(const Tensor&), resulting in an error:
+//   "no suitable constructor exists to convert from "float" to "at::Tensor""
+// - Unfortunately, the Cuda math library documentation doesn't say how (or if) you can provide a full namespace path
+//   to ensure that its version of a particular function is invoked.  It only shows bare (not-namespaced)
+//   calls to its routines inside kernel or device functions.
+// - "std::isfinite(val)" in the gpu_kernel lambda causes an "unspecified launch failure" at runtime with cuda 9 on Windows.
+//
+// isfinite_ensure_cuda_math, declared at file scope outside the at::native region, uses isfinite as math library docs
+// suggest and allows disambiguated usage in the lambda within the at::native region.
+// GPU_LAMBDA is defined as __host__ __device__ (see Loops.cuh), so I need the __host__ keyword or else nvcc complains that
+// "calling a __device__ function("isfinite_ensure_cuda_math") from a __host__ __device__ function("operator()") is not allowed."
+static __host__ __device__ __forceinline__ int isfinite_ensure_zoom_math(float val) {
+  return isfinite(val);
+}
+}
+
+namespace at::native {
+
+namespace {
+// Single-tensor fallback for _amp_foreach_non_finite_check_and_unscale_zoom_.
+// Handles individual tensors that are acceptable to unscale but not MTA-safe.
+void _amp_non_finite_check_and_unscale_zoom_(Tensor& scaled_grad,
+                                             Tensor& found_inf,
+                                             const Tensor& inv_scale)
+{
+  // The only way we reach this function is through _amp_foreach_non_finite_check_and_unscale_zoom_, so no input checks.
+
+  // It's not obvious gpu_kernel always guards onto its argument.  Guarding here just in case.
+  const OptionalDeviceGuard device_guard(device_of(scaled_grad));
+
+  // Acts on scaled_grad in place.
+  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    iter.dtype(),
+    "_amp_non_finite_check_and_unscale_zoom",
+    [&iter, &found_inf, &inv_scale] {
+      auto* found_inf_ptr = found_inf.mutable_data_ptr<float>();
+      auto* inv_scale_ptr = inv_scale.const_data_ptr<float>();
+
+      using opmath_t = at::opmath_type<scalar_t>;
+
+      gpu_kernel(iter,
+                 [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t {
+                   auto val = static_cast<opmath_t>(val_in);
+                   if (!isfinite_ensure_zoom_math(val)) {
+                     *found_inf_ptr = 1.f;
+                   }
+                   // Every thread accesses inv_scale, but it will hit in cache.
+                   const auto inv_scale_val = *inv_scale_ptr;
+                   return static_cast<scalar_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                 });
+    });
+}
+} // anonymous namespace
+
+
+// Multiplies each tensor in scaled_grads by inv_scale in-place.
+// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf to 1.0.
+// Uses multi tensor apply (MTA) to process all MTA-safe tensors.
+//
+// Args:
+// scaled_grads:  A TensorList of scaled gradient tensors.  May contain infs or NaNs.
+// found_inf:  A single-element float tensor to which 1.0 will be written if any gradient contain infs/nans.
+//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
+// inv_scale:  The inverse of the scale factor by which scaled_grads are currently multiplied.
+void _amp_foreach_non_finite_check_and_unscale_zoom_(TensorList scaled_grads,
+                                                     Tensor& found_inf,
+                                                     const Tensor& inv_scale)
+{
+  if (scaled_grads.size() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(inv_scale.is_privateuseone(), "inv_scale must be a Zoom tensor.");
+  TORCH_CHECK(found_inf.is_privateuseone(), "found_inf must be a Zoom tensor.");
+  TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor.");
+  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
+
+  // Ensures client code (GradScaler) filtered scaled_grads by dtype.
+  check_foreach_api_restrictions(scaled_grads);
+
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+
+  // is_non_overlapping_and_dense() is not available in Python.
+  // GradScaler can't filter for it. We need to filter here.
+  if (can_use_fast_route(scaled_grads)) {
+    // Hopefully common case.
+    // can_use_fast_route is true, which confirms:
+    //  - all scaled_grads are strided
+    //  - all scaled_grads are non overlapping and dense
+    //  - all scaled_grads are on the same device
+    //  - all scaled_grads are of the same dtype
+    TORCH_CHECK(scaled_grads[0].is_privateuseone(), "scaled_grads must be Zoom tensors.");
+    // Sets up MTA launch to use scaled_grads as-is.
+    tensor_lists.emplace_back(scaled_grads.vec());
+  } else {
+    // Hopefully uncommon case.
+    // can_use_fast_route is an all-or-nothing check.  In this path it was false,
+    // so any of the above confirmations could have gone wrong.
+    // We filter MTA-safe tensors into an MTA-able list.
+    // If a tensor is acceptable but not MTA-safe, we fall back to the TensorIterator kernel.
+    // If a tensor is unacceptable, we throw an error to blame GradScaler.
+    tensor_lists.resize(1);
+    tensor_lists[0].reserve(scaled_grads.size());
+    auto expected_device = scaled_grads[0].device();
+    const auto expected_dtype = scaled_grads[0].scalar_type();
+    for (const Tensor& t : scaled_grads) {
+      // Ensures GradScaler filtered scaled_grads by device.
+      TORCH_CHECK(t.is_privateuseone(), "one of scaled_grads was not a Zoom tensor.");
+      TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
+      TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
+      if (!t.is_non_overlapping_and_dense() || t.scalar_type() != expected_dtype) {
+        // t is acceptable but not MTA-safe.  Falls back to single-tensor TensorIterator kernel.
+        _amp_non_finite_check_and_unscale_zoom_(const_cast<Tensor&>(t),
+                                                found_inf,
+                                                inv_scale);
+      } else {
+        tensor_lists[0].push_back(t);
+      }
+    }
+    if (tensor_lists[0].size() == 0) {
+      return;
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    tensor_lists[0][0].scalar_type(),
+    "_amp_foreach_non_finite_check_and_unscale_zoom",
+    [&tensor_lists, &found_inf, &inv_scale] {
+      auto* found_inf_ptr = found_inf.mutable_data_ptr<float>();
+      auto* inv_scale_ptr = inv_scale.const_data_ptr<float>();
+
+      using opmath_t = at::opmath_type<scalar_t>;
+
+      // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
+      multi_tensor_apply<1>(tensor_lists,
+                            UnaryOpFunctor<scalar_t,
+                                           /* depth */ 1,
+                                           /* r_args_depth */ 1,
+                                           /* res_arg_index */ 0>(),
+                            [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
+                              // There is a slight asymmetry here with the TensorIterator kernel above.
+                              // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
+                              if (!isfinite_ensure_zoom_math(val)) {
+                                *found_inf_ptr = 1.f;
+                              }
+                              // Every thread accesses inv_scale, but it will hit in cache.
+                              const auto inv_scale_val = *inv_scale_ptr;
+                              return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                            });
+    });
+}
+
+
+// amp_update_scale_zoom_kernel is launched with a single thread to compute the new scale.
+// The scale factor is maintained and updated on the GPU to avoid synchronization.
+__global__ void amp_update_scale_zoom_kernel(float* current_scale,
+                                             int* growth_tracker,
+                                             const float* found_inf,
+                                             double growth_factor,
+                                             double backoff_factor,
+                                             int growth_interval)
+{
+  if (*found_inf) {
+    *current_scale = (*current_scale)*backoff_factor;
+    *growth_tracker = 0;
+  } else {
+    // Entering this branch means we just carried out a successful step,
+    // so growth_tracker is incremented before comparing to growth_interval.
+    auto successful = (*growth_tracker) + 1;
+    if (successful == growth_interval) {
+      auto new_scale = static_cast<float>((*current_scale)*growth_factor);
+      // Do not grow the scale past fp32 bounds to inf.
+      if (isfinite_ensure_zoom_math(new_scale)) {
+          *current_scale = new_scale;
+      }
+      *growth_tracker = 0;
+    } else {
+      *growth_tracker = successful;
+    }
+  }
+}
+
+
+// _amp_update_scale_zoom asynchronously updates the scale tensor in place.
+//
+// Args:
+// current_scale:  A one-element zoom float tensor containing the scale value.
+// growth_tracker:  A one-element torch.zoom.IntTensor containing the number of recent consecutive unskipped steps.
+// found_inf:  A one-element zoom float tensor. If > 0, indicates that infs/nans were found by the relevant
+//             prior _amp_non_finite_check_and_unscale_zoom call, and 0 if no infs/nans were found.
+// growth_factor:  Multiplier if no infs/NaNs were found (typically slightly > 1).
+// backoff_factor:  Multiplier if infs/NaNs were found (typically 0.5).
+// growth_interval:  Number of consecutive unskipped steps that must occur for current_scale to be multiplied by
+//                   growth_factor.
+//
+// Returns:
+// current_scale
+Tensor& _amp_update_scale_zoom_(Tensor& current_scale,
+                                Tensor& growth_tracker,
+                                const Tensor& found_inf,
+                                double growth_factor,
+                                double backoff_factor,
+                                int64_t growth_interval)
+{
+  TORCH_CHECK(growth_tracker.is_privateuseone(), "growth_tracker must be a Zoom tensor.");
+  TORCH_CHECK(current_scale.is_privateuseone(), "current_scale must be a Zoom tensor.");
+  TORCH_CHECK(found_inf.is_privateuseone(), "found_inf must be a Zoom tensor.");
+  TORCH_CHECK(growth_tracker.numel() == 1, "growth_tracker must be a 1-element tensor.");
+  TORCH_CHECK(current_scale.numel() == 1, "current_scale must be a 1-element tensor.");
+  TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
+  TORCH_CHECK(growth_tracker.scalar_type() == at::ScalarType::Int, "growth_tracker must be an int tensor.");
+  TORCH_CHECK(current_scale.scalar_type() == at::ScalarType::Float, "current_scale must be a float tensor.");
+  TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
+
+  amp_update_scale_zoom_kernel<<<1, 1, 0, c10::zoom::getCurrentZoomStream()>>>(
+    current_scale.mutable_data_ptr<float>(),
+    growth_tracker.mutable_data_ptr<int>(),
+    found_inf.const_data_ptr<float>(),
+    growth_factor,
+    backoff_factor,
+    growth_interval);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return current_scale;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/AveragePool2d.cu b/aten/src/ATen/native/zoom/AveragePool2d.cu
new file mode 100644
index 00000000000000..309be1fbb62d4a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AveragePool2d.cu
@@ -0,0 +1,463 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/Pool.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/avg_pool2d_native.h>
+#include <ATen/ops/avg_pool2d_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+__device__ inline int min(int a, int b) {
+  return a <= b ? a : b;
+}
+
+__device__ inline int max(int a, int b) {
+  return a >= b ? a : b;
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void avg_pool2d_out_zoom_frame(const int nthreads,
+    const scalar_t* const bottom_data, const int64_t channels,
+    const int64_t height, const int64_t width, const int64_t pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    scalar_t* const top_data, const int divisor_override,
+    const bool count_include_pad, const bool use_divisor) {
+  HIP_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+
+    if (hstart >= hend || wstart >= wend) {
+      top_data[index] = scalar_t(0);
+      continue;
+    }
+
+    accscalar_t aveval = accscalar_t(0);
+    const scalar_t* const bottom_slice = bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_slice[h * width + w];
+      }
+    }
+    int divide_factor;
+    if (use_divisor) {
+      divide_factor = divisor_override;
+    } else {
+      if(count_include_pad) {
+        divide_factor = pool_size;
+      } else {
+        divide_factor = (hend - hstart) * (wend - wstart);
+      }
+    }
+    top_data[index] = static_cast<scalar_t>(aveval / divide_factor);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void avg_pool2d_out_zoom_frame_nhwc(const int nthreads,
+    const scalar_t* const bottom_data, const int64_t channels,
+    const int64_t height, const int64_t width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    scalar_t* const top_data, const int divisor_override,
+    const bool count_include_pad, const bool use_divisor) {
+  HIP_KERNEL_LOOP(index, nthreads) {
+    const int c = index % channels;
+    const int pw = (index / channels) % pooled_width;
+    const int ph = (index / channels / pooled_width) % pooled_height;
+    const int n = index / channels / pooled_width / pooled_height;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+
+    if (hstart >= hend || wstart >= wend) {
+      top_data[index] = scalar_t(0);
+      continue;
+    }
+
+    accscalar_t aveval = accscalar_t(0);
+    const scalar_t* const bottom_slice = bottom_data + n * channels * height * width + c;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_slice[(h * width + w) * channels];
+      }
+    }
+    int divide_factor;
+    if (use_divisor) {
+      divide_factor = divisor_override;
+    } else {
+      if(count_include_pad) {
+        divide_factor = pool_size;
+      } else {
+        divide_factor = (hend - hstart) * (wend - wstart);
+      }
+    }
+    top_data[index] = static_cast<scalar_t>(aveval / divide_factor);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_t>
+__global__ void avg_pool2d_backward_out_zoom_frame(const index_t nthreads, const scalar_t* const top_diff,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int64_t pooled_height, const int64_t pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    scalar_t* const bottom_diff, const int divisor_override,
+    bool count_include_pad, bool use_divisor) {
+  HIP_KERNEL_LOOP(index, nthreads) {
+    // find out the local index
+    // find out the local offset
+    const int w = index % width + pad_w;
+    const int h = (index / width) % height + pad_h;
+    const int c = (index / width / height) % channels;
+    const int n = index / width / height / channels;
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    accscalar_t gradient = accscalar_t(0);
+    const scalar_t* const top_diff_slice =
+        top_diff + (n * channels + c) * pooled_height * pooled_width;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend = min(hend, height);
+        wend = min(wend, width);
+
+        if (hstart >= hend || wstart >= wend) {
+          continue;
+        }
+
+        int divide_factor;
+        if (use_divisor) {
+          divide_factor = divisor_override;
+        } else {
+          if(count_include_pad) {
+            divide_factor = pool_size;
+          } else {
+            divide_factor = (hend - hstart) * (wend - wstart);
+          }
+        }
+        gradient += top_diff_slice[ph * pooled_width + pw] / divide_factor;
+      }
+    }
+    bottom_diff[index] = static_cast<scalar_t>(gradient);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_t>
+__global__ void avg_pool2d_backward_out_zoom_frame_nhwc(const index_t nthreads,
+    const scalar_t* const top_diff,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    scalar_t* const bottom_diff, const int divisor_override,
+    bool count_include_pad, bool use_divisor) {
+  HIP_KERNEL_LOOP(index, nthreads) {
+    const int c = index % channels;
+    const int w = (index / channels) % width;
+    const int h = (index / channels / width) % height;
+    const int n = index / channels / width / height;
+
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_height);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_width);
+    accscalar_t gradient = accscalar_t(0);
+    const scalar_t* const top_diff_slice = top_diff + n * channels * pooled_height * pooled_width + c;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend = min(hend, height);
+        wend = min(wend, width);
+
+        if (hstart >= hend || wstart >= wend) {
+          continue;
+        }
+
+        int divide_factor;
+        if (use_divisor) {
+          divide_factor = divisor_override;
+        } else {
+          if(count_include_pad) {
+            divide_factor = pool_size;
+          } else {
+            divide_factor = (hend - hstart) * (wend - wstart);
+          }
+        }
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] / divide_factor;
+      }
+    }
+    bottom_diff[index] = static_cast<scalar_t>(gradient);
+  }
+}
+
+} // anonymous namespace
+
+TORCH_IMPL_FUNC(avg_pool2d_out_zoom)
+(const Tensor& input_,
+ int64_t kH_,
+ int64_t kW_,
+ int64_t dH_,
+ int64_t dW_,
+ int64_t padH_,
+ int64_t padW_,
+ bool ceil_mode,
+ bool count_include_pad,
+ std::optional<int64_t> divisor_override,
+ const Tensor& output) {
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg input_arg{ input_, "input_", 2 };
+
+  checkAllSameGPU("avg_pool2d_out_zoom", {output_arg, input_arg});
+
+  const int kH = safe_downcast<int, int64_t>(kH_);
+  const int kW = safe_downcast<int, int64_t>(kW_);
+
+  const int dH = safe_downcast<int, int64_t>(dH_);
+  const int dW = safe_downcast<int, int64_t>(dW_);
+
+  const int padH = safe_downcast<int, int64_t>(padH_);
+  const int padW = safe_downcast<int, int64_t>(padW_);
+
+  /* sizes */
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+  int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
+  const auto memory_format = input_.suggest_memory_format();
+
+  Tensor input = input_.contiguous(memory_format);
+
+  const auto count = safe_downcast<int32_t, int64_t>(output.numel());
+  const uint32_t num_threads = std::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const uint32_t num_blocks = ceil_div<uint32_t>(count, num_threads);
+
+  bool use_divisor = divisor_override.has_value();
+  const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
+
+  if (count != 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+      "avg_pool2d_out_zoom_frame",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+
+        scalar_t *output_data = output.mutable_data_ptr<scalar_t>();
+        const scalar_t *input_data = input.const_data_ptr<scalar_t>();
+
+        switch (memory_format){
+          case MemoryFormat::ChannelsLast: {
+            output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::ChannelsLast);
+            avg_pool2d_out_zoom_frame_nhwc<scalar_t, accscalar_t>
+                <<<num_blocks,
+                   num_threads,
+                   0,
+                   c10::zoom::getCurrentZoomStream()>>>(
+                    count,
+                    input_data,
+                    nInputPlane,
+                    inputHeight,
+                    inputWidth,
+                    outputHeight,
+                    outputWidth,
+                    kH,
+                    kW,
+                    dH,
+                    dW,
+                    padH,
+                    padW,
+                    output_data,
+                    divisor_override_value,
+                    count_include_pad,
+                    use_divisor);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            break;
+          }
+          case MemoryFormat::Contiguous: {
+            avg_pool2d_out_zoom_frame<scalar_t, accscalar_t>
+                <<<num_blocks,
+                   num_threads,
+                   0,
+                   c10::zoom::getCurrentZoomStream()>>>(
+                    count,
+                    input_data,
+                    nInputPlane,
+                    inputHeight,
+                    inputWidth,
+                    outputHeight,
+                    outputWidth,
+                    kH,
+                    kW,
+                    dH,
+                    dW,
+                    padH,
+                    padW,
+                    output_data,
+                    divisor_override_value,
+                    count_include_pad,
+                    use_divisor);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            break;
+          }
+          default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+        }
+      }
+    );
+  }
+}
+
+TORCH_IMPL_FUNC(avg_pool2d_backward_out_zoom) (
+  const Tensor& gradOutput_,
+  const Tensor& input_,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  bool ceil_mode,
+  bool count_include_pad,
+  std::optional<int64_t> divisor_override,
+  const Tensor& gradInput
+) {
+  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
+  TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 };
+  TensorArg input_arg{ input_, "input_", 3 };
+
+  checkAllSameGPU("avg_pool2d_backward_out_zoom",
+                  {gradInput_arg, gradOutput_arg, input_arg});
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const auto memory_format = input_.suggest_memory_format();
+  const Tensor input = input_.contiguous(memory_format);
+  const Tensor gradOutput = gradOutput_.contiguous(memory_format);
+
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+
+  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
+  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
+
+
+  const auto count = input.numel();
+  if (count == 0) {
+    return;
+  }
+
+  const uint32_t num_threads = std::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  const uint32_t num_blocks = ceil_div<uint32_t>(count, num_threads);
+
+  bool use_divisor = divisor_override.has_value();
+  const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+    "avg_pool2d_backward_out_zoom_frame",
+    [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      const scalar_t *gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+      scalar_t *gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+
+      AT_DISPATCH_INDEX_TYPES(
+        at::native::canUse32BitIndexMath(input, INT_MAX) ? ScalarType::Int : ScalarType::Long,
+        "avg_pool2d_backward_out_zoom_frame_launcher",
+        [&] {
+              switch (memory_format) {
+
+                case MemoryFormat::ChannelsLast: {
+                  gradInput.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::ChannelsLast);
+                  avg_pool2d_backward_out_zoom_frame_nhwc<scalar_t, accscalar_t, index_t>
+                    <<<num_blocks, num_threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+                      count,
+                      gradOutput_data,
+                      nInputPlane,
+                      inputHeight, inputWidth,
+                      outputHeight, outputWidth,
+                      kH, kW,
+                      dH, dW,
+                      padH, padW,
+                      gradInput_data,
+                      divisor_override_value,
+                      count_include_pad, use_divisor);
+                  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+                  break;
+                }
+                case MemoryFormat::Contiguous: {
+                  avg_pool2d_backward_out_zoom_frame<scalar_t, accscalar_t, index_t>
+                    <<<num_blocks, num_threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+                      count,
+                      gradOutput_data,
+                      nInputPlane,
+                      inputHeight, inputWidth,
+                      outputHeight, outputWidth,
+                      kH, kW,
+                      dH, dW,
+                      padH, padW,
+                      gradInput_data,
+                      divisor_override_value,
+                      count_include_pad, use_divisor);
+                  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+                  break;
+                }
+                default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+              }
+            });
+        });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/AveragePool3d.cu b/aten/src/ATen/native/zoom/AveragePool3d.cu
new file mode 100644
index 00000000000000..d470809373c8b9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/AveragePool3d.cu
@@ -0,0 +1,606 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/Pool.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/avg_pool3d_native.h>
+#include <ATen/ops/avg_pool3d_backward_native.h>
+#endif
+
+
+namespace at::native {
+namespace {
+
+__device__ inline int min(int a, int b) {
+  return a <= b ? a : b;
+}
+
+__device__ inline int max(int a, int b) {
+  return a >= b ? a : b;
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void avg_pool3d_zoom_update_output(
+  PackedTensorAccessor64<const scalar_t, 4> input,
+  PackedTensorAccessor64<scalar_t, 4> output,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad,
+  int offsetZ, int divisor_override)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % output.size(1); // output frame/time
+  int slice  = (blockIdx.z + offsetZ) / output.size(1); // output slice/feature
+
+  if (oRow < output.size(2) && oCol < output.size(3))
+  {
+    accscalar_t sum = 0.0;
+
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, input.size(1) + padT);
+    int hend = min(hstart + kH, input.size(2) + padH);
+    int wend = min(wstart + kW, input.size(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, input.size(1));
+    hend = min(hend, input.size(2));
+    wend = min(wend, input.size(3));
+
+    if (tstart >= tend || hstart >= hend || wstart >= wend) {
+      output[slice][oFrame][oRow][oCol] = scalar_t(0);
+      return;
+    }
+
+    accscalar_t divide_factor;
+    if (divisor_override) {
+      divide_factor = static_cast<accscalar_t>(divisor_override);
+    } else {
+      if(count_include_pad) {
+        divide_factor = static_cast<accscalar_t>(pool_size);
+      } else {
+        divide_factor = static_cast<accscalar_t>((tend - tstart) * (hend - hstart) * (wend - wstart));
+      }
+    }
+
+    int ti, hi, wi;
+    for (ti = tstart; ti < tend; ++ti)
+    {
+      for (hi = hstart; hi < hend; ++hi)
+      {
+        for (wi = wstart; wi < wend; ++wi)
+        {
+          const scalar_t val = input[slice][ti][hi][wi];
+          sum += val;
+        }
+      }
+    }
+
+    output[slice][oFrame][oRow][oCol] = static_cast<scalar_t>(sum / divide_factor);
+  }
+}
+
+// Inner-most loop size (kW) passed as template parameter for
+// performance reasons.
+//
+template<int KERNEL_WIDTH, typename scalar_t, typename accscalar_t>
+__global__ void avg_pool3d_zoom_update_output(
+  PackedTensorAccessor64<const scalar_t, 4> input,
+  PackedTensorAccessor64<scalar_t, 4> output,
+  int kT, int kH,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad,
+  int offsetZ, int divisor_override)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % output.size(1); // output frame/time
+  int slice  = (blockIdx.z + offsetZ) / output.size(1); // output slice/feature
+
+  if (oRow < output.size(2) && oCol < output.size(3))
+  {
+    accscalar_t sum = 0.0;
+
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, input.size(1) + padT);
+    int hend = min(hstart + kH, input.size(2) + padH);
+    int wend = min(wstart + KERNEL_WIDTH, input.size(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, input.size(1));
+    hend = min(hend, input.size(2));
+    wend = min(wend, input.size(3));
+
+    if (tstart >= tend || hstart >= hend || wstart >= wend) {
+      output[slice][oFrame][oRow][oCol] = scalar_t(0);
+      return;
+    }
+
+    accscalar_t divide_factor;
+    if (divisor_override) {
+      divide_factor = static_cast<accscalar_t>(divisor_override);
+    } else {
+      if(count_include_pad) {
+        divide_factor = static_cast<accscalar_t>(pool_size);
+      } else {
+        divide_factor = static_cast<accscalar_t>((tend - tstart) * (hend - hstart) * (wend - wstart));
+      }
+    }
+
+    int ti, hi, wi;
+    for (ti = tstart; ti < tend; ++ti)
+    {
+      for (hi = hstart; hi < hend; ++hi)
+      {
+        for (wi = wstart; wi < wend; ++wi)
+        {
+          const scalar_t val = input[slice][ti][hi][wi];
+          sum += val;
+        }
+      }
+    }
+
+    output[slice][oFrame][oRow][oCol] = static_cast<scalar_t>(sum / divide_factor);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void avg_pool3d_single_backward_out_frame_stride1(
+  PackedTensorAccessor64<const scalar_t, 4> gradOutput,
+  PackedTensorAccessor64<scalar_t, 4> gradInput,
+  int kT, int kH, int kW,
+  accscalar_t normFactor,
+  int offsetZ)
+{
+  int iCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int iRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int iFrame = (blockIdx.z + offsetZ) % gradInput.size(1); // input frame/time
+  int slice  = (blockIdx.z + offsetZ) / gradInput.size(1); // input slice/feature
+
+  // guard against over-tiled threads
+  if (iRow < gradInput.size(2) && iCol < gradInput.size(3))
+  {
+    accscalar_t sum = 0.0;
+    const scalar_t *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
+      [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
+    int frameOffset = 0;
+    for (int oFrame  = max(0, iFrame - kT + 1);
+         oFrame < min(iFrame + 1, gradOutput.size(1));
+         ++oFrame)
+    {
+      int rowOffset = frameOffset;
+      for (int oRow = max(0, iRow - kH + 1);
+           oRow < min(iRow + 1, gradOutput.size(2));
+           ++oRow)
+      {
+        int colOffset = rowOffset;
+        for (int oCol = max(0, iCol - kW + 1);
+             oCol < min(iCol + 1, gradOutput.size(3));
+             ++oCol)
+        {
+          sum += gOut[colOffset];
+          ++colOffset;
+        }
+        rowOffset += gradOutput.size(3);
+      }
+      frameOffset += gradOutput.size(2) * gradOutput.size(3);
+    }
+    gradInput[slice][iFrame][iRow][iCol] = static_cast<scalar_t>(sum * normFactor);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void avg_pool3d_zoom_update_grad_input_atomic(
+  PackedTensorAccessor64<const scalar_t, 4> gradOutput,
+  PackedTensorAccessor64<scalar_t, 4> gradInput,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad,
+  int offsetZ, int divisor_override, const int gradInput_numel)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % gradOutput.size(1); // gradOutput frame/time
+  int slice  = (blockIdx.z + offsetZ) / gradOutput.size(1); // gradOutput slice/feature
+
+  // guard against over-tiled threads
+  if (oRow < gradOutput.size(2) && oCol < gradOutput.size(3))
+  {
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, gradInput.size(1) + padT);
+    int hend = min(hstart + kH, gradInput.size(2) + padH);
+    int wend = min(wstart + kW, gradInput.size(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, gradInput.size(1));
+    hend = min(hend, gradInput.size(2));
+    wend = min(wend, gradInput.size(3));
+
+    accscalar_t divide_factor;
+    if (divisor_override) {
+      divide_factor = static_cast<accscalar_t>(divisor_override);
+    } else {
+      if(count_include_pad) {
+        divide_factor = static_cast<accscalar_t>(pool_size);
+      } else {
+        divide_factor = static_cast<accscalar_t>((tend - tstart) * (hend - hstart) * (wend - wstart));
+      }
+    }
+
+    scalar_t val = static_cast<scalar_t>(
+      static_cast<accscalar_t>(gradOutput[slice][oFrame][oRow][oCol]) / divide_factor);
+    for (int iFrame = tstart; iFrame < tend; ++iFrame)
+    {
+      for (int iRow = hstart; iRow < hend; ++iRow)
+      {
+        for (int iCol = wstart; iCol < wend; ++iCol)
+        {
+          const int index = slice * gradInput.stride(0) + iFrame * gradInput.stride(1) + iRow * gradInput.stride(2) + iCol * gradInput.stride(3);
+          fastAtomicAdd(gradInput.data(), index, gradInput_numel, val, true);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+__global__ void avg_pool3d_zoom_update_grad_input(
+  PackedTensorAccessor64<const scalar_t, 4> gradOutput,
+  PackedTensorAccessor64<scalar_t, 4> gradInput,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int padT, int padH, int padW,
+  bool count_include_pad, int offsetZ, int divisor_override)
+{
+  int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = (blockIdx.z + offsetZ) % gradOutput.size(1); // gradOutput frame/time
+  int slice  = (blockIdx.z + offsetZ) / gradOutput.size(1); // gradOutput slice/feature
+
+  // guard against over-tiled threads
+  if (oRow < gradOutput.size(2) && oCol < gradOutput.size(3))
+  {
+    int tstart = oFrame * dT - padT;
+    int hstart = oRow   * dH - padH;
+    int wstart = oCol   * dW - padW;
+    int tend = min(tstart + kT, gradInput.size(1) + padT);
+    int hend = min(hstart + kH, gradInput.size(2) + padH);
+    int wend = min(wstart + kW, gradInput.size(3) + padW);
+    int pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
+    tstart = max(tstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    tend = min(tend, gradInput.size(1));
+    hend = min(hend, gradInput.size(2));
+    wend = min(wend, gradInput.size(3));
+
+    accscalar_t divide_factor;
+    if (divisor_override) {
+      divide_factor = static_cast<accscalar_t>(divisor_override);
+    } else {
+      if(count_include_pad) {
+        divide_factor = static_cast<accscalar_t>(pool_size);
+      } else {
+        divide_factor = static_cast<accscalar_t>((tend - tstart) * (hend - hstart) * (wend - wstart));
+      }
+    }
+
+    scalar_t val = static_cast<scalar_t>(
+      static_cast<accscalar_t>(gradOutput[slice][oFrame][oRow][oCol]) / divide_factor);
+    for (int iFrame = tstart; iFrame < tend; ++iFrame)
+    {
+      for (int iRow = hstart; iRow < hend; ++iRow)
+      {
+        for (int iCol = wstart; iCol < wend; ++iCol)
+        {
+          gradInput[slice][iFrame][iRow][iCol] = val;
+        }
+      }
+    }
+  }
+}
+
+} // anonymous namespace
+
+#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:      \
+  avg_pool3d_zoom_update_output<KW, scalar_t, accscalar_t>  \
+    <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>( \
+       work_input.packed_accessor64<const scalar_t, 4>(),   \
+       work_output.packed_accessor64<scalar_t, 4>(),        \
+       kT, kH,                                              \
+       dT, dH, dW,                                          \
+       padT, padH, padW,                                    \
+       count_include_pad,                                   \
+       offsetZ, divisor);                                   \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();                           \
+  break
+
+
+TORCH_IMPL_FUNC(avg_pool3d_out_zoom) (
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  bool ceil_mode,
+  bool count_include_pad,
+  std::optional<int64_t> divisor_override,
+  const Tensor& output
+) {
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg input_arg{ input, "input", 2 };
+
+  checkAllSameGPU(__func__, {output_arg, input_arg});
+
+  const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
+
+  const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
+  const int dH = stride.empty() ? kH :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[1]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[2]);
+
+  const int padT = safe_downcast<int, int64_t>(padding[0]);
+  const int padH = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[1]);
+  const int padW = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[2]);
+
+  // if divisor==0 then we will ignore it
+  int64_t divisor = 0;
+  if (divisor_override.has_value()) {
+    divisor = divisor_override.value();
+  }
+
+  const int64_t nbatch = input.ndimension() == 5 ? input.size(-5) : 1;
+  const int64_t nslices = input.size(-4);
+  const int64_t itime = input.size(-3);
+  const int64_t iheight = input.size(-2);
+  const int64_t iwidth = input.size(-1);
+
+  const int64_t otime = pooling_output_shape<int64_t>(itime, kT, padT, dT, 1, ceil_mode);
+  const int64_t oheight = pooling_output_shape<int64_t>(iheight, kH, padH, dH, 1, ceil_mode);
+  const int64_t owidth = pooling_output_shape<int64_t>(iwidth, kW, padW, dW, 1, ceil_mode);
+
+  Tensor work_input = input.contiguous();
+  Tensor work_output = output;
+  if (input.ndimension() == 5) {
+    // Collapse batch and feature dimensions.
+    work_input = work_input.reshape({nbatch * nslices, itime, iheight, iwidth});
+    work_output = work_output.reshape({nbatch * nslices, otime, oheight, owidth});
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+    input.scalar_type(),
+    "avg_pool3d_out_zoom",
+    [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      int64_t totalZ = otime * nslices * nbatch;
+      int64_t offsetZ = 0;
+      dim3 block(32, 8);
+
+      while (totalZ > 0) {
+        dim3 grid(ceil_div(owidth, static_cast<int64_t>(block.x)),
+                  ceil_div(oheight, static_cast<int64_t>(block.y)),
+                  totalZ > 65535 ? 65535 : totalZ);
+
+        switch (kW) {
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1);
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2);
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3);
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4);
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5);
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6);
+          LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7);
+        default:
+          avg_pool3d_zoom_update_output<scalar_t, accscalar_t>
+            <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                work_input.packed_accessor64<const scalar_t, 4>(),
+                work_output.packed_accessor64<scalar_t, 4>(),
+                kT, kH, kW,
+                dT, dH, dW,
+                padT, padH, padW,
+                count_include_pad,
+                offsetZ, divisor);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          break;
+        }
+
+        totalZ -= 65535;
+        offsetZ += 65535;
+      }
+    }
+  );
+}
+
+#undef LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH
+
+
+TORCH_IMPL_FUNC(avg_pool3d_backward_out_zoom) (
+  const Tensor& gradOutput,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  bool ceil_mode,
+  bool count_include_pad,
+  std::optional<int64_t> divisor_override,
+  const Tensor& gradInput
+) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("avg_pool3d_backward_zoom");
+
+  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
+  TensorArg gradOutput_arg{ gradOutput, "gradOutput", 2 };
+  TensorArg input_arg{ input, "input", 3 };
+
+  checkAllSameGPU(__func__,
+                  {gradInput_arg, gradOutput_arg, input_arg});
+
+  const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
+
+  const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
+  const int dH = stride.empty() ? kH :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[1]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[2]);
+
+  const int padT = safe_downcast<int, int64_t>(padding[0]);
+  const int padH = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[1]);
+  const int padW = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[2]);
+
+  TORCH_CHECK((gradOutput.ndimension() == 4 || gradOutput.ndimension() == 5),
+    "non-empty 4D or 5D (batch mode) tensor expected for gradOutput");
+
+  // if divisor==0 then we will ignore it
+  int64_t divisor = 0;
+  if (divisor_override.has_value()) {
+    divisor = divisor_override.value();
+  }
+
+  gradInput.zero_();
+
+  const int64_t nbatch = input.ndimension() == 5 ? input.size(-5) : 1;
+  const int64_t nslices = input.size(-4);
+  const int64_t itime = input.size(-3);
+  const int64_t iheight = input.size(-2);
+  const int64_t iwidth = input.size(-1);
+
+  const int64_t otime = gradOutput.size(-3);
+  const int64_t oheight = gradOutput.size(-2);
+  const int64_t owidth = gradOutput.size(-1);
+
+  /* XXX shape check behavior from TH */
+  const int64_t otime_for_shape_check = pooling_output_shape<int64_t>(itime, kT, padT, dT, 1, ceil_mode);
+  const int64_t oheight_for_shape_check = pooling_output_shape<int64_t>(iheight, kH, padH, dH, 1, ceil_mode);
+  const int64_t owidth_for_chape_check = pooling_output_shape<int64_t>(iwidth, kW, padW, dW, 1, ceil_mode);
+
+  const bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW);
+
+  Tensor work_grad_input = gradInput;
+  Tensor work_grad_output = gradOutput.contiguous();
+
+  if (input.ndimension() == 5) {
+    // Collapse batch and feature dimensions.
+    work_grad_input = work_grad_input.reshape({nbatch * nslices, itime, iheight, iwidth});
+    work_grad_output = work_grad_output.reshape({nbatch * nslices, otime, oheight, owidth});
+  }
+
+
+  // Optimizing for stride 1 is probably only of limited value, but this
+  // specialization yields 3x speedup over the gpuAtomicAdd implementation.
+  // Padding must be 0, otherwise, pool size may change.
+  if (dT == 1 && dH == 1 && dW == 1 && padT == 0 && padH == 0 && padW == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+      "avg_pool3d_backward_out_frame_stride1",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        int64_t totalZ = itime * nslices * nbatch;
+        int64_t offsetZ = 0;
+        dim3 block(32, 8);
+
+        accscalar_t divide_factor;
+        if (divisor) {
+          divide_factor = static_cast<accscalar_t>(divisor);
+        } else {
+          divide_factor = static_cast<accscalar_t>(kT * kH * kW);
+        }
+
+        while (totalZ > 0) {
+          dim3 grid(ceil_div(iwidth, static_cast<int64_t>(block.x)),
+                    ceil_div(iheight, static_cast<int64_t>(block.y)),
+                    totalZ > 65535 ? 65535 : totalZ);
+
+          avg_pool3d_single_backward_out_frame_stride1<scalar_t, accscalar_t>
+            <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+              work_grad_output.packed_accessor64<const scalar_t, 4>(),
+              work_grad_input.packed_accessor64<scalar_t, 4>(),
+              kT, kH, kW,
+              1.0f/divide_factor,
+              offsetZ);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+          totalZ -= 65535;
+          offsetZ += 65535;
+        }
+      }
+    );
+  }
+  else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+      "avg_pool3d_backward_out_frame",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        int64_t totalZ = otime * nslices * nbatch;
+        int64_t offsetZ = 0;
+        dim3 block(32, 8);
+
+        while (totalZ > 0) {
+          dim3 grid(ceil_div(owidth, static_cast<int64_t>(block.x)),
+                    ceil_div(oheight, static_cast<int64_t>(block.y)),
+                    totalZ > 65535 ? 65535 : totalZ);
+
+          if (kernelsOverlap) {
+            avg_pool3d_zoom_update_grad_input_atomic<scalar_t, accscalar_t>
+              <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                  work_grad_output.packed_accessor64<const scalar_t, 4>(),
+                  work_grad_input.packed_accessor64<scalar_t, 4>(),
+                  kT, kH, kW,
+                  dT, dH, dW,
+                  padT, padH, padW,
+                  count_include_pad,
+                  offsetZ, divisor, work_grad_input.numel());
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+          else {
+            avg_pool3d_zoom_update_grad_input<scalar_t, accscalar_t>
+              <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                  work_grad_output.packed_accessor64<const scalar_t, 4>(),
+                  work_grad_input.packed_accessor64<scalar_t, 4>(),
+                  kT, kH, kW,
+                  dT, dH, dW,
+                  padT, padH, padW,
+                  count_include_pad,
+                  offsetZ, divisor);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+
+          totalZ -= 65535;
+          offsetZ += 65535;
+        }
+      }
+    );
+  }
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu b/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu
index fbd3657a48b6fd..79f4bc80615f58 100644
--- a/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu
+++ b/aten/src/ATen/native/zoom/BinaryBitwiseOpsKernels.cu
@@ -5,6 +5,9 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
 namespace at::native {
 
 template<typename scalar_t>
diff --git a/aten/src/ATen/native/zoom/BinaryGeometricKernels.cu b/aten/src/ATen/native/zoom/BinaryGeometricKernels.cu
new file mode 100644
index 00000000000000..ad16a2c2a6681e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryGeometricKernels.cu
@@ -0,0 +1,39 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void atan2_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      iter.common_dtype(), "atan2_zoom",
+      [&]() {
+        gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return ::atan2(a, b);
+        });
+      });
+}
+
+void hypot_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      iter.common_dtype(), "hypot_zoom",
+      [&]() {
+        opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
+            iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return ::hypot(a, b);
+        });
+      });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(atan2_stub, &atan2_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(hypot_stub, &hypot_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/zoom/BinaryLogicalOpsKernels.cu
new file mode 100644
index 00000000000000..5eb61fc112e8b6
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryLogicalOpsKernels.cu
@@ -0,0 +1,128 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char logical_and_name[] = "logical_and_kernel";
+void logical_and_kernel_zoom(TensorIterator& iter) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto logical_and_string = jiterator_stringify(
+        template <typename T>
+        bool logical_and_kernel(T a, T b) {
+          return a && b;
+        }
+    ); // logical_and_string
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_zoom", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ logical_and_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(iter, logical_and_string);
+    }); // logical_and_string
+#else
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_and_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
+          iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return a && b;
+      });
+    });
+#endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16,
+                               dtype, "logical_and_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
+          iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return a && b;
+      });
+   });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char logical_or_name[] = "logical_or_kernel";
+void logical_or_kernel_zoom(TensorIterator& iter) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto logical_or_string = jiterator_stringify(
+      template <typename T>
+      bool logical_or_kernel(T a, T b) {
+        return a || b;
+      }
+    ); // logical_or_string
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_or_zoom", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ logical_or_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(iter, logical_or_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_or_zoom", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return a || b;
+      });
+    });
+#endif
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16,
+                             dtype, "logical_or_zoom", [&]() {
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
+        iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return a || b;
+    });
+  });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char logical_xor_name[] = "logical_xor_kernel";
+void logical_xor_kernel_zoom(TensorIterator& iter) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto logical_xor_string = jiterator_stringify(
+        template <typename T>
+        bool logical_xor_kernel(T a, T b) {
+          return bool(a) != bool(b);
+        }
+    );
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_xor_zoom", [&]() {
+      jitted_gpu_kernel<
+        /*name=*/ logical_xor_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(iter, logical_xor_string);
+    }); // logical_xor_string
+#else
+    AT_DISPATCH_COMPLEX_TYPES(dtype, "logical_xor_zoom", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+        return bool(a) != bool(b);
+      });
+    });
+#endif
+  } else {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16,
+                             dtype, "logical_xor_zoom", [&]() {
+    opmath_symmetric_gpu_kernel_with_scalars<scalar_t, bool>(
+        iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
+      return bool(a) != bool(b);
+    });
+  });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(logical_and_stub, &logical_and_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(logical_or_stub, &logical_or_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(logical_xor_stub, &logical_xor_kernel_zoom);
+
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/zoom/BinaryMiscBackwardOpsKernels.cu
new file mode 100644
index 00000000000000..e7e2bea410b0bf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryMiscBackwardOpsKernels.cu
@@ -0,0 +1,131 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/BinaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char sigmoid_backward_name[] = "sigmoid_backward";
+void sigmoid_backward_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.dtype();
+  if(isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto sigmoid_backward_string = jiterator_stringify(
+        template <typename T>
+        T sigmoid_backward(T a, T b) {
+          return a * std::conj((T{1.} - b) * b);
+        }
+    ); // sigmoid_backward_string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sigmoid_backward_zoom", [&]() {
+        jitted_gpu_kernel<
+          /*name=*/ sigmoid_backward_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 2>(iter, sigmoid_backward_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "sigmoid_backward_zoom", [&]() {
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        using comp_t = at::opmath_type<scalar_t>;
+        const auto one = comp_t{1.};
+        const auto comp_b = static_cast<comp_t>(b);
+        const auto comp_a = static_cast<comp_t>(a);
+        return static_cast<scalar_t>(comp_a * std::conj((one - comp_b) * comp_b));
+      });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "sigmoid_backward_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return a * (scalar_t(1.) - b) * b;
+      });
+    });
+  }
+}
+
+void logit_backward_kernel_zoom(TensorIteratorBase& iter, const Scalar& eps_scalar) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.dtype(),
+      "logit_zoom",
+      [&]() {
+        using T_ACC = acc_type<scalar_t, true>;
+        const T_ACC eps = eps_scalar.to<T_ACC>();
+        if (eps < T_ACC(0)) {
+          gpu_kernel(
+              iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+                const T_ACC dy_acc = static_cast<T_ACC>(dy);
+                const T_ACC x_acc = static_cast<T_ACC>(x);
+                return (x_acc < T_ACC(0) || x_acc > T_ACC(1))
+                    ? std::numeric_limits<T_ACC>::quiet_NaN()
+                    : dy_acc / (x_acc * (T_ACC(1) - x_acc));
+              });
+        } else {
+          const T_ACC lo = eps;
+          const T_ACC hi = T_ACC(1) - eps;
+          gpu_kernel(
+              iter, [lo, hi] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t {
+                const T_ACC dy_acc = static_cast<T_ACC>(dy);
+                const T_ACC x_acc = static_cast<T_ACC>(x);
+                return (x_acc < lo || x_acc > hi)
+                    ? T_ACC(0)
+                    : dy_acc / (x_acc * (T_ACC(1) - x_acc));
+              });
+        }
+      });
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char tanh_backward_name[] = "tanh_backward";
+void tanh_backward_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.dtype();
+  if(isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto tanh_backward_string = jiterator_stringify(
+      template <typename T>
+      T tanh_backward(T a, T b) {
+        return a * std::conj(T{1.} - b * b);
+      }
+    ); // tanh_backward_string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "tanh_backward_complex_zoom", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/ tanh_backward_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 2>(iter, tanh_backward_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "tanh_backward_complex_zoom", [&]() {
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        using comp_t = at::opmath_type<scalar_t>;
+        const auto one = comp_t{1.};
+        const auto comp_b = static_cast<comp_t>(b);
+        const auto comp_a = static_cast<comp_t>(a);
+        return static_cast<scalar_t>(comp_a * std::conj(one - comp_b * comp_b));
+      });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, dtype, "tanh_backward_zoom", [&]() {
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return a * (scalar_t{1.} - b * b);
+      });
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(sigmoid_backward_stub, &sigmoid_backward_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(logit_backward_stub, &logit_backward_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(tanh_backward_stub, &tanh_backward_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/zoom/BinaryMiscOpsKernels.cu
new file mode 100644
index 00000000000000..fac17cfb29e471
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryMiscOpsKernels.cu
@@ -0,0 +1,81 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/NumericUtils.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void smooth_l1_kernel_zoom(TensorIteratorBase& iter, double beta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "smooth_l1_zoom", [&iter, beta]() {
+    scalar_t beta_val(beta);
+    gpu_kernel(iter, [beta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+      auto z = ::abs(a - b);
+      return z < beta_val ? scalar_t(0.5) * z * z / beta_val : z - scalar_t(0.5) * beta_val;
+    });
+  });
+}
+
+void huber_kernel_zoom(TensorIterator& iter, double delta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "huber_zoom", [&iter, delta] {
+    scalar_t delta_val(delta);
+    gpu_kernel(iter, [delta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+      auto z = ::abs(a - b);
+      return z < delta_val ? scalar_t(0.5) * z * z : delta_val * (z - scalar_t(0.5) * delta_val);
+    });
+  });
+}
+
+void mse_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "mse_zoom", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      auto diff = a - b;
+      return diff * diff;
+    });
+  });
+}
+
+void xlogy_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "xlogy_zoom", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
+      if (at::_isnan(y)){
+        return NAN;
+      }
+      if (x == 0){
+        return 0;
+      }
+      return x * std::log(y);
+    });
+  });
+}
+
+void xlog1py_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "xlog1py_zoom", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
+      if (at::_isnan(y)){
+        return NAN;
+      }
+      if (x == 0){
+        return 0;
+      }
+      return x * std::log1p(y);
+    });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(smooth_l1_stub, &smooth_l1_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(huber_stub, &huber_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(mse_stub, &mse_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(xlogy_stub, &xlogy_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(xlog1py_stub, &xlog1py_kernel_zoom);
+
+// DO NOT ADD ANY NEW KERNELS HERE
+// CUDA compilation times grow quickly.  It's perfectly acceptable to have a file per kernel.
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryRemainderKernel.cu b/aten/src/ATen/native/zoom/BinaryRemainderKernel.cu
new file mode 100644
index 00000000000000..e290015f62502c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryRemainderKernel.cu
@@ -0,0 +1,61 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <type_traits>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void remainder_kernel_zoom(TensorIteratorBase& iter) {
+  if (isIntegralType(iter.common_dtype(), /*includeBool*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "remainder_zoom", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        scalar_t r = a % b;
+        if (r != 0 && c10::signs_differ(r, b)) {
+          r += b;
+        }
+        return r;
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "remainder_zoom", [&]() {
+      gpu_kernel_with_scalars(iter,
+        []GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
+          auto mod = ::fmod(a, b);
+          if (mod != 0 && c10::signs_differ(b, mod)) {
+            mod += b;
+          }
+          return mod;
+        });
+    });
+  }
+}
+
+void fmod_kernel_zoom(TensorIteratorBase& iter) {
+  if (isIntegralType(iter.common_dtype(), /*includeBool*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "fmod_zoom", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return a % b;
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "fmod_zoom", [&]() {
+      gpu_kernel_with_scalars(iter,
+        []GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
+          return ::fmod(a, b);
+        });
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(remainder_stub, &remainder_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(fmod_stub, &fmod_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/BinaryShiftOpsKernels.cu b/aten/src/ATen/native/zoom/BinaryShiftOpsKernels.cu
new file mode 100644
index 00000000000000..2b9edb9cfda724
--- /dev/null
+++ b/aten/src/ATen/native/zoom/BinaryShiftOpsKernels.cu
@@ -0,0 +1,44 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+
+void lshift_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_zoom", [&]() {
+    gpu_kernel_with_scalars(iter,
+      []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        constexpr scalar_t max_shift = sizeof(scalar_t) * CHAR_BIT;
+        if ((static_cast<std::make_signed_t<scalar_t>>(b) < 0) || (b >= max_shift)) {
+          return 0;
+        }
+        return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
+    });
+  });
+}
+
+void rshift_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_zoom", [&]() {
+    gpu_kernel_with_scalars(iter,
+      []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        // right shift value to retain sign bit for signed and no bits for unsigned
+        constexpr scalar_t max_shift = sizeof(scalar_t) * CHAR_BIT - std::is_signed_v<scalar_t>;
+        if ((static_cast<std::make_signed_t<scalar_t>>(b) < 0) || (b >= max_shift)) {
+          return a >> max_shift;
+        }
+        return a >> b;
+    });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(lshift_stub, &lshift_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(rshift_stub, &rshift_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Bucketization.cu b/aten/src/ATen/native/zoom/Bucketization.cu
new file mode 100644
index 00000000000000..95232a5f4d4144
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Bucketization.cu
@@ -0,0 +1,233 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/BucketizationUtils.h>
+#include <ATen/native/Resize.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bucketize_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/searchsorted_native.h>
+#endif
+
+namespace at::native {
+
+// Implement a numpy like searchsorted and a TF like bucketize function running on cuda
+// See details in ATen/native/Bucketization.cpp
+
+namespace {
+
+template<typename input_t>
+__device__ int64_t lower_bound(const input_t *data_ss, int64_t start, int64_t end, const input_t val, const int64_t *data_sort) {
+  // sorter gives relative ordering for ND tensors, so we need to save and add the non-updated start as an offset
+  // i.e. the second row of a 3x3 tensors starts at element 3 but sorter's second row only contains 0, 1, or 2
+  const int64_t orig_start = start;
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    const input_t mid_val = data_sort ? data_ss[orig_start + data_sort[mid]] : data_ss[mid];
+    if (!(mid_val >= val)) {
+      start = mid + 1;
+    }
+    else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template<typename input_t>
+__device__ int64_t upper_bound(const input_t *data_ss, int64_t start, int64_t end, const input_t val, const int64_t *data_sort) {
+  // sorter gives relative ordering for ND tensors, so we need to save and add the non-updated start as an offset
+  // i.e. the second row of a 3x3 tensors starts at element 3 but sorter's second row only contains 0, 1, or 2
+  const int64_t orig_start = start;
+  while (start < end) {
+    const int64_t mid = start + ((end - start) >> 1);
+    const input_t mid_val = data_sort ? data_ss[orig_start + data_sort[mid]] : data_ss[mid];
+    if (!(mid_val > val)) {
+      start = mid + 1;
+    }
+    else {
+      end = mid;
+    }
+  }
+  return start;
+}
+
+template<typename input_t, typename output_t>
+__global__ void searchsorted_zoom_kernel(
+  output_t *data_out,
+  const input_t *data_in,
+  const input_t *data_bd,
+  const int64_t *data_sort,
+  int64_t idim_in,
+  int64_t idim_bd,
+  int64_t numel_in,
+  bool right,
+  bool is_1d_boundaries) {
+
+  for (int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < numel_in; tid += blockDim.x * gridDim.x) {
+    // If boundaries tensor is 1d, we always search the entire boundary tensor
+    int64_t start_bd = is_1d_boundaries ? 0 : tid / idim_in * idim_bd;
+    int64_t end_bd = start_bd + idim_bd;
+
+    int64_t pos = !right ?
+      lower_bound<input_t>(data_bd, start_bd, end_bd, data_in[tid], data_sort) - start_bd :
+      upper_bound<input_t>(data_bd, start_bd, end_bd, data_in[tid], data_sort) - start_bd;
+
+    // type conversion might happen here
+    data_out[tid] = pos;
+  }
+}
+
+template<typename input_t, typename output_t>
+void searchsorted_zoom_contiguous(Tensor& result, const Tensor& input, const Tensor& boundaries, const bool& right, const Tensor& sorter) {
+  int64_t numel_in = input.numel();
+  bool is_scalar_input = input.dim() == 0 && numel_in == 1;
+  // inner most dim size of input and boundaries
+  int64_t idim_in = is_scalar_input ? 1 : input.sizes().back();
+  int64_t idim_bd = boundaries.sizes().back();
+
+  const input_t *data_in = input.const_data_ptr<input_t>();
+  const input_t *data_bd = boundaries.const_data_ptr<input_t>();
+  const int64_t *data_sort = sorter.defined() ? sorter.const_data_ptr<int64_t>() : nullptr;
+  output_t *data_out = result.mutable_data_ptr<output_t>();
+
+  int64_t maxThread = at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  int64_t maxGrid = 1024;
+  dim3 block = dim3(std::min(maxThread, numel_in));
+  dim3 grid  = dim3(std::min(maxGrid, ceil_div<int64_t>(numel_in, block.x)));
+  c10::zoom::ZoomStream stream = c10::zoom::getCurrentZoomStream();
+
+  searchsorted_zoom_kernel<<<grid, block, 0, stream>>>(
+    data_out, data_in, data_bd, data_sort, idim_in, idim_bd, numel_in, right, boundaries.dim() == 1);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+void dispatch(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& boundaries,
+    bool out_int32,
+    bool right,
+    const Tensor& sorter) {
+  if (!out_int32) {
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_zoom", [&] {
+      searchsorted_zoom_contiguous<scalar_t, int64_t>(result, input, boundaries, right, sorter);
+    });
+  }
+  else {
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "searchsorted_out_zoom", [&] {
+      searchsorted_zoom_contiguous<scalar_t, int>(result, input, boundaries, right, sorter);
+    });
+  }
+}
+
+}
+
+Tensor& searchsorted_out_zoom(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> sorter_maybe_owned = at::borrow_from_optional_tensor(sorter_opt);
+  const Tensor& sorter = *sorter_maybe_owned;
+  searchsorted_pre_check(sorted_sequence, self, result, out_int32, right, side_opt, sorter);
+  resize_output(result, self.sizes());
+
+  // we have two inputs to set right, pre_check checks that they aren't set to opposites
+  bool is_right = (side_opt && *side_opt == "right") || right;
+  if (self.numel() == 0) {
+    return result;
+  }
+
+  // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor
+  Tensor out = result;
+  if (!result.is_contiguous()) {
+    out = result.contiguous();
+  }
+  if (sorted_sequence.is_contiguous() && self.is_contiguous() && sorted_sequence.dtype() == self.dtype() && sorter.is_contiguous()) {
+   dispatch(out, self, sorted_sequence, out_int32, is_right, sorter);
+  }
+  else {
+    Tensor trimmed_input;
+    Tensor trimmed_boundaries;
+    Tensor trimmed_sorter;
+    searchsorted_maybe_trim_input_tensors(trimmed_input, trimmed_boundaries, trimmed_sorter, self, sorted_sequence, sorter);
+    const Tensor& final_input = trimmed_input.defined() ? trimmed_input : self;
+    const Tensor& final_boundaries = trimmed_boundaries.defined() ? trimmed_boundaries : sorted_sequence;
+    const Tensor& final_sorter = trimmed_sorter.defined() ? trimmed_sorter : sorter;
+    dispatch(out, final_input, final_boundaries, out_int32, is_right, final_sorter);
+  }
+
+  // if result is non-contiguous, we wrote the answer to a copied version, so we copy back to the original result tensor
+  if (!result.is_contiguous()) {
+    result.copy_(out);
+  }
+  return result;
+}
+
+Tensor& searchsorted_out_zoom(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter_opt,
+    Tensor& result) {
+  const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_out_zoom(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter_opt, result);
+}
+
+Tensor searchsorted_zoom(
+    const Tensor& sorted_sequence,
+    const Tensor& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  at::native::searchsorted_out_zoom(sorted_sequence, self, out_int32, right, side_opt, sorter, result);
+  return result;
+}
+
+Tensor searchsorted_zoom(
+    const Tensor& sorted_sequence,
+    const Scalar& self,
+    bool out_int32,
+    bool right,
+    const std::optional<c10::string_view> side_opt,
+    const std::optional<Tensor>& sorter) {
+  const Tensor& scalar_tensor = searchsorted_scalar_tensor(self, sorted_sequence.device());
+  return searchsorted_zoom(sorted_sequence, scalar_tensor, out_int32, right, side_opt, sorter);
+}
+
+Tensor& bucketize_out_zoom(const Tensor& self, const Tensor& boundaries, bool out_int32, bool right, Tensor& result) {
+  TORCH_CHECK(boundaries.dim() == 1, "boundaries tensor must be 1 dimension, but got dim(", boundaries.dim(), ")");
+  at::native::searchsorted_out_zoom(boundaries, self, out_int32, right, nullopt, nullopt, result);
+  return result;
+}
+
+Tensor bucketize_zoom(const Tensor& self, const Tensor& boundaries, bool out_int32, bool right) {
+  ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
+  c10::TensorOptions options = TensorOptions().device(self.options().device()).dtype(scalar_type);
+  Tensor result = at::empty({0}, options, MemoryFormat::Contiguous);
+  at::native::bucketize_out_zoom(self, boundaries, out_int32, right, result);
+  return result;
+}
+
+Tensor bucketize_zoom(const Scalar& self, const Tensor& boundaries, bool out_int32, bool right) {
+  return bucketize_zoom(searchsorted_scalar_tensor(self, boundaries.device()), boundaries, out_int32, right);
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Col2Im.cu b/aten/src/ATen/native/zoom/Col2Im.cu
new file mode 100644
index 00000000000000..2b9ed73079c879
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Col2Im.cu
@@ -0,0 +1,171 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/div_rtn.h>
+
+#include <ATen/zoom/ZoomContext.h>
+
+#include <ATen/native/zoom/im2col.cuh>
+#include <ATen/native/im2col_shape_check.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/col2im_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/im2col_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+void col2im_out_zoom_template(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef output_size,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  TensorArg input_arg{input_, "input", 1};
+  TensorArg output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      kernel_size.size() == 2,
+      "It is expected kernel_size equals to 2, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 2,
+      "It is expected dilation equals to 2, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 2,
+      "It is expected padding equals to 2, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      stride.size());
+
+  int64_t output_height = output_size[0];
+  int64_t output_width = output_size[1];
+  int64_t kernel_height = kernel_size[0];
+  int64_t kernel_width = kernel_size[1];
+  int64_t dilation_height = dilation[0];
+  int64_t dilation_width = dilation[1];
+  int64_t pad_height = padding[0];
+  int64_t pad_width = padding[1];
+  int64_t stride_height = stride[0];
+  int64_t stride_width = stride[1];
+
+  col2im_shape_check(
+      input_,
+      Tensor(),
+      output_height,
+      output_width,
+      kernel_height,
+      kernel_width,
+      dilation_height,
+      dilation_width,
+      pad_height,
+      pad_width,
+      stride_height,
+      stride_width);
+
+  Tensor input = input_.contiguous();
+
+  bool batched_input = true;
+  if (input.dim() == 2) {
+    // Force batch
+    batched_input = false;
+    input = input.view({1, input.size(0), input.size(1)});
+  }
+
+  int64_t batch_size = input.size(0);
+  int64_t n_input_plane = input.size(1);
+  int64_t n_output_plane = n_input_plane / (kernel_width * kernel_height);
+  int64_t input_batch_stride = input.stride(0);
+
+  output.resize_({batch_size, n_output_plane, output_height, output_width});
+  int64_t output_batch_stride = output.stride(0);
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "col2im_out_zoom", [&] {
+    int64_t height_col = (output_height + 2 * pad_height -
+                          (dilation_height * (kernel_height - 1) + 1)) /
+            stride_height +
+        1;
+    int64_t width_col = (output_width + 2 * pad_width -
+                         (dilation_width * (kernel_width - 1) + 1)) /
+            stride_width +
+        1;
+
+    col2im_batched(
+        c10::zoom::getCurrentZoomStream(),
+        input.const_data_ptr<scalar_t>(),
+        input_batch_stride,
+        batch_size,
+        n_output_plane,
+        output_height,
+        output_width,
+        height_col,
+        width_col,
+        kernel_height,
+        kernel_width,
+        pad_height,
+        pad_width,
+        stride_height,
+        stride_width,
+        dilation_height,
+        dilation_width,
+        output.mutable_data_ptr<scalar_t>(),
+        output_batch_stride);
+
+    if (!batched_input) {
+      output.resize_({n_output_plane, output_height, output_width});
+    }
+  });
+}
+
+} // namespace
+
+Tensor& col2im_out_zoom(const Tensor& input,
+    IntArrayRef output_size,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    Tensor& output) {
+  col2im_out_zoom_template(
+      output, input, output_size, kernel_size, dilation, padding, stride);
+  return output;
+}
+
+Tensor col2im_zoom(
+    const Tensor& input,
+    IntArrayRef output_size,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  Tensor output = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  col2im_out_zoom_template(
+      output, input, output_size, kernel_size, dilation, padding, stride);
+  return output;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ComplexKernel.cu b/aten/src/ATen/native/zoom/ComplexKernel.cu
new file mode 100644
index 00000000000000..c00c15b49a03dd
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ComplexKernel.cu
@@ -0,0 +1,36 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/TensorFactories.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+namespace {
+
+void complex_kernel_zoom(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.input_dtype(0), "complex_zoom", [&]() {
+    gpu_kernel(
+      iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> c10::complex<scalar_t> {
+        return c10::complex<scalar_t>(a, b);
+      });
+  });
+}
+
+void polar_kernel_zoom(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(0), "polar_zoom", [&]() {
+    gpu_kernel(
+      iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> c10::complex<scalar_t> {
+        return c10::complex<scalar_t>(a * std::cos(b), a * std::sin(b));
+      });
+  });
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(complex_stub, &complex_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(polar_stub, &polar_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CompositeRandomAccessor.h b/aten/src/ATen/native/zoom/CompositeRandomAccessor.h
new file mode 100644
index 00000000000000..d47a7fa776f1b6
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CompositeRandomAccessor.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <ATen/native/CompositeRandomAccessorCommon.h>
+#include <thrust/tuple.h>
+
+namespace at { namespace native {
+
+struct TupleInfoCPU {
+  template <typename ...Types>
+  using tuple = thrust::tuple<Types...>;
+
+  template <typename ...Types>
+  static constexpr auto tie(Types&... args) noexcept {
+    return thrust::tie(args...);
+  }
+};
+
+template <typename KeyAccessor, typename ValueAccessor>
+using CompositeRandomAccessorCPU =
+  CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfoCPU>;
+
+template <typename Values, typename References>
+void swap(
+  references_holder<Values, References> rh1,
+  references_holder<Values, References> rh2
+) {
+  return thrust::swap(rh1.data(), rh2.data());
+}
+
+template <int N, typename Values, typename References>
+auto get(references_holder<Values, References> rh) -> decltype(thrust::get<N>(rh.data())) {
+  return thrust::get<N>(rh.data());
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ConvolutionMM2d.cu b/aten/src/ATen/native/zoom/ConvolutionMM2d.cu
new file mode 100644
index 00000000000000..c2e165d4ac9dce
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ConvolutionMM2d.cu
@@ -0,0 +1,502 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/div_rtn.h>
+#include <ATen/zoom/HIPBlas.h>
+#include <ATen/native/ConvUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/zoom/im2col.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_slow_conv2d_forward_native.h>
+#include <ATen/ops/_slow_conv2d_backward_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#endif
+
+namespace at::native {
+namespace {
+
+void slow_conv2d_shape_check(
+    const Tensor& input, const Tensor& grad_output,
+    const Tensor& weight, const Tensor& bias,
+    int64_t kH, int64_t kW,
+    int64_t dH, int64_t dW,
+    int64_t padH, int64_t padW,
+    bool weight_nullable) {
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: ", dH, " dW: ", dW);
+
+  TORCH_CHECK(weight_nullable || weight.defined(),
+              "weight tensor is expected to be non-nullable");
+  TORCH_CHECK(!weight.defined() ||
+              ((weight.numel() > 0) && (weight.dim() == 2)),
+              "non-empty 2D weight tensor expected, but got: ", weight.sizes());
+  TORCH_CHECK(!bias.defined() || (bias.dim() == 1 && bias.sizes()[0] == weight.sizes()[0]),
+              "Expected bias to have shape [", weight.sizes()[0], "] but got ", bias.sizes());
+
+  const auto in_sizes = input.sizes();
+  constexpr int ndim = 4;
+  constexpr int dimf = 1;
+  constexpr int dimh = 2;
+  constexpr int dimw = 3;
+  TORCH_CHECK(in_sizes.size() == ndim, "Expected 4D input tensor, but got ", in_sizes);
+
+  // Allow for empty batch size but not other dimensions
+  const bool valid_empty = c10::multiply_integers(in_sizes.slice(1)) != 0;
+  TORCH_CHECK(valid_empty, "non-empty input tensor expected but got: ", in_sizes);
+
+  int64_t inputHeight = in_sizes[dimh];
+  int64_t inputWidth = in_sizes[dimw];
+
+  int64_t exactInputHeight = inputHeight + 2 * padH;
+  int64_t exactInputWidth = inputWidth + 2 * padW;
+
+  TORCH_CHECK(exactInputHeight >= kH && exactInputWidth >= kW,
+              "Calculated padded input size per channel: ",
+              IntArrayRef{exactInputHeight, exactInputWidth},
+              ". Kernel size: ", IntArrayRef{kH, kW},
+              ". Kernel size can't be greater than actual input size");
+
+  // NOTE: can't use conv_output_size if the weight isn't defined
+  auto outputHeight = div_rtn<int64_t>(exactInputHeight - kH, dH) + 1;
+  auto outputWidth = div_rtn<int64_t>(exactInputWidth - kW, dW) + 1;
+
+  TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
+              "Given input size per channel: ",
+              IntArrayRef{inputHeight, inputWidth},
+              ". Calculated output size per channel: ",
+              IntArrayRef{outputHeight, outputWidth},
+              ". Output size is too small");
+
+  if (weight.defined()) {
+    const auto w_sizes = weight.sizes();
+    int64_t nInputPlane = w_sizes[1];
+    if (w_sizes.size() == 2) {
+      nInputPlane /= (kH * kW);
+    }
+    TORCH_CHECK(in_sizes[dimf] == nInputPlane,
+                "Expected input dim ", dimf, " to have size ", nInputPlane,
+                " but got ", in_sizes[dimf]);
+  }
+
+  if (grad_output.defined()) {
+    const auto gO_sizes = grad_output.sizes();
+    TORCH_CHECK(gO_sizes.size() == ndim,
+                "Expected grad_output to have ", ndim,
+                " dimensions but got shape", gO_sizes);
+
+    if (weight.defined()) {
+      const auto w_sizes = weight.sizes();
+      TORCH_CHECK(gO_sizes[dimf] == w_sizes[0],
+                  "Expected  dim ", dimf, " to have size ", w_sizes[0],
+                  " but got ", gO_sizes[dimf]);
+    } else if (bias.defined()) {
+      const auto b_sizes = bias.sizes();
+      int64_t nOutputPlane = b_sizes.size() == 0 ? 1 : b_sizes[0];
+      TORCH_CHECK(gO_sizes[dimf] == nOutputPlane,
+                  "Expected grad_output dim ", dimf, " to have size ",
+                  nOutputPlane, " but got ", gO_sizes[dimf]);
+    }
+    TORCH_CHECK(gO_sizes[dimh] == outputHeight,
+                "Expected grad_output dim ", dimh, " to have size ",
+                outputHeight, " but got ", gO_sizes[dimh]);
+    TORCH_CHECK(gO_sizes[dimw] == outputWidth,
+                "Expected grad_output dim ", dimw, " to have size ",
+                outputWidth, " but got ", gO_sizes[dimw]);
+  }
+}
+
+Tensor new_view_weight_MM2d(const Tensor& weight_) {
+  auto weight = weight_.expect_contiguous();
+  const auto w_sizes = weight->sizes();
+  TORCH_CHECK(w_sizes.size() == 4);
+  int64_t s1 = w_sizes[0];
+  int64_t s2 = c10::multiply_integers(w_sizes.slice(1));
+  return weight->view({s1, s2});
+}
+
+void slow_conv2d_forward(
+           const Tensor &input,
+           const Tensor &output,
+           const Tensor &weight_,
+           const Tensor &bias,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW) {
+  auto weight = new_view_weight_MM2d(weight_);
+  slow_conv2d_shape_check(
+      input, {}, weight, bias, kH, kW, dH, dW, padH, padW, /*weight_nullable*/false);
+
+  constexpr int dimf = 1;
+  constexpr int dimh = 2;
+  constexpr int dimw = 3;
+
+  auto in_sizes = input.sizes();
+  int64_t batchSize = in_sizes[0];
+  int64_t nInputPlane  = in_sizes[dimf];
+  int64_t inputHeight  = in_sizes[dimh];
+  int64_t inputWidth   = in_sizes[dimw];
+  int64_t nOutputPlane = weight.sizes()[0];
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  // Resize output
+  resize_output(output, {batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  // Create temporary columns
+  at::Tensor columns;
+
+  const bool requires_columns = (
+      kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0);
+
+  if (requires_columns) {
+    columns = at::empty({nInputPlane * kW * kH, outputHeight * outputWidth}, input.options());
+  }
+
+  if (bias.defined()) {
+    TORCH_CHECK(bias.scalar_type() == input.scalar_type(),
+                "Expected bias to have type ", input.scalar_type(),
+                " but got ", bias.scalar_type());
+    output.copy_(bias.view({-1, 1, 1}));
+  } else {
+    output.zero_();
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "slow_conv2d_zoom", [&] {
+    // For each elt in batch, do:
+    for (int elt = 0; elt < batchSize; elt ++) {
+      // Matrix multiply per output:
+      auto input_n = input.select(0, elt);
+      auto output_n = output.select(0, elt);
+
+      if (requires_columns) {
+        // Extract columns:
+        at::native::im2col(
+          c10::zoom::getCurrentZoomStream(),
+          input_n.const_data_ptr<scalar_t>(),
+          nInputPlane, inputHeight, inputWidth,
+          outputHeight, outputWidth,
+          kH, kW, padH, padW, dH, dW,
+          1, 1,
+          columns.mutable_data_ptr<scalar_t>()
+        );
+      }
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nOutputPlane;
+      int64_t n = outputHeight * outputWidth;
+      int64_t k = nInputPlane*kH*kW;
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      auto gemm_in_ptr = requires_columns ?
+          columns.const_data_ptr<scalar_t>() :
+          input_n.const_data_ptr<scalar_t>();
+      at::zoom::blas::gemm(
+          'n', 'n',
+          n, m, k,
+          scalar_t(1),
+          gemm_in_ptr, n,
+          weight.const_data_ptr<scalar_t>(), k,
+          scalar_t(1),
+          output_n.mutable_data_ptr<scalar_t>(), n
+      );
+    }
+  });
+}
+
+void slow_conv2d_backward(
+    const Tensor &input,
+    const Tensor &grad_output,
+    const Tensor &grad_input,
+    const Tensor &weight_,
+    const Tensor &grad_columns,
+    int kH, int kW,
+    int dH, int dW,
+    int padH, int padW) {
+  Tensor weight = new_view_weight_MM2d(weight_);
+  slow_conv2d_shape_check(input, grad_output, weight, {},
+                          kH, kW, dH, dW, padH, padW, /*weight_nullable=*/false);
+
+  // Params
+  auto weight_sizes = weight.sizes();
+  int nInputPlane = weight_sizes[1]/(kW*kH);
+  int nOutputPlane = weight_sizes[0];
+
+  TORCH_INTERNAL_ASSERT(grad_output.is_contiguous());
+
+  auto input_sizes = input.sizes();
+  int64_t inputWidth   = input_sizes[3];
+  int64_t inputHeight  = input_sizes[2];
+  auto output_sizes = grad_output.sizes();
+  int64_t outputWidth  = output_sizes[3];
+  int64_t outputHeight = output_sizes[2];
+
+  // Batch size + input planes
+  int64_t batchSize = input_sizes[0];
+
+  // Resize output
+  resize_output(grad_input, input_sizes);
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+
+  // Resize temporary columns
+  resize_output(grad_columns, {nInputPlane*kW*kH, outputHeight*outputWidth});
+  TORCH_CHECK(grad_columns.is_contiguous(), "grad_columns must be contiguous");
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "slow_conv2d_backward_zoom", [&] {
+    // For each elt in batch, do:
+    for (int elt = 0; elt < batchSize; elt ++) {
+      // Matrix multiply per sample:
+      auto grad_input_n = grad_input.select(0, elt);
+      auto grad_output_n = grad_output.select(0, elt);
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nInputPlane*kW*kH;
+      int64_t n = grad_columns.sizes()[1];
+      int64_t k = nOutputPlane;
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      at::zoom::blas::gemm<scalar_t>(
+          'n', 't',
+          n, m, k,
+          scalar_t(1),
+          grad_output_n.const_data_ptr<scalar_t>(), n,
+          weight.const_data_ptr<scalar_t>(), m,
+          scalar_t(0),
+          grad_columns.mutable_data_ptr<scalar_t>(), n
+      );
+
+      // Unpack columns back into input:
+      using acc_t = at::acc_type<scalar_t, true>;
+      at::native::col2im<scalar_t, acc_t>(
+        c10::zoom::getCurrentZoomStream(),
+        grad_columns.const_data_ptr<scalar_t>(),
+        nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+        1, 1, grad_input_n.mutable_data_ptr<scalar_t>()
+      );
+    }
+  });
+}
+
+void slow_conv2d_grad_weight(
+           const Tensor &input,
+           const Tensor &grad_output,
+           const Tensor &grad_weight_,
+           const Tensor &columns,
+           int64_t kH, int64_t kW,
+           int64_t dH, int64_t dW,
+           int64_t padH, int64_t padW) {
+  TORCH_CHECK(grad_weight_.is_contiguous(), "grad_weight needs to be contiguous");
+  auto grad_weight = new_view_weight_MM2d(grad_weight_);
+  slow_conv2d_shape_check(input, grad_output, grad_weight, {},
+                          kH, kW, dH, dW, padH, padW, /*weight_nullable=*/true);
+
+  // Params
+  TORCH_INTERNAL_ASSERT(input.is_contiguous());
+  TORCH_INTERNAL_ASSERT(grad_output.is_contiguous());
+
+  auto input_sizes = input.sizes();
+  int64_t nInputPlane = input_sizes[1];
+  int64_t nOutputPlane = grad_output.sizes()[1];
+
+  int64_t inputWidth   = input_sizes[3];
+  int64_t inputHeight  = input_sizes[2];
+  int64_t outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  int64_t outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  int64_t batchSize = input_sizes[0];
+
+  // Resize temporary columns
+  resize_output(columns, {nInputPlane * kH * kW, outputHeight * outputWidth});
+
+  const bool requires_columns = (
+      kW != 1 || kH != 1 || dW != 1 || dH != 1 || padH != 0 || padW != 0);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "slow_conv2d_grad_weight_zoom", [&] {
+    // For each elt in batch, do:
+    for (int elt = 0; elt < batchSize; elt ++) {
+      // Matrix multiply per output:
+      auto grad_output_n = grad_output.select(0, elt);
+
+      // Matrix multiply per output:
+      auto input_n = input.select(0, elt);
+
+      if (requires_columns) {
+        // Extract columns:
+        at::native::im2col<scalar_t>(
+          c10::zoom::getCurrentZoomStream(),
+          input_n.const_data_ptr<scalar_t>(),
+          nInputPlane, inputHeight, inputWidth,
+          outputHeight, outputWidth,
+          kH, kW, padH, padW, dH, dW,
+          1, 1,
+          columns.mutable_data_ptr<scalar_t>()
+        );
+      }
+
+      // M,N,K are dims of matrix A and B
+      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+      int64_t m = nOutputPlane;
+      int64_t n = nInputPlane*kW*kH;
+      int64_t k = columns.sizes()[1];
+
+      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+      auto gemm_in_ptr = requires_columns ?
+          columns.const_data_ptr<scalar_t>() :
+          input_n.const_data_ptr<scalar_t>();
+      at::zoom::blas::gemm(
+          't', 'n',
+          n, m, k,
+          scalar_t(1),
+          gemm_in_ptr, k,
+          grad_output_n.const_data_ptr<scalar_t>(), k,
+          scalar_t(1),
+          grad_weight.mutable_data_ptr<scalar_t>(), n
+      );
+    }
+  });
+}
+
+}  // namespace (anonymous)
+
+
+Tensor& slow_conv2d_forward_out_zoom(
+    const Tensor &self_,
+    const Tensor &weight_,
+    IntArrayRef kernel_size,
+    const std::optional<Tensor> &bias_,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    Tensor &output) {
+  TORCH_CHECK(kernel_size.size() == 2);
+  TORCH_CHECK(stride.size() == 2);
+  TORCH_CHECK(padding.size() == 2);
+
+  auto self = self_.expect_contiguous();
+  auto weight = weight_.expect_contiguous();
+  auto bias = [&] {
+    if (bias_.has_value() && bias_->defined()) {
+      return bias_->expect_contiguous();
+    }
+    return MaybeOwned<Tensor>::owned(std::in_place);
+  }();
+
+  slow_conv2d_forward(
+      *self,
+      output,
+      *weight,
+      *bias,
+      kernel_size[0], kernel_size[1],
+      stride[0], stride[1],
+      padding[0], padding[1]
+    );
+  return output;
+}
+
+Tensor slow_conv2d_forward_zoom(
+    const Tensor &self,
+    const Tensor &weight,
+    IntArrayRef kernel_size,
+    const std::optional<Tensor> &bias,
+    IntArrayRef stride,
+    IntArrayRef padding) {
+  auto output = at::empty({0}, self.options());
+  return slow_conv2d_forward_out_zoom(
+      self, weight, kernel_size, bias, stride, padding, output);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_zoom(
+    const Tensor& grad_output_,
+    const Tensor& self_,
+    const Tensor& weight_,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  auto grad_output = grad_output_.expect_contiguous();
+
+  Tensor columns = at::empty({0}, self_.options());
+  if (grad_input.defined()) {
+    resize_output(grad_input, self_.sizes());
+    auto weight = weight_.expect_contiguous();
+
+    slow_conv2d_backward(
+        self_, *grad_output,
+        grad_input, *weight,
+        columns,
+        kernel_size[0], kernel_size[1],
+        stride[0], stride[1],
+        padding[0], padding[1]);
+  }
+  if (grad_bias.defined()) {
+    at::sum_out(grad_bias, *grad_output, IntArrayRef{0, 2, 3});
+  }
+  if (grad_weight.defined()) {
+    resize_output(grad_weight, weight_.sizes());
+    grad_weight.zero_();
+    auto self = self_.expect_contiguous();
+    slow_conv2d_grad_weight(
+        *self,
+        *grad_output,
+        grad_weight,
+        columns,
+        kernel_size[0], kernel_size[1],
+        stride[0], stride[1],
+        padding[0], padding[1]
+      );
+  }
+  return std::tuple<Tensor&, Tensor&, Tensor&>{
+      grad_input, grad_weight, grad_bias};
+}
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    std::array<bool, 3> output_mask) {
+  Tensor grad_input;
+  Tensor grad_weight;
+  Tensor grad_bias;
+
+  if (output_mask[0]) {
+    grad_input = at::empty({0}, grad_output.options());
+  }
+
+  if (output_mask[1]) {
+    grad_weight = at::empty({0}, grad_output.options());
+  }
+
+  if (output_mask[2]) {
+    grad_bias = at::empty({0}, grad_output.options());
+  }
+
+  return native::slow_conv2d_backward_out_zoom(
+      grad_output,
+      self,
+      weight,
+      kernel_size,
+      stride,
+      padding,
+      grad_input,
+      grad_weight,
+      grad_bias);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CopysignKernel.cu b/aten/src/ATen/native/zoom/CopysignKernel.cu
new file mode 100644
index 00000000000000..d34dbc1ee94876
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CopysignKernel.cu
@@ -0,0 +1,27 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <c10/zoom/HIPMathCompat.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void copysign_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "copysign_zoom", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return c10::hip::compat::copysign(a, b);
+    });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(copysign_stub, &copysign_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/CrossKernel.cu b/aten/src/ATen/native/zoom/CrossKernel.cu
new file mode 100644
index 00000000000000..459766ccace6e3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/CrossKernel.cu
@@ -0,0 +1,92 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/Cross.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
+
+namespace at::native {
+
+template <typename T, typename OffsetCalc, typename StrideType>
+__global__ void cross_kernel(
+    int numel, T* out, const T* x1, const T* x2, OffsetCalc offset_calculator,
+    StrideType ostride, StrideType x1stride, StrideType x2stride) {
+  HIP_KERNEL_LOOP(i, numel) {
+    const auto offsets = offset_calculator.get(i);
+    auto* out_row = out + offsets[0];
+    const auto* x1_row = x1 + offsets[1];
+    const auto* x2_row = x2 + offsets[2];
+
+    const T val0 = (x1_row[1 * x1stride] * x2_row[2 * x2stride] -
+                    x1_row[2 * x1stride] * x2_row[1 * x2stride]);
+
+    const T val1 = (x1_row[2 * x1stride] * x2_row[0 * x2stride] -
+                    x1_row[0 * x1stride] * x2_row[2 * x2stride]);
+
+    const T val2 = (x1_row[0 * x1stride] * x2_row[1 * x2stride] -
+                    x1_row[1 * x1stride] * x2_row[0 * x2stride]);
+
+
+    out_row[0 * ostride] = val0;
+    out_row[1 * ostride] = val1;
+    out_row[2 * ostride] = val2;
+  }
+}
+
+void launch_cross_kernel(const TensorIteratorBase& iter, int64_t ostride,
+                         int64_t x1stride, int64_t x2stride) {
+  const auto N = iter.numel();
+  auto offset_calculator = make_element_offset_calculator<3>(iter);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + num_threads() - 1) / num_threads();
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "cross_zoom", [&] {
+    auto out = static_cast<scalar_t*>(iter.data_ptr(0));
+    auto x1 = static_cast<const scalar_t*>(iter.data_ptr(1));
+    auto x2 = static_cast<const scalar_t*>(iter.data_ptr(2));
+    constexpr int64_t int_max = std::numeric_limits<int>::max();
+    if (ostride * 2 > int_max || x1stride * 2 > int_max || x2stride * 2 > int_max) {
+      cross_kernel<<<grid, num_threads(), 0, stream>>>(
+          N, out, x1, x2, offset_calculator, ostride, x1stride, x2stride);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      cross_kernel<<<grid, num_threads(), 0, stream>>>(
+          N, out, x1, x2, offset_calculator,
+          static_cast<int>(ostride),
+          static_cast<int>(x1stride),
+          static_cast<int>(x2stride));
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+}
+
+void cross_impl(const Tensor& result, const Tensor& x1, const Tensor& x2, int64_t dim) {
+  const int64_t ostride = result.stride(dim);
+  const int64_t x1stride = x1.stride(dim);
+  const int64_t x2stride = x2.stride(dim);
+
+  auto iter = TensorIteratorConfig()
+      .add_output(result)
+      .add_const_input(x1)
+      .add_const_input(x2)
+      .resize_outputs(false)
+      .declare_static_shape(result.sizes(), /*squash_dims=*/dim)
+      .build();
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (iter.can_use_32bit_indexing()) {
+    launch_cross_kernel(iter, ostride, x1stride, x2stride);
+  } else {
+    for (auto&& sub_iter: iter.with_32bit_indexing()) {
+      launch_cross_kernel(sub_iter, ostride, x1stride, x2stride);
+    }
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(cross_stub, &cross_impl);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/airy_ai.cu b/aten/src/ATen/native/zoom/airy_ai.cu
new file mode 100644
index 00000000000000..7f752d4e01444d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/airy_ai.cu
@@ -0,0 +1,42 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+namespace {
+CONSTEXPR_EXCEPT_WIN_CUDA char airy_ai_name[] = "airy_ai_forward";
+
+void airy_ai_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "airy_ai_zoom", [&]() {
+        jitted_gpu_kernel<airy_ai_name, scalar_t, scalar_t, 1>(iterator, airy_ai_string);
+    });
+#else
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "airy_ai_zoom", [&]() {
+        gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return airy_ai_forward(a);
+        });
+    });
+#endif // AT_USE_JITERATOR()
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(special_airy_ai_stub, &airy_ai_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/bessel_j0.cu b/aten/src/ATen/native/zoom/bessel_j0.cu
new file mode 100644
index 00000000000000..bd2917c4d601ed
--- /dev/null
+++ b/aten/src/ATen/native/zoom/bessel_j0.cu
@@ -0,0 +1,42 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+namespace {
+CONSTEXPR_EXCEPT_WIN_CUDA char bessel_j0_name[] = "bessel_j0_forward";
+
+void bessel_j0_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_j0_zoom", [&]() {
+        jitted_gpu_kernel<bessel_j0_name, scalar_t, scalar_t, 1>(iterator, bessel_j0_string);
+    });
+#else
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_j0_zoom", [&]() {
+        gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return bessel_j0_forward(a);
+        });
+    });
+#endif // AT_USE_JITERATOR()
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(special_bessel_j0_stub, &bessel_j0_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/bessel_j1.cu b/aten/src/ATen/native/zoom/bessel_j1.cu
new file mode 100644
index 00000000000000..c6bbf4b87cd836
--- /dev/null
+++ b/aten/src/ATen/native/zoom/bessel_j1.cu
@@ -0,0 +1,42 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+namespace {
+CONSTEXPR_EXCEPT_WIN_CUDA char bessel_j1_name[] = "bessel_j1_forward";
+
+void bessel_j1_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_j1_zoom", [&]() {
+        jitted_gpu_kernel<bessel_j1_name, scalar_t, scalar_t, 1>(iterator, bessel_j1_string);
+    });
+#else
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_j1_zoom", [&]() {
+        gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return bessel_j1_forward(a);
+        });
+    });
+#endif // AT_USE_JITERATOR()
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(special_bessel_j1_stub, &bessel_j1_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/bessel_y0.cu b/aten/src/ATen/native/zoom/bessel_y0.cu
new file mode 100644
index 00000000000000..5683ef667ed55b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/bessel_y0.cu
@@ -0,0 +1,41 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char bessel_y0_name[] = "bessel_y0_forward";
+
+            void bessel_y0_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_y0_zoom", [&]() {
+                    jitted_gpu_kernel<bessel_y0_name, scalar_t, scalar_t, 1>(iterator, bessel_y0_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_y0_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return bessel_y0_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_bessel_y0_stub, &bessel_y0_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/bessel_y1.cu b/aten/src/ATen/native/zoom/bessel_y1.cu
new file mode 100644
index 00000000000000..600dcfd040c9f8
--- /dev/null
+++ b/aten/src/ATen/native/zoom/bessel_y1.cu
@@ -0,0 +1,41 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char bessel_y1_name[] = "bessel_y1_forward";
+
+            void bessel_y1_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_y1_zoom", [&]() {
+                    jitted_gpu_kernel<bessel_y1_name, scalar_t, scalar_t, 1>(iterator, bessel_y1_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "bessel_y1_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return bessel_y1_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_bessel_y1_stub, &bessel_y1_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/chebyshev_polynomial_t.cu b/aten/src/ATen/native/zoom/chebyshev_polynomial_t.cu
new file mode 100644
index 00000000000000..8319ea12eb5bbe
--- /dev/null
+++ b/aten/src/ATen/native/zoom/chebyshev_polynomial_t.cu
@@ -0,0 +1,31 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_t_name[] = "chebyshev_polynomial_t_forward";
+
+            void chebyshev_polynomial_t_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_t_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<chebyshev_polynomial_t_name, scalar_t, scalar_t>(iterator, chebyshev_polynomial_t_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_t_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return chebyshev_polynomial_t_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // chebyshev_polynomial_t_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/chebyshev_polynomial_u.cu b/aten/src/ATen/native/zoom/chebyshev_polynomial_u.cu
new file mode 100644
index 00000000000000..5cc6bcbb46c52d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/chebyshev_polynomial_u.cu
@@ -0,0 +1,31 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_u_name[] = "chebyshev_polynomial_u_forward";
+
+            void chebyshev_polynomial_u_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_u_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<chebyshev_polynomial_u_name, scalar_t, scalar_t>(iterator, chebyshev_polynomial_u_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_u_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return chebyshev_polynomial_u_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // chebyshev_polynomial_u_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/chebyshev_polynomial_v.cu b/aten/src/ATen/native/zoom/chebyshev_polynomial_v.cu
new file mode 100644
index 00000000000000..d262b8b02aa82a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/chebyshev_polynomial_v.cu
@@ -0,0 +1,31 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_v_name[] = "chebyshev_polynomial_v_forward";
+
+            void chebyshev_polynomial_v_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_v_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<chebyshev_polynomial_v_name, scalar_t, scalar_t>(iterator, chebyshev_polynomial_v_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_v_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return chebyshev_polynomial_v_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // chebyshev_polynomial_v_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(chebyshev_polynomial_v_stub, &chebyshev_polynomial_v_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/chebyshev_polynomial_w.cu b/aten/src/ATen/native/zoom/chebyshev_polynomial_w.cu
new file mode 100644
index 00000000000000..390b0a248ac436
--- /dev/null
+++ b/aten/src/ATen/native/zoom/chebyshev_polynomial_w.cu
@@ -0,0 +1,31 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char chebyshev_polynomial_w_name[] = "chebyshev_polynomial_w_forward";
+
+            void chebyshev_polynomial_w_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_w_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<chebyshev_polynomial_w_name, scalar_t, scalar_t>(iterator, chebyshev_polynomial_w_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "chebyshev_polynomial_w_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return chebyshev_polynomial_w_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // chebyshev_polynomial_w_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(chebyshev_polynomial_w_stub, &chebyshev_polynomial_w_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/im2col.cuh b/aten/src/ATen/native/zoom/im2col.cuh
new file mode 100644
index 00000000000000..3efaf7dc75d4a7
--- /dev/null
+++ b/aten/src/ATen/native/zoom/im2col.cuh
@@ -0,0 +1,341 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+using namespace at::zoom::detail;
+
+// Kernel for fast unfold+copy
+// (borrowed from Caffe:
+// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+
+template <typename dt>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void im2col_kernel(
+    const int64_t n,
+    const dt* data_im,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_height,
+    const int64_t kernel_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_col) {
+  HIP_KERNEL_LOOP_TYPE(index, n, int64_t) {
+    int64_t w_out = index % width_col;
+
+    int64_t idx = index / width_col;
+
+    int64_t h_out = idx % height_col;
+    int64_t channel_in = idx / height_col;
+    int64_t channel_out = channel_in * kernel_height * kernel_width;
+    int64_t h_in = h_out * stride_height - pad_height;
+    int64_t w_in = w_out * stride_width - pad_width;
+
+    dt* col = data_col + (channel_out * height_col + h_out) * width_col + w_out;
+    const dt* im = data_im + (channel_in * height + h_in) * width + w_in;
+
+    for (int64_t i = 0; i < kernel_height; ++i) {
+      for (int64_t j = 0; j < kernel_width; ++j) {
+        int64_t h = h_in + i * dilation_height;
+        int64_t w = w_in + j * dilation_width;
+        *col = (h >= 0 && w >= 0 && h < height && w < width)
+            ? im[i * dilation_height * width + j * dilation_width]
+            : static_cast<dt>(0);
+        col += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename dt>
+void im2col(
+    hipStream_t stream,
+    const dt* data_im,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t kernel_height,
+    const int64_t kernel_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    dt* data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int64_t num_kernels = channels * height_col * width_col;
+  im2col_kernel<<<GET_BLOCKS(num_kernels), 1024, 0, stream>>>(
+      num_kernels,
+      data_im,
+      height,
+      width,
+      kernel_height,
+      kernel_width,
+      pad_height,
+      pad_width,
+      stride_height,
+      stride_width,
+      dilation_height,
+      dilation_width,
+      height_col,
+      width_col,
+      data_col);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename accT, typename dt>
+__forceinline__ __device__ void col2im_device(
+    const int64_t index,
+    const dt* data_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t channels,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_im) {
+  accT val = static_cast<accT>(0);
+  const int64_t w_im = index % width + pad_width;
+  const int64_t h_im = (index / width) % height + pad_height;
+  const int64_t c_im = index / (width * height);
+  int64_t kernel_extent_w = (kernel_w - 1) * dilation_width + 1;
+  int64_t kernel_extent_h = (kernel_h - 1) * dilation_height + 1;
+  // compute the start and end of the output
+  const int64_t w_col_start = (w_im < kernel_extent_w)
+      ? 0
+      : (w_im - kernel_extent_w) / stride_width + 1;
+  const int64_t w_col_end = ::min(w_im / stride_width + 1, width_col);
+  const int64_t h_col_start = (h_im < kernel_extent_h)
+      ? 0
+      : (h_im - kernel_extent_h) / stride_height + 1;
+  const int64_t h_col_end = ::min(h_im / stride_height + 1, height_col);
+
+  // TODO: use LCM of stride and dilation to avoid unnecessary loops
+  for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+    for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+      int64_t h_k = (h_im - h_col * stride_height);
+      int64_t w_k = (w_im - w_col * stride_width);
+      if (h_k % dilation_height == 0 && w_k % dilation_width == 0) {
+        h_k /= dilation_height;
+        w_k /= dilation_width;
+        int64_t data_col_index =
+            (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col +
+              h_col) *
+                width_col +
+            w_col;
+        val += data_col[data_col_index];
+      }
+    }
+  }
+  data_im[index] = static_cast<dt>(val);
+}
+
+template <typename dt, typename accT>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void col2im_kernel(
+    const int64_t n,
+    const dt* data_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t channels,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_im) {
+  HIP_KERNEL_LOOP(index, n) {
+    col2im_device<accT>(
+        index,
+        data_col,
+        height,
+        width,
+        channels,
+        kernel_h,
+        kernel_w,
+        pad_height,
+        pad_width,
+        stride_height,
+        stride_width,
+        dilation_height,
+        dilation_width,
+        height_col,
+        width_col,
+        data_im);
+  }
+}
+
+template <typename dt, typename accT>
+void col2im(
+    hipStream_t stream,
+    const dt* data_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t patch_height,
+    const int64_t patch_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    dt* data_im) {
+  int64_t num_kernels = channels * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  col2im_kernel<dt, accT>
+      <<<GET_BLOCKS(num_kernels, 512), 512, 0, stream>>>(
+          num_kernels,
+          data_col,
+          height,
+          width,
+          channels,
+          patch_height,
+          patch_width,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          height_col,
+          width_col,
+          data_im);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename dt>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void col2im_batched_kernel(
+    const int64_t n,
+    const dt* data_col,
+    const int64_t col_batch_stride,
+    const int64_t nbatch,
+    const int64_t height,
+    const int64_t width,
+    const int64_t channels,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_im,
+    const int64_t im_batch_stride) {
+  using accT = at::acc_type<dt, /*is_cuda*/true>;
+  const auto im_numel = n * nbatch;
+
+  HIP_KERNEL_LOOP_TYPE(index, im_numel, int64_t) {
+    const auto ibatch = index / n;
+    const auto slice_index = index % n;
+
+    col2im_device<accT>(
+        slice_index,
+        data_col + ibatch * col_batch_stride,
+        height,
+        width,
+        channels,
+        kernel_h,
+        kernel_w,
+        pad_height,
+        pad_width,
+        stride_height,
+        stride_width,
+        dilation_height,
+        dilation_width,
+        height_col,
+        width_col,
+        data_im + ibatch * im_batch_stride);
+  }
+}
+
+template <typename dt>
+void col2im_batched(
+    hipStream_t stream,
+    const dt* data_col,
+    const int64_t col_batch_stride,
+    const int64_t nbatch,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t patch_height,
+    const int64_t patch_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    dt* data_im,
+    const int64_t im_batch_stride) {
+  const int64_t num_kernels = channels * height * width;
+  const int64_t output_numel = nbatch * num_kernels;
+  if (output_numel == 0) {
+    return;  // No work to do
+  }
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  col2im_batched_kernel<<<GET_BLOCKS(output_numel, 512), 512, 0, stream>>>(
+          num_kernels,
+          data_col,
+          col_batch_stride,
+          nbatch,
+          height,
+          width,
+          channels,
+          patch_height,
+          patch_width,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          height_col,
+          width_col,
+          data_im,
+          im_batch_stride);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file

From 7a16442759bfc35ad73c22a17c537f75a4b93268 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 13 Apr 2025 21:15:04 -0500
Subject: [PATCH 21/23] add back distance, distributions, embedding, fused,
 gridsampler, hermite kernels

---
 aten/src/ATen/native/Distance.cpp             |  16 +-
 aten/src/ATen/native/Distributions.h          |   6 +-
 aten/src/ATen/native/native_functions.yaml    | 106 +-
 aten/src/ATen/native/zoom/DepthwiseConv2d.cu  | 732 +++++++++++++
 aten/src/ATen/native/zoom/DepthwiseConv3d.cu  | 706 +++++++++++++
 aten/src/ATen/native/zoom/DilatedMaxPool2d.cu | 563 ++++++++++
 aten/src/ATen/native/zoom/DilatedMaxPool3d.cu | 652 ++++++++++++
 aten/src/ATen/native/zoom/DistanceKernel.cu   | 365 +++++++
 .../ATen/native/zoom/DistributionBernoulli.cu |  40 +
 .../native/zoom/DistributionCauchyKernel.cu   |  15 +
 .../zoom/DistributionGeometricKernel.cu       |  15 +
 .../zoom/DistributionLogNormalKernel.cu       |  15 +
 .../ATen/native/zoom/DistributionNormal.cu    |  15 +
 .../ATen/native/zoom/DistributionTemplates.h  |   1 +
 aten/src/ATen/native/zoom/Distributions.cpp   |  84 ++
 aten/src/ATen/native/zoom/Distributions.h     |  25 +
 .../ATen/native/zoom/DistributionsKernels.cu  | 204 ++++
 aten/src/ATen/native/zoom/Dropout.cu          | 412 ++++++++
 aten/src/ATen/native/zoom/Embedding.cu        | 383 +++++++
 .../native/zoom/EmbeddingBackwardKernel.cu    | 365 +++++++
 .../native/zoom/EmbeddingBackwardKernel.cuh   |  22 +
 aten/src/ATen/native/zoom/EmbeddingBag.cu     | 560 ++++++++++
 .../ATen/native/zoom/FlattenIndicesKernel.cu  |  28 +
 .../ATen/native/zoom/FractionalMaxPool2d.cu   | 274 +++++
 .../ATen/native/zoom/FractionalMaxPool3d.cu   | 348 ++++++
 .../zoom/FunctionOfAMatrixUtilsKernel.cu      | 114 ++
 aten/src/ATen/native/zoom/FusedAdamKernel.cu  | 170 +++
 aten/src/ATen/native/zoom/FusedAdamWKernel.cu | 172 +++
 aten/src/ATen/native/zoom/FusedSgdKernel.cu   | 427 ++++++++
 aten/src/ATen/native/zoom/GcdLcmKernel.cu     |  58 +
 aten/src/ATen/native/zoom/GridSampler.cpp     |  82 ++
 aten/src/ATen/native/zoom/GridSampler.cu      | 961 +++++++++++++++++
 aten/src/ATen/native/zoom/GridSampler.cuh     | 321 ++++++
 aten/src/ATen/native/zoom/GridSampler.h       |  32 +
 aten/src/ATen/native/zoom/IGammaKernel.cu     | 554 ++++++++++
 aten/src/ATen/native/zoom/Im2Col.cu           | 165 +++
 aten/src/ATen/native/zoom/UpSample.cuh        | 370 +++++++
 .../native/zoom/fused_adam_amsgrad_impl.cu    | 112 ++
 .../native/zoom/fused_adam_amsgrad_impl.cuh   |  40 +
 aten/src/ATen/native/zoom/fused_adam_impl.cu  | 102 ++
 aten/src/ATen/native/zoom/fused_adam_impl.cuh |  38 +
 .../src/ATen/native/zoom/fused_adam_utils.cuh | 202 ++++
 .../native/zoom/fused_adamw_amsgrad_impl.cu   | 114 ++
 .../native/zoom/fused_adamw_amsgrad_impl.cuh  |  40 +
 aten/src/ATen/native/zoom/fused_adamw_impl.cu | 104 ++
 .../src/ATen/native/zoom/fused_adamw_impl.cuh |  38 +
 .../src/ATen/native/zoom/group_norm_kernel.cu | 996 ++++++++++++++++++
 .../ATen/native/zoom/hermite_polynomial_h.cu  |  31 +
 .../ATen/native/zoom/hermite_polynomial_he.cu |  31 +
 test/custom_backend/CMakeLists.txt            |   2 +-
 test/custom_operator/CMakeLists.txt           |   2 +-
 test/test_torch.py                            | 337 ++++--
 52 files changed, 11421 insertions(+), 146 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/DepthwiseConv2d.cu
 create mode 100644 aten/src/ATen/native/zoom/DepthwiseConv3d.cu
 create mode 100644 aten/src/ATen/native/zoom/DilatedMaxPool2d.cu
 create mode 100644 aten/src/ATen/native/zoom/DilatedMaxPool3d.cu
 create mode 100644 aten/src/ATen/native/zoom/DistanceKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionBernoulli.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionCauchyKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionGeometricKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionLogNormalKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/DistributionNormal.cu
 create mode 100644 aten/src/ATen/native/zoom/Distributions.cpp
 create mode 100644 aten/src/ATen/native/zoom/Distributions.h
 create mode 100644 aten/src/ATen/native/zoom/DistributionsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/Dropout.cu
 create mode 100644 aten/src/ATen/native/zoom/Embedding.cu
 create mode 100644 aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cuh
 create mode 100644 aten/src/ATen/native/zoom/EmbeddingBag.cu
 create mode 100644 aten/src/ATen/native/zoom/FlattenIndicesKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/FractionalMaxPool2d.cu
 create mode 100644 aten/src/ATen/native/zoom/FractionalMaxPool3d.cu
 create mode 100644 aten/src/ATen/native/zoom/FunctionOfAMatrixUtilsKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/FusedAdamKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/FusedAdamWKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/FusedSgdKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/GcdLcmKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/GridSampler.cpp
 create mode 100644 aten/src/ATen/native/zoom/GridSampler.cu
 create mode 100644 aten/src/ATen/native/zoom/GridSampler.cuh
 create mode 100644 aten/src/ATen/native/zoom/GridSampler.h
 create mode 100644 aten/src/ATen/native/zoom/IGammaKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/Im2Col.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSample.cuh
 create mode 100644 aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cu
 create mode 100644 aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cuh
 create mode 100644 aten/src/ATen/native/zoom/fused_adam_impl.cu
 create mode 100644 aten/src/ATen/native/zoom/fused_adam_impl.cuh
 create mode 100644 aten/src/ATen/native/zoom/fused_adam_utils.cuh
 create mode 100644 aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cu
 create mode 100644 aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cuh
 create mode 100644 aten/src/ATen/native/zoom/fused_adamw_impl.cu
 create mode 100644 aten/src/ATen/native/zoom/fused_adamw_impl.cuh
 create mode 100644 aten/src/ATen/native/zoom/group_norm_kernel.cu
 create mode 100644 aten/src/ATen/native/zoom/hermite_polynomial_h.cu
 create mode 100644 aten/src/ATen/native/zoom/hermite_polynomial_he.cu

diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp
index 942461c7612c13..06e8f4b4fc091f 100644
--- a/aten/src/ATen/native/Distance.cpp
+++ b/aten/src/ATen/native/Distance.cpp
@@ -101,9 +101,10 @@ static Tensor cdist_impl(const Tensor& x1, const Tensor& x2, const double p, std
 
   // See Note [cdist relies on cdist_impl redispatching]
   // Keep this condition in sync with the condition at the Note
+  // TODO(Arham): replace keys below
   if (!(p == 2 && (mode == 1 || (mode == 0 && (r1 > 25 || r2 > 25))))) {
-    TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "cdist only supports CPU and CUDA devices, X1 got: ", device1);
-    TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "cdist only supports CPU and CUDA devices, X2 got: ", device2);
+    TORCH_CHECK(device1 == kCPU || device1 == kCUDA || device1 == kPrivateUse1, "cdist only supports CPU, CUDA, and HIP devices, X1 got: ", device1);
+    TORCH_CHECK(device2 == kCPU || device2 == kCUDA || device2 == kPrivateUse1, "cdist only supports CPU, CUDA, and HIP devices, X2 got: ", device2);
   }
 
   auto dim1 = x1.dim();
@@ -228,9 +229,10 @@ Tensor _cdist_backward(const Tensor& _grad, const Tensor& _x1, const Tensor& _x2
   int64_t n = x1.size(-2);
   int64_t m = x1.size(-1);
   auto device1 = x1.device().type();
-  TORCH_CHECK(device1 == kCPU || device1 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X1 got: ", device1);
+  //TODO(Arham): exchange keys below
+  TORCH_CHECK(device1 == kCPU || device1 == kCUDA || device1 == kPrivateUse1, "_cdist_backward only supports CPU, CUDA, and HIP devices, X1 got: ", device1);
   auto device2 = x2.device().type();
-  TORCH_CHECK(device2 == kCPU || device2 == kCUDA, "_cdist_backward only supports CPU and CUDA devices, X2 got: ", device2);
+  TORCH_CHECK(device2 == kCPU || device2 == kCUDA || device2 == kPrivateUse1, "_cdist_backward only supports CPU, CUDA, and HIP devices, X2 got: ", device2);
 
   Tensor grad_x1 =
       at::empty({batch_product, n, m}, x1.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
@@ -244,7 +246,8 @@ Tensor _cdist_backward(const Tensor& _grad, const Tensor& _x1, const Tensor& _x2
 Tensor _pdist_forward(const Tensor& self, const double p) {
   TORCH_CHECK(self.is_contiguous(), "_pdist_forward requires contiguous input");
   auto device = self.device().type();
-  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_forward only supports CPU and CUDA devices, got: ", device);
+  // TODO(Arham): exchange keys below
+  TORCH_CHECK(device == kCPU || device == kCUDA || device == kPrivateUse1, "_pdist_forward only supports CPU, CUDA, and HIP devices, got: ", device);
   Tensor result = at::empty({0}, self.options(), LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   if (self.size(0) <= 1) {
     result.resize_({0});
@@ -265,7 +268,8 @@ Tensor _pdist_backward(const Tensor& grad, const Tensor& self, const double p, c
   TORCH_CHECK(self.is_contiguous(), "_pdist_backward requires self to be contiguous");
   TORCH_CHECK(pdist.is_contiguous(), "_pdist_backward requires pdist to be contiguous");
   auto device = self.device().type();
-  TORCH_CHECK(device == kCPU || device == kCUDA, "_pdist_backward only supports CPU and CUDA devices, got: ", device);
+  // TODO(Arham): exchange keys below
+  TORCH_CHECK(device == kCPU || device == kCUDA || device == kPrivateUse1, "_pdist_backward only supports CPU, CUDA, and HIP devices, got: ", device);
   Tensor result = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   pdist_backward_stub(device, result, grad, self, p, pdist);
   return result;
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 2c334157eba9f5..5476777d72f8a4 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -17,7 +17,11 @@
 #define compat_abs c10::cuda::compat::abs
 #define compat_log1p c10::cuda::compat::log1p
 #elif defined(__HIPCC__)
-#include <c10/hip/HIPMathCompat.h>
+  #ifdef USE_ZOOM
+    #include <c10/zoom/HIPMathCompat.h>
+  #else
+    #include <c10/hip/HIPMathCompat.h>
+  #endif
 #define compat_exp c10::hip::compat::exp
 #define compat_ceil c10::hip::compat::ceil
 #define compat_floor c10::hip::compat::floor
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 33dc6344e7b930..af421d12e05225 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -271,6 +271,7 @@
   variants: function
   dispatch:
     CUDA: fused_dropout_cuda
+    PrivateUse1: fused_dropout_zoom
   tags: nondeterministic_seeded
   autogen: _fused_dropout.out
 
@@ -278,6 +279,7 @@
   variants: function
   dispatch:
     CUDA: masked_scale_cuda
+    PrivateUse1: masked_scale_zoom
   autogen: _masked_scale.out
 
 - func: native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
@@ -285,6 +287,7 @@
   dispatch:
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
+    PrivateUse1: native_dropout_zoom
     NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
   tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
@@ -293,6 +296,7 @@
   dispatch:
     CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
     CUDA: native_dropout_backward_cuda
+    PrivateUse1: native_dropout_backward_zoom
   autogen: native_dropout_backward.out
   tags: pointwise
 
@@ -1095,7 +1099,7 @@
   variants: function
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: bernoulli_out
+    CPU, CUDA, PrivateUse1: bernoulli_out
     MPS: bernoulli_out_mps
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
@@ -1103,7 +1107,7 @@
   variants: method
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: bernoulli_
+    CPU, CUDA, PrivateUse1: bernoulli_
     MPS: bernoulli_mps_
   autogen: bernoulli.Tensor, bernoulli.Tensor_out
 
@@ -1112,7 +1116,7 @@
   variants: method
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: bernoulli_
+    CPU, CUDA, PrivateUse1: bernoulli_
     MPS: bernoulli_mps_
   autogen: bernoulli.float_out
 
@@ -2299,6 +2303,7 @@
   dispatch:
     CPU: embedding_dense_backward_cpu
     CUDA: embedding_dense_backward_cuda
+    PrivateUse1: embedding_dense_backward_zoom
     MPS: embedding_dense_backward_mps
   autogen: embedding_dense_backward.out
   tags: core
@@ -2306,7 +2311,8 @@
 - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
   dispatch:
     CPU: embedding_renorm_cpu_
-    CUDA: embedding_renorm_cuda_
+    CUDA: embedding_renorm_cuda
+    PrivateUse1: embedding_renorm_zoom_
   autogen: embedding_renorm, embedding_renorm.out
 
 - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
@@ -2325,6 +2331,7 @@
   dispatch:
     CPU: _embedding_bag_forward_only_cpu
     CUDA: _embedding_bag_forward_only_cuda
+    PrivateUse1: _embedding_bag_forward_only_zoom
   autogen: _embedding_bag_forward_only.out
 
 - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@@ -2346,6 +2353,7 @@
   dispatch:
     CPU: _embedding_bag_cpu
     CUDA: _embedding_bag_cuda
+    PrivateUse1: _embedding_bag_zoom
   autogen: _embedding_bag.out
   tags: core
 
@@ -2361,12 +2369,14 @@
   dispatch:
     CPU: _embedding_bag_dense_backward_cpu
     CUDA: _embedding_bag_dense_backward_cuda
+    PrivateUse1: _embedding_bag_dense_backward_zoom
   autogen: _embedding_bag_dense_backward.out
 
 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
   dispatch:
     CPU: _embedding_bag_per_sample_weights_backward_cpu
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
+    PrivateUse1: _embedding_bag_per_sample_weights_backward_zoom
   autogen: _embedding_bag_per_sample_weights_backward.out
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
@@ -2842,7 +2852,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: gcd_out
+    CPU, CUDA, PrivateUse1: gcd_out
   tags: pointwise
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
@@ -2858,7 +2868,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: lcm_out
+    CPU, CUDA, PrivateUse1: lcm_out
   tags: pointwise
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
@@ -2893,6 +2903,7 @@
   dispatch:
     CPU, QuantizedCPU: grid_sampler_2d_cpu
     CUDA: grid_sampler_2d_cuda
+    PrivateUse1: grid_sampler_2d_zoom
     MPS: grid_sampler_2d_mps
   autogen: grid_sampler_2d.out
   tags: core
@@ -2904,6 +2915,7 @@
   dispatch:
     CPU: grid_sampler_2d_backward_cpu
     CUDA: grid_sampler_2d_backward_cuda
+    PrivateUse1: grid_sampler_2d_backward_zoom
   autogen: grid_sampler_2d_backward.out
 
 # See NOTE [ grid_sample CPU fallback ]
@@ -2918,6 +2930,7 @@
   dispatch:
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
+    PrivateUse1: grid_sampler_3d_zoom
   autogen: grid_sampler_3d.out
 
 # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
@@ -2927,6 +2940,7 @@
   dispatch:
     CPU: grid_sampler_3d_backward_cpu
     CUDA: grid_sampler_3d_backward_cuda
+    PrivateUse1: grid_sampler_3d_backward_zoom
   autogen: grid_sampler_3d_backward.out
 
 - func: hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -2980,14 +2994,14 @@
 
 - func: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
   dispatch:
-    CPU, CUDA: native_group_norm
+    CPU, CUDA, PrivateUse1: native_group_norm
     CompositeExplicitAutograd: math_group_norm
   autogen: native_group_norm.out
   tags: core
 
 - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
-    CPU, CUDA: native_group_norm_backward
+    CPU, CUDA, PrivateUse1: native_group_norm_backward
   autogen: native_group_norm_backward.out
   tags: core
 
@@ -3813,11 +3827,11 @@
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
   dispatch:
-    CPU, CUDA: _compute_linear_combination
+    CPU, CUDA, PrivateUse1: _compute_linear_combination
 
 - func: _compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: _compute_linear_combination_out
+    CPU, CUDA, PrivateUse1: _compute_linear_combination_out
 
 - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
@@ -4462,27 +4476,27 @@
 
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   dispatch:
-    CPU, CUDA: _cdist_forward
+    CPU, CUDA, PrivateUse1: _cdist_forward
     MPS: _cdist_forward_mps
   autogen: _cdist_forward.out
   tags: core
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
   dispatch:
-    CPU, CUDA: _cdist_backward
+    CPU, CUDA, PrivateUse1: _cdist_backward
   autogen: _cdist_backward.out
 
 - func: pdist(Tensor self, float p=2) -> Tensor
 
 - func: _pdist_forward(Tensor self, float p=2) -> Tensor
   dispatch:
-    CPU, CUDA: _pdist_forward
+    CPU, CUDA, PrivateUse1: _pdist_forward
   autogen: _pdist_forward.out
   tags: core
 
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
   dispatch:
-    CPU, CUDA: _pdist_backward
+    CPU, CUDA, PrivateUse1: _pdist_backward
   autogen: _pdist_backward.out
 
 - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
@@ -8723,7 +8737,7 @@
   variants: method
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: cauchy_
+    CPU, CUDA, PrivateUse1: cauchy_
   autogen: cauchy, cauchy.out
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
@@ -8731,7 +8745,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: log_normal_
+    CPU, CUDA, PrivateUse1: log_normal_
   autogen: log_normal, log_normal.out
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
@@ -8748,7 +8762,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: geometric_
+    CPU, CUDA, PrivateUse1: geometric_
 
   # wrappers for TH functions
   autogen: geometric, geometric.out
@@ -9829,7 +9843,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: igamma_out
+    CPU, CUDA, PrivateUse1: igamma_out
   tags: pointwise
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
@@ -9846,7 +9860,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: igammac_out
+    CPU, CUDA, PrivateUse1: igammac_out
   tags: pointwise
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
@@ -10277,7 +10291,7 @@
   tags: nondeterministic_seeded
   variants: method
   dispatch:
-    CPU, CUDA: normal_
+    CPU, CUDA, PrivateUse1: normal_
     MPS: normal_mps_
     Meta: normal_meta_
     SparseCsrCPU, SparseCsrCUDA: normal_sparse_csr_
@@ -10296,41 +10310,41 @@
 - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   tags: nondeterministic_seeded
   dispatch:
-    CPU, CUDA: normal_out
+    CPU, CUDA, PrivateUse1: normal_out
     MPS: normal_mps_out
     Meta: normal_out_meta
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
   dispatch:
-    CPU, CUDA: normal
+    CPU, CUDA, PrivateUse1: normal
     MPS: normal_mps
     Meta: normal_meta
   tags: nondeterministic_seeded
 
 - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: normal_out
+    CPU, CUDA, PrivateUse1: normal_out
     Meta: normal_out_meta
     MPS: normal_mps_out
   tags: nondeterministic_seeded
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
   dispatch:
-    CPU, CUDA: normal
+    CPU, CUDA, PrivateUse1: normal
     MPS: normal_mps
     Meta: normal_meta
   tags: nondeterministic_seeded
 
 - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: normal_out
+    CPU, CUDA, PrivateUse1: normal_out
     Meta: normal_out_meta
     MPS: normal_mps_out
   tags: nondeterministic_seeded
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
   dispatch:
-    CPU, CUDA: normal
+    CPU, CUDA, PrivateUse1: normal
     MPS: normal_mps
     Meta: normal_meta
   tags: nondeterministic_seeded
@@ -12414,6 +12428,7 @@
   dispatch:
     CPU: fractional_max_pool2d_out_cpu
     CUDA: fractional_max_pool2d_out_cuda
+    PrivateUse1: fractional_max_pool2d_out_zoom
 
 # Return: (Tensor output, Tensor indices)
 - func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
@@ -12426,6 +12441,7 @@
   dispatch:
     CPU: fractional_max_pool2d_backward_cpu
     CUDA: fractional_max_pool2d_backward_cuda
+    PrivateUse1: fractional_max_pool2d_backward_zoom
 
 - func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
   python_module: nn
@@ -12442,6 +12458,7 @@
   dispatch:
     CPU: fractional_max_pool3d_out_cpu
     CUDA: fractional_max_pool3d_out_cuda
+    PrivateUse1: fractional_max_pool3d_out_zoom
 
 # Return: (Tensor output, Tensor indices)
 - func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
@@ -12453,12 +12470,14 @@
   dispatch:
     CPU: fractional_max_pool3d_backward_out_cpu
     CUDA: fractional_max_pool3d_backward_out_cuda
+    PrivateUse1: fractional_max_pool3d_backward_out_zoom
 
 - func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
   python_module: nn
   dispatch:
     CPU: fractional_max_pool3d_backward_cpu
     CUDA: fractional_max_pool3d_backward_cuda
+    PrivateUse1: fractional_max_pool3d_backward_zoom
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
@@ -12467,6 +12486,7 @@
   dispatch:
     CPU: max_pool2d_with_indices_out_cpu
     CUDA: max_pool2d_with_indices_out_cuda
+    PrivateUse1: max_pool2d_with_indices_out_zoom
     MPS: max_pool2d_with_indices_out_mps
 
 # Return: (Tensor output, Tensor indices)
@@ -12481,6 +12501,7 @@
   dispatch:
     CPU: max_pool2d_with_indices_backward_out_cpu
     CUDA: max_pool2d_with_indices_backward_out_cuda
+    PrivateUse1: max_pool2d_with_indices_backward_out_zoom
     MPS: max_pool2d_with_indices_backward_out_mps
 
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
@@ -12494,6 +12515,7 @@
   dispatch:
     CPU: max_pool3d_with_indices_out_cpu
     CUDA: max_pool3d_with_indices_out_cuda
+    PrivateUse1: max_pool3d_with_indices_out_zoom
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -12501,6 +12523,7 @@
   dispatch:
     CPU: max_pool3d_with_indices_cpu
     CUDA: max_pool3d_with_indices_cuda
+    PrivateUse1: max_pool3d_with_indices_zoom
   tags: core
 
 - func: max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -12508,12 +12531,14 @@
   dispatch:
     CPU: max_pool3d_with_indices_backward_out_cpu
     CUDA: max_pool3d_with_indices_backward_out_cuda
+    PrivateUse1: max_pool3d_with_indices_backward_out_zoom
 
 - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
   python_module: nn
   dispatch:
     CPU: max_pool3d_with_indices_backward_cpu
     CUDA: max_pool3d_with_indices_backward_cuda
+    PrivateUse1: max_pool3d_with_indices_backward_zoom
 
 - func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -13173,17 +13198,20 @@
   use_const_ref_for_mutable_tensors: True
   python_module: nn
   dispatch:
-    CUDA: conv_depthwise2d_cuda_out
+    CUDA: conv_depthwise2d_cuda
+    PrivateUse1: conv_depthwise2d_zoom_out
 
 - func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda
+    PrivateUse1: conv_depthwise2d_zoom
 
 - func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise3d_cuda
+    PrivateUse1: conv_depthwise3d_zoom
   autogen: conv_depthwise3d.out
 
 - func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
@@ -13240,12 +13268,14 @@
   dispatch:
     CPU: im2col_out_cpu
     CUDA: im2col_out_cuda
+    PrivateUse1: im2col_out_zoom
 
 - func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   python_module: nn
   dispatch:
     CPU: im2col_cpu
     CUDA: im2col_cuda
+    PrivateUse1: im2col_zoom
 
 - func: isfinite(Tensor self) -> Tensor
   variants: function, method
@@ -15306,7 +15336,7 @@
 - func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_hermite_polynomial_h_out
+    CPU, CUDA, PrivateUse1: special_hermite_polynomial_h_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15355,7 +15385,7 @@
 - func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_hermite_polynomial_he_out
+    CPU, CUDA, PrivateUse1: special_hermite_polynomial_he_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15789,7 +15819,8 @@
   variants: function
   dispatch:
     CPU: _fused_adam_kernel_cpu_
-    CUDA: _fused_adam_kernel_cuda_
+    CUDA: _fused_adam_kernel_cuda
+    PrivateUse1: _fused_adam_kernel_zoom_
   autogen: _fused_adam, _fused_adam.out
 
 - func: _fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
@@ -15799,7 +15830,8 @@
   variants: function
   dispatch:
     CPU: _fused_adam_kernel_cpu_
-    CUDA: _fused_adam_kernel_cuda_
+    CUDA: _fused_adam_kernel_cuda
+    PrivateUse1: _fused_adam_kernel_zoom_
   autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out
 
 - func: _fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
@@ -15807,7 +15839,8 @@
   variants: function
   dispatch:
     CPU: _fused_adamw_kernel_cpu_
-    CUDA: _fused_adamw_kernel_cuda_
+    CUDA: _fused_adamw_kernel_cuda
+    PrivateUse1: _fused_adamw_kernel_zoom_
   autogen: _fused_adamw, _fused_adamw.out
 
 - func: _fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
@@ -15817,7 +15850,8 @@
   variants: function
   dispatch:
     CPU: _fused_adamw_kernel_cpu_
-    CUDA: _fused_adamw_kernel_cuda_
+    CUDA: _fused_adamw_kernel_cuda
+    PrivateUse1: _fused_adamw_kernel_zoom_
   autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out
 
 - func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
@@ -15825,7 +15859,8 @@
   variants: function
   dispatch:
     CPU: _fused_sgd_kernel_cpu_
-    CUDA: _fused_sgd_kernel_cuda_
+    CUDA: _fused_sgd_kernel_cuda
+    PrivateUse1: _fused_sgd_kernel_zoom_
   autogen: _fused_sgd, _fused_sgd.out
 
 - func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
@@ -15835,7 +15870,8 @@
   variants: function
   dispatch:
     CPU: _fused_sgd_kernel_cpu_
-    CUDA: _fused_sgd_kernel_cuda_
+    CUDA: _fused_sgd_kernel_cuda
+    PrivateUse1: _fused_sgd_kernel_zoom_
   autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
 
 - func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
diff --git a/aten/src/ATen/native/zoom/DepthwiseConv2d.cu b/aten/src/ATen/native/zoom/DepthwiseConv2d.cu
new file mode 100644
index 00000000000000..1999c0f346017c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DepthwiseConv2d.cu
@@ -0,0 +1,732 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/div_rtn.h>
+#include <ATen/zoom/HIPBlas.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/native/ConvUtils.h>
+#include <ATen/native/zoom/block_reduce.cuh>
+#include <ATen/native/Resize.h>
+#include <ATen/native/IndexingUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/_conv_depthwise2d_native.h>
+#endif
+
+namespace at::native {
+namespace {
+using at::zoom::detail::HIP_NUM_THREADS;
+using at::zoom::detail::GET_BLOCKS;
+
+template <typename scalar_t, int ndim, template <typename U> class PtrTraits = DefaultPtrTraits>
+PackedTensorAccessor32<scalar_t, ndim, PtrTraits> dummy_packed_accessor32() {
+  std::array<int64_t, ndim> zeros{};
+  return {nullptr, zeros.data(), zeros.data()};
+}
+
+template <typename scalar_t, typename index_t>
+__global__ void
+conv_depthwise2d_forward_kernel_generic(
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
+    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
+    bool biasEnabled,
+    index_t totalElements,
+    const int outputChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight) {
+  using acc_t = at::acc_type<scalar_t, true>;
+
+  HIP_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
+    //calculate n,c,h,w indices, replacing modulos by divide and multiply add,
+    //result is same as would be in the code below
+    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
+    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
+    //const int h = (linearIndex / outputWidth) % outputHeight;
+    //const int w = linearIndex % outputWidth;
+
+    int indtmp1 = linearIndex/outputWidth;
+    const int w = linearIndex - indtmp1 * outputWidth;
+    int indtmp2 = indtmp1/outputHeight;
+    const int h = indtmp1 - indtmp2 * outputHeight;
+    indtmp1 = indtmp2;
+    indtmp2 = indtmp1/outputChannels;
+    const int c = indtmp1 - indtmp2 * outputChannels;
+    const int n = indtmp2;
+
+    int inputChannel = c;
+    int inputChannels = outputChannels;
+    if (depthwiseMultiplier !=1) {
+      inputChannel /= depthwiseMultiplier;
+      inputChannels /= depthwiseMultiplier;
+    }
+
+    int weightOffset = c * kernelHeight * kernelWidth;
+
+    // By precisely computing the filtering boundaries, we avoid repeating several
+    // expensive edge condition checks for every fetched item. If the input element is
+    // resident in L1, then the extra branches and comparisons would have been
+    // comparable in terms of cycles with the actual data fetch. Therefore computing
+    // boundaries ahead of the loop showed significant performance boost.
+
+    int kHmin = 0, kHmax = kernelHeight, kWmin = 0, kWmax = kernelWidth;
+
+    // Top
+    int h_in_min = -padHeight + h * strideHeight;
+    if (h_in_min < 0) {
+      kHmin =  -h_in_min / dilationHeight;
+      if ((-h_in_min) % dilationHeight > 0) {
+        kHmin++;
+      }
+    }
+
+    // Bottom
+    int h_in_max = h_in_min + (kernelHeight - 1) * dilationHeight - inputHeight + 1;
+    if (h_in_max >= 0) {
+      kHmax = kernelHeight - h_in_max / dilationHeight;
+      if (h_in_max % dilationHeight > 0) {
+        kHmax--;
+      }
+    }
+
+    // Left
+    int w_in_min = -padWidth + w * strideWidth;
+    if (w_in_min < 0) {
+      kWmin = -w_in_min / dilationWidth;
+      if ((-w_in_min) % dilationWidth > 0) {
+        kWmin++;
+      }
+    }
+
+    // Right
+    int w_in_max = w_in_min + (kernelWidth - 1) * dilationWidth - inputWidth + 1;
+    if (w_in_max >= 0) {
+      kWmax = kernelWidth - w_in_max / dilationWidth;
+      if (w_in_max % dilationWidth > 0) {
+        kWmax--;
+      }
+    }
+
+    acc_t value = biasEnabled ? static_cast<acc_t>(bias.data()[c]) : acc_t(0);
+    const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
+
+    for (int kH = kHmin; kH < kHmax; ++kH) {
+      const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
+      for (int kW = kWmin; kW < kWmax; ++kW) {
+        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
+        const index_t offset = offset0 + h_in * inputWidth + w_in;
+        value += (static_cast<acc_t>(weight.data()[weightOffset + kH * kernelWidth + kW]) *
+                    static_cast<acc_t>(input.data()[offset]));
+      }
+    }
+    output.data()[linearIndex] = static_cast<scalar_t>(value);
+  }
+}
+
+template <int kSize, typename scalar_t, typename index_t>
+__global__ void
+conv_depthwise2d_forward_kernel(
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
+    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    const PackedTensorAccessor32<const scalar_t, 1, DefaultPtrTraits> bias,
+    bool biasEnabled,
+    index_t totalElements,
+    const int outputChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth;
+  const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight;
+
+  HIP_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
+    //calculate n,c,h,w indices, replacing modulos by divide and multiply add,
+    //result is same as would be in the code below
+    //const int n = linearIndex / batchStride; //batchStride = outputChannels * outputHeight * outputWidth
+    //const int c = (linearIndex / channelStride) % outputChannels; //channelStride = outputHeight * outputWidth
+    //const int h = (linearIndex / outputWidth) % outputHeight;
+    //const int w = linearIndex % outputWidth;
+
+    int indtmp1 = linearIndex/outputWidth;
+    const int w = linearIndex - indtmp1 * outputWidth;
+    int indtmp2 = indtmp1/outputHeight;
+    const int h = indtmp1 - indtmp2 * outputHeight;
+    indtmp1 = indtmp2;
+    indtmp2 = indtmp1/outputChannels;
+    const int c = indtmp1 - indtmp2 * outputChannels;
+    const int n = indtmp2;
+
+    int inputChannel = c;
+    int inputChannels = outputChannels;
+    if (depthwiseMultiplier !=1) {
+      inputChannel /= depthwiseMultiplier;
+      inputChannels /= depthwiseMultiplier;
+    }
+
+    int weightOffset = c * kernelHeight * kernelWidth;
+
+    acc_t value = biasEnabled ? static_cast<acc_t>(bias.data()[c]) : acc_t(0);
+    const index_t offset0 = (n * inputChannels + inputChannel) * inputHeight * inputWidth;
+    for (int kH = 0; kH < KH_LIMIT; ++kH) {
+      for (int kW = 0; kW < KW_LIMIT; ++kW) {
+        const int h_in = -padHeight + h * strideHeight + kH * dilationHeight;
+        const int w_in = -padWidth + w * strideWidth + kW * dilationWidth;
+
+        if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) && (w_in < inputWidth)) {
+          const index_t offset = offset0 + h_in * inputWidth + w_in;
+          value += (static_cast<acc_t>(weight.data()[weightOffset]) *
+                    static_cast<acc_t>(input.data()[offset]));
+        }
+        ++weightOffset;
+      }
+    }
+    output.data()[linearIndex] = static_cast<scalar_t>(value);
+  }
+}
+
+template <int kSize, int stride, typename scalar_t, typename index_t>
+__global__ void conv_depthwise2d_backward_kernel(
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> grad_output,
+    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_input,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> weight,
+    index_t totalElements,
+    const int inputChannels,
+    const int depthwiseMultiplier,
+    const int outputChannels,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth;
+  const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight;
+  const int strideW = (stride != 0) ? stride : strideWidth;
+  const int strideH = (stride != 0) ? stride : strideHeight;
+
+  HIP_KERNEL_LOOP_TYPE(linearIndex, totalElements, index_t) {
+    int indtmp1 = linearIndex/inputWidth;
+    const int w = linearIndex - indtmp1 * inputWidth;
+    int indtmp2 = indtmp1/inputHeight;
+    const int h = indtmp1 - indtmp2 * inputHeight;
+    indtmp1 = indtmp2;
+    indtmp2 = indtmp1/inputChannels;
+    const int c = indtmp1 - indtmp2 * inputChannels;
+    const int n = indtmp2;
+
+    acc_t value(0);
+
+    for (int multiplier = 0; multiplier < depthwiseMultiplier; ++multiplier) {
+      int och = (c * depthwiseMultiplier) + multiplier;
+      int weightOffset = och * kernelHeight * kernelWidth;
+      for (int kh = 0; kh < KH_LIMIT; ++kh) {
+        #pragma unroll
+        for (int kw = 0; kw < KW_LIMIT; ++kw) {
+          int h_out = h + padHeight - kh * dilationHeight;
+          int w_out = w + padWidth - kw * dilationWidth;
+          if ((h_out % strideH == 0) && (w_out % strideW == 0)) {
+            h_out = h_out / strideH;
+            w_out = w_out / strideW;
+
+            if ((h_out >= 0) && (h_out < outputHeight)
+                  && (w_out >= 0) && (w_out < outputWidth)) {
+
+              const int offset = ((n * outputChannels + och) * outputHeight + h_out)
+                    * outputWidth + w_out;
+              value += (static_cast<acc_t>(weight.data()[weightOffset]) *
+                        static_cast<acc_t>(grad_output.data()[offset]));
+            }
+          }
+          ++weightOffset;
+        }
+      }
+    }
+    grad_input.data()[linearIndex] = static_cast<scalar_t>(value);
+  }
+}
+
+
+template <typename scalar_t, typename index_t=unsigned>
+__global__ void conv_depthwise2d_grad_weight_kernel(
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> grad_output,
+    const PackedTensorAccessor32<const scalar_t, 4, DefaultPtrTraits> input,
+    PackedTensorAccessor32<scalar_t, 4, DefaultPtrTraits> grad_weight,
+    const int batchSize,
+    const int inputChannels,
+    const int kernelChannels,
+    const int depthwiseMultiplier,
+    const int inputWidth, const int inputHeight,
+    const int outputWidth, const int outputHeight,
+    const int kernelWidth, const int kernelHeight,
+    const int strideWidth, const int strideHeight,
+    const int padWidth, const int padHeight,
+    const int dilationWidth, const int dilationHeight) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  const int channelStride = kernelWidth * kernelHeight;
+
+  // Each Block is responsible for accumulating over a permutation of
+  // (channels x kH x kW), use blockIdx to determine which one
+  int bidx = blockIdx.x;
+  int kW = bidx % kernelWidth;
+  int kH = (bidx / kernelWidth) % kernelHeight;
+  int ch = (bidx / channelStride);
+
+  // Need to calculate which input channel is associated with this filter
+  // channel
+  int inputCh = ch / depthwiseMultiplier;
+
+  acc_t grad(0);
+
+  const int laneId = threadIdx.x % C10_WARP_SIZE;
+  const int batch = threadIdx.x / C10_WARP_SIZE;
+  const int nwarps = blockDim.x / C10_WARP_SIZE;
+  const int imageElements = outputWidth * outputHeight;
+  // Use warp per item.  In the original kernel, a threadblock was used to sum over NHW.
+  // Here, we use a warp to sum values over HW dimension, and if batchSize is larger than the
+  // number of warps, a warp would loop over remaining batch items (e.g. if there are 8 warps,
+  // warp 0 would go over 0-8-16 etc image, warp 1 over 1-9-17 etc). Later in blockReduce,
+  // all the warps will be reduced anyway, thus the full reduction will be over NHW, like it
+  // should be. That allows to get rid of one modulo operation inside the loop (because n/batchIdx
+  // now does not have to be computed through modulo, you are just looping over it), and
+  // bring a nice speed-up.
+  for (int batchIdx = batch; batchIdx < batchSize; batchIdx += nwarps){
+    // Warp-stride loop over elements in a batch item
+    for (index_t idx = laneId; idx < imageElements; idx += C10_WARP_SIZE) {
+    // Need to calculate the following: batch position, and offset into the grad_output
+    // in height, and width. We can intuit the corresponding position in the input from
+    // the other parameters we have
+      int go_w_offset = idx % outputWidth;
+      int go_h_offset = (idx / outputWidth);
+
+      int i_w_offset = (go_w_offset * strideWidth) + (kW * dilationWidth) - padWidth;
+      int i_h_offset = (go_h_offset * strideHeight) + (kH * dilationHeight) - padHeight;
+
+      if (i_w_offset >= 0 && i_h_offset >= 0 && i_w_offset < inputWidth && i_h_offset < inputHeight) {
+        int inputOffset = ((batchIdx * inputChannels + inputCh) * inputHeight + i_h_offset) * inputWidth + i_w_offset;
+        int outputOffset = ((batchIdx * kernelChannels + ch) * outputHeight ) * outputWidth + idx;
+        grad += (static_cast<acc_t>(input.data()[inputOffset]) *
+                 static_cast<acc_t>(grad_output.data()[outputOffset]));
+      }
+    }
+  }
+
+  // At this point each thread in the block has a local gradient, which we need to
+  // accumulate prior to writing the global value
+  extern __shared__ char smem[];
+  acc_t* buf = reinterpret_cast<acc_t*>(smem);
+  acc_t tval = zoom_utils::BlockReduceSum(grad, buf);
+
+  // After reduction, first thread in the block has the gradient, so its responsible
+  // for writing it to grad_weight
+  if (threadIdx.x == 0) {
+    int weightOffset = kW + (kernelWidth * kH) + (kernelWidth * kernelHeight * ch);
+    grad_weight.data()[weightOffset] = static_cast<scalar_t>(tval);
+  }
+}
+
+void conv_depthwise2d_forward_out(
+                  const Tensor &input,
+                  const Tensor &output,
+                  const Tensor &weight,
+                  const Tensor &bias,
+                  const int kW, const int kH,
+                  const int dW, const int dH,
+                  const int padW, const int padH,
+                  const int dilationW, const int dilationH) {
+  // Only handle 4D Input Tensors for now
+  TORCH_CHECK(input.numel() > 0 && input.dim() == 4);
+  TORCH_CHECK(weight.numel() > 0 && weight.dim() == 4);
+  TORCH_CHECK(output.is_contiguous());
+
+  auto in_sizes = input.sizes();
+  auto w_sizes = weight.sizes();
+
+  // We assume that the input and weight Tensors are shaped properly by
+  // the caller, so we verify that here to some extent
+
+  // Weight Tensor is shape (output_channels, 1, kH, kW)
+  TORCH_CHECK(w_sizes[1] == 1);
+
+  // Input Tensor is shape (N, input_channels, H, W)
+  // We verify that the # of output_channels is a multiple of input_channels
+  TORCH_CHECK(w_sizes[0] % in_sizes[1] == 0);
+
+  // Bias has same # of channels as output
+  const bool has_bias = bias.defined();
+  TORCH_CHECK(!has_bias || (bias.dim() <= 1 && bias.numel() == w_sizes[0]));
+
+  // Following the behavior of other THCUNN functions, we shape the output
+  // Tensor ourselves
+  int64_t height = in_sizes[2];
+  int64_t width = in_sizes[3];
+  int64_t outputChannels = w_sizes[0];
+  auto out_sizes = conv_output_size(in_sizes, weight.sizes(), {padH, padW}, {dH, dW},
+                                    {dilationH, dilationW});
+  const auto outputWidth = out_sizes[3];
+  const auto outputHeight = out_sizes[2];
+
+  resize_output(output, out_sizes);
+
+  int64_t inputChannels = in_sizes[1];
+  int64_t depthwiseMultiplier = outputChannels / inputChannels;
+
+  // One thread per output value
+  TORCH_CHECK(canUse32BitIndexMath(input) && canUse32BitIndexMath(output));
+  int32_t n = output.numel();
+  int blocks = GET_BLOCKS(n);
+  dim3 grid(blocks);
+  dim3 block(HIP_NUM_THREADS);
+  const auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "conv_depthwise2d_forward_zoom", [&] {
+    // Create PackedTensorAccessor
+    // Kernel currently relies upon all the Tensors to be contiguous, but we made
+    // them contiguous above
+    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
+    const auto weight_a = weight.packed_accessor32<const scalar_t, 4>();
+    const auto output_a = output.packed_accessor32<scalar_t, 4>();
+    const auto bias_a = has_bias ?
+      bias.packed_accessor32<const scalar_t, 1>() :
+      dummy_packed_accessor32<const scalar_t, 1>();
+    if (kW == 5 && kH == 5) {
+      conv_depthwise2d_forward_kernel<5> <<<grid, block, 0, stream>>>(
+        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else if (kW == 3 && kH == 3) {
+      conv_depthwise2d_forward_kernel<3> <<<grid, block, 0, stream>>>(
+        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else if (kW == 1 && kH == 1) {
+      conv_depthwise2d_forward_kernel<1> <<<grid, block, 0, stream>>>(
+        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      conv_depthwise2d_forward_kernel_generic<<<grid, block, 0, stream>>>(
+        input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+}
+
+void conv_depthwise2d_backward_out(
+                  const Tensor &input,
+                  const Tensor &grad_output,
+                  const Tensor &grad_input,
+                  const Tensor &weight,
+                  const int kW, const int kH,
+                  const int dW, const int dH,
+                  const int padW, const int padH,
+                  const int dilationW, const int dilationH) {
+  // Only handle 4D Input Tensors for now
+  TORCH_CHECK(input.numel() > 0 && input.dim() == 4);
+  TORCH_CHECK(weight.numel() > 0 && weight.dim() == 4);
+  TORCH_CHECK(grad_output.numel() > 0 && grad_output.dim() == 4);
+
+  // Minimal shape checking, as above
+  // Same # of elements in batch
+  TORCH_CHECK(input.sizes()[0] == grad_output.sizes()[0]);
+  // Same # of filters as outputChannels
+  TORCH_CHECK(weight.sizes()[0] == grad_output.sizes()[1]);
+
+  // Resize Grainput_a
+  auto in_sizes = input.sizes();
+  resize_output(grad_input, in_sizes);
+
+  int inputChannels = in_sizes[1];
+  int height = in_sizes[2];
+  int width = in_sizes[3];
+
+  auto gO_sizes = grad_output.sizes();
+  int outputChannels = gO_sizes[1];
+  int outputHeight = gO_sizes[2];
+  int outputWidth = gO_sizes[3];
+
+  int depthwiseMultiplier = outputChannels / inputChannels;
+
+  // Kernel currently relies upon all the Tensors to be contiguous
+  TORCH_CHECK(grad_output.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
+  TORCH_CHECK(grad_input.is_contiguous());
+
+  // One thread per grainput_a value
+  TORCH_CHECK(canUse32BitIndexMath(grad_input) &&
+              canUse32BitIndexMath(grad_output));
+  int32_t n = grad_input.numel();
+  int blocks = GET_BLOCKS(n);
+  dim3 grid(blocks);
+  dim3 block(HIP_NUM_THREADS);
+  const auto stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(),
+                                  "conv_depthwise2d_backward_zoom", [&] {
+    auto grad_output_a = grad_output.packed_accessor32<const scalar_t, 4>();
+    auto grad_input_a = grad_input.packed_accessor32<scalar_t, 4>();
+    auto weight_a = weight.packed_accessor32<const scalar_t, 4>();
+
+    if (kW == 3 && kH == 3) {
+      if (dW == 1 && dH == 1){
+        conv_depthwise2d_backward_kernel<3, 1><<<grid, block, 0, stream>>>(
+            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else if (dW == 2 && dH == 2) {
+        conv_depthwise2d_backward_kernel<3, 2><<<grid, block, 0, stream>>>(
+            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        conv_depthwise2d_backward_kernel<3, 0><<<grid, block, 0, stream>>>(
+            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    } else if (kW == 1 && kH == 1) {
+      if (dW == 1 && dH == 1){
+        conv_depthwise2d_backward_kernel<1, 1><<<grid, block, 0, stream>>>(
+            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else if (dW == 2 && dH == 2) {
+        conv_depthwise2d_backward_kernel<1, 2><<<grid, block, 0, stream>>>(
+            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        conv_depthwise2d_backward_kernel<1, 0><<<grid, block, 0, stream>>>(
+            grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+            height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    } else if (dW == 1 && dH == 1) {
+      conv_depthwise2d_backward_kernel<0, 1><<<grid, block, 0, stream>>>(
+          grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+          height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else if (dW == 2 && dH == 2) {
+      conv_depthwise2d_backward_kernel<0, 2><<<grid, block, 0, stream>>>(
+          grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+          height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      conv_depthwise2d_backward_kernel<0, 0><<<grid, block, 0, stream>>>(
+          grad_output_a, grad_input_a, weight_a, n, inputChannels, depthwiseMultiplier, outputChannels, width,
+          height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+}
+
+// Crude benchmarks suggest 256 is better than 512 and 1024
+// TODO: Autotune/use better heuristics, improve speed more.
+int getGradParamsNumThreads(int batchSize) {
+  //warp per item in a batch, up to a maximum
+  constexpr int MAX_BLOCK_SIZE = 256;
+  return std::min(batchSize * at::zoom::warp_size(), MAX_BLOCK_SIZE);
+}
+
+void conv_depthwise2d_grad_weight_out(
+                  const Tensor &input,
+                  const Tensor &grad_output,
+                  const Tensor &grad_weight,
+                  const int kW, const int kH,
+                  const int dW, const int dH,
+                  const int padW, const int padH,
+                  const int dilationW, const int dilationH) {
+  // Only handle 4D Input Tensors for now
+  TORCH_CHECK(input.numel() > 0 && input.dim() == 4);
+  TORCH_CHECK(grad_output.numel() > 0 && grad_output.dim() == 4);
+
+  // Minimal shape checking as above
+  // Same # of elements in batch
+  TORCH_CHECK(input.sizes()[0] == grad_output.sizes()[0]);
+
+  auto in_sizes = input.sizes();
+  int batchSize = in_sizes[0];
+  int inputChannels = in_sizes[1];
+  int height = in_sizes[2];
+  int width = in_sizes[3];
+
+  auto gO_sizes = grad_output.sizes();
+  int outputChannels = gO_sizes[1];
+  int outputHeight = gO_sizes[2];
+  int outputWidth = gO_sizes[3];
+
+  int depthwiseMultiplier = outputChannels / inputChannels;
+
+  resize_output(grad_weight, {outputChannels, 1, kH, kW});
+
+  // Kernel currently relies upon all the Tensors to be contiguous
+  TORCH_CHECK(grad_output.is_contiguous());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(grad_weight.is_contiguous());
+
+  // We parallelize so that each block computes a single value in grad_weight
+  TORCH_CHECK(canUse32BitIndexMath(input) &&
+              canUse32BitIndexMath(grad_output));
+  int blocks = outputChannels * kH * kW;
+
+  // Make sure we have enough threads to perform the reduction, and use this number
+  // to create the shared memory size for the reduction
+  dim3 grid(blocks);
+  dim3 block(getGradParamsNumThreads(batchSize));
+  const auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(),
+                                  "conv_depthwise2d_grad_weight_zoom", [&] {
+    const auto grad_output_a = grad_output.packed_accessor32<const scalar_t, 4>();
+    const auto input_a = input.packed_accessor32<const scalar_t, 4>();
+    const auto grad_weight_a = grad_weight.packed_accessor32<scalar_t, 4>();
+    using acc_t = at::acc_type<scalar_t, true>;
+    int warp_size = at::zoom::warp_size();
+    TORCH_INTERNAL_ASSERT(block.x % warp_size == 0);
+    int smem = (block.x  / warp_size) * sizeof(acc_t);
+    conv_depthwise2d_grad_weight_kernel<<<grid, block, smem, stream>>>(
+        grad_output_a, input_a, grad_weight_a, batchSize, inputChannels, outputChannels, depthwiseMultiplier,
+        width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+}  // namespace (anonymous)
+
+const Tensor& conv_depthwise2d_zoom_out(
+    const Tensor &input_,
+    const Tensor &weight_,
+    IntArrayRef kernel_size,
+    const std::optional<Tensor> &bias_opt,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    const Tensor &out) {
+  TORCH_CHECK(kernel_size.size() == 2);
+  TORCH_CHECK(stride.size() == 2);
+  TORCH_CHECK(padding.size() == 2);
+  TORCH_CHECK(dilation.size() == 2);
+
+  auto input = input_.expect_contiguous();
+  auto weight = weight_.expect_contiguous();
+  auto bias = [&] {
+    if (bias_opt.has_value() && bias_opt->defined()) {
+      return bias_opt->expect_contiguous();
+    }
+    return c10::MaybeOwned<Tensor>::owned(std::in_place);
+  }();
+
+  conv_depthwise2d_forward_out(
+      *input,
+      out,
+      *weight,
+      *bias,
+      kernel_size[1], kernel_size[0],
+      stride[1], stride[0],
+      padding[1], padding[0],
+      dilation[1], dilation[0]);
+  return out;
+}
+
+Tensor conv_depthwise2d_zoom(
+    const Tensor &input,
+    const Tensor &weight,
+    IntArrayRef kernel_size,
+    const std::optional<Tensor> &bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation) {
+  auto out = at::empty({0}, input.options());
+  return conv_depthwise2d_zoom_out(input, weight, kernel_size, bias,
+                                   stride, padding, dilation, out);
+}
+
+std::tuple<Tensor&, Tensor&> conv_depthwise2d_backward_zoom_out(
+    const Tensor & grad_output_,
+    const Tensor & self_,
+    const Tensor & weight_,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    Tensor & grad_input,
+    Tensor & grad_weight) {
+  auto grad_output = grad_output_.expect_contiguous();
+
+  if (grad_weight.defined()) {
+    auto self = self_.expect_contiguous();
+    conv_depthwise2d_grad_weight_out(
+        *self, *grad_output, grad_weight,
+        kernel_size[1], kernel_size[0],
+        stride[1], stride[0],
+        padding[1], padding[0],
+        dilation[1], dilation[0]);
+  }
+
+  if (grad_input.defined()) {
+    auto weight = weight_.expect_contiguous();
+    conv_depthwise2d_backward_out(
+        self_, *grad_output, grad_input, *weight,
+        kernel_size[1], kernel_size[0],
+        stride[1], stride[0],
+        padding[1], padding[0],
+        dilation[1], dilation[0]);
+  }
+  return std::forward_as_tuple(grad_input, grad_weight);
+}
+
+std::tuple<Tensor, Tensor> conv_depthwise2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    std::array<bool, 2> output_mask) {
+  Tensor grad_input;
+  Tensor grad_weight;
+
+  if (output_mask[0]) {
+    grad_input = at::empty({0}, grad_output.options());
+  }
+
+  if (output_mask[1]) {
+    grad_weight = at::empty({0}, grad_output.options());
+  }
+  return conv_depthwise2d_backward_zoom_out(
+      grad_output,
+      self,
+      weight,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      grad_input,
+      grad_weight);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(conv_depthwise2d_backward_stub, &conv_depthwise2d_backward_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DepthwiseConv3d.cu b/aten/src/ATen/native/zoom/DepthwiseConv3d.cu
new file mode 100644
index 00000000000000..3d2cf1bb6cfd8b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DepthwiseConv3d.cu
@@ -0,0 +1,706 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/ConvUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/conv_depthwise3d_native.h>
+#endif
+
+#include <algorithm>
+#include <tuple>
+#include <limits>
+
+namespace at::native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t,
+    int kKnownKernelT, int kKnownKernelH, int kKnownKernelW,
+    int kKnownDilationT, int kKnownDilationH, int kKnownDilationW>
+__global__ void conv_depthwise3d_zoom_kernel(
+    const PackedTensorAccessor32<const scalar_t, 5> input,
+    PackedTensorAccessor32<scalar_t, 5> output,
+    const PackedTensorAccessor32<const scalar_t, 5> kernel,
+    const scalar_t* bias,
+    int strideT, int strideH, int strideW,
+    int paddingT, int paddingH, int paddingW,
+    int dilationT_, int dilationH_, int dilationW_)
+{
+  const int kT = kKnownKernelT > 0 ? kKnownKernelT : kernel.size(2);
+  const int kH = kKnownKernelH > 0 ? kKnownKernelH : kernel.size(3);
+  const int kW = kKnownKernelW > 0 ? kKnownKernelW : kernel.size(4);
+  const int oC = output.size(1);
+  const int oT = output.size(2);
+  const int oH = output.size(3);
+  const int oW = output.size(4);
+  const int iC = input.size(1);
+  const int iT = input.size(2);
+  const int iH = input.size(3);
+  const int iW = input.size(4);
+  const int channel_multiplier = oC / iC;
+  const int dilationT = kKnownDilationT > 0 ? kKnownDilationT : dilationT_;
+  const int dilationH = kKnownDilationH > 0 ? kKnownDilationH : dilationH_;
+  const int dilationW = kKnownDilationW > 0 ? kKnownDilationW : dilationW_;
+  const int num_output = output.size(0) * output.stride(0);
+
+  HIP_KERNEL_LOOP(index, num_output) {
+    const int out_col = index % oW;
+    const int out_row = (index / oW) % oH;
+    const int out_frame = (index / oW / oH) % oT;
+    const int out_channel = (index / oW / oH / oT) % oC;
+    const int batch = index / oW / oH / oT / oC;
+
+    const int in_channel = out_channel / channel_multiplier;
+
+    const int in_col_start = out_col * strideW - paddingW;
+    const int in_row_start = out_row * strideH - paddingH;
+    const int in_frame_start = out_frame * strideT - paddingT;
+
+    accscalar_t sum = 0;
+    const scalar_t *kernel_ptr = kernel[out_channel].data();
+    const scalar_t *input_ptr =
+        &input[batch][in_channel][in_frame_start][in_row_start][in_col_start];
+    for (int k_frame = 0; k_frame < kT; ++k_frame) {
+      const int in_frame = in_frame_start + k_frame * dilationT;
+      for (int k_row = 0; k_row < kH; ++k_row) {
+        const int in_row = in_row_start + k_row * dilationH;
+        for (int k_col = 0; k_col < kW; ++k_col) {
+          const accscalar_t op1 = *(kernel_ptr++);
+          const int in_col = in_col_start + k_col * dilationW;
+          if (in_frame >= 0 && in_row >= 0 && in_col >= 0 &&
+              in_frame < iT && in_row < iH && in_col < iW) {
+            sum += op1 * *(input_ptr);
+          }
+          input_ptr += dilationW;
+        }
+        input_ptr += iW * dilationH - kW * dilationW;
+      }
+      input_ptr += iW * (iH * dilationT - kH * dilationH);
+    }
+    if (bias != NULL) {
+      sum += bias[out_channel];
+    }
+
+    output[batch][out_channel][out_frame][out_row][out_col] = sum;
+  }
+}
+
+template <typename scalar_t, typename accscalar_t,
+    int kKnownKernelT, int kKnownKernelH, int kKnownKernelW,
+    int kKnownDilationT, int kKnownDilationH, int kKnownDilationW,
+    int kKnownStrideT, int kKnownStrideH, int kKnownStrideW>
+__global__ void
+conv_depthwise3d_zoom_backward_input_kernel(
+    const PackedTensorAccessor32<const scalar_t, 5> grad_output,
+    PackedTensorAccessor32<scalar_t, 5> grad_input,
+    const PackedTensorAccessor32<const scalar_t, 5> kernel,
+    int strideT_, int strideH_, int strideW_,
+    int paddingT, int paddingH, int paddingW,
+    int dilationT_, int dilationH_, int dilationW_) {
+  const int kT = kKnownKernelT > 0 ? kKnownKernelT : kernel.size(2);
+  const int kH = kKnownKernelH > 0 ? kKnownKernelH : kernel.size(3);
+  const int kW = kKnownKernelW > 0 ? kKnownKernelW : kernel.size(4);
+  const int oC = grad_output.size(1);
+  const int oT = grad_output.size(2);
+  const int oH = grad_output.size(3);
+  const int oW = grad_output.size(4);
+  const int iC = grad_input.size(1);
+  const int iT = grad_input.size(2);
+  const int iH = grad_input.size(3);
+  const int iW = grad_input.size(4);
+  const int channel_multiplier = oC / iC;
+  const int dilationT = kKnownDilationT > 0 ? kKnownDilationT : dilationT_;
+  const int dilationH = kKnownDilationH > 0 ? kKnownDilationH : dilationH_;
+  const int dilationW = kKnownDilationW > 0 ? kKnownDilationW : dilationW_;
+  const int strideT = kKnownStrideT > 0 ? kKnownStrideT : strideT_;
+  const int strideH = kKnownStrideH > 0 ? kKnownStrideH : strideH_;
+  const int strideW = kKnownStrideW > 0 ? kKnownStrideW : strideW_;
+  const int num_input = grad_input.size(0) * grad_input.stride(0);
+
+  HIP_KERNEL_LOOP(index, num_input) {
+    const int in_col = index % iW;
+    const int in_row = (index / iW) % iH;
+    const int in_frame = (index / iW / iH) % iT;
+    const int in_channel = (index / iW / iH / iT) % iC;
+    const int batch = index / iW / iH / iT / iC;
+
+    const int out_col_end = in_col + paddingW;
+    const int out_row_end = in_row + paddingH;
+    const int out_frame_end = in_frame + paddingT;
+
+    const scalar_t* kernel_ptr = kernel[in_channel * channel_multiplier].data();
+    accscalar_t sum = 0;
+
+    for (int k_chn = in_channel * channel_multiplier;
+         k_chn < (in_channel + 1) * channel_multiplier;
+         ++k_chn) {
+      const scalar_t* gout_ptr = grad_output[batch][k_chn].data();
+
+      for (int k_frame = 0; k_frame < kT; ++k_frame) {
+        const int out_frame_raw = out_frame_end - k_frame * dilationT;
+        const int out_frame = out_frame_raw / strideT;
+        for (int k_row = 0; k_row < kH; ++k_row) {
+          const int out_row_raw = out_row_end - k_row * dilationH;
+          const int out_row = out_row_raw / strideH;
+          for (int k_col = 0; k_col < kW; ++k_col) {
+            const accscalar_t op1 = *(kernel_ptr++);
+            const int out_col_raw = out_col_end - k_col * dilationW;
+            const int out_col = out_col_raw / strideW;
+
+            const int out_offs = (out_frame * oH + out_row) * oW + out_col;
+
+            accscalar_t op2 = (accscalar_t)0;
+            if (out_col >= 0 && out_row >= 0 && out_frame >= 0 &&
+                out_col < oW && out_row < oH && out_frame < oT) {
+              op2 = *(gout_ptr + out_offs);
+            }
+            if (out_frame * strideT == out_frame_raw &&
+                out_row * strideH == out_row_raw &&
+                out_col * strideW == out_col_raw) {
+              sum += op1 * op2;
+            }
+          }
+        }
+      }
+    }
+
+    grad_input[batch][in_channel][in_frame][in_row][in_col] = sum;
+  }
+}
+
+template <typename scalar_t, typename accscalar_t,
+    int kKnownStrideH, int kKnownStrideW>
+__global__ void
+conv_depthwise3d_zoom_backward_weight_kernel(
+    const PackedTensorAccessor32<const scalar_t, 5> grad_output,
+    const PackedTensorAccessor32<const scalar_t, 5> input,
+    PackedTensorAccessor32<scalar_t, 5> grad_kernel,
+    int strideT, int strideH_, int strideW_,
+    int paddingT, int paddingH, int paddingW,
+    int dilationT, int dilationH, int dilationW) {
+  const int kC = grad_kernel.size(0);
+  const int kT = grad_kernel.size(2);
+  const int kH = grad_kernel.size(3);
+  const int kW = grad_kernel.size(4);
+
+  const int strideH = kKnownStrideH > 0 ? kKnownStrideH : strideH_;
+  const int strideW = kKnownStrideW > 0 ? kKnownStrideW : strideW_;
+
+  const int k_col = blockIdx.x % kW;
+  const int k_row = (blockIdx.x / kW) % kH;
+  const int k_frame = (blockIdx.x / kW / kH) % kT;
+  const int k_channel = blockIdx.x / kW / kH / kT;
+  scalar_t *result = &grad_kernel[k_channel][0][k_frame][k_row][k_col];
+
+  const int oT = grad_output.size(2);
+  const int oH = grad_output.size(3);
+  const int oW = grad_output.size(4);
+  const int iT = input.size(2);
+  const int iH = input.size(3);
+  const int iW = input.size(4);
+  const int channel_multiplier = grad_output.size(1) / input.size(1);
+  const int in_channel = k_channel / channel_multiplier;
+
+  extern __shared__ int sdata_raw[];
+  scalar_t* sdata = reinterpret_cast<scalar_t*>(sdata_raw);
+
+  if (k_channel >= kC) {
+    return;
+  }
+
+  const int laneid = threadIdx.x % C10_WARP_SIZE;
+  const int warpid = threadIdx.x / C10_WARP_SIZE;
+  const int nwarps = blockDim.x / C10_WARP_SIZE;
+
+  accscalar_t grad = 0;
+  int batch = warpid / oT;
+  int gout_frame = warpid - batch * oT;
+  for (int outer_pos = warpid; outer_pos < input.size(0) * oT;
+       outer_pos += nwarps, gout_frame += nwarps) {
+    while (gout_frame >= oT) { gout_frame -= oT; batch ++; }
+
+    const int in_frame = (gout_frame * strideT) + (k_frame * dilationT) - paddingT;
+
+    if (in_frame < 0 || in_frame >= iT) {
+      continue;
+    }
+
+    const scalar_t* gout_ptr = grad_output[batch][k_channel][gout_frame].data() + laneid;
+    const scalar_t* input_ptr = input[batch][in_channel][in_frame].data();
+
+    int gout_row = laneid / oW;
+    int gout_col = laneid - gout_row * oW;
+
+    for (; gout_row < oH; ) {
+      const accscalar_t op1 = *(gout_ptr);
+      gout_ptr += C10_WARP_SIZE;
+
+      const int in_col = (gout_col * strideW) + (k_col * dilationW) - paddingW;
+      const int in_row = (gout_row * strideH) + (k_row * dilationH) - paddingH;
+      const int in_pos = in_row * iW + in_col;
+
+      accscalar_t op2 = (accscalar_t)0;
+      if (in_col >= 0 && in_col < iW && in_row >= 0 && in_row < iH) {
+        op2 = *(input_ptr + in_pos);
+      }
+
+      gout_col += C10_WARP_SIZE;
+      while (gout_col >= oW) {
+        gout_col -= oW; gout_row ++;
+      }
+
+      grad += op1 * op2;
+    }
+  }
+
+  sdata[threadIdx.x] = grad;
+  __syncthreads();
+
+  ZOOM_KERNEL_ASSERT(__popc(blockDim.x) == 1);
+#pragma unroll
+  for (int i = blockDim.x / 2; i >= 1; i >>= 1) {
+    if (threadIdx.x < i) {
+      sdata[threadIdx.x] += sdata[threadIdx.x + i];
+    }
+    __syncthreads();
+  }
+
+  if (threadIdx.x == 0) {
+    *result = sdata[0];
+  }
+}
+
+template <int dim>
+void conv_depthwise_shape_check(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& grad_output,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation) {
+  TORCH_CHECK(kernel_size.size() == dim,
+              "kernel size length should be ", dim, ", but got ", kernel_size.size());
+  TORCH_CHECK(stride.size() == dim,
+              "stride length should be ", dim, ", but got ", stride.size());
+  TORCH_CHECK(padding.size() == dim,
+              "padding length should be ", dim, ", but got ", padding.size());
+  TORCH_CHECK(dilation.size() == dim,
+              "dilation length should be ", dim, ", but got ", dilation.size());
+
+  TORCH_CHECK(weight.defined(),
+              "Weight must be defined.");
+  TORCH_CHECK(input.dim() == dim + 1 || input.dim() == dim + 2,
+              "Input dimension should be ",
+              dim + 1, "D or ", dim + 2, "D, got ",
+              input.dim(), "D");
+  TORCH_CHECK(weight.dim() == dim + 2,
+              "Weight dimension should be ", dim + 2, "D, got ", weight.dim(), "D");
+  TORCH_CHECK(weight.size(1) == 1,
+              "Depthwise weight should have in_channels=1, got ", weight.size(1));
+  TORCH_CHECK(weight.size(0) % input.size(-dim - 1) == 0,
+              "Depthwise out channels should be a multiple of in channels, got ",
+              weight.size(0), " and ", input.size(-dim - 1));
+  for (int i = 0; i < dim; ++i) {
+    TORCH_CHECK(weight.size(i + 2) == kernel_size[i],
+                "kernel size and weight size mismatch, got ",
+                kernel_size, " and ", weight.sizes());
+    TORCH_CHECK(stride[i] >= 1,
+                "stride should be at least 1, got ", stride);
+    TORCH_CHECK(padding[i] >= 0,
+                "padding should be non-negative, got ", padding);
+    TORCH_CHECK(dilation[i] >= 1,
+                "dilation should be at least 1, got ", dilation);
+  }
+
+  if (bias.defined()) {
+    TORCH_CHECK(bias.dim() == 1,
+                "Bias should be 1D tensor, got ", bias.dim(), "D");
+    TORCH_CHECK(bias.size(0) == weight.size(0),
+                "Bias length should be equal to out_channels, got ",
+                bias.size(0), " and ", weight.size(0));
+  }
+
+  if (grad_output.defined()) {
+    auto expected_output_size = conv_output_size(input.sizes(), weight.sizes(),
+                                                 padding, stride, dilation);
+    TORCH_CHECK(static_cast<size_t>(grad_output.dim()) == expected_output_size.size(),
+                "Expect grad_output to be ",
+                expected_output_size.size(), "D, got ",
+                grad_output.dim(), "D.");
+    for (int i = 0; i < grad_output.dim(); ++i) {
+      TORCH_CHECK(grad_output.size(i) == expected_output_size[i],
+                  "Expect grad_output to be of same shape as output, got ",
+                  grad_output.size(i), " and ", expected_output_size[i],
+                  " at dimension ", i);
+    }
+  }
+}
+
+}
+
+#define NODEF_OR_EQUAL(x, y) ((y) < 0 || (x) == (y))
+#define NODEF_OR_EQUAL_3(x, y1, y2, y3) \
+  (NODEF_OR_EQUAL(x[0], y1) && \
+   NODEF_OR_EQUAL(x[1], y2) && \
+   NODEF_OR_EQUAL(x[2], y3))
+
+#define DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(kt, kh, kw, dilt, dilh, dilw) \
+  if (NODEF_OR_EQUAL_3(kernel_size, (kt), (kh), (kw)) &&                    \
+      NODEF_OR_EQUAL_3(dilation, (dilt), (dilh), (dilw))) {                 \
+    using accscalar_t = acc_type<scalar_t, true>;                           \
+    conv_depthwise3d_zoom_kernel                                            \
+    <scalar_t, accscalar_t, (kt), (kh), (kw), (dilt), (dilh), (dilw)>       \
+      <<<grid, block, (smem), c10::zoom::getCurrentZoomStream()>>>(          \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
+        output_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
+        bias_ptr,                                                           \
+        stride[0], stride[1], stride[2],                                    \
+        padding[0], padding[1], padding[2],                                 \
+        dilation[0], dilation[1], dilation[2]);                             \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                         \
+  } else
+
+#define DWCONV3D_FORWARD_DISPATCH_OTHERS \
+  {                                      \
+    using accscalar_t = acc_type<scalar_t, true>;                           \
+    conv_depthwise3d_zoom_kernel                                            \
+    <scalar_t,accscalar_t, -1, -1, -1, -1, -1, -1>                          \
+      <<<grid, block, (smem), c10::zoom::getCurrentZoomStream()>>>(          \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
+        output_.packed_accessor32<scalar_t, 5>(),                           \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
+        bias_ptr,                                                           \
+        stride[0], stride[1], stride[2],                                    \
+        padding[0], padding[1], padding[2],                                 \
+        dilation[0], dilation[1], dilation[2]);                             \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                         \
+  }
+
+Tensor conv_depthwise3d_zoom(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  TORCH_CHECK(input.device() == weight.device(), "expects input and weight tensors to be on the same device.");
+  if (bias.defined()) {
+    TORCH_CHECK(input.device() == bias.device(), "expects input and bias tensors to be on the same device.");
+  }
+
+  conv_depthwise_shape_check<3>(input, weight, bias, Tensor() /* undefined */,
+                                kernel_size, stride, padding, dilation);
+
+  Tensor input_ = input.contiguous();
+
+  if (input.dim() == 4 /* no batch */) {
+    input_ = input.unsqueeze(0);
+  }
+
+  auto output_size = conv_output_size(input_.sizes(), weight.sizes(),
+                                      padding, stride, dilation);
+  for (size_t i = 0; i < output_size.size(); ++i) {
+    TORCH_CHECK(output_size[i] > 0,
+                "Output size should be positive, got ", output_size[i], " at dim ", i);
+  }
+  Tensor output = at::empty(output_size, input.options());
+  Tensor output_ = output;
+  Tensor weight_ = weight.contiguous();
+  Tensor bias_ = bias.defined() ? bias.contiguous() : bias;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      input.scalar_type(),
+      "conv_depthwise3d",
+      [&]{
+        int64_t num_outputs = output_.numel();
+        int64_t block = 256;
+        int64_t grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536);
+        int64_t smem = 0;
+
+        const scalar_t* bias_ptr =
+            bias_.defined() ? bias_.const_data_ptr<scalar_t>() : NULL;
+
+        // Range check to avoid overflow in zoom kernels.
+        TORCH_CHECK(input_.numel() <= std::numeric_limits<int32_t>::max(),
+                    "Input tensor is too large.");
+        TORCH_CHECK(output_.numel() <= std::numeric_limits<int32_t>::max(),
+                    "Output tensor is too large.");
+        TORCH_CHECK(weight_.numel() <= std::numeric_limits<int32_t>::max(),
+                    "Weight tensor is too large.");
+        for (int i = 0; i < 3; ++i) {
+          TORCH_CHECK(padding[i] * 2 + input.size(i + 2) <= std::numeric_limits<int32_t>::max(),
+                      "Padded input tensor is too large.");
+        }
+
+        DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(3, 3, 3, 1, 1, 1)
+        DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(-1, -1, -1, 1, 1, 1)
+        DWCONV3D_FORWARD_DISPATCH_OTHERS
+      }
+  );
+
+  return output;
+}
+
+#undef DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION
+#undef DWCONV3D_FORWARD_DISPATCH_OTHERS
+
+#define DWCONV3D_BACKWARD_INPUT_DISPATCH_SPECIALIZATION(                    \
+    kt, kh, kw, dilt, dilh, dilw, dt, dh, dw)                               \
+  if (NODEF_OR_EQUAL_3(kernel_size, (kt), (kh), (kw)) &&                    \
+      NODEF_OR_EQUAL_3(dilation, (dilt), (dilh), (dilw)) &&                 \
+      NODEF_OR_EQUAL_3(stride, (dt), (dh), (dw))) {                         \
+    using accscalar_t = acc_type<scalar_t, true>;                           \
+    conv_depthwise3d_zoom_backward_input_kernel                             \
+    <scalar_t, accscalar_t, (kt), (kh), (kw), (dilt), (dilh), (dilw), (dt), (dh), (dw)>  \
+      <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(               \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
+        grad_input_.packed_accessor32<scalar_t, 5>(),                       \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
+        stride[0], stride[1], stride[2],                                    \
+        padding[0], padding[1], padding[2],                                 \
+        dilation[0], dilation[1], dilation[2]);                             \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                         \
+  } else
+
+#define DWCONV3D_BACKWARD_INPUT_DISPATCH_OTHERS                             \
+  {                                                                         \
+    using accscalar_t = acc_type<scalar_t, true>;                           \
+    conv_depthwise3d_zoom_backward_input_kernel                             \
+    <scalar_t, accscalar_t, -1, -1, -1, -1, -1, -1, -1, -1, -1>             \
+      <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(               \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
+        grad_input_.packed_accessor32<scalar_t, 5>(),                       \
+        weight_.packed_accessor32<const scalar_t, 5>(),                     \
+        stride[0], stride[1], stride[2],                                    \
+        padding[0], padding[1], padding[2],                                 \
+        dilation[0], dilation[1], dilation[2]);                             \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                         \
+  }
+
+#define DWCONV3D_BACKWARD_WEIGHT_DISPATCH_SPECIALIZATION(dh, dw)            \
+  if (NODEF_OR_EQUAL_3(stride, -1, (dh), (dw))) {                           \
+    using accscalar_t = acc_type<scalar_t, true>;                           \
+    conv_depthwise3d_zoom_backward_weight_kernel                            \
+    <scalar_t, accscalar_t, (dh), (dw)>                                     \
+      <<<grid, block, smem, c10::zoom::getCurrentZoomStream()>>>(            \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
+        grad_weight.packed_accessor32<scalar_t, 5>(),                       \
+        stride[0], stride[1], stride[2],                                    \
+        padding[0], padding[1], padding[2],                                 \
+        dilation[0], dilation[1], dilation[2]);                             \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                         \
+  } else
+
+#define DWCONV3D_BACKWARD_WEIGHT_DISPATCH_OTHERS                            \
+  {                                                                         \
+    using accscalar_t = acc_type<scalar_t, true>;                           \
+    conv_depthwise3d_zoom_backward_weight_kernel                            \
+    <scalar_t, accscalar_t, -1, -1>                                         \
+      <<<grid, block, smem, c10::zoom::getCurrentZoomStream()>>>(            \
+        grad_output_.packed_accessor32<const scalar_t, 5>(),                \
+        input_.packed_accessor32<const scalar_t, 5>(),                      \
+        grad_weight.packed_accessor32<scalar_t, 5>(),                       \
+        stride[0], stride[1], stride[2],                                    \
+        padding[0], padding[1], padding[2],                                 \
+        dilation[0], dilation[1], dilation[2]);                             \
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();                                         \
+  }
+
+std::tuple<Tensor&, Tensor&, Tensor&> _depthwise_3d_backward_zoom_out(
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias,
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    const std::array<bool, 3> output_mask)
+{
+
+  TORCH_CHECK(grad_output.device() == input.device() &&
+              input.device() == weight.device(),
+              "expects input, weight and grad_output to be on the same device.");
+  conv_depthwise_shape_check<3>(
+      input, weight, Tensor() /* undefined */, grad_output,
+      kernel_size, stride, padding, dilation);
+
+  const Tensor grad_output_ = grad_output.contiguous();
+
+  Tensor grad_input_ =
+      (output_mask[0] ?  grad_input
+                      : Tensor());
+
+  if (output_mask[0]) {
+    const Tensor weight_ = weight.contiguous();
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        grad_output.scalar_type(),
+        "conv_depthwise3d",
+        [&] {
+          int64_t num_inputs = grad_input_.numel();
+          int64_t block = 256;
+          int64_t grid = std::min((num_inputs - 1) / block + 1, (int64_t)65536);
+
+          // Range check to avoid overflow in zoom kernels.
+          TORCH_CHECK(grad_input_.numel() <= std::numeric_limits<int32_t>::max(),
+                      "Input tensor is too large.");
+          TORCH_CHECK(grad_output_.numel() <= std::numeric_limits<int32_t>::max(),
+                      "Output tensor is too large.");
+          TORCH_CHECK(weight_.numel() <= std::numeric_limits<int32_t>::max(),
+                      "Weight tensor is too large.");
+          for (int i = 0; i < 3; ++i) {
+            TORCH_CHECK(padding[i] * 2 + input.size(i + 2) <= std::numeric_limits<int32_t>::max(),
+                        "Padded input tensor is too large.");
+          }
+
+          DWCONV3D_BACKWARD_INPUT_DISPATCH_SPECIALIZATION(
+              3, 3, 3, 1, 1, 1, 1, 1, 1)
+          DWCONV3D_BACKWARD_INPUT_DISPATCH_SPECIALIZATION(
+              3, 3, 3, 1, 1, 1, -1, -1, -1)
+          DWCONV3D_BACKWARD_INPUT_DISPATCH_SPECIALIZATION(
+              3, 3, 3, -1, -1, -1, 1, 1, 1)
+          DWCONV3D_BACKWARD_INPUT_DISPATCH_SPECIALIZATION(
+              3, 3, 3, -1, -1, -1, -1, -1, -1)
+          DWCONV3D_BACKWARD_INPUT_DISPATCH_OTHERS
+        }
+    );
+  }
+
+  if (output_mask[1]) {
+    const Tensor input_ = input.contiguous();
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf,
+        kBFloat16,
+        grad_output.scalar_type(),
+        "conv_depthwise3d",
+        [&] {
+          int64_t grid = grad_weight.numel();
+          int64_t block = 256;
+          int64_t smem = sizeof(scalar_t) * block;
+
+          const int64_t int_max = std::numeric_limits<int32_t>::max();
+          TORCH_CHECK(grad_input_.numel() <= int_max,
+                      "Input tensor is too large.");
+          TORCH_CHECK(grad_output_.numel() <= int_max,
+                      "Output tensor is too large.");
+          TORCH_CHECK(weight.numel() <= int_max,
+                      "Weight tensor is too large.");
+          for (int i = 0; i < 3; ++i) {
+            TORCH_CHECK(padding[i] * 2 + input.size(i + 2) <= int_max,
+                        "Padded input tensor is too large.");
+          }
+          int64_t warp_size = at::zoom::warp_size();
+          TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / warp_size &&
+                      grad_output_.size(3) <= int_max - warp_size &&
+                      grad_output_.size(4) <= int_max - warp_size,
+                      "Output size is too large.");
+
+          DWCONV3D_BACKWARD_WEIGHT_DISPATCH_SPECIALIZATION(1, 1)
+          DWCONV3D_BACKWARD_WEIGHT_DISPATCH_SPECIALIZATION(2, 2)
+          DWCONV3D_BACKWARD_WEIGHT_DISPATCH_OTHERS
+        }
+    );
+  }
+
+  if (output_mask[2]) {
+    grad_bias = grad_output.sum({0, 2, 3, 4});
+  }
+
+  return std::tie(grad_input, grad_weight, grad_bias);
+
+}
+
+
+std::tuple<Tensor&, Tensor&, Tensor&> conv_depthwise3d_backward_zoom_out(const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes());
+    grad_weight.zero_();
+  }
+
+  return _depthwise_3d_backward_zoom_out(
+      grad_input,
+      grad_weight,
+      grad_bias,
+      grad_output,
+      input,
+      weight,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      {true,true,true});
+}
+
+std::tuple<Tensor, Tensor, Tensor> conv_depthwise3d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    const std::array<bool, 3> output_mask) {
+
+  auto options = grad_output.options();
+  Tensor grad_input =
+      (output_mask[0] ? at::empty(input.sizes(), options) : Tensor());
+  Tensor grad_weight =
+      (output_mask[1] ? at::empty(weight.sizes(), options) : Tensor());
+  Tensor grad_bias; /* undefined temporarily */
+
+  return _depthwise_3d_backward_zoom_out(
+      grad_input,
+      grad_weight,
+      grad_bias,
+      grad_output,
+      input,
+      weight,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      output_mask
+  );
+
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(conv_depthwise3d_backward_stub, &conv_depthwise3d_backward_zoom);
+
+#undef DWCONV3D_BACKWARD_INPUT_DISPATCH_SPECIALIZATION
+#undef DWCONV3D_BACKWARD_INPUT_DISPATCH_OTHERS
+
+#undef NODEF_OR_EQUAL_3
+#undef NODEF_OR_EQUAL
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DilatedMaxPool2d.cu b/aten/src/ATen/native/zoom/DilatedMaxPool2d.cu
new file mode 100644
index 00000000000000..5484d357e7ba38
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DilatedMaxPool2d.cu
@@ -0,0 +1,563 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/Pool.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <c10/macros/Macros.h>
+#include <ATen/native/zoom/LaunchUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+__device__ inline int min(int a, int b) {
+  return a <= b ? a : b;
+}
+
+#define HIP_MAX_THREADS 1024 // this is safe, in reality 256 is our limit
+
+#define BLOCK_STRIDE 2 // increasing block_stride to lower # of blocks launched
+
+static __device__ inline int p_start(int size, int pad, int kernel, int dilation, int stride) {
+  return (size + pad < ((kernel - 1) * dilation + 1)) ? 0 : (size + pad - ((kernel - 1) * dilation + 1)) / stride + 1;
+}
+
+static __device__ inline int p_end(int size, int pad, int pooled_size, int stride) {
+  return min((size + pad) / stride + 1, pooled_size);
+}
+
+// kernels borrowed from Caffe
+template <typename scalar_t, typename accscalar_t>
+__global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom_data,
+    const int64_t channels, const int64_t height,
+    const int64_t width, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, scalar_t* top_data,
+    int64_t* top_mask) {
+  HIP_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+    while(hstart < 0)
+      hstart += dilation_h;
+    while(wstart < 0)
+      wstart += dilation_w;
+    accscalar_t maxval = at::numeric_limits<accscalar_t>::lower_bound(); // -Infinity
+    int maxidx = hstart * width + wstart;
+    const scalar_t* btm_data = bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; h += dilation_h) {
+      for (int w = wstart; w < wend; w += dilation_w) {
+        scalar_t val = btm_data[h * width + w];
+        if ((static_cast<accscalar_t>(val) > maxval) || at::_isnan(val)) {
+          maxidx = h * width + w;
+          maxval = static_cast<accscalar_t>(val);
+        }
+      }
+    }
+    top_data[index] = static_cast<accscalar_t>(maxval);
+    top_mask[index] = maxidx;
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(HIP_MAX_THREADS)
+__global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch,
+                                   const int64_t channels, const int64_t height,
+                                   const int64_t width, const int pooled_height, const int pooled_width,
+                                   const int kernel_h, const int kernel_w, const int stride_h,
+                                   const int stride_w, const int pad_h, const int pad_w,
+                                   const int dilation_h, const int dilation_w,
+                                   const int in_stride_n, const int in_stride_c,
+                                   const int in_stride_h, const int in_stride_w,
+                                   const int kernel_stride_C, const int kernel_size_C,
+                                   scalar_t* top_data, int64_t* top_mask) {
+  extern __shared__ int smem[];
+  int *out_mask_cached = smem;
+  scalar_t *out_cached = reinterpret_cast<scalar_t*>(&out_mask_cached[kernel_size_C*blockDim.x*blockDim.y*blockDim.z]);
+
+  // flattening cta for pre-computation & smem initialization;
+  int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+  int block_size = blockDim.x * blockDim.y * blockDim.z;
+
+  // use shared memory to store temporary output value. This is simply to
+  // reduce register usage.
+  for (int i = thread_id; i < kernel_size_C*blockDim.x*blockDim.y*blockDim.z; i+= block_size) {
+    out_cached[i] = at::numeric_limits<scalar_t>::lower_bound();
+    out_mask_cached[i] = 0;
+  }
+
+  __syncthreads();
+
+  int batch_id = blockIdx.x % nbatch;
+  int channel_id = blockIdx.x / nbatch;
+  int channel_offset = threadIdx.x + channel_id * blockDim.x;
+
+  top_data = top_data + batch_id * pooled_height * pooled_width * channels;
+  top_mask = top_mask + batch_id * pooled_height * pooled_width * channels;
+  bottom_data = bottom_data + batch_id * in_stride_n;
+
+  out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
+  out_mask_cached = &out_mask_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
+
+  int oH = (pooled_height + gridDim.z-1) / gridDim.z;
+  int oW = (pooled_width + gridDim.y-1) / gridDim.y;
+  int ostartH = threadIdx.z + blockIdx.z*oH;
+  int oendH = ::min(ostartH+oH, pooled_height);
+  int ostartW = threadIdx.y + blockIdx.y*oW;
+  int oendW = ::min(ostartW+oW, pooled_width);
+
+  for (int oh = ostartH; oh < oendH; oh+=blockDim.z) {
+    int hstart = oh * stride_h - pad_h;
+    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
+      int wstart = ow * stride_w - pad_w;
+      int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+      while(hstart < 0)
+        hstart += dilation_h;
+      while(wstart < 0)
+        wstart += dilation_w;
+      for (int ih = hstart; ih < hend; ih += dilation_h) {
+        for (int iw = wstart; iw < wend; iw += dilation_w) {
+          int cached_index = threadIdx.x;
+          const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
+          for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
+            scalar_t val = ptr_input[c*in_stride_c];
+            if ((static_cast<accscalar_t>(val) > out_cached[cached_index]) || at::_isnan(val)) {
+              out_cached[cached_index] = static_cast<accscalar_t>(val);
+              out_mask_cached[cached_index] = ih * width + iw;
+            }
+            cached_index += blockDim.x;
+          }
+        }
+      }
+      scalar_t *ptr_output_data = top_data + (oh * pooled_width + ow) * channels;
+      int64_t *ptr_output_mask = top_mask + (oh * pooled_width + ow) * channels;
+
+      int cached_index = threadIdx.x;
+      for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
+        ptr_output_data[c] = out_cached[cached_index];
+        ptr_output_mask[c] = out_mask_cached[cached_index];
+        out_cached[cached_index] = at::numeric_limits<scalar_t>::lower_bound();
+        out_mask_cached[cached_index] = 0;
+        cached_index += blockDim.x;
+      }
+    }
+  }
+}
+
+
+static const int BLOCK_THREADS = 256;
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_2(BLOCK_THREADS, 4)
+__global__ void max_pool_backward_nchw(const scalar_t* top_diff,
+    const int64_t* top_mask, const int num, const int64_t channels,
+    const int64_t height, const int64_t width, const int pooled_height,
+    const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w,
+    scalar_t* bottom_diff) {
+  HIP_KERNEL_LOOP(index, height*width) {
+    int h = index / width;
+    int w = index - h * width;
+    int phstart = p_start(h, pad_h, kernel_h, dilation_h, stride_h);
+    int phend = p_end(h, pad_h, pooled_height, stride_h);
+    int pwstart = p_start(w, pad_w, kernel_w, dilation_w, stride_w);
+    int pwend = p_end(w, pad_w, pooled_width, stride_w);
+    for (int n = blockIdx.y; n < num; n += gridDim.y) {
+      for (int c = blockIdx.z; c < channels; c+= gridDim.z) {
+        accscalar_t gradient = accscalar_t(0);
+        int offset = (n * channels + c) * pooled_height * pooled_width;
+        for (int ph = phstart; ph < phend; ++ph) {
+          for (int pw = pwstart; pw < pwend; ++pw) {
+            if (top_mask[ph * pooled_width + pw + offset] == h * width + w) {
+              gradient += static_cast<accscalar_t>(top_diff[ph * pooled_width + pw + offset]);
+            }
+          }
+        }
+        bottom_diff[(n*channels+c)*height*width+index] = static_cast<scalar_t>(gradient);
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(HIP_MAX_THREADS)
+__global__ void max_pool_backward_nhwc(const scalar_t* top_diff,
+                                    const int64_t* top_mask, const int nbatch, const int64_t channels,
+                                    const int64_t height, const int64_t width, const int pooled_height,
+                                    const int pooled_width, const int kernel_h, const int kernel_w,
+                                    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+                                    const int dilation_h, const int dilation_w,
+                                    const int out_stride_c, const int out_stride_h, const int out_stride_w,
+                                    const int kernel_stride_C, const int kernel_size_C,
+                                    scalar_t* bottom_diff) {
+  extern __shared__ int smem[];
+  accscalar_t *out_cached = reinterpret_cast<accscalar_t*>(smem);
+
+  int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+  int block_size = blockDim.x * blockDim.y * blockDim.z;
+
+  int batch_id = blockIdx.x % nbatch;
+  int channel_id = blockIdx.x / nbatch;
+  int channel_offset = threadIdx.x + channel_id * blockDim.x;
+
+  for (int i = thread_id; i < kernel_size_C*blockDim.x*blockDim.y*blockDim.z; i+= block_size) {
+    out_cached[i] = accscalar_t(0.0);
+  }
+
+  __syncthreads();
+
+  out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
+
+  bottom_diff = bottom_diff + batch_id * height * width * channels;
+  top_mask = top_mask + batch_id * pooled_height * pooled_width * channels;
+  top_diff = top_diff + batch_id * pooled_height * pooled_width * channels;
+
+  int iH = (height + gridDim.z-1) / gridDim.z;
+  int iW = (width + gridDim.y-1) / gridDim.y;
+  int istartH = threadIdx.z + blockIdx.z*iH;
+  int iendH = ::min(static_cast<int64_t>(istartH)+iH, height);
+  int istartW = threadIdx.y + blockIdx.y*iW;
+  int iendW = ::min(static_cast<int64_t>(istartW)+iW, width);
+
+  for (int ih = istartH; ih < iendH; ih+=blockDim.z) {
+    int phstart = p_start(ih, pad_h, kernel_h, dilation_h, stride_h);
+    int phend = p_end(ih, pad_h, pooled_height, stride_h);
+    for (int iw = istartW; iw < iendW; iw+=blockDim.y) {
+      int pwstart = p_start(iw, pad_w, kernel_w, dilation_w, stride_w);
+      int pwend = p_end(iw, pad_w, pooled_width, stride_w);
+      int index_shift = ih * width + iw;
+      if ((phstart + 1 != phend) || (pwstart + 1 != pwend)) {
+        for(int oh = phstart; oh < phend; ++oh) {
+          for(int ow = pwstart; ow < pwend; ++ow) {
+            int cached_index = threadIdx.x;
+            const int64_t* ptr_top_mask = top_mask + oh * out_stride_h + ow * out_stride_w;
+            for (int c = channel_offset; c < channels; c += blockDim.x*kernel_stride_C) {
+              if (ptr_top_mask[c*out_stride_c] == index_shift) {
+                out_cached[cached_index] +=
+                  static_cast<accscalar_t>(top_diff[oh * out_stride_h + ow * out_stride_w + c*out_stride_c]);
+              }
+              cached_index += blockDim.x;
+            }
+          }
+        }
+        scalar_t *ptr_bottom_diff = bottom_diff + index_shift * channels;
+        int cached_index = threadIdx.x;
+        for (int c = channel_offset; c < channels; c += blockDim.x*kernel_stride_C) {
+          ptr_bottom_diff[c] = static_cast<scalar_t>(out_cached[cached_index]);
+          out_cached[cached_index] = accscalar_t(0.0);
+          cached_index += blockDim.x;
+        }
+      } else {
+        const int64_t* ptr_top_mask = top_mask + phstart * out_stride_h + pwstart * out_stride_w;
+        scalar_t *ptr_bottom_diff = bottom_diff + index_shift * channels;
+        int cached_index = threadIdx.x;
+        for (int c = channel_offset; c < channels; c += blockDim.x*kernel_stride_C) {
+          if (ptr_top_mask[c*out_stride_c] == index_shift) {
+            ptr_bottom_diff[c] =
+              static_cast<scalar_t>(top_diff[phstart * out_stride_h + pwstart * out_stride_w + c*out_stride_c]);
+          }
+          cached_index += blockDim.x;
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(max_pool2d_with_indices_out_zoom)
+(const Tensor& input_,
+IntArrayRef kernel_size,
+IntArrayRef stride,
+IntArrayRef padding,
+IntArrayRef dilation,
+bool ceil_mode,
+const Tensor& output,
+const Tensor& indices) {
+  NoNamesGuard guard;
+
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg indices_arg{ indices, "indices", 2 };
+  TensorArg input_arg{ input_, "input_", 3 };
+
+  checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
+  if (output.numel() == 0) {
+    return;
+  }
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const auto memory_format = input_.suggest_memory_format();
+
+  const int64_t nbatch = input_.ndimension() == 4 ? input_.size(-4) : 1;
+  const int64_t nInputPlane = input_.size(-3);
+  const int64_t inputHeight = input_.size(-2);
+  const int64_t inputWidth = input_.size(-1);
+
+  const int64_t outputHeight = output.size(-2);
+  const int64_t outputWidth = output.size(-1);
+
+  Tensor input = input_.contiguous(memory_format);
+
+  const int64_t in_stride_n = input_.ndimension() == 4 ? input.stride(-4) : 0;
+  const int64_t in_stride_c = input.stride(-3);
+  const int64_t in_stride_h = input.stride(-2);
+  const int64_t in_stride_w = input.stride(-1);
+
+  const int count = safe_downcast<int, int64_t>(output.numel());
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+    "max_pool2d_with_indices_out_zoom_frame",
+    [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *output_data = output.mutable_data_ptr<scalar_t>();
+      const scalar_t *input_data = input.const_data_ptr<scalar_t>();
+      int64_t *indices_data = indices.mutable_data_ptr<int64_t>();
+
+      switch (memory_format) {
+        case MemoryFormat::ChannelsLast: {
+          const int max_threads = std::min<int>(
+              at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, HIP_MAX_THREADS);
+          int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+          int block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::zoom::warp_size()));
+          int block_y = std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(outputWidth), max_threads / block_x));
+          int block_z = std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(outputHeight), max_threads / block_x / block_y));
+          block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
+          const dim3 block(block_x, block_y, block_z);
+
+          int kernel_stride_C = ceil_div(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
+          int kernel_size_C = ceil_div(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C);
+
+          int grid_x = nbatch*kernel_stride_C;
+          int grid_y = std::min<int>(
+              at::zoom::getCurrentDeviceProperties()->maxGridSize[1],
+              ceil_div(safe_downcast<int, int64_t>(outputWidth), block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              at::zoom::getCurrentDeviceProperties()->maxGridSize[2],
+              ceil_div(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
+          AT_ASSERT(shmem_size <= at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock);
+
+          max_pool_forward_nhwc<scalar_t, scalar_t>
+          <<<grid, block, shmem_size, c10::zoom::getCurrentZoomStream()>>>(
+              input_data, nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  in_stride_n, in_stride_c,
+                  in_stride_h, in_stride_w,
+                  kernel_stride_C, kernel_size_C,
+                  output_data, indices_data);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          break;
+        }
+        case MemoryFormat::Contiguous: {
+          const int num_threads = std::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock,
+                                            BLOCK_THREADS);
+          max_pool_forward_nchw<scalar_t, scalar_t>
+              <<<ceil_div(count, num_threads), num_threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+              count, input_data,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  output_data, indices_data);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          break;
+        }
+        default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      }
+    }
+  );
+}
+
+TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_zoom)
+(const Tensor& gradOutput_,
+const Tensor& input_,
+IntArrayRef kernel_size,
+IntArrayRef stride,
+IntArrayRef padding,
+IntArrayRef dilation,
+bool ceil_mode,
+const Tensor& indices_,
+const Tensor& gradInput) {
+  NoNamesGuard guard;
+
+  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
+  TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 };
+  TensorArg input_arg{ input_, "input_", 3 };
+  TensorArg indices_arg{ indices_, "indices", 4 };
+
+  checkAllSameGPU(__func__,
+                  {gradInput_arg, gradOutput_arg, input_arg, indices_arg});
+  if (gradOutput_.numel() == 0) {
+    return;
+  }
+
+  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
+
+  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
+
+  const int padH = safe_downcast<int, int64_t>(padding[0]);
+  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
+
+  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
+
+  const auto memory_format = input_.suggest_memory_format();
+
+  const Tensor input = input_.contiguous(memory_format);
+
+  const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
+
+  const int64_t in_stride_n = input.ndimension() == 4 ? input.stride(-4) : 0;
+  const int64_t in_stride_c = input.stride(-3);
+  const int64_t in_stride_h = input.stride(-2);
+  const int64_t in_stride_w = input.stride(-1);
+
+  const Tensor gradOutput = gradOutput_.contiguous(memory_format);
+
+  const int64_t outputHeight = gradOutput.size(-2);
+  const int64_t outputWidth = gradOutput.size(-1);
+
+  const int64_t out_stride_c = gradOutput.stride(-3);
+  const int64_t out_stride_h = gradOutput.stride(-2);
+  const int64_t out_stride_w = gradOutput.stride(-1);
+
+  const Tensor indices = indices_.contiguous(memory_format);
+
+  gradInput.zero_();
+
+  int64_t count = input.numel();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+    "max_pool2d_with_indices_out_zoom_frame",
+    [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      const scalar_t *gradOutput_data = gradOutput.const_data_ptr<scalar_t>();
+      scalar_t *gradInput_data = gradInput.mutable_data_ptr<scalar_t>();
+      const int64_t *indices_data = indices.const_data_ptr<int64_t>();
+
+      switch (memory_format) {
+        case MemoryFormat::ChannelsLast: {
+          const int max_threads = std::min<int>(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, HIP_MAX_THREADS);
+          int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+          int block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::zoom::warp_size()));
+          int block_y = std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(inputWidth), max_threads / block_x));
+          int block_z = std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(inputHeight), max_threads / block_x / block_y));
+          block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
+          const dim3 block(block_x, block_y, block_z);
+
+          int kernel_stride_C = ceil_div(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
+          int kernel_size_C = ceil_div(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C);
+
+          int grid_x = nbatch*kernel_stride_C;
+          int grid_y = std::min<int>(
+              at::zoom::getCurrentDeviceProperties()->maxGridSize[1],
+              ceil_div(safe_downcast<int, int64_t>(inputWidth), block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              at::zoom::getCurrentDeviceProperties()->maxGridSize[2],
+              ceil_div(safe_downcast<int, int64_t>(inputHeight), block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t);
+          AT_ASSERT(shmem_size <= at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock);
+
+          // The backward kernel is launched on input instead output.
+          // If it is launched on output layer, atomic_add would not provide much benefit on FP16.
+          // Please check comments at https://github.com/pytorch/pytorch/pull/34519.
+          max_pool_backward_nhwc<scalar_t, accscalar_t>
+          <<<grid, block, shmem_size, c10::zoom::getCurrentZoomStream()>>>(
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  out_stride_c, out_stride_h, out_stride_w,
+                  kernel_stride_C, kernel_size_C,
+                  gradInput_data);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          break;
+        }
+        case MemoryFormat::Contiguous: {
+          int imgcount = inputWidth * inputHeight;
+          dim3 grid;
+          const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
+          grid.x = blocks;
+          grid.y = nbatch;
+          uint64_t maxGridY = at::zoom::getCurrentDeviceProperties()->maxGridSize[1];
+          if (maxGridY < grid.y) grid.y = maxGridY;
+          grid.z = nInputPlane;
+          uint64_t maxGridZ = at::zoom::getCurrentDeviceProperties()->maxGridSize[2];
+          if (maxGridZ < grid.z) grid.z = maxGridZ;
+
+          max_pool_backward_nchw<scalar_t, accscalar_t>
+          <<<grid, BLOCK_THREADS, 0, c10::zoom::getCurrentZoomStream()>>>(
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  gradInput_data);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          break;
+        }
+        default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      }
+    }
+  );
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/DilatedMaxPool3d.cu b/aten/src/ATen/native/zoom/DilatedMaxPool3d.cu
new file mode 100644
index 00000000000000..615d86cc1c6433
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DilatedMaxPool3d.cu
@@ -0,0 +1,652 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/Pool.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/max_pool3d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+__device__ inline int min(int a, int b) {
+  return a <= b ? a : b;
+}
+
+template <typename scalar_t>
+__global__ static void max_pool3d_with_indices_single_out_frame(
+  const scalar_t* inputData,
+  scalar_t* outputData,
+  int64_t* indicesData,
+  int features,
+  int itime, int iheight, int iwidth,
+  int obatch, int otime, int oheight, int owidth,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int dilationT, int dilationH, int dilationW,
+  int offsetZ,
+  bool channels_last)
+{
+  int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow = blockIdx.y * blockDim.y + threadIdx.y;
+  int oFrame = 0;
+  // used only for channels-first indexing
+  int64_t slice = 0;
+  // used only for channels-last indexing
+  int batch = 0;
+  int channel = 0;
+  if (!channels_last) {
+    // indexing order: batch, channel, time
+    oFrame = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) % otime; // output frame/time
+    slice = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) / otime; // output slice/feature
+  } else {
+    // indexing order: batch, time, channel
+    channel = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) % features; // output feature (channel)
+    slice = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) / features; // output slice (batch + time)
+    batch = slice / otime;
+    oFrame = slice % otime;
+  }
+
+  // For int64_t data type, see https://github.com/pytorch/pytorch/issues/52822
+  if (oRow < oheight && oColumn < owidth && oFrame < otime && channel < features && batch < obatch)
+  {
+    int tStart = oFrame  * dT - pT;
+    int hStart = oRow    * dH - pH;
+    int wStart = oColumn * dW - pW;
+    int tEnd = min(tStart + (kT - 1) * dilationT + 1, itime);
+    int hEnd = min(hStart + (kH - 1) * dilationH + 1, iheight);
+    int wEnd = min(wStart + (kW - 1) * dilationW + 1, iwidth);
+
+    while(tStart < 0)
+      tStart += dilationT;
+    while(hStart < 0)
+      hStart += dilationH;
+    while(wStart < 0)
+      wStart += dilationW;
+
+    // maxIndex remains in "channels-first"/contiguous
+    int64_t maxIndex = tStart * iheight * iwidth + hStart * iwidth + wStart;
+
+    if (!channels_last) {
+        inputData += (int64_t) slice * itime * iheight * iwidth;
+    } else {
+        inputData += ((int64_t) batch * itime * iheight * iwidth * features) + channel;
+    }
+
+    scalar_t max = at::numeric_limits<scalar_t>::lower_bound(); // -Infinity
+
+    for (int t = tStart; t < tEnd; t += dilationT)
+    {
+      for (int h = hStart; h < hEnd; h += dilationH)
+      {
+        for (int w = wStart; w < wEnd; w += dilationW)
+        {
+          scalar_t val;
+          int index = t * iheight * iwidth + h * iwidth + w;
+          if (!channels_last) {
+            val = inputData[index];
+          } else {
+            int64_t index_channels_last = index*features;
+            val = inputData[index_channels_last];
+          }
+
+          if ((max < val) || at::_isnan(val))
+          {
+            max = val;
+            maxIndex = index;
+          }
+        }
+      }
+    }
+
+    int64_t out_index;
+    if (!channels_last) {
+      out_index = (int64_t) slice*otime*oheight*owidth + oFrame*oheight*owidth + oRow*owidth + oColumn;
+    } else {
+      out_index = ((int64_t) batch*otime*oheight*owidth + oFrame*oheight*owidth + oRow*owidth + oColumn)*features + channel;
+    }
+    outputData[out_index] = max;
+    indicesData[out_index] = maxIndex;
+  }
+}
+
+template <typename scalar_t>
+void max_pool3d_with_indices_out_frame(
+  const scalar_t* input_data,
+  const Tensor& output,
+  const Tensor& indices,
+  int features,
+  int64_t totalZ,
+  int itime, int iheight, int iwidth,
+  int obatch, int otime, int oheight, int owidth,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int dilationT, int dilationH, int dilationW,
+  bool channels_last)
+{
+  int offsetZ = 0;
+  int threadX = 32;
+  int threadY = 8;
+  int threadZ = 1;
+  int stepZ = 65535;
+  if (channels_last) {
+    threadX = 2;
+    threadY = 4;
+    threadZ = 64;
+  }
+  dim3 block(threadX, threadY, threadZ);
+
+  while (totalZ > 0) {
+    dim3 grid(ceil_div(owidth, static_cast<int>(block.x)),
+              ceil_div(oheight, static_cast<int>(block.y)),
+              totalZ > stepZ*threadZ ? stepZ : ceil_div(totalZ, static_cast<int64_t>(threadZ)));
+
+    max_pool3d_with_indices_single_out_frame
+      <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+         input_data,
+         output.mutable_data_ptr<scalar_t>(),
+         indices.mutable_data_ptr<int64_t>(),
+         features,
+         itime, iheight, iwidth,
+         obatch, otime, oheight, owidth,
+         kT, kH, kW,
+         dT, dH, dW,
+         pT, pH, pW,
+         dilationT, dilationH, dilationW,
+         offsetZ, channels_last);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    totalZ -= threadZ*stepZ;
+    offsetZ += threadZ*stepZ;
+  }
+}
+
+#undef UPDATE_OUTPUT_KERNEL_WIDTH
+
+template <typename scalar_t>
+__global__ static void max_pool3d_with_indices_backward_single_out_frame(
+  scalar_t *gradInputData,
+  const scalar_t *gradOutputData,
+  const int64_t *indicesData,
+  int features,
+  int itime, int iheight, int iwidth,
+  int obatch, int otime, int oheight, int owidth,
+  int offsetZ,
+  bool channels_last)
+{
+  int oColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int oRow = blockIdx.y * blockDim.y + threadIdx.y;
+
+  int oFrame = 0;
+  // used only for channels-first indexing
+  int64_t slice = 0;
+  // used only for channels-last indexing
+  int batch = 0;
+  int channel = 0;
+  if (!channels_last) {
+    // indexing order: batch, channel, time
+    oFrame = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) % otime; // output frame/time
+    slice = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) / otime; // output slice/feature
+  } else {
+    // indexing order: batch, time, channel
+    channel = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) % features; // output feature (channel)
+    slice = (blockIdx.z * blockDim.z + threadIdx.z + offsetZ) / features; // output slice (batch + time)
+    batch = slice / otime;
+    oFrame = slice % otime;
+  }
+
+  if (oRow < oheight && oColumn < owidth && oFrame < otime && batch < obatch && channel < features)
+  {
+    int64_t out_index;
+    if (!channels_last) {
+      out_index = (int64_t) slice*otime*oheight*owidth + oFrame*oheight*owidth + oRow*owidth + oColumn;
+    } else {
+      out_index = ((int64_t) batch*otime*oheight*owidth + oFrame*oheight*owidth + oRow*owidth + oColumn)*features + channel;
+    }
+    int64_t maxIndex = indicesData[out_index];
+    if (maxIndex != -1) {
+      if (!channels_last) {
+        gpuAtomicAddNoReturn(&gradInputData[(int64_t) slice * itime  * iheight * iwidth + maxIndex],
+          gradOutputData[out_index]);
+      } else {
+        gpuAtomicAddNoReturn(&gradInputData[((int64_t) batch * itime * iheight * iwidth + maxIndex) * features + channel],
+          gradOutputData[out_index]);
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+void max_pool3d_with_indices_backward_out_frame(
+  scalar_t *gradInputData,
+  const Tensor& gradOutput,
+  const Tensor& indices,
+  int features,
+  int64_t totalZ,
+  int itime, int iheight, int iwidth,
+  int obatch, int otime, int oheight, int owidth,
+  bool channels_last)
+{
+  int offsetZ = 0;
+  int threadX = 32;
+  int threadY = 8;
+  int threadZ = 1;
+  int stepZ = 65535;
+  if (channels_last) {
+    threadX = 2;
+    threadY = 4;
+    threadZ = 64;
+  }
+  dim3 block(threadX, threadY, threadZ);
+
+  while (totalZ > 0) {
+    dim3 grid(ceil_div(owidth, static_cast<int>(block.x)),
+              ceil_div(oheight, static_cast<int>(block.y)),
+              totalZ > stepZ*threadZ ? stepZ : ceil_div(totalZ, static_cast<int64_t>(block.z)));
+
+    max_pool3d_with_indices_backward_single_out_frame
+      <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+        gradInputData,
+        gradOutput.const_data_ptr<scalar_t>(),
+        indices.const_data_ptr<int64_t>(),
+        features,
+        itime, iheight, iwidth,
+        obatch, otime, oheight, owidth,
+        offsetZ,
+        channels_last);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    totalZ -= threadZ*stepZ;
+    offsetZ += threadZ*stepZ;
+  }
+}
+
+void max_pool3d_with_indices_out_zoom_template(
+           Tensor& output,
+           Tensor& indices,
+           const Tensor& input,
+           IntArrayRef kernel_size,
+           IntArrayRef stride,
+           IntArrayRef padding,
+           IntArrayRef dilation,
+           bool ceil_mode)
+{
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg indices_arg{ indices, "indices", 2 };
+  TensorArg input_arg{ input, "input", 3 };
+
+  checkAllSameGPU(__func__,
+                  {output_arg, indices_arg, input_arg});
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3,
+    "max_pool3d: kernel_size must either be a single int, or a tuple of three ints")
+  const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
+
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 3,
+    "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints")
+  const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
+  const int dH = stride.empty() ? kH :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[1]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[2]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
+    "max_pool3d: padding must either be a single int, or a tuple of three ints");
+  const int pT = safe_downcast<int, int64_t>(padding[0]);
+  const int pH = padding.size() == 1 ? pT : safe_downcast<int, int64_t>(padding[1]);
+  const int pW = padding.size() == 1 ? pT : safe_downcast<int, int64_t>(padding[2]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 3,
+    "max_pool3d: dilation must be either a single int, or a tuple of three ints");
+  const int dilationT = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationH = dilation.size() == 1 ? dilationT : safe_downcast<int, int64_t>(dilation[1]);
+  const int dilationW = dilation.size() == 1 ? dilationT : safe_downcast<int, int64_t>(dilation[2]);
+
+  const int64_t nbatch = input.ndimension() == 5 ? input.size(-5) : 1;
+  const int64_t nslices = input.size(-4);
+  const int64_t itime = input.size(-3);
+  const int64_t iheight = input.size(-2);
+  const int64_t iwidth = input.size(-1);
+
+  const int64_t otime = pooling_output_shape<int64_t>(itime, kT, pT, dT, dilationT, ceil_mode);
+  const int64_t oheight = pooling_output_shape<int64_t>(iheight, kH, pH, dH, dilationH, ceil_mode);
+  const int64_t owidth = pooling_output_shape<int64_t>(iwidth, kW, pW, dW, dilationW, ceil_mode);
+
+  pool3d_shape_check(
+    input,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    pT, pH, pW,
+    dilationT, dilationH, dilationW,
+    itime, iheight, iwidth,
+    otime, oheight, owidth,
+    "max_pool3d_with_indices_out_zoom_template()");
+
+  bool channels_last = input.ndimension() == 5 && input.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d;
+  Tensor _input = input;
+  if (input.ndimension() == 4) {
+    Tensor input_channels_last_check = input.unsqueeze(0);
+    // work around buggy behavior of suggest_memory_format here where
+    // suggested format of unsqueezed tensor is contiguous while it is
+    // really only contiguous in ChannelsLast3d
+    channels_last = (!input_channels_last_check.is_contiguous()) &&
+                     input_channels_last_check.is_contiguous(at::MemoryFormat::ChannelsLast3d);
+    if (!channels_last) {
+      output.resize_({ nslices, otime, oheight, owidth});
+      indices.resize_({nslices, otime, oheight, owidth});
+    } else {
+      _input = input_channels_last_check;
+      output.resize_({1, nslices, otime, oheight, owidth}, at::MemoryFormat::ChannelsLast3d);
+      indices.resize_({1, nslices, otime, oheight, owidth}, at::MemoryFormat::ChannelsLast3d);
+      output = output.squeeze(0);
+      indices = indices.squeeze(0);
+    }
+  } else {
+    if (!channels_last) {
+      output.resize_({nbatch, nslices, otime, oheight, owidth});
+      indices.resize_({nbatch, nslices, otime, oheight, owidth});
+    } else {
+      output.resize_({nbatch, nslices, otime, oheight, owidth}, at::MemoryFormat::ChannelsLast3d);
+      indices.resize_({nbatch, nslices, otime, oheight, owidth}, at::MemoryFormat::ChannelsLast3d);
+    }
+  }
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  Tensor work_input;
+  Tensor work_output = output;
+  if (!channels_last) {
+    work_input = input.contiguous();
+  } else {
+    work_input = _input.contiguous(at::MemoryFormat::ChannelsLast3d);
+  }
+  Tensor work_indices = indices;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+    input.scalar_type(),
+    "max_pool3d_with_indices_out_frame",
+    [&]{
+      const scalar_t *input_data = work_input.const_data_ptr<scalar_t>();
+      const int64_t totalZ = otime * nslices * nbatch;
+
+      max_pool3d_with_indices_out_frame(
+        input_data, work_output, work_indices,
+        nslices, // features
+        totalZ,
+        itime, iheight, iwidth,
+        nbatch, otime, oheight, owidth,
+        kT, kH, kW,
+        dT, dH, dW,
+        pT, pH, pW,
+        dilationT, dilationH, dilationW, channels_last);
+    }
+  );
+}
+
+void max_pool3d_with_indices_backward_out_zoom_template(
+           Tensor& gradInput,
+           const Tensor& gradOutput,
+           const Tensor& input,
+           const Tensor& indices,
+           IntArrayRef kernel_size,
+           IntArrayRef stride,
+           IntArrayRef padding,
+           IntArrayRef dilation,
+           bool ceil_mode)
+{
+  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
+  TensorArg gradOutput_arg{ gradOutput, "gradOutput", 2 };
+  TensorArg input_arg{ input, "input", 3 };
+  TensorArg indices_arg{ indices, "indices", 4 };
+
+  checkAllSameGPU(__func__,
+                  {gradInput_arg, gradOutput_arg, input_arg, indices_arg});
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3,
+    "max_pool3d: kernel_size must either be a single int, or a tuple of three ints")
+  const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
+
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 3,
+    "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints")
+  const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
+  const int dH = stride.empty() ? kH :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[1]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[2]);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
+    "max_pool3d: padding must either be a single int, or a tuple of three ints");
+  const int pT = safe_downcast<int, int64_t>(padding[0]);
+  const int pH = padding.size() == 1 ? pT : safe_downcast<int, int64_t>(padding[1]);
+  const int pW = padding.size() == 1 ? pT : safe_downcast<int, int64_t>(padding[2]);
+
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 3,
+    "max_pool3d: dilation must be either a single int, or a tuple of three ints");
+  const int dilationT = safe_downcast<int, int64_t>(dilation[0]);
+  const int dilationH = dilation.size() == 1 ? dilationT : safe_downcast<int, int64_t>(dilation[1]);
+  const int dilationW = dilation.size() == 1 ? dilationT : safe_downcast<int, int64_t>(dilation[2]);
+
+  TORCH_CHECK((input.ndimension() == 4 || input.ndimension() == 5),
+    "max_pool2d_with_indices_backward_out_zoom_template(): ",
+    "Expected 4D or 5D input tensor, but got ", input.sizes());
+
+  TORCH_CHECK((gradOutput.ndimension() == 4 || gradOutput.ndimension() == 5),
+    "max_pool2d_with_indices_backward_out_zoom_template(): ",
+    "Expected 4D or 5D gradOutput tensor, but got ", gradOutput.sizes());
+
+  // Resize and initialize result tensor.
+  bool channels_last = input.ndimension() == 5 && input.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d;
+  Tensor _input = input;
+  if (input.ndimension() == 4) {
+    Tensor input_channels_last_check = input.unsqueeze(0);
+    // work around buggy behavior of suggest_memory_format here where
+    // suggested format of unsqueezed tensor is contiguous while it is
+    // really only contiguous in ChannelsLast3d
+    channels_last = (!input_channels_last_check.is_contiguous()) &&
+                     input_channels_last_check.is_contiguous(at::MemoryFormat::ChannelsLast3d);
+    if (channels_last) {
+      _input = input_channels_last_check;
+    }
+  }
+  if (!channels_last) {
+    gradInput.resize_as_(input);
+  } else {
+    gradInput.resize_as_(_input, at::MemoryFormat::ChannelsLast3d);
+  }
+  gradInput.zero_();
+
+  const int64_t nbatch = input.ndimension() == 5 ? input.size(-5) : 1;
+  const int64_t nslices = input.size(-4);
+
+  const int64_t otime = gradOutput.size(-3);
+  const int64_t oheight = gradOutput.size(-2);
+  const int64_t owidth = gradOutput.size(-1);
+
+  const int64_t itime = gradInput.size(-3);
+  const int64_t iheight = gradInput.size(-2);
+  const int64_t iwidth = gradInput.size(-1);
+
+  max_pool3d_backward_shape_check(
+    input,
+    gradOutput,
+    indices,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    pT, pH, pW,
+    dilationT, dilationH, dilationW,
+    itime, iheight, iwidth,
+    otime, oheight, owidth,
+    "max_pool3d_with_indices_backward_out_zoom_template()");
+
+  if (gradOutput.numel() == 0) {
+    return;
+  }
+
+  Tensor work_grad_input = gradInput;
+  Tensor work_grad_output;
+  Tensor work_indices;
+  if (!channels_last) {
+    work_grad_output = gradOutput.contiguous();
+    work_indices = indices.contiguous();
+  } else {
+    if (input.ndimension() == 4) {
+      work_grad_output = gradOutput.unsqueeze(0).contiguous(at::MemoryFormat::ChannelsLast3d);
+      work_indices = indices.unsqueeze(0).contiguous(at::MemoryFormat::ChannelsLast3d);
+    } else {
+      work_grad_output = gradOutput.contiguous(at::MemoryFormat::ChannelsLast3d);
+      work_indices = indices.contiguous(at::MemoryFormat::ChannelsLast3d);
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+    "max_pool3d_with_indices_backward_out_frame",
+    [&] {
+      const int64_t totalZ = otime * nslices * nbatch;
+      scalar_t *grad_input_data = work_grad_input.mutable_data_ptr<scalar_t>();
+
+      max_pool3d_with_indices_backward_out_frame(
+        grad_input_data, work_grad_output, work_indices,
+        nslices,
+        totalZ,
+        itime, iheight, iwidth,
+        nbatch, otime, oheight, owidth,
+        channels_last);
+    }
+  );
+}
+
+} // namespace
+
+std::tuple<Tensor&, Tensor&> max_pool3d_with_indices_out_zoom(const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  Tensor& output,
+  Tensor& indices)
+{
+  max_pool3d_with_indices_out_zoom_template(
+    output,
+    indices,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return std::tuple<Tensor&, Tensor&>(output, indices);
+}
+
+std::tuple<Tensor, Tensor> max_pool3d_with_indices_zoom(
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode)
+{
+  NoNamesGuard guard;
+
+  Tensor output = at::empty({0}, input.options());
+  Tensor indices = at::empty({0}, input.options().dtype(kLong));
+  max_pool3d_with_indices_out_zoom_template(
+    output,
+    indices,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+
+  guard.reset();
+  namedinference::propagate_names(output, input);
+  namedinference::propagate_names(indices, input);
+
+  return std::tuple<Tensor, Tensor>(output, indices);
+}
+
+Tensor& max_pool3d_with_indices_backward_out_zoom(const Tensor& gradOutput,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  const Tensor& indices,
+  Tensor& gradInput)
+{
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("max_pool3d_with_indices_backward_out_zoom");
+  max_pool3d_with_indices_backward_out_zoom_template(
+    gradInput,
+    gradOutput,
+    input,
+    indices,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return gradInput;
+}
+
+Tensor max_pool3d_with_indices_backward_zoom(
+  const Tensor& gradOutput,
+  const Tensor& input,
+  IntArrayRef kernel_size,
+  IntArrayRef stride,
+  IntArrayRef padding,
+  IntArrayRef dilation,
+  bool ceil_mode,
+  const Tensor& indices)
+{
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("max_pool3d_with_indices_backward_zoom");
+  auto gradInput = at::empty(input.sizes(), input.options());
+  max_pool3d_with_indices_backward_out_zoom_template(
+    gradInput,
+    gradOutput,
+    input,
+    indices,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode);
+  return gradInput;
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/DistanceKernel.cu b/aten/src/ATen/native/zoom/DistanceKernel.cu
new file mode 100644
index 00000000000000..248b8a431f7cb5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistanceKernel.cu
@@ -0,0 +1,365 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <math.h>
+
+#include <ATen/native/zoom/block_reduce.cuh>
+#include <ATen/native/zoom/DeviceSqrt.cuh>
+#include <ATen/native/Distance.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#endif
+
+#include <c10/macros/Macros.h>
+
+namespace at::native {
+
+namespace {
+
+constexpr int kHIPNumThreads = 256;
+
+template <typename scalar_t>
+struct dists {
+
+  static __forceinline__ __device__ scalar_t sign(scalar_t val) {
+    return (0 < val) - (val < 0);
+  }
+
+  // Zero norm
+  struct zero {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t /*p*/) { agg += diff != 0.0; }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+  };
+
+  // One norm
+  struct one {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t /*p*/) { agg += diff; }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t /*dist*/, const scalar_t /*p*/) { return grad * sign(diff); }
+  };
+
+  // Special case backward when p is less than two
+  struct lt_two {
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) {
+      return (dist == 0.0 || (diff == 0.0 && p < 1)) ? 0 : (sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1));
+    }
+  };
+
+  // Two norm
+  struct two {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t /*p*/) { agg += diff * diff; }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return device_sqrt<scalar_t>(agg); }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t /*p*/) { return dist == 0.0 ? 0 : grad * diff / dist; }
+  };
+
+  // General p norm
+  struct p {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t p) { agg += std::pow(diff, p); }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t p) { return std::pow(agg, static_cast<scalar_t>(1) / p); }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { update += other; }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : diff * std::pow(std::abs(diff), p - 2) * grad / std::pow(dist, p - 1); }
+  };
+
+  // Inf norm
+  struct inf {
+    static __forceinline__ __device__ void inc(scalar_t& agg, const scalar_t diff, const scalar_t /*p*/) { if (diff > agg) { agg = diff; } }
+    static __forceinline__ __device__ scalar_t finish(const scalar_t agg, const scalar_t /*p*/) { return agg; }
+    static __forceinline__ __device__ void agg(scalar_t& update, const scalar_t other) { if (other > update) { update = other; } }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t /*p*/) { return grad * sign(diff) * (std::abs(diff) == dist); }
+  };
+
+};
+
+template <typename scalar_t, typename F>
+struct DistReduceOp {
+    __forceinline__ __device__ scalar_t combine(scalar_t a, scalar_t b) const {
+        F::agg(a, b);
+        return a;
+    }
+
+    __forceinline__ __device__ scalar_t warp_shfl_down(scalar_t data, int offset) const {
+        return WARP_SHFL_DOWN(data, offset);
+    }
+};
+
+template <typename scalar_t, typename F>
+__global__ static void pdist_kernel_zoom_impl(scalar_t * result, const scalar_t * self, const int64_t n, const int64_t m, const scalar_t p,
+                                              const double n2, const double n2_squared_minus_1) {
+  const int64_t k = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // The -1 accounts for floating point truncation issues
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
+  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+
+  const scalar_t * const start = self + i * m;
+  const scalar_t * const end = start + m;
+  const scalar_t * a = start + threadIdx.x;
+  const scalar_t * b = self + j * m + threadIdx.x;
+  scalar_t agg = 0.0;
+  for (; a < end; a += stride, b += stride) {
+    F::inc(agg, std::abs(*a - *b), p);
+  }
+
+  __shared__ scalar_t agg_smem[kHIPNumThreads];
+  scalar_t agg_init{0.0};
+  agg = zoom_utils::BlockReduce(agg, DistReduceOp<scalar_t, F>{}, agg_init, agg_smem);
+  if (threadIdx.x == 0) {
+    result[k] = F::finish(agg, p);
+  }
+}
+
+template <typename scalar_t, typename F>
+__global__ static void cdist_backward_kernel_zoom_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * x1, const scalar_t * x2, const scalar_t * dist,
+                                                       const scalar_t p, const int64_t r1, const int64_t r2, const int64_t m, const int64_t count, const int64_t r_size, const int64_t l1_size, const int64_t l2_size) {
+  const int y = (blockIdx.y * gridDim.z + blockIdx.z) * blockDim.y + threadIdx.y;
+  const int init = blockIdx.x * blockDim.x + threadIdx.x;
+  if (y >= count || init >= m) {
+    return;
+  }
+  const int l = y / r_size;
+  const int k = y % r_size;
+  const int stride = blockDim.x * gridDim.x;
+  const int l_size = r_size * m;
+
+  int64_t i = k / r2;
+  int64_t j = k % r2;
+
+  const scalar_t grad_k = grad[y];
+  const scalar_t dist_k = dist[y];
+
+  const scalar_t * const start = x1 + l * l1_size + i * m;
+  const scalar_t * const end = start + m;
+  const scalar_t * self_i = start + init;
+  const scalar_t * self_j = x2 + l * l2_size + j * m + init;
+
+  scalar_t * buff_i = buffer + l * l_size + (r1 * j + i) * m + init;
+
+  for (; self_i < end; self_i += stride, self_j += stride, buff_i += stride) {
+    const scalar_t res = F::backward(*self_i - *self_j, grad_k, dist_k, p);
+    *buff_i = res;
+  }
+}
+
+template <typename scalar_t, typename F>
+__global__ static void pdist_backward_kernel_zoom_impl(scalar_t * buffer, const scalar_t * grad, const scalar_t * self, const scalar_t * dist, int64_t gs, const int64_t n, const int64_t m, const int64_t combs, const scalar_t p,
+                                                       const double n2, const double n2_squared_minus_1) {
+  const int64_t k = blockIdx.x * blockDim.x + threadIdx.x;
+  const int init = blockIdx.y * blockDim.y + threadIdx.y;
+  const int stride = blockDim.y * gridDim.y;
+
+  if (k >= combs) {
+    return;
+  }
+
+  // The -1 accounts for floating point truncation issues
+  int64_t i = static_cast<int64_t>((n2 - device_sqrt<double>(n2_squared_minus_1 - 2 * k)));
+  int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
+  int64_t ib = j - i - 1;
+  int64_t jb = n - 2 - i;
+
+  const scalar_t grad_k = grad[k * gs];
+  const scalar_t dist_k = dist[k];
+
+  const scalar_t * const start = self + i * m;
+  const scalar_t * const end = start + m;
+  const scalar_t * self_i = start + init;
+  const scalar_t * self_j = self + j * m + init;
+  scalar_t * buff_i = buffer + (ib * n + i) * m + init;
+  scalar_t * buff_j = buffer + (jb * n + j) * m + init;
+  for (; self_i < end; self_i += stride, self_j += stride, buff_i += stride, buff_j += stride) {
+    const scalar_t res = F::backward(*self_i - *self_j, grad_k, dist_k, p);
+    *buff_i = res;
+    *buff_j = -res;
+  }
+}
+
+template <typename scalar_t, typename F>
+__global__ static void cdist_kernel_zoom_impl(scalar_t * result, const scalar_t * x1, const scalar_t * x2,
+    const scalar_t p, const int64_t r2, const int64_t m, const int64_t r_size, const int64_t l1_size, const int64_t l2_size) {
+  const int64_t l = blockIdx.x / r_size;
+  const int64_t k = blockIdx.x % r_size;
+  const int64_t i = k / r2;
+  const int64_t j = k % r2;
+  const int stride = blockDim.x;
+
+  const scalar_t * const start = x1 + l * l1_size + i * m;
+  const scalar_t * const end = start + m;
+  const scalar_t * a = start + threadIdx.x;
+  const scalar_t * b = x2 + l * l2_size + j * m + threadIdx.x;
+
+  scalar_t agg = 0.0;
+  for (; a < end; a += stride, b += stride) {
+    F::inc(agg, std::abs(*a - *b), p);
+  }
+  __shared__ scalar_t agg_smem[kHIPNumThreads];
+  scalar_t agg_init{0.0};
+  agg = zoom_utils::BlockReduce(agg, DistReduceOp<scalar_t, F>{}, agg_init, agg_smem);
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = F::finish(agg, p);
+  }
+}
+
+void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, double p) {
+  const int64_t r1 = x1.size(-2);
+  const int64_t r2 = x2.size(-2);
+  const int64_t m = x1.size(-1);
+  const int64_t r_size = r1 * r2;
+  const int64_t l1_size = r1 * m;
+  const int64_t l2_size = r2 * m;
+  const dim3 grid(result.numel());
+  const dim3 block(kHIPNumThreads);
+
+  AT_DISPATCH_FLOATING_TYPES(x1.scalar_type(), "cdist_zoom", [&] {
+    auto impl_fptr = cdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::p>;
+    if (p == 0.0) {
+      impl_fptr = cdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::zero>;
+    } else if (p == 1.0) {
+      impl_fptr = cdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::one>;
+    } else if (p == 2.0) {
+      impl_fptr = cdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::two>;
+    } else if (std::isinf(p)) {
+      impl_fptr = cdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::inf>;
+    }
+    impl_fptr<<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(result.mutable_data_ptr<scalar_t>(), x1.const_data_ptr<scalar_t>(), x2.const_data_ptr<scalar_t>(), p, r2, m, r_size, l1_size, l2_size);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
+  const dim3 grid(result.numel());
+  const dim3 block(kHIPNumThreads);
+  int64_t n = self.size(0);
+  int64_t m = self.size(1);
+  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
+  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
+  const double n2 = n - .5;
+  const double n2_squared_minus_1 = n2 * n2 - 1;
+
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_zoom", [&] {
+    auto impl_fptr = pdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::p>;
+    if (p == 0.0) {
+      impl_fptr = pdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::zero>;
+    } else if (p == 1.0) {
+      impl_fptr = pdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::one>;
+    } else if (p == 2.0) {
+      impl_fptr = pdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::two>;
+    } else if (std::isinf(p)) {
+      impl_fptr = pdist_kernel_zoom_impl<scalar_t, dists<scalar_t>::inf>;
+    }
+    impl_fptr<<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+  if (p == 0.0 || grad.numel() == 0 || self.numel() == 0) {
+    result.fill_(0);
+    return;
+  }
+
+  const int64_t n = result.size(0);
+  int64_t m = self.size(1);
+  const int block_x = 16;
+  // NB: be careful with changing block_y; as it's currently written, grid_y is limited to be 2^16.
+  // block_y of 64 gives us max pdist dim1 of 2**24
+  const int block_y = 64;
+  const int grid_x = (dist.numel() + block_x - 1) / block_x;
+  const int grid_y = (m + block_y * 8 - 1) / (block_y * 8);
+  const dim3 grid(grid_x, grid_y);
+  const dim3 block(block_x, block_y);
+  // https://github.com/pytorch/pytorch/issues/15511 demonstrated we need to do
+  // some math in fp64 -- this is just minimizing the amount of fp64 math we do on the device.
+  const double n2 = n - .5;
+  const double n2_squared_minus_1 = n2 * n2 - 1;
+
+  Tensor buffer = at::empty({n - 1, result.size(0), result.size(1)}, result.options());
+  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_zoom_backward", [&] {
+    auto impl_fptr = pdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::p>;
+    if (p == 1.0) {
+      impl_fptr = pdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::one>;
+    } else if (p < 2.0) {
+      impl_fptr = pdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::lt_two>;
+    } else if (p == 2.0) {
+      impl_fptr = pdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::two>;
+    } else if (std::isinf(p)) {
+      impl_fptr = pdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::inf>;
+    }
+    impl_fptr<<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(buffer.mutable_data_ptr<scalar_t>(), grad.const_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(), dist.const_data_ptr<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+
+  at::sum_out(result, buffer, 0);
+}
+
+void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) {
+  if (p == 0.0 || grad.numel() == 0 || x1.numel() == 0 || x2.numel() == 0) {
+    result.fill_(0);
+    return;
+  }
+
+  const int64_t r1 = x1.size(-2);
+  const int64_t r2 = x2.size(-2);
+  const int64_t m = x1.size(-1);
+  // Just like we do in the CPU code, assume that result is always batched
+  int64_t batch = result.size(0);
+  const int block_x = 64;
+  const int block_y = 16;
+  const int grid_x = (m + block_x * 8 - 1) / (block_x * 8);
+
+  const int64_t count = dist.numel();
+  const int64_t grid_temp = (count + block_y - 1) / block_y;
+
+  const int grid_y = (grid_temp - 1) / 65535 + 1;
+  const int grid_z = (grid_temp - 1) / grid_y + 1;
+
+  const dim3 grid(grid_x, grid_y, grid_z);
+  const dim3 block(block_x, block_y);
+
+  const int64_t r_size = r1 * r2;
+  const int64_t l1_size = r1 * m;
+  const int64_t l2_size = r2 * m;
+  //current implementation supports only gradient that can be collapsed to 1D. However, to avoid checking this assumption,
+  //we call grad.contiguous() before backward, so stride is guaranteed to be 1
+
+  Tensor buffer = at::empty({batch, r2, r1, m}, result.options());
+  AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist_zoom_backward", [&] {
+    auto impl_fptr = cdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::p>;
+    if (p == 1.0) {
+      impl_fptr = cdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::one>;
+    } else if (p < 2.0) {
+       impl_fptr = cdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::lt_two>;
+    } else if (p == 2.0) {
+       impl_fptr = cdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::two>;
+    } else if (std::isinf(p)) {
+       impl_fptr = cdist_backward_kernel_zoom_impl<scalar_t, dists<scalar_t>::inf>;
+    }
+    impl_fptr<<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(buffer.mutable_data_ptr<scalar_t>(),
+      grad.const_data_ptr<scalar_t>(), x1.const_data_ptr<scalar_t>(), x2.const_data_ptr<scalar_t>(), dist.const_data_ptr<scalar_t>(),
+      p, r1, r2, m, count, r_size, l1_size, l2_size);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+
+  at::sum_out(result, buffer, 1);
+
+}
+
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(pdist_forward_stub, &pdist_forward_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(pdist_backward_stub, &pdist_backward_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(cdist_stub, &cdist_kernel_impl);
+REGISTER_PRIVATEUSE1_DISPATCH(cdist_backward_stub, &cdist_backward_kernel_impl);
+
+} // at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/DistributionBernoulli.cu b/aten/src/ATen/native/zoom/DistributionBernoulli.cu
new file mode 100644
index 00000000000000..1e2dc0ada79395
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionBernoulli.cu
@@ -0,0 +1,40 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+#include <utility>
+#include <functional>
+
+#include <ATen/native/Distributions.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <type_traits>
+
+namespace at::native {
+
+void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, std::optional<Generator> gen_) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::bernoulli_kernel(self, p_, generator);
+}
+
+void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Generator> gen) {
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::bernoulli_kernel(iter, p, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(bernoulli_tensor_stub, &bernoulli_tensor_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(bernoulli_scalar_stub, &bernoulli_scalar_kernel);
+
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionCauchyKernel.cu b/aten/src/ATen/native/zoom/DistributionCauchyKernel.cu
new file mode 100644
index 00000000000000..729878c244cf77
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionCauchyKernel.cu
@@ -0,0 +1,15 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+namespace at::native {
+
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::cauchy_kernel(iter, median, sigma, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(cauchy_stub, &cauchy_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionGeometricKernel.cu b/aten/src/ATen/native/zoom/DistributionGeometricKernel.cu
new file mode 100644
index 00000000000000..cd8a883cf0a387
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionGeometricKernel.cu
@@ -0,0 +1,15 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+namespace at::native {
+
+void geometric_kernel(TensorIteratorBase& iter, double p_, std::optional<Generator> gen) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::geometric_kernel(iter, p_, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(geometric_stub, &geometric_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionLogNormalKernel.cu b/aten/src/ATen/native/zoom/DistributionLogNormalKernel.cu
new file mode 100644
index 00000000000000..dd57bc450f5cdb
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionLogNormalKernel.cu
@@ -0,0 +1,15 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+namespace at::native {
+
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::log_normal_kernel(iter, mean, std, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(log_normal_stub, &log_normal_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionNormal.cu b/aten/src/ATen/native/zoom/DistributionNormal.cu
new file mode 100644
index 00000000000000..1eee03731df11c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionNormal.cu
@@ -0,0 +1,15 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+namespace at::native {
+
+void normal_kernel(const TensorBase &self, double mean, double std, std::optional<Generator> gen) {
+  auto generator = get_generator_or_default<ZoomGeneratorImpl>(gen, zoom::detail::getDefaultZoomGenerator());
+  at::native::templates::zoom::normal_kernel(self, mean, std, generator);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(normal_stub, &normal_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/DistributionTemplates.h b/aten/src/ATen/native/zoom/DistributionTemplates.h
index 24981a26aa817b..584d90f9247762 100644
--- a/aten/src/ATen/native/zoom/DistributionTemplates.h
+++ b/aten/src/ATen/native/zoom/DistributionTemplates.h
@@ -17,6 +17,7 @@
 
 #include <hiprand.h>
 #include <hiprand_kernel.h>
+//#include <curand_philox4x32_x.h>
 #include <cstdint>
 #include <limits>
 #include <utility>
diff --git a/aten/src/ATen/native/zoom/Distributions.cpp b/aten/src/ATen/native/zoom/Distributions.cpp
new file mode 100644
index 00000000000000..077d4d41b6afab
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Distributions.cpp
@@ -0,0 +1,84 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/Distributions.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_dirichlet_grad_native.h>
+#include <ATen/ops/_sample_dirichlet_native.h>
+#include <ATen/ops/_standard_gamma_grad_native.h>
+#include <ATen/ops/_standard_gamma_native.h>
+#include <ATen/ops/binomial_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/poisson_native.h>
+#endif
+
+namespace at::native {
+
+Tensor _s_poisson_zoom(const Tensor& lambda, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  Tensor ret = at::empty(lambda.sizes(), lambda.options());
+  launch_poisson_zoom_kernel(ret, lambda, gen);
+  return ret;
+}
+
+Tensor _s_binomial_zoom(const Tensor& count, const Tensor& prob, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  Tensor ret = at::empty(count.sizes(), count.options());
+  at::TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(count)
+      .add_input(prob)
+      .build();
+  launch_binomial_zoom_kernel(iter, gen);
+  return ret;
+}
+
+Tensor _s_gamma_zoom(const Tensor& alpha, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
+  launch_gamma_kernel(ret, alpha, gen);
+  return ret;
+}
+
+Tensor _s_dirichlet_zoom(const Tensor& alpha, std::optional<Generator> gen_) {
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  Tensor ret = at::empty(alpha.sizes(), alpha.options());
+  launch_gamma_kernel(ret, alpha, gen);
+  auto gamma_sum = ret.sum(/*dim=*/-1, /*keepdim=*/true);
+  at::TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(ret)
+      .add_input(gamma_sum)
+      .build();
+  launch_dirichlet_kernel(iter);
+  return ret;
+}
+
+Tensor _standard_gamma_grad_zoom(const Tensor& self, const Tensor& output) {
+  Tensor ret = at::empty(self.sizes(), self.options());
+  TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(self)
+      .add_input(output)
+      .build();
+  launch_standard_gamma_grad_kernel(iter);
+  return ret;
+}
+
+Tensor _dirichlet_grad_zoom(const Tensor& x, const Tensor& alpha, const Tensor& total) {
+  Tensor ret = at::empty(x.sizes(), x.options());
+  TensorIterator iter = at::TensorIteratorConfig()
+      .add_output(ret)
+      .add_input(x)
+      .add_input(alpha)
+      .add_input(total)
+      .build();
+  launch_dirichlet_grad_kernel(iter);
+  return ret;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Distributions.h b/aten/src/ATen/native/zoom/Distributions.h
new file mode 100644
index 00000000000000..c395fd26a795a0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Distributions.h
@@ -0,0 +1,25 @@
+#pragma once
+
+namespace at {
+struct ZoomGeneratorImpl;
+struct TensorIteratorBase;
+class TensorBase;
+
+namespace native {
+
+void launch_poisson_zoom_kernel(
+    const TensorBase &ret, const TensorBase &lambda, ZoomGeneratorImpl *gen);
+
+void launch_gamma_kernel(
+    const TensorBase &ret, const TensorBase &alpha, ZoomGeneratorImpl *gen);
+
+void launch_binomial_zoom_kernel(
+    TensorIteratorBase &iter, ZoomGeneratorImpl *gen);
+
+void launch_dirichlet_kernel(TensorIteratorBase &iter);
+
+void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter);
+
+void launch_dirichlet_grad_kernel(TensorIteratorBase &iter);
+
+}}  // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/DistributionsKernels.cu b/aten/src/ATen/native/zoom/DistributionsKernels.cu
new file mode 100644
index 00000000000000..14a1ac3f826609
--- /dev/null
+++ b/aten/src/ATen/native/zoom/DistributionsKernels.cu
@@ -0,0 +1,204 @@
+// #define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/Distributions.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/DistributionTemplates.h>
+
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+//#include <curand_philox4x32_x.h>
+#include <utility>
+#include <functional>
+
+#include <ATen/native/Distributions.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <type_traits>
+
+/**
+ * Note [Register spilling in curand call for CUDA < 10]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * For CUDA < 10, hiprandStatePhilox4_32_10_t engine achieves poor performance (60% SOL bandwidth)
+ * when called to generate one random number at a time. This is because the line
+ *            unsigned ret = (&state->output.x)[state->STATE++];
+ * in
+ *            QUALIFIERS unsigned int curand(hiprandStatePhilox4_32_10_t *state)
+ * in curand_kernel.h dynamically indexes into state.output, preventing the compiler from ever
+ * storing state.output in registers.
+ *
+ * CUDA 10 fixed this problem. However, for backwards compatibility, in the following kernels
+ * we are using curand distributions that utilize hiprand4 call. hiprand4 call doesn't have the
+ * register spilling problem.
+ */
+
+namespace {
+
+template <typename scalar_t>
+void poisson_zoom_kernel(
+    const at::TensorBase &ret,
+    const at::TensorBase &lambda,
+    at::PhiloxHIPState philox_args) {
+  auto functor = [philox_args] __device__(
+          scalar_t & ret_val, const scalar_t& lambda) {
+        ZOOM_KERNEL_ASSERT(lambda >= 0 && "invalid Poisson rate, expected rate to be non-negative");
+        auto seeds = at::zoom::philox::unpack(philox_args);
+        hiprandStatePhilox4_32_10_t state;
+        hiprand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
+        ret_val = static_cast<scalar_t>(hiprand_poisson(&state, lambda));
+      };
+  at::zoom::Zoom_tensor_apply2<scalar_t, scalar_t, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, lambda, functor);
+}
+
+struct hiprand_uniform_wrapper {
+  hiprandStatePhilox4_32_10_t &state;
+  __device__ hiprand_uniform_wrapper(hiprandStatePhilox4_32_10_t &state): state(state) {}
+  __device__ float operator()() {
+
+  uint32_t val = hiprand(&state); //need just bits
+  constexpr auto MASK = static_cast<uint32_t>((static_cast<uint64_t>(1) << std::numeric_limits<float>::digits) - 1);
+  constexpr auto DIVISOR = static_cast<float>(1) / (static_cast<uint32_t>(1) << std::numeric_limits<float>::digits);
+    return (val & MASK) * DIVISOR;
+  }
+};
+
+template <typename scalar_t>
+void binomial_zoom_kernel(
+    at::TensorIteratorBase &iter,
+    at::PhiloxHIPState philox_args) {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+
+  at::native::distribution_binary_kernel(iter, philox_args,
+      [] GPU_LAMBDA (hiprandStatePhilox4_32_10_t& state, scalar_t count, scalar_t prob) {
+        auto uniform_lambda = hiprand_uniform_wrapper(state);
+        BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
+        auto sample = sample_binomial<scalar_t, accscalar_t, decltype(uniform_lambda)>(count, prob, standard_uniform);
+        return static_cast<scalar_t>(sample);
+      }
+  );
+}
+
+template <typename scalar_t>
+void gamma_zoom_kernel(
+    const at::TensorBase &ret,
+    const at::TensorBase &alpha,
+    at::PhiloxHIPState philox_args) {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  auto functor = [philox_args] __device__(
+          scalar_t & ret_val, const scalar_t& alpha) {
+        auto seeds = at::zoom::philox::unpack(philox_args);
+        hiprandStatePhilox4_32_10_t state;
+        hiprand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
+
+        auto uniform_lambda = [&state] __device__ () {
+          return hiprand_uniform(&state);
+        };
+        BaseSampler<accscalar_t, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
+
+        auto normal_lambda = [&state] __device__ () {
+          return hiprand_normal(&state);
+        };
+        BaseSampler<accscalar_t, decltype(normal_lambda)> standard_normal(normal_lambda);
+        auto sample = sample_gamma<scalar_t, accscalar_t, decltype(uniform_lambda), decltype(normal_lambda)>(alpha, standard_uniform, standard_normal);
+        auto min_value = std::numeric_limits<scalar_t>::min();
+        ret_val = (min_value > sample) ? min_value : sample;
+      };
+  at::zoom::Zoom_tensor_apply2<scalar_t, scalar_t, decltype(functor),
+                               /*max_threads_per_block=*/256,
+                               /*min_blocks_per_sm==*/2>(ret, alpha, functor);
+}
+
+} // namespace
+
+namespace at::native {
+
+void launch_dirichlet_kernel(at::TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+                                  iter.input_dtype(), "dirichlet_zoom", [&] {
+    at::native::gpu_kernel(
+        iter,
+        [] GPU_LAMBDA (scalar_t gamma, scalar_t gamma_sum) {
+      auto ret_val = gamma / gamma_sum;
+      auto min_value = std::numeric_limits<scalar_t>::min();
+      auto max_value = 1 - std::numeric_limits<scalar_t>::epsilon();
+      ret_val = (min_value > ret_val) ? min_value : ret_val;
+      ret_val = (max_value < ret_val) ? max_value : ret_val;
+      return ret_val;
+    });
+  });
+}
+
+void launch_poisson_zoom_kernel(
+    const TensorBase &ret, const TensorBase &lambda, ZoomGeneratorImpl *gen) {
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(20);
+  }
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "poisson_zoom", [&] {
+    poisson_zoom_kernel<scalar_t>(ret, lambda, rng_engine_inputs);
+  });
+}
+
+void launch_binomial_zoom_kernel(
+    TensorIteratorBase &iter, ZoomGeneratorImpl *gen) {
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(42);
+  }
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "binomial_zoom", [&] {
+    binomial_zoom_kernel<scalar_t>(iter, rng_engine_inputs);
+  });
+}
+
+void launch_gamma_kernel(
+    const TensorBase &ret, const TensorBase &alpha, ZoomGeneratorImpl *gen) {
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(10);
+  }
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "gamma_zoom", [&] {
+     gamma_zoom_kernel<scalar_t>(ret, alpha, rng_engine_inputs);
+   });
+}
+
+void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "_standard_gamma_grad_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    gpu_kernel(iter,
+      [] GPU_LAMBDA (scalar_t self_val, scalar_t output_val) {
+        return standard_gamma_grad_one<scalar_t, accscalar_t>(self_val, output_val);
+      });
+  });
+}
+
+void launch_dirichlet_grad_kernel(TensorIteratorBase &iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "_dirichlet_grad_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    at::native::gpu_kernel(iter,
+      [] GPU_LAMBDA (scalar_t x_val, scalar_t alpha_val, scalar_t total_val) -> scalar_t {
+        return dirichlet_grad_one<scalar_t, accscalar_t>(x_val, alpha_val, total_val);
+      });
+  });
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Dropout.cu b/aten/src/ATen/native/zoom/Dropout.cu
new file mode 100644
index 00000000000000..a4dc5cf223d4fb
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Dropout.cu
@@ -0,0 +1,412 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/HIPGraphsUtils.hpp>
+#include <c10/macros/Macros.h>
+#include <hiprand/hiprand_kernel.h>
+
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/MemoryAccess.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_masked_scale_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+// philox generates 128 bits of randomness at a time. Kernel uses this explicitly by putting suitably transformed result into float4
+// for all members of float4 to be consumed UNROLL has to be 4. Don't change!
+// Note: VEC <= 4 (and in most real-world cases will be 4), so same logic applies.
+const int UNROLL = 4;
+
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename IndexType,
+    int ADims,
+    int VEC,
+    typename mask_t>
+C10_LAUNCH_BOUNDS_2(256, 4)
+__global__ void
+fused_dropout_kernel_vec(at::zoom::detail::TensorInfo<const scalar_t, IndexType> a,
+                         at::zoom::detail::TensorInfo<scalar_t, IndexType> b,
+                         at::zoom::detail::TensorInfo<mask_t, IndexType> c,
+                         IndexType totalElements, accscalar_t p,
+                         PhiloxHIPState philox_args) {
+  // make sure we don't break assumption that we can't have > 4 elements / thread
+  static_assert(VEC <= 4, "Value of VEC must be in [2, 4]");
+
+  using LoadT = memory::aligned_vector<scalar_t, VEC>;
+  using MaskLoadT = memory::aligned_vector<mask_t, VEC>;
+
+  auto seeds = at::zoom::philox::unpack(philox_args);
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  // Helps align the total number of times hiprand_uniform4 is called by each thread for the same totalElements
+  // in the vec=2 and vec=4 cases.
+  bool gridxvec_loop_state = 0;
+  accscalar_t scale = 1.0 / p;
+
+  float4 rand;
+
+  // Note: Vectorized loads means we'll stride each thread by an additional VEC factor, as we'll load VEC elements at a time
+  for (IndexType linearIndex = idx * VEC;
+      linearIndex < totalElements;
+      linearIndex += gridDim.x * blockDim.x * VEC) {
+    // local storage
+    scalar_t src[VEC];
+    // We'll use this to actually cause vectorized loads later
+    LoadT *value = reinterpret_cast<LoadT*>(&src);
+
+    //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
+    // Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
+    // sets of rand.
+    if ((VEC == 4) || (gridxvec_loop_state == 0)) {
+      rand = hiprand_uniform4(&state);
+    } else {
+      // sets up the last two values we generated last iteration to be used this iteration.
+      rand.x = rand.z;
+      rand.y = rand.w;
+      gridxvec_loop_state ^= 1;
+    }
+
+    rand.x = rand.x < p;
+    rand.y = rand.y < p;
+    if (VEC == 4) {
+      rand.z = rand.z < p;
+      rand.w = rand.w < p;
+    }
+
+    // Note: We explicitly check for is_contiguous() before launching the vectorized kernel
+    // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other)
+    // ordering.
+    // Single vectorized load
+    *value = *reinterpret_cast<const LoadT*>(&a.data[linearIndex]);
+
+    scalar_t r[VEC];
+    mask_t mask[VEC];
+
+    // Perform the actual computation
+    #pragma unroll
+    for (int ii = 0; ii < VEC; ii++) {
+      r[ii] = src[ii]*(&rand.x)[ii]*scale;
+      mask[ii] = (mask_t)(&rand.x)[ii];
+    }
+    // Vectorized writes for both mask & result
+    *(reinterpret_cast<LoadT*>(&b.data[linearIndex])) = *reinterpret_cast<LoadT*>(&r[0]);
+    *(reinterpret_cast<MaskLoadT*>(&c.data[linearIndex])) = *reinterpret_cast<MaskLoadT*>(&mask[0]);
+
+    __syncthreads();
+  }
+}
+
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename IndexType,
+    int ADims,
+    int BDims = ADims,
+    typename mask_t>
+C10_LAUNCH_BOUNDS_2(256, 4)
+__global__ void
+fused_dropout_kernel(zoom::detail::TensorInfo<const scalar_t, IndexType> a,
+                     zoom::detail::TensorInfo<scalar_t, IndexType> b,
+                     zoom::detail::TensorInfo<mask_t, IndexType> c,
+                     IndexType totalElements, accscalar_t p,
+                     PhiloxHIPState philox_args) {
+  auto seeds = at::zoom::philox::unpack(philox_args);
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+  accscalar_t scale = 1.0 / p;
+
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) *
+        blockDim.x * gridDim.x * UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) {
+//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
+       float4 rand = hiprand_uniform4(&state);
+       scalar_t src[UNROLL];
+       rand.x = rand.x < p;
+       rand.y = rand.y < p;
+       rand.z = rand.z < p;
+       rand.w = rand.w < p;
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+    // Convert `linearIndex` into an offset of `a`
+               const IndexType aOffset =
+                   zoom::detail::IndexToOffset<const scalar_t, IndexType, ADims>::get(li, a);
+               src[ii] = a.data[aOffset];
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+    // Convert `linearIndex` into an offset of `b`
+               const IndexType bOffset =
+                   zoom::detail::IndexToOffset<scalar_t, IndexType, BDims>::get(li, b);
+               b.data[bOffset] = src[ii]*(&rand.x)[ii]*scale;
+               c.data[bOffset] = (mask_t)(&rand.x)[ii];
+           }
+       }
+       __syncthreads();
+  }
+}
+
+template<typename mask_t, typename scalar_t, typename accscalar_t>
+void masked_scale_kernel(at::Tensor& ret, const at::Tensor& src, const at::Tensor& mask, accscalar_t scale){
+   auto iter = at::TensorIteratorConfig()
+     .check_all_same_dtype(false)
+     .add_output(ret)
+     .add_const_input(src)
+     .add_const_input(mask)
+     .build();
+
+   at::native::gpu_kernel(
+       iter,
+       [=]GPU_LAMBDA(const scalar_t src_val, const mask_t mask_val) -> scalar_t {
+          return (float)mask_val * src_val * scale;
+       });
+}
+
+template <typename scalar_t>
+int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) {
+  int vec_size = 4;
+  // get the vector size
+  if (!self.is_non_overlapping_and_dense() || !ret.is_non_overlapping_and_dense() || !mask.is_non_overlapping_and_dense()) {
+    vec_size = 1;
+  } else {
+    vec_size = memory::can_vectorize_up_to<scalar_t>((const char*)self.const_data_ptr());
+  }
+
+  // check that we'd have no remainders - prefer a smaller vector size with no remainders over a larger vector and remainder.
+  bool can_vectorize = true;
+  do {
+    can_vectorize = self.numel() % vec_size == 0 && ret.numel() % vec_size == 0 && mask.numel() % vec_size == 0;
+    if (!can_vectorize) vec_size /= 2;
+  } while (vec_size > 1 && !can_vectorize);
+  return can_vectorize ? vec_size : 1;
+}
+
+template <typename index_type, typename mask_t>
+inline void launcher(
+    const Tensor& self,
+    Tensor& ret,
+    Tensor& mask,
+    double p,
+    const int64_t nelem,
+    const PhiloxHIPState rng_engine_inputs,
+    dim3 grid,
+    dim3 dim_block) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      self.scalar_type(),
+      "fused_dropout",
+      [&] {
+        using accscalar_t = acc_type<scalar_t, true>;
+        accscalar_t pa = (accscalar_t)(p);
+        auto self_info =
+            zoom::detail::getTensorInfo<const scalar_t, index_type>(self);
+        auto ret_info =
+            zoom::detail::getTensorInfo<scalar_t, index_type>(ret);
+        auto mask_info =
+            zoom::detail::getTensorInfo<mask_t, index_type>(mask);
+        self_info.collapseDims();
+        ret_info.collapseDims();
+        mask_info.collapseDims(); // ret and mask are collapsed to 1d
+                                  // contiguous tensor
+
+        int vec_size = get_vector_size<scalar_t>(self, ret, mask);
+
+        if (vec_size > 1) {
+          switch (vec_size) {
+            case 4:
+              fused_dropout_kernel_vec<
+                  scalar_t,
+                  accscalar_t,
+                  index_type,
+                  1,
+                  4>
+                  <<<grid, dim_block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                      self_info,
+                      ret_info,
+                      mask_info,
+                      nelem,
+                      pa,
+                      rng_engine_inputs);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              break;
+            case 2:
+              fused_dropout_kernel_vec<
+                  scalar_t,
+                  accscalar_t,
+                  index_type,
+                  1,
+                  2>
+                  <<<grid, dim_block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                      self_info,
+                      ret_info,
+                      mask_info,
+                      nelem,
+                      pa,
+                      rng_engine_inputs);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              break;
+          }
+        } else {
+          switch (self_info.dims) {
+            case 1:
+              fused_dropout_kernel<scalar_t, accscalar_t, index_type, 1>
+                  <<<grid, dim_block, 0, c10::zoom::getCurrentZoomStream()>>>(
+                      self_info,
+                      ret_info,
+                      mask_info,
+                      nelem,
+                      pa,
+                      rng_engine_inputs);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              break;
+            default:
+              if (!self.is_contiguous() && ret.is_contiguous() &&
+                  mask.is_contiguous()) {
+                fused_dropout_kernel<scalar_t, accscalar_t, index_type, -1, 1>
+                    <<<grid,
+                        dim_block,
+                        0,
+                        c10::zoom::getCurrentZoomStream()>>>(
+                        self_info,
+                        ret_info,
+                        mask_info,
+                        nelem,
+                        pa,
+                        rng_engine_inputs);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              } else {
+                fused_dropout_kernel<scalar_t, accscalar_t, index_type, -1>
+                    <<<grid,
+                        dim_block,
+                        0,
+                        c10::zoom::getCurrentZoomStream()>>>(
+                        self_info,
+                        ret_info,
+                        mask_info,
+                        nelem,
+                        pa,
+                        rng_engine_inputs);
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              }
+          }
+        }
+      });
+}
+
+} //anonymous namespace
+
+template <typename mask_t>
+std::tuple<Tensor,Tensor>
+dropout_zoom(ZoomGeneratorImpl* gen, const Tensor& self, double p){
+  Tensor mask = at::empty_like(self, self.options().dtype(c10::CppTypeToScalarType<mask_t>::value));
+  const int64_t nelem = self.numel();
+  // empty tensors should not get here, but just in case, avoid FPE
+  // non-training shot-cut
+  if (nelem==0) return std::tuple<Tensor,Tensor>(self.clone(), mask);
+
+  Tensor ret = at::empty_like(self);
+  const int64_t block_size = 256;
+  unsigned int blocks_per_sm = at::zoom::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  dim3 dim_block(block_size);
+  dim3 grid((nelem + block_size -1)/block_size);
+  grid.x = std::min((unsigned int)at::zoom::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+//number of times random will be generated per thread, to offset philox counter in thc random state
+  int64_t counter_offset = ((nelem - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
+  PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(counter_offset);
+  }
+  if (zoom::detail::canUse32BitIndexMath(self)){
+    launcher<unsigned int, mask_t>(
+        self, ret, mask, p, nelem, rng_engine_inputs, grid, dim_block);
+  } else {
+    launcher<uint64_t, mask_t>(
+        self, ret, mask, p, nelem, rng_engine_inputs, grid, dim_block);
+  }
+  return std::tuple<Tensor,Tensor>(ret, mask);
+}
+
+std::tuple<Tensor,Tensor>
+native_dropout_zoom(const Tensor& self, double p, std::optional<bool> train){
+  // short-cut for train == false
+  if (train.has_value() && !train.value()) {
+    return std::make_tuple(self.clone(), at::ones_like(self, self.options().dtype(c10::CppTypeToScalarType<bool>::value)));
+  }
+  // short-cut
+  if (p == 1) {
+    // native_dropout_cuda is in derivatives.yaml, so we don't need to add data
+    // dependency from output to input for autograd
+    auto ret = at::zeros_like(self);
+    auto mask = at::zeros_like(self, self.options().dtype(c10::CppTypeToScalarType<bool>::value));
+    return std::tuple<Tensor,Tensor>(ret, mask);
+  }
+
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(c10::nullopt, zoom::detail::getDefaultZoomGenerator());
+  double p1m = 1. - p;
+  return dropout_zoom<bool>(gen, self, p1m);
+}
+
+// TODO: _fused_dropout_cuda is to be removed, see PR #63937
+std::tuple<Tensor,Tensor>
+fused_dropout_zoom(const Tensor& self, double p, std::optional<Generator> gen_){
+  auto gen = get_generator_or_default<ZoomGeneratorImpl>(gen_, zoom::detail::getDefaultZoomGenerator());
+  return dropout_zoom<uint8_t>(gen, self, p);
+}
+
+template <typename mask_t>
+Tensor dropout_backward_zoom(const Tensor& grad, const Tensor& mask, double scale){
+   Tensor ret = at::empty_like(grad, grad.suggest_memory_format());
+   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, ret.scalar_type(), "masked_scale", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      masked_scale_kernel<mask_t, scalar_t>(ret, grad, mask, (accscalar_t)scale);
+  });
+  return ret;
+}
+
+Tensor native_dropout_backward_zoom(const Tensor& grad, const Tensor& mask, double scale){
+   TORCH_CHECK(mask.scalar_type() == at::ScalarType::Bool, "Mask should be Bool Scalar Type", mask.scalar_type());
+  return dropout_backward_zoom<bool>(grad, mask, scale);
+}
+
+// TODO: masked_scale_cuda is to be removed, see PR #63937
+Tensor masked_scale_zoom(const Tensor& self, const Tensor& mask, double scale){
+  TORCH_CHECK(mask.scalar_type() == at::ScalarType::Byte, "mask should be torch.uint8 dtype");
+  return dropout_backward_zoom<uint8_t>(self, mask, scale);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Embedding.cu b/aten/src/ATen/native/zoom/Embedding.cu
new file mode 100644
index 00000000000000..413f1bdfe70358
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Embedding.cu
@@ -0,0 +1,383 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/ceil_div.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/util/Exception.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/zoom/cub.cuh>
+
+#include <ATen/native/zoom/EmbeddingBackwardKernel.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+#include <ATen/zoom/jit/thread_constants.h>
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+#include <thrust/iterator/reverse_iterator.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/embedding_dense_backward_native.h>
+#include <ATen/ops/embedding_renorm_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+static const int BLOCKDIMY = 16;
+
+template
+  <typename scalar_t,
+   typename accscalar_t,
+   typename index_t>
+__global__ void embedding_backward_feature_kernel
+  (const index_t* indices,
+   const scalar_t* __restrict__ grad,
+   scalar_t* __restrict__ grad_weight,
+   int n, // OK to pass as int, we don't expect 2 billion+ samples in one shot
+   int64_t stride,
+   int padding_idx)
+{
+  extern __shared__ char buf[];
+  accscalar_t* smem = (accscalar_t*)buf;
+  accscalar_t* my_s = smem + C10_WARP_SIZE*threadIdx.y;
+  int* indices_batch = (int*)(buf + sizeof(accscalar_t)*C10_WARP_SIZE*blockDim.y);
+
+  const int s = (int)stride; // OK to make int, we don't expect 2 billion+ embedding row size
+
+  const int f = threadIdx.x + blockIdx.x*blockDim.x; // feature_dim
+
+  for(int batch_start = 0; batch_start < n; batch_start += blockDim.x*blockDim.y)
+  {
+    // Entire block cooperates to load a batch of 1024 indices to process
+    int tid = threadIdx.x + threadIdx.y*blockDim.x;
+    if(batch_start + tid < n)
+      indices_batch[tid] = (int)indices[batch_start + tid];
+
+    int batch_end = batch_start + blockDim.x*blockDim.y < n ?
+                    batch_start + blockDim.x*blockDim.y : n;
+
+    // Loop over the batch of <= 1024 loaded indices in chunks of blockDim.y = 32
+    for(int chunk_start = batch_start; chunk_start < batch_end; chunk_start += blockDim.y)
+    {
+      // This does double duty:  it makes sure indices_batch is ready, and it makes sure match-group
+      // leaders are done with their accumulates before other warps start loading again.
+      __syncthreads();
+
+      int n_this_chunk = (batch_end - chunk_start) < blockDim.y ?
+                         (batch_end - chunk_start) : blockDim.y;
+
+      int src_row = chunk_start + threadIdx.y;
+      int dst_row = indices_batch[src_row - batch_start]; // This warp's target row in grad_weight
+
+      // All warps load their smem segments with incoming grad data
+      if(src_row < n && f < s && dst_row != padding_idx)
+        my_s[threadIdx.x] = static_cast<accscalar_t>(grad[src_row*stride + f]);
+
+      __syncthreads();
+
+      // To ensure determinism, we can't just have each warp add its grad data to its dst_row.
+      // We need to check if any other warps pulled grad data targeting dst_row.
+      // If so, we elect the first warp in each matching group as the leader.
+      // Each leader warp serializes the accumulates targeting dst_row in shared memory,
+      // then finishes by adding the accumulated buffer to dst_row in grad_weight.
+      if(dst_row != padding_idx && src_row < n) // Per-warp exit condition, safe with ballot_sync
+      {
+        int match_found_this_thread = 0;
+        if(threadIdx.x < n_this_chunk)
+          match_found_this_thread = (dst_row == indices_batch[chunk_start - batch_start + threadIdx.x]);
+
+        unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
+        int first_remaining_peer = __ffsll(matchmask) - 1;
+
+        if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
+        {
+          matchmask ^= (1 << first_remaining_peer);
+          while(matchmask)
+          {
+            first_remaining_peer = __ffsll(matchmask) - 1;
+            my_s[threadIdx.x] += smem[threadIdx.x + C10_WARP_SIZE*first_remaining_peer];
+            matchmask ^= (1 << first_remaining_peer);
+          }
+          if(f < s)
+            grad_weight[dst_row*stride + f] += static_cast<scalar_t>(my_s[threadIdx.x]);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, typename index_t>
+__global__ void embedding_backward_kernel(
+  index_t* input, index_t* indices, scalar_t* grad_output, scalar_t* grad_weight,
+  index_t* count, int64_t numel, int64_t stride, int padding_idx) {
+
+  using accscalar_t = acc_type<scalar_t, true>;
+  int idx = blockIdx.x * 4 + threadIdx.y;
+
+  // Each warp is responsible for an input into the LookupTable.
+  // If the preceding input has the same as this input, then the warp
+  // exits immediately. The warp also processes subsequent inputs with the
+  // same value.
+  //
+  // Input Warp
+  // 1     <warp 1>
+  // 1     <warp 1> (<warp 2> exits without doing any work)
+  // 5     <warp 3>
+  // 8     <warp 4>
+
+  // Number of values processed by each thread (grain size)
+  const int SZ = 4;
+
+  if (idx < numel
+      && (idx == 0 || input[idx] != input[idx - 1])
+      && input[idx] != padding_idx) {
+    do {
+      const int start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
+      const int weight_row = ((int) input[idx]) * stride;
+      const int grad_row = ((int) indices[idx]) * stride;
+      const accscalar_t scale = count ? (accscalar_t)1.0 / count[idx] : 1.0;
+
+      accscalar_t gradient[SZ];
+      accscalar_t weight[SZ];
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int feature_dim = start_feature + ii * C10_WARP_SIZE;
+        if (feature_dim < stride) {
+          gradient[ii] = static_cast<accscalar_t>(grad_output[grad_row + feature_dim]);
+          weight[ii] = static_cast<accscalar_t>(grad_weight[weight_row + feature_dim]);
+        }
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        weight[ii] += gradient[ii] * scale;
+      }
+
+      #pragma unroll
+      for (int ii = 0; ii < SZ; ii++) {
+        int feature_dim = start_feature + ii * C10_WARP_SIZE;
+        if (feature_dim < stride) {
+            grad_weight[weight_row + feature_dim] = static_cast<scalar_t>(weight[ii]);
+        }
+      }
+
+      idx++;
+    } while (idx < numel && input[idx] == input[idx - 1]);
+  }
+}
+
+/* Calculate norms of the rows of weight_ptr given by idx_ptr and capture them in norms */
+template <typename scalar_t, typename accscalar_t, typename index_t>
+__global__ void renorm_kernel(
+    scalar_t* weights, index_t* indices, accscalar_t max_norm,
+    accscalar_t norm_type, int64_t dim,
+    int64_t weights_stride0, int64_t weights_stride1,
+    const int64_t *num_unique_indices) {
+  if (blockIdx.x >= *num_unique_indices) {
+    return;
+  }
+
+  // Some casting hacks since dynamic shared memory and templates don't work together:
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+
+  int tid = threadIdx.x;
+  int base_index = indices[blockIdx.x] * weights_stride0;
+
+  accscalar_t v = 0;
+  for (int i = tid; i < dim; i += blockDim.x) {
+    auto x = static_cast<accscalar_t>(weights[base_index + i * weights_stride1]);
+    if (norm_type == 1) {
+      v += std::abs(x);
+    } else if (norm_type == 2) {
+      v += x * x;
+    } else {
+      v += std::pow(x, norm_type);
+    }
+  }
+
+  v = zoom_utils::BlockReduceSum(v, sdata);
+
+  if (tid == 0) {
+    sdata[0] = std::pow(v, static_cast<accscalar_t>(1.0 / norm_type));
+  }
+  __syncthreads();
+
+  // now we renormalize the blocks that need it
+  if (sdata[0] > max_norm) {
+    auto factor = static_cast<scalar_t>(max_norm / (sdata[0] + 1e-7));
+    for (int i = tid; i < dim; i += blockDim.x) {
+      weights[base_index + i * weights_stride1] *= factor;
+    }
+  }
+}
+
+} // anonymous namespace
+
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+template<typename index_t>
+void embedding_dense_backward_zoom_scan(Tensor &sorted_indices, Tensor &count);
+#endif
+
+Tensor embedding_dense_backward_zoom(const Tensor & grad_, const Tensor & indices_,
+                               int64_t num_weights, int64_t padding_idx,
+                               bool scale_grad_by_freq) {
+  auto grad_arg = TensorArg(grad_, "grad", 1);
+  auto indices_arg = TensorArg(indices_, "indices", 1);
+  checkScalarTypes("embedding_backward", indices_arg, {kLong, kInt});
+  checkSameGPU("embedding_backward", grad_arg, indices_arg);
+
+  auto indices = indices_.contiguous();
+
+  auto num_indices = indices.numel();
+  auto grad = grad_.contiguous().view({num_indices, grad_.size(-1)});
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  if (num_indices <= 3072 && !scale_grad_by_freq) {
+    auto indices_contig = indices.contiguous();
+    auto grad_weight = at::zeros({num_weights, grad_.size(-1)}, grad_.options());
+    int64_t stride = grad_weight.stride(0);
+    int warp_size = at::zoom::warp_size();
+    dim3 grid(ceil_div(stride, (int64_t)warp_size));
+    dim3 block(warp_size, BLOCKDIMY);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad.scalar_type(),
+       "embedding_backward",
+       [&]
+       {
+          using accscalar_t = acc_type<scalar_t, true>;
+          AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_zoom", [&] () {
+          embedding_backward_feature_kernel<scalar_t, accscalar_t, index_t>
+            <<<grid,
+                block,
+                sizeof(accscalar_t)*warp_size*BLOCKDIMY + sizeof(int)*warp_size*BLOCKDIMY,
+                stream>>>
+            (indices_contig.const_data_ptr<index_t>(),
+              grad.const_data_ptr<scalar_t>(),
+              grad_weight.mutable_data_ptr<scalar_t>(),
+              static_cast<int>(num_indices),
+              static_cast<int64_t>(stride),
+              static_cast<int>(padding_idx));
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          });
+       });
+    return grad_weight;
+  }
+
+  auto sorted_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  Tensor count;
+  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_zoom", [&] () {
+    auto range = at::arange(num_indices, indices.options());
+    int64_t nbits = zoom::hipcub::get_num_bits(num_weights);
+    zoom::hipcub::radix_sort_pairs(
+      indices.const_data_ptr<index_t>(), sorted_indices.mutable_data_ptr<index_t>(),
+      range.const_data_ptr<index_t>(), orig_indices.mutable_data_ptr<index_t>(),
+      num_indices, false/*, 0, nbits*/);
+  });
+
+  if (scale_grad_by_freq) {
+    count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_zoom", [&] () {
+      hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+      // Compute an increasing sequence per unique item in sortedIndices:
+      // sorted: 2 5 5 5 7 7 8 9 9
+      //  count: 1 1 2 3 1 2 1 1 2
+      auto sorted_data = sorted_indices.const_data_ptr<index_t>();
+      auto count_data = count.mutable_data_ptr<index_t>();
+      zoom::hipcub::inclusive_sum_by_key(
+        sorted_data,
+        at_zoom_detail::hipcub::ConstantInputIterator<index_t>(1),
+        count_data,
+        num_indices
+      );
+
+      // Take the maximum of each count per unique key in reverse:
+      // sorted: 2 5 5 5 7 7 8 9 9
+      //  count: 1 3 3 3 2 2 1 2 2
+      zoom::hipcub::inclusive_scan_by_key(
+        thrust::make_reverse_iterator(sorted_data + num_indices),
+        thrust::make_reverse_iterator(static_cast<const index_t*>(count_data) + num_indices),
+        thrust::make_reverse_iterator(count_data + num_indices),
+        at_zoom_detail::hipcub::Max(),
+        num_indices
+      );
+    });
+#else
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_zoom", [&] () {
+      embedding_dense_backward_zoom_scan<index_t>(sorted_indices, count);
+    });
+#endif
+  }
+
+  return embedding_backward_zoom_kernel(grad, orig_indices,
+      sorted_indices, count, num_weights, padding_idx);
+}
+
+Tensor & embedding_renorm_zoom_(Tensor & self, const Tensor & indices,
+                                double max_norm, double norm_type) {
+  auto self_arg = TensorArg(self, "self", 1);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkDim("embedding_renorm_", self_arg, 2);
+  checkSameGPU("embedding_renorm", self_arg, indices_arg);
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_zoom_", [&] () {
+
+    auto num_indices = indices.numel();
+    auto indices_contig = std::get<0>(indices.sort()).contiguous();
+    auto unique_indices = at::empty(indices.numel(), indices.options());
+    auto num_unique_indices = at::empty({}, indices.options().dtype(kLong));
+
+    zoom::hipcub::unique(
+      indices_contig.const_data_ptr<index_t>(),
+      unique_indices.mutable_data_ptr<index_t>(),
+      num_unique_indices.mutable_data_ptr<int64_t>(),
+      num_indices
+    );
+
+    int warp_size = at::zoom::warp_size();
+    TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
+                  num_threads() <= zoom_utils::kHIPBlockReduceMaxThreads,
+                  "BlockReduceSum requires all warps be active");
+    const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr<int64_t>();
+    dim3 grid = unique_indices.numel();
+    dim3 block = num_threads();
+    int dim = self.stride(0);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "embedding_renorm_zoom_", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      renorm_kernel<<<grid, block, (block.x / warp_size) * sizeof(accscalar_t), stream>>>(
+        self.mutable_data_ptr<scalar_t>(),
+        unique_indices.const_data_ptr<index_t>(),
+        static_cast<accscalar_t>(max_norm),
+        static_cast<accscalar_t>(norm_type),
+        dim, self.stride(0), self.stride(1),
+        num_unique_indices_ptr);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  });
+  return self;
+}
+
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cu
new file mode 100644
index 00000000000000..8187318bd61561
--- /dev/null
+++ b/aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cu
@@ -0,0 +1,365 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/EmbeddingBackwardKernel.cuh>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/cub.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/zoom/SortingCommon.cuh>
+
+#include <c10/macros/Macros.h>
+
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+#include <thrust/iterator/counting_iterator.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+/* This code computes the sum of the weights in two-steps:
+  1) Each GPU warp sums `NROWS_PER_THREAD` number of row given by `indeces`
+  2) Each partial-sum from 1) are summed and scatter into `grad_weight`
+
+  Notice, `NROWS_PER_THREAD` impacts the Achieved Occupancy of the
+  kernel execution. If it is high, the size of the thread blocks will be
+  too small to achieve good occupancy. Similarly, a very low value will
+  make the size of the thread blocks in the final sum in step 2) too small.
+*/
+constexpr int NROWS_PER_THREAD = 10;
+
+// Fast ceil division (no overflow checking)
+__host__ __device__ __forceinline__
+int64_t ceil_div(int64_t x, int64_t y) {
+  return (x + y - 1) / y;
+}
+
+template <typename index_t>
+__global__
+void krn_partials_per_segment(index_t *ret, const index_t *segment_offsets,
+                              const int64_t *num_of_segments_ptr, int64_t numel) {
+  int64_t num_of_segments = *num_of_segments_ptr;
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if(id < num_of_segments) {
+    const int64_t idx_start = segment_offsets[id];
+    const int64_t idx_end = (id == num_of_segments-1)?numel:segment_offsets[id+1];
+    const int64_t size = idx_end - idx_start;
+    ret[id] = ceil_div(size, NROWS_PER_THREAD);
+  }
+}
+
+template <typename index_t>
+__global__
+void krn_partial_segment_offset(
+        index_t *ret,
+        const index_t *partials_per_segment,
+        const index_t *partials_per_segment_offset,
+        const index_t *segment_offsets,
+        const int64_t *num_of_segments_ptr) {
+  int64_t num_of_segments = *num_of_segments_ptr;
+  const int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if(id < num_of_segments) {
+    index_t idx = partials_per_segment_offset[id];
+    const index_t num_partials = partials_per_segment[id];
+    const index_t segment_offset = segment_offsets[id];
+    for (int64_t i=0; i<num_partials; ++i) {
+      ret[idx++] = segment_offset + i * NROWS_PER_THREAD;
+    }
+  }
+}
+
+
+template <typename scalar_t, typename index_t>
+__global__ void compute_grad_weight_bags(
+    const index_t *indices, const scalar_t *gradOutput,
+    const index_t *offset2bag, const index_t *count, ptrdiff_t numel,
+    int64_t stride, int mode_mean, const index_t *bag_size,
+    const scalar_t* per_sample_weights, int64_t per_sample_weights_stride,
+    const index_t* segment_offsets, const int64_t *num_of_segments_ptr,
+    acc_type<scalar_t, true> *grad_weight_per_segment,
+    const int64_t stride_warped) {
+
+  int64_t num_of_segments = *num_of_segments_ptr;
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int id = gid / stride_warped;
+  const int startFeature = gid % stride_warped;
+  if (startFeature >= stride) {
+    return;
+  }
+  if (id >= num_of_segments) {
+    return;
+  }
+  const int idx_begin = segment_offsets[id];
+  const int idx_end = (id == num_of_segments-1)?numel:segment_offsets[id+1];
+
+  acc_type<scalar_t, true> weight = 0;
+  for (int idx=idx_begin; idx < idx_end; ++idx) {
+    const int origRow = indices[idx];
+    const int seq_number = offset2bag[origRow];
+    const int gradOutputRow = seq_number * stride;
+
+    acc_type<scalar_t, true> scale = count ? 1.0 / count[idx] : 1.0;
+    if (per_sample_weights) {
+      scale *= per_sample_weights[origRow * per_sample_weights_stride];
+    }
+
+    acc_type<scalar_t, true> gradient = gradOutput[gradOutputRow + startFeature];
+    if (mode_mean) {
+      gradient /= bag_size[seq_number];
+    }
+    weight += gradient * scale;
+  }
+  grad_weight_per_segment[id * stride + startFeature] = weight;
+}
+
+template <typename scalar_t, typename index_t>
+__global__ void compute_grad_weight(
+    const index_t *indices,
+    const scalar_t *gradOutput,
+    const index_t *count,
+    ptrdiff_t numel,
+    int64_t stride,
+    const index_t* segment_offsets,
+    const int64_t *num_of_segments_ptr,
+    acc_type<scalar_t, true> *grad_weight_per_segment,
+    const int64_t stride_warped) {
+
+  int64_t num_of_segments = *num_of_segments_ptr;
+  using accscalar_t = acc_type<scalar_t, true>;
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int id = gid / stride_warped;
+  const int startFeature = gid % stride_warped;
+  if (startFeature >= stride) {
+    return;
+  }
+  if (id >= num_of_segments) {
+    return;
+  }
+  const int idx_begin = segment_offsets[id];
+  const int idx_end = (id == num_of_segments-1)?numel:segment_offsets[id+1];
+
+  accscalar_t weight = 0;
+  for (int idx=idx_begin; idx < idx_end; ++idx) {
+    const index_t target_row = indices[idx];
+    const accscalar_t scale = count ? (accscalar_t)1.0 / count[idx] : 1.0;
+    weight += gradOutput[target_row * stride + startFeature] * scale;
+  }
+  grad_weight_per_segment[id * stride + startFeature] = weight;
+}
+
+// This kernel assumes that all input tensors are contiguous.
+template <typename scalar_t, typename index_t>
+__global__ void sum_and_scatter(
+    const index_t *input, scalar_t *gradWeight, int64_t stride,
+    const index_t* segment_offsets, const int64_t *num_of_segments_ptr,
+    const acc_type<scalar_t, true> *grad_weight_per_segment,
+    const index_t *segment_sizes_offsets, const int64_t *num_of_partial_segments_ptr,
+    const int64_t padding_idx,
+    const int64_t stride_warped) {
+
+  int64_t num_of_segments = *num_of_segments_ptr;
+  int64_t num_of_partial_segments = *num_of_partial_segments_ptr;
+  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int id = gid / stride_warped;
+  const int startFeature = gid % stride_warped;
+  if (startFeature >= stride) {
+    return;
+  }
+  if (id >= num_of_segments) {
+    return;
+  }
+
+  const int idx_begin = segment_sizes_offsets[id];
+  const int idx_end = (id == num_of_segments-1)?num_of_partial_segments:segment_sizes_offsets[id+1];
+  acc_type<scalar_t, true> weight = 0;
+  for (int idx=idx_begin; idx < idx_end; ++idx) {
+    weight += grad_weight_per_segment[idx*stride + startFeature];
+  }
+  int64_t target_row = input[segment_offsets[id]];
+  if (target_row != padding_idx) {
+    gradWeight[target_row * stride + startFeature] = weight;
+  }
+}
+
+template<typename index_t>
+__global__ void compute_num_of_partial_segments(const index_t *partials_per_segment, const index_t *partials_per_segment_offset, const int64_t *num_of_segments_ptr, int64_t *output) {
+  int64_t num_of_segments = *num_of_segments_ptr;
+  *output = partials_per_segment[num_of_segments-1] +
+            partials_per_segment_offset[num_of_segments-1];
+}
+
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
+  *num_of_segments_ptr = num_of_segments;
+}
+#endif
+
+} // anon namespace
+
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+template<typename index_t>
+int64_t embedding_backward_zoom_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
+#endif
+
+Tensor embedding_backward_zoom_kernel(
+        const Tensor &grad,
+        const Tensor &orig_indices,
+        const Tensor &sorted_indices,
+        const Tensor &count,
+        int64_t num_weights,
+        int padding_idx,
+        bool mode_mean,
+        const Tensor &offset2bag,
+        const Tensor &bag_size,
+        const Tensor &per_sample_weights) {
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  const ptrdiff_t numel = sorted_indices.numel();
+
+  auto grad_weight = at::zeros({num_weights, grad.size(-1)}, grad.options());
+  const int64_t stride = grad_weight.stride(0);
+
+  // Compute the number of segments and their start position so that we do not have to
+  // spawn a warp per index. In this context, a segment is a number of rows that should
+  // be summarized.
+  // Unit: index in `sorted_indices` and `orig_indices`
+  auto segment_offsets = at::empty({numel}, orig_indices.options());
+  auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
+  int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
+#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_zoom_kernel", [&] () {
+    int64_t num_of_segments = embedding_backward_zoom_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
+    write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::zoom::getCurrentZoomStream()>>>(num_of_segments_ptr, num_of_segments);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  });
+#else
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_zoom_kernel", [&] () {
+    zoom::hipcub::unique_by_key(
+      sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
+      nullptr, segment_offsets.mutable_data_ptr<index_t>(),
+      num_of_segments_ptr, sorted_indices.numel());
+  });
+#endif
+
+  int64_t max_segments = std::min<int64_t>(numel, num_weights);
+
+  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_zoom_kernel", [&] () {
+    // We split the segments up into sizes of `NROWS_PER_THREAD`
+    // Compute the number partial-segments per segment (some partial-segments
+    // may not be the full `NROWS_PER_THREAD` number of rows)
+    auto partials_per_segment = at::empty({max_segments}, orig_indices.options());
+    {
+      krn_partials_per_segment<<<ceil_div(max_segments, 32), 32, 0, stream>>> (
+              partials_per_segment.mutable_data_ptr<index_t>(),
+              segment_offsets.const_data_ptr<index_t>(),
+              num_of_segments_ptr,
+              numel);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+
+    // In order to compute `partial_segment_offset`, which is the start index
+    // of each partial-segment in `sorted_indices`, we need to compute the
+    // start position of each _segment_ in `partial_segment_offset`.
+    // Unit: index in `partial_segment_offset`
+    auto partials_per_segment_offset = at::empty({max_segments}, orig_indices.options());
+    zoom::hipcub::exclusive_sum(
+        partials_per_segment.const_data_ptr<index_t>(),
+        partials_per_segment_offset.mutable_data_ptr<index_t>(),
+        max_segments);
+
+    // The total number of partial-segments is the sum of `partials_per_segment_offset`
+    auto num_of_partial_segments_tensor = at::empty({}, grad.options().dtype(kLong));
+    int64_t *num_of_partial_segments_ptr = num_of_partial_segments_tensor.mutable_data_ptr<int64_t>();
+    compute_num_of_partial_segments<index_t><<<1, 1, 0, c10::zoom::getCurrentZoomStream()>>>(
+      partials_per_segment.const_data_ptr<index_t>(),
+      partials_per_segment_offset.const_data_ptr<index_t>(),
+      num_of_segments_ptr, num_of_partial_segments_ptr);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    auto max_partial_segment = numel / NROWS_PER_THREAD + max_segments;
+
+    // Now we can compute the start position of each partial-segment
+    // Unit: index in `sorted_indices` and `orig_indices`
+    auto partial_segment_offset = at::empty({max_partial_segment}, orig_indices.options());
+    {
+      krn_partial_segment_offset<<<ceil_div(max_segments, 32), 32, 0, stream>>> (
+              partial_segment_offset.mutable_data_ptr<index_t>(),
+              partials_per_segment.const_data_ptr<index_t>(),
+              partials_per_segment_offset.const_data_ptr<index_t>(),
+              segment_offsets.const_data_ptr<index_t>(),
+              num_of_segments_ptr);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+
+    const int warp_size = at::zoom::warp_size();
+    const int stride_warped = ceil_div(stride, warp_size)*warp_size;
+    const int block = std::min(stride_warped, MAX_BLOCK_SIZE);
+    const int grid = ceil_div(max_partial_segment*stride_warped, block);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad.scalar_type(), "embedding_bag_backward_zoom_compute_grad_weight", [&] {
+        // For numerical stability, the dtype of `grad_weight_per_segment`
+        // should match `acc_type`
+        using partial_weight_t = acc_type<scalar_t, true>;
+        TensorOptions op;
+        if(grad.dtype() == at::kHalf || grad.dtype() == at::kBFloat16) {
+            op = grad.options().dtype(at::kFloat);
+        } else {
+            op = grad.options();
+        }
+        auto grad_weight_per_segment = at::empty({max_partial_segment, stride}, op);
+        // Compute the sum of each partial-segment and handle bags
+        if (offset2bag.defined()) {
+              compute_grad_weight_bags<scalar_t><<<grid, block, 0, stream>>>(
+                orig_indices.const_data_ptr<index_t>(),
+                grad.const_data_ptr<scalar_t>(),
+                offset2bag.const_data_ptr<index_t>(),
+                count.defined() ? count.const_data_ptr<index_t>() : nullptr, numel, stride,
+                mode_mean, bag_size.const_data_ptr<index_t>(),
+                per_sample_weights.defined() ? per_sample_weights.const_data_ptr<scalar_t>() : NULL,
+                per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
+                partial_segment_offset.const_data_ptr<index_t>(),
+                num_of_partial_segments_ptr, grad_weight_per_segment.mutable_data_ptr<partial_weight_t>(),
+                stride_warped);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        } else {
+              compute_grad_weight<scalar_t><<<grid, block, 0, stream>>>(
+                orig_indices.const_data_ptr<index_t>(),
+                grad.const_data_ptr<scalar_t>(),
+                count.defined() ? count.const_data_ptr<index_t>() : nullptr,
+                numel, stride,
+                partial_segment_offset.const_data_ptr<index_t>(),
+                num_of_partial_segments_ptr,
+                grad_weight_per_segment.mutable_data_ptr<partial_weight_t>(),
+                stride_warped);
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+
+        // Finally, we sum all the partial-sums and scatter them
+        // into `grad_weight`.
+        const int grid2 = ceil_div(max_segments*stride_warped, block);
+            sum_and_scatter<scalar_t><<<grid2, block, 0, stream>>>(
+              sorted_indices.const_data_ptr<index_t>(),
+              grad_weight.mutable_data_ptr<scalar_t>(),
+              stride,
+              segment_offsets.const_data_ptr<index_t>(),
+              num_of_segments_ptr, grad_weight_per_segment.const_data_ptr<partial_weight_t>(),
+              partials_per_segment_offset.const_data_ptr<index_t>(),
+              num_of_partial_segments_ptr,
+              padding_idx,
+              stride_warped);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  });
+  return grad_weight;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cuh b/aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cuh
new file mode 100644
index 00000000000000..35f14ef997eabc
--- /dev/null
+++ b/aten/src/ATen/native/zoom/EmbeddingBackwardKernel.cuh
@@ -0,0 +1,22 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/TensorUtils.h>
+
+namespace at {
+namespace native {
+
+Tensor embedding_backward_zoom_kernel(
+    const Tensor &grad,
+    const Tensor &orig_indices,
+    const Tensor &sorted_indices,
+    const Tensor &count,
+    int64_t num_weights,
+    int padding_idx = -1,
+    bool mode_mean = false,
+    const Tensor &offset2bag = Tensor(),
+    const Tensor &bag_size = Tensor(),
+    const Tensor &per_sample_weights = Tensor());
+
+}}
diff --git a/aten/src/ATen/native/zoom/EmbeddingBag.cu b/aten/src/ATen/native/zoom/EmbeddingBag.cu
new file mode 100644
index 00000000000000..3ed55a07c39fe5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/EmbeddingBag.cu
@@ -0,0 +1,560 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/TensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_dense_backward_native.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
+#endif
+
+#include <ATen/zoom/cub.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/native/zoom/EmbeddingBackwardKernel.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#include <c10/macros/Macros.h>
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+#include <thrust/iterator/reverse_iterator.h>
+#endif
+
+namespace at::native {
+
+#if !CUB_SUPPORTS_SCAN_BY_KEY()
+template<typename index_t>
+void embedding_dense_backward_zoom_scan(Tensor &sorted_indices, Tensor &count);
+#endif
+
+namespace {
+
+constexpr int MODE_SUM = 0;
+constexpr int MODE_MEAN = 1;
+constexpr int MODE_MAX = 2;
+
+std::pair<Tensor, Tensor> promoteIndicesAndOffsets(
+    const Tensor& indices,
+    const Tensor& offsets) {
+  const auto commonType =
+      promoteTypes(offsets.scalar_type(), indices.scalar_type());
+  return {
+      indices.scalar_type() == commonType ? indices
+                                          : indices.toType(commonType),
+      offsets.scalar_type() == commonType ? offsets
+                                          : offsets.toType(commonType)};
+}
+
+// This kernel assumes that all input tensors except `weight` and
+// per_sample_weights are contiguous.
+template <typename scalar_t, typename index_t>
+__global__ void EmbeddingBag_updateOutputKernel_max(
+    const index_t *input, const index_t *offsets, const scalar_t *weight, scalar_t *output,
+    index_t *offset2bag, int64_t numIndices, int64_t numBags,
+    int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1,
+    index_t *bag_size, index_t *max_indices,
+    index_t padding_idx, int64_t numRows) {
+
+  // the strategy here is that each bag x feature is handled by a single thread
+
+  int64_t chunksPerBag = ceil_div(featureSize, (int64_t)blockDim.x);
+  int64_t numChunks = numBags * chunksPerBag;
+  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+  int64_t chunkStride = gridDim.x * blockDim.y;
+
+  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
+    if (featureDim < featureSize) {
+      int64_t bag = chunk / chunksPerBag;
+      const scalar_t *weightFeat = weight + featureDim * weight_stride1;
+      int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
+      int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
+      ZOOM_KERNEL_ASSERT(end >= begin);
+      scalar_t weightFeatMax = 0;
+      int64_t bag_size_ = 0;
+      int64_t maxWord = -1;
+      for (int64_t emb = begin; emb < end; emb++) {
+        bool pad = (input[emb] == padding_idx);
+        ZOOM_KERNEL_ASSERT(input[emb] < numRows);
+        const int64_t weightRow = input[emb] * weight_stride0;
+        scalar_t weightValue = weightFeat[weightRow];
+        if (bag_size_ == 0 || weightValue > weightFeatMax) {
+          weightFeatMax = pad ? weightFeatMax : weightValue;
+          maxWord = pad ? maxWord : input[emb];
+        }
+        bag_size_ += pad ? 0 : 1;
+
+        if (featureDim == 0) {
+          offset2bag[emb] = bag;
+        }
+      }
+      bag_size[bag] = bag_size_;
+      max_indices[bag * featureSize + featureDim] = maxWord;
+      output[bag * featureSize + featureDim] = weightFeatMax;
+    }
+  }
+}
+
+// This kernel assumes that all input tensors except `weight` and
+// per_sample_weights are contiguous.
+template <typename scalar_t, typename index_t>
+__global__ void EmbeddingBag_updateOutputKernel_sum_mean(
+    const index_t *input, const index_t *offsets, const scalar_t *weight, scalar_t *output,
+    index_t *offset2bag, int64_t numIndices, int64_t numBags,
+    int64_t featureSize, int64_t weight_stride0, int64_t weight_stride1,
+    int mode, index_t *bag_size,
+    const scalar_t* per_sample_weights, int64_t per_sample_weights_stride,
+    index_t padding_idx, int64_t numRows) {
+
+  // the strategy here is that each bag x feature is handled by a single thread
+
+  using accscalar_t = acc_type<scalar_t, true>;
+  int64_t chunksPerBag = ceil_div(featureSize, (int64_t)blockDim.x);
+  int64_t numChunks = numBags * chunksPerBag;
+  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+  int64_t chunkStride = gridDim.x * blockDim.y;
+
+  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
+    if (featureDim < featureSize) {
+      int64_t bag = chunk / chunksPerBag;
+      const scalar_t *weightFeat = weight + featureDim * weight_stride1;
+      int64_t begin = bag == 0 ? 0 : offsets[bag]; // forces first offset to be 0 instead of asserting on it
+      int64_t end = (bag < numBags - 1) ? (offsets[bag + 1]) : numIndices;
+      ZOOM_KERNEL_ASSERT(end >= begin);
+      accscalar_t weightFeatSum = 0;
+      int64_t bag_size_ = 0;
+      for (int64_t emb = begin; emb < end; emb++) {
+        bool pad = (input[emb] == padding_idx);
+        ZOOM_KERNEL_ASSERT(input[emb] < numRows);
+        const int64_t weightRow = input[emb] * weight_stride0;
+        scalar_t weightValue = weightFeat[weightRow];
+        weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
+        if (per_sample_weights) {
+          accscalar_t scaleWeightBy = static_cast<accscalar_t>(
+              per_sample_weights[emb * per_sample_weights_stride]);
+          weightFeatSum += scaleWeightBy * static_cast<accscalar_t>(weightValue);
+        } else {
+          weightFeatSum += static_cast<accscalar_t>(weightValue);
+        }
+        bag_size_ += pad ? 0 : 1;
+
+        if (featureDim == 0) {
+          offset2bag[emb] = bag;
+        }
+      }
+      if (mode == MODE_MEAN) {
+        if (bag_size_ != 0) {
+          weightFeatSum = weightFeatSum / static_cast<accscalar_t>(bag_size_);
+        }
+      }
+      bag_size[bag] = bag_size_;
+      output[bag * featureSize + featureDim] = static_cast<scalar_t>(weightFeatSum);
+    }
+  }
+}
+
+Tensor embedding_bag_backward_zoom_sum_avg(
+                                   const Tensor &grad,
+                                   const Tensor &indices_,
+                                   const Tensor &offset2bag,
+                                   const Tensor &bag_size,
+                                   int64_t num_weights,
+                                   bool scale_grad_by_freq, int64_t mode,
+                                   const Tensor& per_sample_weights,
+                                   int64_t padding_idx) {
+  auto indices = indices_.contiguous();
+
+  ptrdiff_t num_indices = indices.numel();
+
+  if (num_indices == 0) {
+    // all empty bags
+    return at::zeros({num_weights, grad.size(1)}, grad.options());
+  }
+
+  auto sorted_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto orig_indices = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  Tensor count;
+
+  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_zoom_sum_avg", [&] () {
+    auto range = at::arange(num_indices, indices.options());
+    // int64_t nbits = zoom::hipcub::get_num_bits(num_weights);
+    zoom::hipcub::radix_sort_pairs(
+      indices.const_data_ptr<index_t>(), sorted_indices.mutable_data_ptr<index_t>(),
+      range.const_data_ptr<index_t>(), orig_indices.mutable_data_ptr<index_t>(),
+      num_indices, false/*, 0, nbits*/);
+  });
+
+  if (scale_grad_by_freq) {
+    count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_zoom_sum_avg", [&] () {
+      hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+      // Compute an increasing sequence per unique item in sortedIndices:
+      // sorted: 2 5 5 5 7 7 8 9 9
+      //  count: 1 1 2 3 1 2 1 1 2
+      auto sorted_data = sorted_indices.const_data_ptr<index_t>();
+      auto count_data = count.mutable_data_ptr<index_t>();
+      zoom::hipcub::inclusive_sum_by_key(
+        sorted_data,
+        at_zoom_detail::hipcub::ConstantInputIterator<index_t>(1),
+        count_data,
+        num_indices
+      );
+
+      // Take the maximum of each count per unique key in reverse:
+      // sorted: 2 5 5 5 7 7 8 9 9
+      //  count: 1 3 3 3 2 2 1 2 2
+      zoom::hipcub::inclusive_scan_by_key(
+        thrust::make_reverse_iterator(sorted_data + num_indices),
+        thrust::make_reverse_iterator(count_data + num_indices),
+        thrust::make_reverse_iterator(count_data + num_indices),
+        at_zoom_detail::hipcub::Max(),
+        num_indices
+      );
+    });
+#else
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_zoom_sum_avg", [&] () {
+      embedding_dense_backward_zoom_scan<index_t>(sorted_indices, count);
+    });
+#endif
+  }
+  return embedding_backward_zoom_kernel(grad, orig_indices, sorted_indices,
+      count, num_weights, padding_idx, mode == MODE_MEAN, offset2bag,
+      bag_size, per_sample_weights);
+}
+
+template <typename scalar_t, typename index_t>
+__global__ void EmbeddingBag_accGradParametersKernel_max(
+    const index_t *max_indices, const scalar_t *gradOutput,
+    scalar_t *gradWeight, int64_t stride, int64_t numBags,
+    index_t padding_idx, const index_t numel) {
+
+  using accscalar_t = acc_type<scalar_t, true>;
+
+  int64_t chunksPerBag = ceil_div(stride, (int64_t)blockDim.x);
+  int64_t numChunks = numBags * chunksPerBag;
+  int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
+  int64_t chunkStride = gridDim.x * blockDim.y;
+
+  for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
+    int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
+    if (featureDim < stride) {
+      int64_t bag = chunk / chunksPerBag;
+
+      index_t word_idx = max_indices[bag * stride + featureDim];
+      if (word_idx >= 0 && word_idx != padding_idx) {
+        // If bag is empty, we have max_indices[idx] set to -1 in forward.
+        fastAtomicAdd(
+            gradWeight, static_cast<index_t>(word_idx * stride + featureDim),
+            numel, gradOutput[bag * stride + featureDim], true);
+      }
+    }
+  }
+}
+
+Tensor embedding_bag_backward_zoom_max(const Tensor &grad,
+                                   const Tensor &max_indices,
+                                   int64_t num_weights,
+                                   int64_t padding_idx) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("embedding_bag_backward_zoom_max");
+
+  auto grad_weight = at::zeros({num_weights, grad.size(1)}, grad.options());
+
+  int64_t stride = grad_weight.stride(0);
+
+  int64_t numBags = grad.size(0);
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  dim3 block = dim3(64, 4);
+  int grid = 1024;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "embedding_bag_backward_zoom_max", [&] {
+        AT_DISPATCH_INDEX_TYPES(max_indices.scalar_type(), "embedding_bag_backward_zoom_max", [&] () {
+          EmbeddingBag_accGradParametersKernel_max<
+              scalar_t, index_t><<<grid, block, 0, stream>>>(
+              max_indices.const_data_ptr<index_t>(), grad.const_data_ptr<scalar_t>(),
+              grad_weight.mutable_data_ptr<scalar_t>(), stride, numBags,
+              padding_idx, grad_weight.numel());
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+  });
+
+  return grad_weight;
+}
+}
+
+// Assumes all input tensors are contiguous.
+// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+_embedding_bag_forward_only_zoom(const Tensor &weight, const Tensor &indices,
+                   const Tensor &offsets, const bool scale_grad_by_freq,
+                   const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
+                   bool include_last_offset, int64_t padding_idx) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
+  const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;
+
+  return _embedding_bag_zoom(
+      weight,
+      indices,
+      offsets,
+      scale_grad_by_freq,
+      mode,
+      sparse,
+      per_sample_weights,
+      include_last_offset,
+      padding_idx);
+}
+
+// Assumes all input tensors are contiguous.
+// See NOTE [ embedding_bag Native Functions ] in native_functions.yaml for details
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+_embedding_bag_zoom(const Tensor &weight, const Tensor &indices_,
+                   const Tensor &offsets_, const bool scale_grad_by_freq,
+                   const int64_t mode, bool sparse, const std::optional<Tensor>& per_sample_weights_opt,
+                   bool include_last_offset, int64_t padding_idx) {
+  TORCH_CHECK(indices_.dim() == 1 || indices_.dim() == 2,
+      "input has to be a 1D or 2D Tensor, but got Tensor of dimension ",
+      indices_.dim());
+  if (indices_.dim() == 1) {
+    TORCH_CHECK(offsets_.dim() == 1,
+        "offsets has to be a 1D Tensor, but got Tensor of dimension ",
+        offsets_.dim());
+  }
+  TORCH_CHECK(weight.dim() == 2,
+      "weight has to be a 2D Tensor, but got Tensor of dimension ",
+      weight.dim());
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
+  const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;
+
+  Tensor indices, offsets;
+  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarTypes("embedding_bag_zoom", indices_arg, {kLong, kInt});
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarTypes("embedding_bag_zoom", offsets_arg, {kLong, kInt});
+  checkSameType("embedding_bag_zoom", indices_arg, offsets_arg);
+  auto weight_arg = TensorArg(weight, "weight", 1);
+  checkSameGPU("embedding_bag_zoom", weight_arg, indices_arg);
+  checkSameGPU("embedding_bag_zoom", weight_arg, offsets_arg);
+
+  int64_t numIndices = indices.size(0);
+  int64_t numBags = offsets.size(0);
+  if (include_last_offset) {
+    // Check https://github.com/pytorch/pytorch/issues/29019
+    // We plan to add one more element in offsets, which is equal to the size of
+    // indices. Currently for cuda devices, we still use the legacy
+    // implementation even this flag is enabled.
+    TORCH_CHECK(
+        numBags >= 1, "include_last_offset: numBags should be at least 1");
+    numBags -= 1;
+  }
+  int64_t featureSize = weight.size(1);
+
+  auto bag_size = at::empty(offsets.sizes(), indices.options());
+  auto offset2bag =
+      at::empty({indices.size(0)}, indices.options()); // offset2bag = [0 0 0 0 0]
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  auto output = at::empty({numBags, featureSize}, weight.options());
+
+  Tensor max_indices;
+
+  if (mode == MODE_MAX) {
+    max_indices = at::empty({numBags, featureSize}, indices.options());
+  } else {
+    // No need to allocate if we aren't doing a backwards pass
+    max_indices = at::empty({0}, indices.options());
+  }
+
+  dim3 block = dim3(64, 4);
+
+  int grid = 1024;
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, weight.scalar_type(), "embedding_bag_zoom", [&] {
+    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_zoom", [&] () {
+      if (mode == MODE_MAX) {
+        EmbeddingBag_updateOutputKernel_max<scalar_t, index_t><<<grid, block, 0, stream>>>(
+            indices.const_data_ptr<index_t>(), offsets.const_data_ptr<index_t>(),
+            weight.const_data_ptr<scalar_t>(), output.mutable_data_ptr<scalar_t>(),
+            offset2bag.mutable_data_ptr<index_t>(), numIndices, numBags, featureSize,
+            weight.stride(0), weight.stride(1), bag_size.mutable_data_ptr<index_t>(),
+            max_indices.mutable_data_ptr<index_t>(),
+            padding_idx, weight.size(0));
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        EmbeddingBag_updateOutputKernel_sum_mean<scalar_t, index_t><<<grid, block, 0, stream>>>(
+            indices.const_data_ptr<index_t>(), offsets.const_data_ptr<index_t>(),
+            weight.const_data_ptr<scalar_t>(), output.mutable_data_ptr<scalar_t>(),
+            offset2bag.mutable_data_ptr<index_t>(), numIndices, numBags, featureSize,
+            weight.stride(0), weight.stride(1), mode, bag_size.mutable_data_ptr<index_t>(),
+            per_sample_weights.defined() ? per_sample_weights.const_data_ptr<scalar_t>() : NULL,
+            per_sample_weights.defined() ? per_sample_weights.stride(0) : 0,
+            padding_idx, weight.size(0));
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    });
+  });
+
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, offset2bag, bag_size, max_indices);
+}
+
+Tensor _embedding_bag_dense_backward_zoom(const Tensor &grad_, const Tensor &indices,
+                                   const Tensor &offset2bag,
+                                   const Tensor &bag_size_,
+                                   const Tensor &max_indices,
+                                   int64_t num_weights,
+                                   bool scale_grad_by_freq, int64_t mode, const std::optional<Tensor>& per_sample_weights_opt,
+                                   int64_t padding_idx) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt);
+  const Tensor& per_sample_weights = *per_sample_weights_maybe_owned;
+
+  // indices, offsets and offset2bag are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward in
+  // EmbeddingBag.cpp.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  Tensor grad = grad_.contiguous();
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  auto grad_arg = TensorArg(grad, "grad", 1);
+  checkSameGPU("embedding_bag_zoom", grad_arg, indices_arg);
+
+
+  switch (mode) {
+    case MODE_SUM:
+    case MODE_MEAN:
+      if (mode == MODE_MEAN)
+        AT_ASSERT(!per_sample_weights.defined());
+      return embedding_bag_backward_zoom_sum_avg(grad, indices, offset2bag,
+              bag_size_, num_weights, scale_grad_by_freq, mode,
+              per_sample_weights, padding_idx);
+
+    case MODE_MAX:
+      AT_ASSERT(!per_sample_weights.defined());
+      return embedding_bag_backward_zoom_max(grad, max_indices, num_weights,
+              padding_idx);
+
+    default:
+      AT_ERROR(
+          "Unknown mode for embedding_bag_backward_zoom ", mode);
+  }
+}
+
+template <typename scalar_t, typename index_t>
+__global__ static void _embedding_bag_per_sample_weights_backward_kernel(
+    const scalar_t* grad, int64_t grad_stride0, int64_t grad_stride1,
+    const scalar_t* weight, int64_t weight_stride0, int64_t weight_stride1,
+    const index_t* indices,  // contiguous
+    const index_t* offset2bag,  // contiguous
+    int64_t num_samples,
+    int64_t embedding_features,
+    scalar_t* output,
+    index_t padding_idx) {
+  using accscalar_t = acc_type<scalar_t, true>;
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  const int warp = idx / C10_WARP_SIZE;
+  const int thread_in_warp = idx % C10_WARP_SIZE;
+  const int num_warps = blockDim.x * gridDim.x / C10_WARP_SIZE;
+
+  // Each warp is responsible for the accumulation of one sample.
+  // This involves doing one dot product between grad[bag_idx] and weight[embedding_idx].
+  for (int sample_idx = warp; sample_idx < num_samples; sample_idx += num_warps) {
+    accscalar_t result = 0.;
+    const int bag_idx = (int)offset2bag[sample_idx];
+    const int embedding_idx = (int)indices[sample_idx];
+    if (embedding_idx != padding_idx) {
+      for (int feature_idx = thread_in_warp; feature_idx < embedding_features;
+          feature_idx += C10_WARP_SIZE) {
+        result +=
+            grad[grad_stride0 * bag_idx + grad_stride1 * feature_idx] *
+            weight[weight_stride0 * embedding_idx + weight_stride1 * feature_idx];
+      }
+    }
+    result = zoom_utils::WarpReduceSum<accscalar_t>(result);
+    if (thread_in_warp == 0) {
+      output[sample_idx] = result;
+    }
+  }
+}
+
+Tensor _embedding_bag_per_sample_weights_backward_zoom(
+    const Tensor& grad,
+    const Tensor& weight,  // NB: embedding table, not per_sample_weights
+    const Tensor& indices_,
+    const Tensor& offsets_,
+    const Tensor& offset2bag,
+    int64_t mode,
+    int64_t padding_idx) {
+  TORCH_CHECK(
+      mode == MODE_SUM,
+      "embedding_bag_backward: per_sample_weights only supported for mode='sum'");
+
+  AT_ASSERT(grad.dim() == 2);
+  auto embedding_features = grad.size(1);
+
+  Tensor indices, offsets;
+  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  AT_ASSERT(indices.dim() == 1);
+  auto num_samples = indices.size(0);
+
+  AT_ASSERT(weight.dim() == 2);
+  AT_ASSERT(weight.size(1) == embedding_features);
+
+  const int threads_per_block = 512;
+  const int warps_per_block = threads_per_block / at::zoom::warp_size();
+
+  dim3 block(threads_per_block);
+  dim3 grid((num_samples + warps_per_block - 1) / warps_per_block);
+
+  auto output = at::empty({num_samples}, grad.options());
+
+  // Early return when there is no samples in the batch. This saves unnecessary kernel
+  // launch, but also prevents hipGetLastError() to complain about invalid launch args
+  if (num_samples == 0) {
+    return output;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    grad.scalar_type(), "_embedding_bag_per_sample_weights_backward_zoom", [&]() {
+      AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "_embedding_bag_per_sample_weights_backward_zoom", [&]() {
+        _embedding_bag_per_sample_weights_backward_kernel<scalar_t, index_t>
+          <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+            grad.const_data_ptr<scalar_t>(), grad.stride(0), grad.stride(1),
+            weight.const_data_ptr<scalar_t>(), weight.stride(0), weight.stride(1),
+            indices.const_data_ptr<index_t>(),
+            offset2bag.const_data_ptr<index_t>(),
+            num_samples,
+            embedding_features,
+            output.mutable_data_ptr<scalar_t>(),
+            padding_idx);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+    }
+  );
+  return output;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/FlattenIndicesKernel.cu b/aten/src/ATen/native/zoom/FlattenIndicesKernel.cu
new file mode 100644
index 00000000000000..65bcb764d538c5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FlattenIndicesKernel.cu
@@ -0,0 +1,28 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/FlattenIndicesCommon.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/AccumulateType.h>
+
+namespace at::native {
+
+namespace {
+
+template <typename func_t>
+struct HIPKernelLauncher {
+  static void launch(TensorIteratorBase& iter, const func_t& f) {
+    gpu_kernel(iter, f);
+  }
+};
+
+Tensor flatten_indices_zoom_kernel(const Tensor& indices, IntArrayRef size) {
+  return _flatten_indices<HIPKernelLauncher>(indices, size);
+}
+
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(flatten_indices_stub, &flatten_indices_zoom_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/FractionalMaxPool2d.cu b/aten/src/ATen/native/zoom/FractionalMaxPool2d.cu
new file mode 100644
index 00000000000000..e469cbf4cffe9c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FractionalMaxPool2d.cu
@@ -0,0 +1,274 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/native/FractionalMaxPooling.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/fractional_max_pool2d_backward_native.h>
+#include <ATen/ops/fractional_max_pool2d_native.h>
+#endif
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+namespace at::native {
+
+using namespace at::zoom::detail;
+
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+__device__ inline int get_interval(accscalar_t sample,
+  int index, int inputSize, int outputSize, int poolSize) {
+  accscalar_t alpha = static_cast<accscalar_t>(inputSize - poolSize) /
+    static_cast<accscalar_t>(outputSize - 1);
+  if (index == outputSize - 1) {
+    return inputSize - poolSize;
+  } else {
+    return static_cast<int>((index + sample) * alpha) -
+      static_cast<int>(sample * alpha);
+  }
+}
+
+template <typename scalar_t>
+__global__ void fractional_max_pool2d_out_zoom_frame(
+  PackedTensorAccessor<scalar_t, 4> output,
+  PackedTensorAccessor<int64_t, 4> indices,
+  PackedTensorAccessor<const scalar_t, 4> input,
+  PackedTensorAccessor<const scalar_t, 3> samples,
+  int poolSizeH, int poolSizeW) {
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+
+  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < output.size(2) * output.size(3)) {
+    int outputW = ourOutputPoint % output.size(3);
+    int outputH = ourOutputPoint / output.size(3);
+
+    int poolW = get_interval<scalar_t, accscalar_t>(
+      static_cast<accscalar_t>(samples[batch][plane][0]),
+        outputW, input.size(3), output.size(3), poolSizeW);
+    int poolH = get_interval<scalar_t, accscalar_t>(
+      static_cast<accscalar_t>(samples[batch][plane][1]),
+        outputH, input.size(2), output.size(2), poolSizeH);
+
+    scalar_t maxVal = at::numeric_limits<scalar_t>::lower_bound();
+    int maxIndex = poolH * input.size(3) + poolW;
+
+    for (int h = poolH; h < poolH + poolSizeH; ++h) {
+      if (poolSizeW < 2 || poolSizeW > 7) {
+        for (int w = poolW; w < poolW + poolSizeW; ++w) {
+          scalar_t val = input[batch][plane][h][w];
+          // for consistency with THNN, favor the first max
+          if (val > maxVal || at::_isnan(val)) {
+            maxIndex = h * input.size(3) + w;
+            maxVal = val;
+          }
+        }
+      } else {
+        for (int i = 0; i < poolSizeW; ++i) {
+          int w = i + poolW;
+          scalar_t val = input[batch][plane][h][w];
+          // for consistency with THNN, favor the first max
+          if (val > maxVal || at::_isnan(val)) {
+            maxIndex = h * input.size(3) + w;
+            maxVal = val;
+          }
+        }
+      }
+    }
+
+    indices[batch][plane][outputH][outputW] = maxIndex;
+    output[batch][plane][outputH][outputW] = maxVal;
+  }
+}
+
+template <typename scalar_t>
+__global__ void fractional_max_pool2d_backward_out_zoom_frame(
+  PackedTensorAccessor<scalar_t, 4> gradInput,
+  PackedTensorAccessor<const scalar_t, 4> gradOutput,
+  PackedTensorAccessor<const int64_t, 4> indices) {
+  // Output (h, w) point that this thread is responsible for
+  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int plane = blockIdx.y;
+  int batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < gradOutput.size(2) *
+    gradOutput.size(3)) {
+    int outputW = ourOutputPoint % gradOutput.size(3);
+    int outputH = ourOutputPoint / gradOutput.size(3);
+
+    int index = indices[batch][plane][outputH][outputW];
+    ZOOM_KERNEL_ASSERT(index >= 0);
+    int inputW = index % gradInput.size(3);
+    int inputH = index / gradInput.size(3);
+    ZOOM_KERNEL_ASSERT(inputH < gradInput.size(2));
+
+    gpuAtomicAddNoReturn(
+      &gradInput[batch][plane][inputH][inputW],
+      gradOutput[batch][plane][outputH][outputW]
+    );
+  }
+}
+
+} // anonymous namespace
+
+TORCH_IMPL_FUNC(fractional_max_pool2d_out_zoom) (
+  const Tensor& input,
+  IntArrayRef pool_size,
+  IntArrayRef output_size,
+  const Tensor& randomSamples,
+  const Tensor& output,
+  const Tensor& indices
+) {
+  fractional_max_pool_check_shape</*ndim*/ 2>(input, randomSamples);
+
+  int planeDim = 0;
+
+  int ndims = input.ndimension();
+
+  if (ndims == 4) {
+    planeDim++;
+  }
+
+  /* sizes */
+  int numPlanes = input.size(planeDim);
+
+  int outputH = output_size[0];
+  int outputW = output_size[1];
+  int poolSizeH = pool_size[0];
+  int poolSizeW = pool_size[1];
+
+  auto output_ = output;
+  auto input_ = input;
+  auto indices_ = indices;
+
+  if(ndims == 3) {
+    output_ = output_.reshape({1, numPlanes, outputH, outputW});
+    indices_ = indices_.reshape({1, numPlanes, outputH, outputW});
+    input_ = input_.reshape({1, input.size(0), input.size(1), input.size(2)});
+  }
+
+  if (output_.numel() == 0) {
+    return;
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = output_.size(2) *
+    output_.size(3);
+  dim3 grid((outputPlaneSize + 127) / 128, // ceil(outputPlaneSize / 128)
+            input_.size(1),
+            input_.size(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    input.scalar_type(),
+    "fractional_max_pool2d_out_zoom_frame",
+    [&] {
+      auto devInput = input_.packed_accessor64<const scalar_t, 4>();
+      auto devOutput = output_.packed_accessor64<scalar_t, 4>();
+      auto devIndices = indices_.packed_accessor64<int64_t, 4>();
+      auto devSamples = randomSamples.packed_accessor64<const scalar_t, 3>();
+      fractional_max_pool2d_out_zoom_frame<scalar_t>
+        <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+          devOutput, devIndices, devInput, devSamples,
+          poolSizeH, poolSizeW);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+     }
+   );
+}
+
+TORCH_IMPL_FUNC(fractional_max_pool2d_backward_zoom)(
+  const Tensor& gradOutput,
+  const Tensor& input,
+  IntArrayRef pool_size /* unused */,
+  IntArrayRef output_size,
+  const Tensor& indices,
+  const Tensor& gradInput)
+{
+
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("fractional_max_pool2d_backward_zoom");
+
+  int dimh = 1;
+  int dimw = 2;
+
+  int ndims = input.ndimension();
+  if (ndims == 4) {
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  int inputH = input.size(dimh);
+  int inputW = input.size(dimw);
+
+  int outputH = output_size[0];
+  int outputW = output_size[1];
+
+  if (gradInput.numel() == 0) {
+    return;
+  }
+
+  gradInput.zero_();
+
+  auto gradInput_ = gradInput;
+  auto gradOutput_ = gradOutput;
+  auto indices_ = indices;
+
+  if(ndims == 3) {
+    gradInput_ = gradInput_.reshape({1, input.size(0), inputH, inputW});
+    gradOutput_ = gradOutput_.reshape({1, gradOutput.size(0), outputH, outputW});
+    indices_ = indices_.reshape({1, indices_.size(0), outputH, outputW});
+  }
+
+  /* backprop */
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = gradOutput_.size(2) *
+    gradOutput_.size(3);
+  dim3 grid((outputPlaneSize + 127) / 128, // ceil(outputPlaneSize / 128)
+            gradInput_.size(1),
+            gradInput_.size(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+  auto devIndices = indices_.packed_accessor64<const int64_t, 4>();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    gradOutput.scalar_type(),
+    "fractional_max_pool2d_backward_out_zoom_frame",
+    [&] {
+      auto devGradInput = gradInput_.packed_accessor64<scalar_t, 4>();
+      auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 4>();
+      fractional_max_pool2d_backward_out_zoom_frame<scalar_t>
+        <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+        devGradInput, devGradOutput, devIndices);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  );
+}
+
+}// at::native
diff --git a/aten/src/ATen/native/zoom/FractionalMaxPool3d.cu b/aten/src/ATen/native/zoom/FractionalMaxPool3d.cu
new file mode 100644
index 00000000000000..461e0343e39cb1
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FractionalMaxPool3d.cu
@@ -0,0 +1,348 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/NumericLimits.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/native/FractionalMaxPooling.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/fractional_max_pool3d_backward_native.h>
+#include <ATen/ops/fractional_max_pool3d_native.h>
+#endif
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+namespace at::native {
+
+using namespace at::zoom::detail;
+
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+__device__ inline int64_t get_intervals(
+  accscalar_t sample,
+  int64_t index,
+  int64_t inputSize,
+  int64_t outputSize,
+  int64_t poolSize) {
+    accscalar_t alpha = static_cast<accscalar_t>(inputSize - poolSize) /
+      static_cast<accscalar_t>(outputSize - 1);
+    if (index == outputSize - 1) {
+      return inputSize - poolSize;
+    } else {
+      return static_cast<int64_t>((index + sample) * alpha) - \
+        static_cast<int64_t>(sample * alpha);
+    }
+  }
+
+template <typename scalar_t>
+__global__ void fractional_max_pool3d_out_frame(
+  PackedTensorAccessor64<const scalar_t, 5> input,
+  PackedTensorAccessor64<scalar_t, 5> output,
+  PackedTensorAccessor64<int64_t, 5> indices,
+  PackedTensorAccessor64<const scalar_t, 3> samples,
+  int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) {
+    using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+    // Output (t, h, w) point that this thread is responsible for
+    int64_t ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t plane = blockIdx.y;
+    int64_t batch = blockIdx.z;
+    // Each thread generates a specific output point
+    if (ourOutputPoint < output.size(2) * output.size(3) *
+      output.size(4)){
+      int64_t outputT = ourOutputPoint / (output.size(3) *
+                    output.size(4));
+      int64_t outputH = (ourOutputPoint / output.size(4)) %
+                    output.size(3);
+      int64_t outputW = ourOutputPoint % output.size(4);
+
+      int64_t poolT = get_intervals<scalar_t,accscalar_t>(
+        static_cast<accscalar_t>(samples[batch][plane][0]),
+        outputT, input.size(2), output.size(2), poolSizeT);
+      int64_t poolH = get_intervals<scalar_t, accscalar_t>(
+        static_cast<accscalar_t>(samples[batch][plane][1]),
+        outputH, input.size(3), output.size(3), poolSizeH);
+      int64_t poolW = get_intervals<scalar_t, accscalar_t>(
+        static_cast<accscalar_t>(samples[batch][plane][2]),
+        outputW, input.size(4), output.size(4), poolSizeW);
+
+      scalar_t maxVal = at::numeric_limits<scalar_t>::lower_bound();
+      int64_t maxIndex = poolT * input.size(3) * input.size(4) + poolH * input.size(4) + poolW;
+
+      for(int64_t t = poolT; t < poolT + poolSizeT; ++ t) {
+        for (int64_t h = poolH; h < poolH + poolSizeH; ++h) {
+          if(poolSizeW < 2 || poolSizeW > 7) {
+            for (int64_t w = poolW; w < poolW + poolSizeW; ++w) {
+              scalar_t val = input[batch][plane][t][h][w];
+              // for consistency with THNN, favor the first max
+              if (val > maxVal || at::_isnan(val)) {
+                maxIndex = t * input.size(3) *
+                  input.size(4) + h * input.size(4) + w;
+                maxVal = val;
+              }
+            }
+          } else {
+            for (int64_t i = 0; i < poolSizeW; ++i) {
+              int64_t w = i + poolW;
+              scalar_t val = input[batch][plane][t][h][w];
+              // for consistency with THNN, favor the first max
+              if (val > maxVal || at::_isnan(val)) {
+                maxIndex = t * input.size(3) * input.size(4) +
+                  h * input.size(4) + w;
+                maxVal = val;
+              }
+            }
+          }
+        }
+      }
+
+      indices[batch][plane][outputT][outputH][outputW] = maxIndex;
+      output[batch][plane][outputT][outputH][outputW] = maxVal;
+    }
+  }
+
+template <typename scalar_t>
+__global__ void fractional_max_pool3d_backward_out_frame(
+  PackedTensorAccessor64<scalar_t, 5> gradInput,
+  PackedTensorAccessor64<const scalar_t, 5> gradOutput,
+  PackedTensorAccessor64<const int64_t, 5> indices) {
+  // Output (h, w) point that this thread is responsible for
+  int64_t ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
+  int64_t plane = blockIdx.y;
+  int64_t batch = blockIdx.z;
+
+  // Each thread generates a specific output point
+  if (ourOutputPoint < gradOutput.size(2) *
+    gradOutput.size(3) * gradOutput.size(4)) {
+    int64_t outputW = ourOutputPoint % gradOutput.size(4);
+    int64_t outputH = (ourOutputPoint / gradOutput.size(4)) %
+                      gradOutput.size(3);
+    int64_t outputT = ourOutputPoint / (gradOutput.size(3) *
+                      gradOutput.size(4));
+
+    int64_t index = indices[batch][plane][outputT][outputH][outputW];
+    ZOOM_KERNEL_ASSERT(index >= 0);
+    int64_t inputW = index % gradInput.size(4);
+    int64_t inputH = (index / gradInput.size(4)) %
+      gradInput.size(3);
+    int64_t inputT = index / (gradInput.size(3) *
+      gradInput.size(4));
+    ZOOM_KERNEL_ASSERT(inputT < gradInput.size(2));
+
+    gpuAtomicAddNoReturn(
+      &gradInput[batch][plane][inputT][inputH][inputW],
+      gradOutput[batch][plane][outputT][outputH][outputW]
+      );
+    }
+  }
+
+void fractional_max_pool3d_backward_out_zoom_template(
+  Tensor& gradInput,
+  const Tensor& gradOutput,
+  const Tensor& input,
+  IntArrayRef output_size,
+  const Tensor& indices) {
+    int64_t dimt = 1;
+    int64_t dimh = 2;
+    int64_t dimw = 3;
+
+    int64_t outputT = output_size[0];
+    int64_t outputH = output_size[1];
+    int64_t outputW = output_size[2];
+
+    int64_t ndims = input.ndimension();
+    if (ndims == 5) {
+      dimt++;
+      dimh++;
+      dimw++;
+    }
+
+    /* sizes */
+    int64_t inputT = input.size(dimt);
+    int64_t inputH = input.size(dimh);
+    int64_t inputW = input.size(dimw);
+
+    TORCH_CHECK(
+      outputT == gradOutput.size(dimt),
+      "fractional_max_pool3d_backward_out_zoom_template(): ",
+      "gradOutput time unexpected"
+    );
+    TORCH_CHECK(
+      outputH == gradOutput.size(dimh),
+      "fractional_max_pool3d_backward_out_zoom_template(): ",
+      "gradOutput height unexpected"
+    );
+    TORCH_CHECK(
+      outputW == gradOutput.size(dimw),
+      "fractional_max_pool3d_backward_out_zoom_template(): ",
+      "gradOutput width unexpected"
+    );
+
+    /* resize */
+    gradInput.resize_as_(input);
+    gradInput.zero_();
+
+    auto gradInput_ = gradInput;
+    auto gradOutput_ = gradOutput;
+    auto indices_ = indices;
+
+    if(ndims == 4) {
+      gradInput_ = gradInput_.reshape({1, gradInput.size(0), inputT,
+                                       inputH, inputW});
+      gradOutput_ = gradOutput_.reshape({1, gradOutput.size(0), outputT,
+                                         outputH, outputW});
+      indices_ = indices_.reshape({1, indices.size(0), outputT, outputH,
+                                   outputW});
+    }
+
+    if (gradInput.numel() == 0) {
+      return;
+    }
+
+    /* backprop */
+    // block is limited to 4 warps
+    // grid handles overflow per each plane
+    int64_t outputPlaneSize = gradOutput_.size(2) *
+      gradOutput_.size(3) * gradOutput_.size(4);
+    dim3 grid(
+      (outputPlaneSize + 127) / 128, // ceil(outputPlaneSize / 128)
+      gradInput_.size(1),
+      gradInput_.size(0));
+    dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      gradOutput.scalar_type(),
+      "fractional_max_pool3d_backward_out_frame",
+      [&] {
+        fractional_max_pool3d_backward_out_frame<scalar_t>
+        <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+          gradInput_.packed_accessor64<scalar_t, 5>(),
+          gradOutput_.packed_accessor64<const scalar_t, 5>(),
+          indices_.packed_accessor64<const int64_t, 5>()
+        );
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    );
+  }
+
+}// namespace
+
+TORCH_IMPL_FUNC(fractional_max_pool3d_out_zoom) (
+  const Tensor& input,
+  int64_t poolSizeT,
+  int64_t poolSizeH,
+  int64_t poolSizeW,
+  int64_t outputT,
+  int64_t outputH,
+  int64_t outputW,
+  const Tensor& randomSamples,
+  int64_t numBatch,
+  int64_t numPlanes,
+  int64_t inputT,
+  int64_t inputH,
+  int64_t inputW,
+  const Tensor& output,
+  const Tensor& indices) {
+  fractional_max_pool_check_shape</*ndim*/ 3>(input, randomSamples);
+
+  auto output_ = output;
+  auto indices_ = indices;
+  auto input_ = input;
+
+  int64_t ndims = input_.ndimension();
+  if(ndims == 4) {
+    output_ = output_.reshape({1, numPlanes, outputT, outputH, outputW});
+    indices_ = indices_.reshape({1, numPlanes, outputT, outputH, outputW});
+    input_ = input_.reshape({1, numPlanes, inputT, inputH, inputW});
+  }
+  if (output_.numel() == 0) {
+    return;
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int64_t outputPlaneSize = output_.size(2) *
+    output_.size(3) * output_.size(4);
+  dim3 grid(
+    (outputPlaneSize + 127) / 128, // ceil(outputPlaneSize / 128)
+    input_.size(1),
+    input_.size(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    input.scalar_type(),
+    "fractional_max_pool3d_out_frame",
+    [&]{
+      fractional_max_pool3d_out_frame<scalar_t>
+      <<<grid, block, 0, c10::zoom::getCurrentZoomStream()>>>(
+        input_.packed_accessor64<const scalar_t, 5>(),
+        output_.packed_accessor64<scalar_t, 5>(),
+        indices_.packed_accessor64<int64_t, 5>(),
+        randomSamples.packed_accessor64<const scalar_t, 3>(),
+        poolSizeT, poolSizeH, poolSizeW
+      );
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  );
+}
+
+Tensor& fractional_max_pool3d_backward_out_zoom(const at::Tensor& gradOutput_,
+  const at::Tensor& input,
+  IntArrayRef /*pool_size*/,
+  IntArrayRef output_size,
+  const at::Tensor& indices,
+  at::Tensor& gradInput) {
+    // See Note [Writing Nondeterministic Operations]
+    // Nondeterministic because of atomicAdd usage
+    globalContext().alertNotDeterministic("fractional_max_pool3d_backward_out_zoom");
+    fractional_max_pool3d_backward_out_zoom_template(
+      gradInput,
+      gradOutput_,
+      input,
+      output_size,
+      indices
+    );
+    return gradInput;
+  }
+
+Tensor fractional_max_pool3d_backward_zoom(
+  const at::Tensor& gradOutput,
+  const at::Tensor& input,
+  IntArrayRef pool_size,
+  IntArrayRef output_size,
+  const at::Tensor& indices) {
+    // See Note [Writing Nondeterministic Operations]
+    // Nondeterministic because of atomicAdd usage
+    globalContext().alertNotDeterministic("fractional_max_pool3d_backward_zoom");
+    Tensor gradInput = at::empty({0}, input.options());
+    fractional_max_pool3d_backward_out_zoom_template(
+      gradInput,
+      gradOutput,
+      input,
+      output_size,
+      indices
+    );
+    return gradInput;
+ }
+
+}// namespace at::native
diff --git a/aten/src/ATen/native/zoom/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/zoom/FunctionOfAMatrixUtilsKernel.cu
new file mode 100644
index 00000000000000..0c683a1ac4aa8e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FunctionOfAMatrixUtilsKernel.cu
@@ -0,0 +1,114 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/FunctionOfAMatrixUtils.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+
+namespace at::native {
+
+namespace {
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+C10_LAUNCH_BOUNDS_2(n_threads, n_elems_per_thread)
+__global__ void _elemwise_kernel(int total_n_elems, func_t f) {
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  int idx = total_work_block * blockIdx.x + threadIdx.x;
+
+  #pragma unroll
+  for (int i = 0; i < n_elems_per_thread; ++i) {
+    if (idx < total_n_elems) {
+      f(idx);
+      idx += n_threads;
+    }
+  }
+}
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+void _lauch_kernel(int total_n_elems, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(
+    total_n_elems >= 0 && total_n_elems <= std::numeric_limits<int32_t>::max()
+  );
+
+  dim3 block(n_threads);
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  dim3 grid((total_n_elems + total_work_block - 1) / total_work_block);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  _elemwise_kernel<n_threads, n_elems_per_thread, func_t>
+    <<<grid, block, 0, stream>>>(total_n_elems, f);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t>
+void _compute_linear_combination_internal_kernel(
+  TensorIterator& iter,
+  int32_t in_stride,
+  int32_t coeff_stride,
+  int32_t num_summations
+) {
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      _compute_linear_combination_internal_kernel<scalar_t>(
+        sub_iter, in_stride, coeff_stride, num_summations
+      );
+    }
+    return;
+  }
+
+  auto offset_calc = make_offset_calculator<3>(iter);
+  char* __restrict__ out_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  char* __restrict__ in_ptr = reinterpret_cast<char*>(iter.data_ptr(1));
+  char* __restrict__ coeff_ptr = reinterpret_cast<char*>(iter.data_ptr(2));
+
+  auto loop = [=]C10_DEVICE(int idx) {
+    auto offsets = offset_calc.get(idx);
+
+    auto* __restrict__ out_data = reinterpret_cast<scalar_t*>(
+      out_ptr + offsets[0]
+    );
+    auto* __restrict__ in_data = reinterpret_cast<scalar_t*>(
+      in_ptr + offsets[1]
+    );
+    using primitive_t = typename scalar_value_type<scalar_t>::type;
+    auto* __restrict__ coeff_data = reinterpret_cast<primitive_t*>(
+      coeff_ptr + offsets[2]
+    );
+
+    // perform summation
+    for (int32_t i = 0; i < num_summations; ++i) {
+      *out_data += in_data[i * in_stride] * coeff_data[i * coeff_stride];
+    }
+  };
+
+  _lauch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+}
+
+void _compute_linear_combination_zoom_kernel(
+  TensorIterator& iter,
+  int64_t in_stride,
+  int64_t coeff_stride,
+  int64_t num_summations
+) {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+    iter.dtype(),
+    "_compute_linear_combination_zoom", [&] () {
+      _compute_linear_combination_internal_kernel<scalar_t>(
+        iter, in_stride, coeff_stride, num_summations
+      );
+    }
+  );
+}
+
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(_compute_linear_combination_stub, &_compute_linear_combination_zoom_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/FusedAdamKernel.cu b/aten/src/ATen/native/zoom/FusedAdamKernel.cu
new file mode 100644
index 00000000000000..c958afd6b5391b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FusedAdamKernel.cu
@@ -0,0 +1,170 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TypeDefault.h>
+#include <ATen/native/ForeachUtils.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/zoom/fused_adam_amsgrad_impl.cuh>
+#include <ATen/native/zoom/fused_adam_impl.cuh>
+
+namespace at::native {
+
+// note(crcrpar): To observe the CI rules, i.e. 20 minutes per file to compile,
+// defensively split instantiations into _impl files. this is only for CUDA 11.3
+// for which it took about 20 minutes and 28 minutes in my workstation and CI,
+// respectively. As a data point, it took about 20 seconds for CUDA 11.7
+// installed in my environment. See
+// https://github.com/pytorch/pytorch/pull/81705 for details.
+void _fused_adam_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (amsgrad) {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs}),
+        "params, grads, exp_avgs, exp_avg_sqs, and max_exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adam_amsgrad_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  } else {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs}),
+        "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adam_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  }
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adam_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (lr.is_cpu()) {
+    _fused_adam_kernel_zoom_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr.item<double>(),
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        amsgrad,
+        maximize,
+        grad_scale,
+        found_inf);
+    return;
+  }
+
+  // Manually check devices since we specify no device check in
+  // native_functions.yaml
+  Device param_device = params[0].device();
+  if (grad_scale != c10::nullopt) {
+    TORCH_CHECK(
+        grad_scale->device() == param_device,
+        "grad_scale must be on the same GPU device as the params");
+  }
+  if (found_inf != c10::nullopt) {
+    TORCH_CHECK(
+        found_inf->device() == param_device,
+        "found_inf must be on the same GPU device as the params");
+  }
+  TORCH_CHECK(
+      lr.device() == param_device,
+      "lr must be on the same GPU device as the params");
+
+  if (amsgrad) {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs}),
+        "params, grads, exp_avgs, exp_avg_sqs, and max_exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adam_amsgrad_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  } else {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs}),
+        "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adam_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  }
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/FusedAdamWKernel.cu b/aten/src/ATen/native/zoom/FusedAdamWKernel.cu
new file mode 100644
index 00000000000000..fac98ddaf8167e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FusedAdamWKernel.cu
@@ -0,0 +1,172 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TypeDefault.h>
+#include <ATen/native/ForeachUtils.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/zoom/fused_adamw_amsgrad_impl.cuh>
+#include <ATen/native/zoom/fused_adamw_impl.cuh>
+
+namespace at {
+namespace native {
+
+// note(crcrpar): To observe the CI rules, i.e. 20 minutes per file to compile,
+// defensively split instantiations into _impl files. this is only for CUDA 11.3
+// for which it took about 20 minutes and 28 minutes in my workstation and CI,
+// respectively. As a data point, it took about 20 seconds for CUDA 11.7
+// installed in my environment. See
+// https://github.com/pytorch/pytorch/pull/81705 for details.
+void _fused_adamw_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (amsgrad) {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs}),
+        "params, grads, exp_avgs, exp_avg_sqs, and max_exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adamw_amsgrad_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  } else {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs}),
+        "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adamw_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  }
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adamw_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (lr.is_cpu()) {
+    _fused_adamw_kernel_zoom_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr.item<double>(),
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        amsgrad,
+        maximize,
+        grad_scale,
+        found_inf);
+    return;
+  }
+
+  // Manually check devices since we specify no device check in
+  // native_functions.yaml
+  Device param_device = params[0].device();
+  if (grad_scale != c10::nullopt) {
+    TORCH_CHECK(
+        grad_scale->device() == param_device,
+        "grad_scale must be on the same GPU device as the params");
+  }
+  if (found_inf != c10::nullopt) {
+    TORCH_CHECK(
+        found_inf->device() == param_device,
+        "found_inf must be on the same GPU device as the params");
+  }
+  TORCH_CHECK(
+      lr.device() == param_device,
+      "lr must be on the same GPU device as the params");
+
+  if (amsgrad) {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs}),
+        "params, grads, exp_avgs, exp_avg_sqs, and max_exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adamw_amsgrad_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  } else {
+    TORCH_CHECK(
+        at::native::check_fast_path_restrictions(
+            {params, grads, exp_avgs, exp_avg_sqs}),
+        "params, grads, exp_avgs, and exp_avg_sqs must have same dtype, device, and layout");
+    _fused_adamw_zoom_impl_(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        lr,
+        beta1,
+        beta2,
+        weight_decay,
+        eps,
+        maximize,
+        grad_scale,
+        found_inf);
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/FusedSgdKernel.cu b/aten/src/ATen/native/zoom/FusedSgdKernel.cu
new file mode 100644
index 00000000000000..2212615ee26672
--- /dev/null
+++ b/aten/src/ATen/native/zoom/FusedSgdKernel.cu
@@ -0,0 +1,427 @@
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ForeachUtils.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+
+namespace at::native {
+
+namespace {
+
+template <typename scalar_t, int depth>
+C10_DEVICE __forceinline__ void sgd_math(
+    scalar_t r_args[depth][kILP],
+    const double weight_decay,
+    const double momentum,
+    const float* lr_ptr,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const float* grad_scale_ptr) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  const double double_lr = lr_ptr != nullptr ? *lr_ptr : lr;
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    auto p = static_cast<opmath_t>(r_args[0][ii]);
+    auto g = static_cast<opmath_t>(r_args[1][ii]);
+    if (grad_scale_ptr) {
+      g /= static_cast<double>(*grad_scale_ptr);
+      r_args[1][ii] = g;
+    }
+    if (maximize) {
+      g *= -1.0;
+    }
+    if (weight_decay != 0) {
+      g += weight_decay * p;
+    }
+    if (depth > 2) {
+      const auto momentum_buffer = is_first_step
+          ? g
+          : (momentum * static_cast<opmath_t>(r_args[2][ii]) +
+             (1 - dampening) * g);
+      r_args[2][ii] = momentum_buffer;
+
+      if (nesterov) {
+        g = g + momentum * momentum_buffer;
+      } else {
+        g = momentum_buffer;
+      }
+    }
+    p -= double_lr * g;
+    r_args[0][ii] = p;
+  }
+}
+
+template <typename scalar_t, int depth>
+struct FusedSgdMathFunctor {
+  static_assert(
+      depth == 2 || depth == 3,
+      "depth of 2 for SGD w/ momentum == 0, 3 for SGD w/ momentum != 0");
+  C10_DEVICE __forceinline__ void operator()(
+      const int chunk_size,
+      TensorListMetadata<depth>& tl,
+      const double weight_decay,
+      const double momentum,
+      const float* lr_ptr,
+      const double lr,
+      const double dampening,
+      const bool nesterov,
+      const bool maximize,
+      const bool is_first_step,
+      const float* grad_scale_ptr,
+      const float* found_inf_ptr) {
+    if (found_inf_ptr && *found_inf_ptr == 1) {
+      return;
+    }
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+
+    scalar_t* args[depth];
+    scalar_t r_args[depth][kILP];
+    const auto all_aligned{
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
+    const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size;
+    #if defined(ROCM_VERSION) && ROCM_VERSION >= 60100 
+      const auto use_faster_load_store =
+          (n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned;
+    #else
+      #pragma message("Detected ROCm < 6.1, disabling faster load store for fused SGD")
+      const auto use_faster_load_store{false};
+    #endif
+    if (use_faster_load_store) {
+      for (auto i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (auto i = 0; i < depth; i++) {
+          load_store(r_args[i], args[i], 0, i_start);
+        }
+        sgd_math<scalar_t, depth>(
+            r_args,
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr);
+        load_store(args[0], r_args[0], i_start, 0);
+        if (grad_scale_ptr) {
+          load_store(args[1], r_args[1], i_start, 0);
+        }
+        if (depth > 2) {
+          load_store(args[2], r_args[2], i_start, 0);
+        }
+      }
+    } else {
+      for (auto i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth>(r_args, args, i_start, chunk_size, n);
+        sgd_math<scalar_t, depth>(
+            r_args,
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr);
+        store_args(args[0], r_args[0], i_start, chunk_size, n);
+        if (grad_scale_ptr) {
+          store_args(args[1], r_args[1], i_start, chunk_size, n);
+        }
+        if (depth > 2) {
+          store_args(args[2], r_args[2], i_start, chunk_size, n);
+        }
+      }
+    }
+  }
+};
+
+void _fused_sgd_with_momentum_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  TORCH_CHECK_GT(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions(
+      {params, grads, momentum_buffer_list}));
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  float* lr_ptr = nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), momentum_buffer_list.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_with_momentum_kernel_zoom",
+      [&]() {
+        multi_tensor_apply<3>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 3>(),
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+void _fused_sgd_with_momentum_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const at::Tensor& lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (lr.is_cpu()) {
+    _fused_sgd_with_momentum_kernel_zoom_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr.item<double>(),
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  TORCH_CHECK_GT(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions(
+      {params, grads, momentum_buffer_list}));
+  if (grad_scale != c10::nullopt) {
+    TORCH_CHECK(
+        grad_scale->device() == params[0].device(),
+        "grad_scale must be on the same GPU device as the params");
+  }
+  if (found_inf != c10::nullopt) {
+    TORCH_CHECK(
+        found_inf->device() == params[0].device(),
+        "found_inf must be on the same GPU device as the params");
+  }
+  TORCH_CHECK(
+      lr.device() == params[0].device(),
+      "found_inf must be on the same GPU device as the params");
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), momentum_buffer_list.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_with_momentum_kernel_zoom",
+      [&]() {
+        multi_tensor_apply<3>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 3>(),
+            weight_decay,
+            momentum,
+            lr.data_ptr<float>(),
+            1.0,
+            dampening,
+            nesterov,
+            maximize,
+            is_first_step,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace
+
+void _fused_sgd_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (!momentum_buffer_list.empty()) {
+    _fused_sgd_with_momentum_kernel_zoom_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr,
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  TORCH_CHECK_EQ(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions({params, grads}));
+  if (is_first_step) {
+    TORCH_WARN_ONCE(
+        "`is_first_step` argument has no effect when `momentum_buffer_list` is empty");
+  }
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  float* lr_ptr = nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{params.vec(), grads.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_kernel_zoom",
+      [&]() {
+        multi_tensor_apply<2>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 2>(),
+            weight_decay,
+            momentum,
+            lr_ptr,
+            lr,
+            dampening,
+            nesterov,
+            maximize,
+            /* is_first_step */ false,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+void _fused_sgd_kernel_zoom_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList momentum_buffer_list,
+    const double weight_decay,
+    const double momentum,
+    const at::Tensor& lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  if (!momentum_buffer_list.empty()) {
+    _fused_sgd_with_momentum_kernel_zoom_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr,
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  if (lr.is_cpu()) {
+    _fused_sgd_kernel_zoom_(
+        params,
+        grads,
+        momentum_buffer_list,
+        weight_decay,
+        momentum,
+        lr.item<double>(),
+        dampening,
+        nesterov,
+        maximize,
+        is_first_step,
+        grad_scale,
+        found_inf);
+    return;
+  }
+  TORCH_CHECK_EQ(momentum, 0);
+  TORCH_CHECK(at::native::check_fast_path_restrictions({params, grads}));
+  if (is_first_step) {
+    TORCH_WARN_ONCE(
+        "`is_first_step` argument has no effect when `momentum_buffer_list` is empty");
+  }
+  if (grad_scale.has_value()) {
+    TORCH_CHECK(
+        grad_scale->device() == params[0].device(),
+        "grad_scale must be on the same GPU device as the params");
+  }
+  if (found_inf.has_value()) {
+    TORCH_CHECK(
+        found_inf->device() == params[0].device(),
+        "found_inf must be on the same GPU device as the params");
+  }
+  TORCH_CHECK(
+      lr.device() == params[0].device(),
+      "found_inf must be on the same GPU device as the params");
+  float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+
+  std::vector<std::vector<at::Tensor>> tensor_lists{params.vec(), grads.vec()};
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_sgd_kernel_zoom",
+      [&]() {
+        multi_tensor_apply<2>(
+            tensor_lists,
+            FusedSgdMathFunctor<scalar_t, 2>(),
+            weight_decay,
+            momentum,
+            lr.data_ptr<float>(),
+            1.0,
+            dampening,
+            nesterov,
+            maximize,
+            /* is_first_step */ false,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/GcdLcmKernel.cu b/aten/src/ATen/native/zoom/GcdLcmKernel.cu
new file mode 100644
index 00000000000000..ebb7dbdaa79f34
--- /dev/null
+++ b/aten/src/ATen/native/zoom/GcdLcmKernel.cu
@@ -0,0 +1,58 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/zoom/jit/jit_utils.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+// See note [Jiterator]
+CONSTEXPR_EXCEPT_WIN_CUDA char gcd_name[] = "gcd";
+void gcd_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/gcd_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 2>(iter, gcd_string);
+    });
+  #else
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "gcd_zoom", [&]() {
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        return calc_gcd(a, b);
+      });
+    });
+  #endif // AT_USE_JITERATOR()
+}
+
+// See note [Jiterator]
+CONSTEXPR_EXCEPT_WIN_CUDA char lcm_name[] = "lcm";
+void lcm_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "lcm_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/lcm_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 2>(iter, lcm_string);
+    });
+  #else
+    AT_DISPATCH_INTEGRAL_TYPES(iter.common_dtype(), "lcm_zoom", [&]() {
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+        scalar_t g = calc_gcd(a, b);
+        return (g == 0) ? 0 : ::abs(a / g * b);
+      });
+    });
+  #endif // AT_USE_JITERATOR()
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(gcd_stub, &gcd_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(lcm_stub, &lcm_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/GridSampler.cpp b/aten/src/ATen/native/zoom/GridSampler.cpp
new file mode 100644
index 00000000000000..88b2956b297c22
--- /dev/null
+++ b/aten/src/ATen/native/zoom/GridSampler.cpp
@@ -0,0 +1,82 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/GridSampler.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/grid_sampler_2d_backward_native.h>
+#include <ATen/ops/grid_sampler_2d_native.h>
+#include <ATen/ops/grid_sampler_3d_backward_native.h>
+#include <ATen/ops/grid_sampler_3d_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+namespace at::native {
+
+Tensor grid_sampler_2d_zoom(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode,
+                            bool align_corners) {
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+  launch_grid_sampler_2d_forward_kernel(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+Tensor grid_sampler_3d_zoom(const Tensor& input, const Tensor& grid,
+                            int64_t interpolation_mode, int64_t padding_mode,
+                            bool align_corners) {
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2], grid_size[3]},
+      input.options());
+  launch_grid_sampler_3d_forward_kernel(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_2d_backward_zoom(const Tensor& grad_output, const Tensor& input,
+                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners, std::array<bool, 2> output_mask) {
+  auto input_requires_grad = output_mask[0];
+  Tensor grad_input = ([&]() {
+    if (input_requires_grad) {
+      return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    } else {
+      return Tensor();
+    }
+  })();
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  launch_grid_sampler_2d_backward_kernel(
+      grad_input, grad_grid, grad_output, input,
+      grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+std::tuple<Tensor, Tensor>
+grid_sampler_3d_backward_zoom(const Tensor& grad_output, const Tensor& input,
+                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners, std::array<bool,2> output_mask) {
+  auto input_requires_grad = output_mask[0];
+  Tensor grad_input = ([&]() {
+    if (input_requires_grad) {
+      return at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    } else {
+      return Tensor();
+    }
+  })();
+  auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  launch_grid_sampler_3d_backward_kernel(
+      grad_input, grad_grid, grad_output, input,
+      grid, interpolation_mode, padding_mode, align_corners, output_mask);
+  return std::make_tuple(grad_input, grad_grid);
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/GridSampler.cu b/aten/src/ATen/native/zoom/GridSampler.cu
new file mode 100644
index 00000000000000..584581757bedea
--- /dev/null
+++ b/aten/src/ATen/native/zoom/GridSampler.cu
@@ -0,0 +1,961 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/OpMathType.h>
+#include <ATen/native/zoom/GridSampler.h>
+#include <ATen/native/GridSamplerUtils.h>
+#include <ATen/native/zoom/GridSampler.cuh>
+#include <ATen/native/zoom/UpSample.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/Dispatch.h>
+#include <c10/macros/Macros.h>
+#include <cmath>
+
+namespace at::native {
+
+using namespace at::zoom::detail;
+
+using at::native::detail::GridSamplerInterpolation;
+using at::native::detail::GridSamplerPadding;
+
+namespace {
+  template <typename scalar_t, typename index_t>
+  C10_LAUNCH_BOUNDS_1(256)
+  __global__ void grid_sampler_2d_kernel(
+      const index_t nthreads,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
+      TensorInfo<scalar_t, index_t> output,
+      const GridSamplerInterpolation interpolation_mode,
+      const GridSamplerPadding padding_mode,
+      bool align_corners) {
+
+    using opmath_t = at::opmath_type<scalar_t>;
+    index_t C = input.sizes[1];
+    index_t inp_H = input.sizes[2];
+    index_t inp_W = input.sizes[3];
+    index_t out_H = grid.sizes[1];
+    index_t out_W = grid.sizes[2];
+    index_t inp_sN = input.strides[0];
+    index_t inp_sC = input.strides[1];
+    index_t inp_sH = input.strides[2];
+    index_t inp_sW = input.strides[3];
+    index_t grid_sN = grid.strides[0];
+    index_t grid_sH = grid.strides[1];
+    index_t grid_sW = grid.strides[2];
+    index_t grid_sCoor = grid.strides[3];
+    index_t out_sN = output.strides[0];
+    index_t out_sC = output.strides[1];
+    index_t out_sH = output.strides[2];
+    index_t out_sW = output.strides[3];
+
+    HIP_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+      const index_t w = index % out_W;
+      const index_t h = (index / out_W) % out_H;
+      const index_t n = index / (out_H * out_W);
+      const index_t grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y co-ordinates from grid
+      opmath_t x = grid.data[grid_offset];
+      opmath_t y = grid.data[grid_offset + grid_sCoor];
+
+      opmath_t ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
+      opmath_t iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
+
+      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+        // get NE, NW, SE, SW pixel values from (x, y)
+        index_t ix_nw = static_cast<index_t>(::floor(ix));
+        index_t iy_nw = static_cast<index_t>(::floor(iy));
+        index_t ix_ne = ix_nw + 1;
+        index_t iy_ne = iy_nw;
+        index_t ix_sw = ix_nw;
+        index_t iy_sw = iy_nw + 1;
+        index_t ix_se = ix_nw + 1;
+        index_t iy_se = iy_nw + 1;
+
+        // get surfaces to each neighbor:
+        opmath_t nw = (ix_se - ix)    * (iy_se - iy);
+        opmath_t ne = (ix    - ix_sw) * (iy_sw - iy);
+        opmath_t sw = (ix_ne - ix)    * (iy    - iy_ne);
+        opmath_t se = (ix    - ix_nw) * (iy    - iy_nw);
+
+        // calculate bilinear weighted pixel value and set output pixel
+        auto inp_ptr_NC = input.data + n * inp_sN;
+        auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
+        for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+          opmath_t out_acc = 0;
+          if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+          }
+          if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+          }
+          if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+          }
+          if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+          }
+          *out_ptr_NCHW = out_acc;
+        }
+      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+        index_t ix_nearest = static_cast<index_t>(std::nearbyint(ix));
+        index_t iy_nearest = static_cast<index_t>(std::nearbyint(iy));
+
+        // assign nearest neighbour pixel value to output pixel
+        auto inp_ptr_NC = input.data + n * inp_sN;
+        auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
+        for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+          if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+            *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+          } else {
+            *out_ptr_NCHW = static_cast<scalar_t>(0);
+          }
+        }
+      } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+
+        ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+        iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+        opmath_t ix_nw = std::floor(ix);
+        opmath_t iy_nw = std::floor(iy);
+
+        const opmath_t tx = ix - ix_nw;
+        const opmath_t ty = iy - iy_nw;
+
+        auto inp_ptr_NC = input.data + n * inp_sN;
+        auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
+        for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+          opmath_t coefficients[4];
+
+          #pragma unroll 4
+          for (index_t i = 0; i < 4; ++i) {
+            coefficients[i] = cubic_interp1d(
+              get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+              get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+              get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+              get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+              tx);
+          }
+
+          *out_ptr_NCHW = cubic_interp1d(
+            coefficients[0],
+            coefficients[1],
+            coefficients[2],
+            coefficients[3],
+            ty);
+        }
+      }
+    }
+  }
+
+  template <typename scalar_t, typename index_t>
+  C10_LAUNCH_BOUNDS_1(512)
+  __global__ void grid_sampler_3d_kernel(
+      const index_t nthreads,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
+      TensorInfo<scalar_t, index_t> output,
+      const GridSamplerInterpolation interpolation_mode,
+      const GridSamplerPadding padding_mode,
+      bool align_corners) {
+
+    using opmath_t = at::opmath_type<scalar_t>;
+    index_t C = input.sizes[1];
+    index_t inp_D = input.sizes[2];
+    index_t inp_H = input.sizes[3];
+    index_t inp_W = input.sizes[4];
+    index_t out_D = grid.sizes[1];
+    index_t out_H = grid.sizes[2];
+    index_t out_W = grid.sizes[3];
+    index_t inp_sN = input.strides[0];
+    index_t inp_sC = input.strides[1];
+    index_t inp_sD = input.strides[2];
+    index_t inp_sH = input.strides[3];
+    index_t inp_sW = input.strides[4];
+    index_t grid_sN = grid.strides[0];
+    index_t grid_sD = grid.strides[1];
+    index_t grid_sH = grid.strides[2];
+    index_t grid_sW = grid.strides[3];
+    index_t grid_sCoor = grid.strides[4];
+    index_t out_sN = output.strides[0];
+    index_t out_sC = output.strides[1];
+    index_t out_sD = output.strides[2];
+    index_t out_sH = output.strides[3];
+    index_t out_sW = output.strides[4];
+
+    HIP_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+      const index_t w = index % out_W;
+      const index_t h = (index / out_W) % out_H;
+      const index_t d = (index / (out_H * out_W)) % out_D;
+      const index_t n = index / (out_D * out_H * out_W);
+      const index_t grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y, z co-ordinates from grid
+      opmath_t x = grid.data[grid_offset];
+      opmath_t y = grid.data[grid_offset + grid_sCoor];
+      opmath_t z = grid.data[grid_offset + 2 * grid_sCoor];
+
+      opmath_t ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
+      opmath_t iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
+      opmath_t iz = grid_sampler_compute_source_index(z, inp_D, padding_mode, align_corners);
+
+      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+        // get corner pixel values from (x, y, z)
+        // for 4d, we used north-east-south-west
+        // for 5d, we add top-bottom
+        index_t ix_tnw = static_cast<index_t>(::floor(ix));
+        index_t iy_tnw = static_cast<index_t>(::floor(iy));
+        index_t iz_tnw = static_cast<index_t>(::floor(iz));
+
+        index_t ix_tne = ix_tnw + 1;
+        index_t iy_tne = iy_tnw;
+        index_t iz_tne = iz_tnw;
+
+        index_t ix_tsw = ix_tnw;
+        index_t iy_tsw = iy_tnw + 1;
+        index_t iz_tsw = iz_tnw;
+
+        index_t ix_tse = ix_tnw + 1;
+        index_t iy_tse = iy_tnw + 1;
+        index_t iz_tse = iz_tnw;
+
+        index_t ix_bnw = ix_tnw;
+        index_t iy_bnw = iy_tnw;
+        index_t iz_bnw = iz_tnw + 1;
+
+        index_t ix_bne = ix_tnw + 1;
+        index_t iy_bne = iy_tnw;
+        index_t iz_bne = iz_tnw + 1;
+
+        index_t ix_bsw = ix_tnw;
+        index_t iy_bsw = iy_tnw + 1;
+        index_t iz_bsw = iz_tnw + 1;
+
+        index_t ix_bse = ix_tnw + 1;
+        index_t iy_bse = iy_tnw + 1;
+        index_t iz_bse = iz_tnw + 1;
+
+        // get surfaces to each neighbor:
+        opmath_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+        opmath_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+        opmath_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+        opmath_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+        opmath_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+        opmath_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+        opmath_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+        opmath_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+        auto inp_ptr_NC = input.data + n * inp_sN;
+        auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+        for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+          //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
+          // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
+          // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
+          // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
+          opmath_t out_acc = 0;
+          if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+          }
+          if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+          }
+          if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+          }
+          if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+          }
+          if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+          }
+          if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+          }
+          if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+          }
+          if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+            out_acc += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+          }
+          *out_ptr_NCDHW = out_acc;
+        }
+      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+        index_t ix_nearest = static_cast<index_t>(std::nearbyint(ix));
+        index_t iy_nearest = static_cast<index_t>(std::nearbyint(iy));
+        index_t iz_nearest = static_cast<index_t>(std::nearbyint(iz));
+
+        // assign nearest neighbour pixel value to output pixel
+        auto inp_ptr_NC = input.data + n * inp_sN;
+        auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+        for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+          if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
+            *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
+          } else {
+            *out_ptr_NCDHW = static_cast<scalar_t>(0);
+          }
+        }
+      }
+    }
+  }
+
+// Note [Passing pointer and offset to fastAtomicAdd]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// For its internal bounds checking, fastAtomicAdd needs to know where the destination address
+// lies relative to the entire tensor, so we pass the base grad_input.data and full offset information,
+// including batch * channel offset (NC_offset).
+
+  template <typename scalar_t, typename index_t>
+  C10_LAUNCH_BOUNDS_1(256)
+  __global__ void grid_sampler_2d_backward_kernel(
+      const index_t nthreads,
+      TensorInfo<const scalar_t, index_t> grad_output,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
+      TensorInfo<scalar_t, index_t> grad_input,  // initialized to zeros (or unused if input_requires_grad is false)
+      TensorInfo<scalar_t, index_t> grad_grid,   // initialized to empty
+      const GridSamplerInterpolation interpolation_mode,
+      const GridSamplerPadding padding_mode,
+      bool align_corners,
+      const index_t grad_input_memory_span,
+      const bool input_requires_grad) {
+
+    index_t C = input.sizes[1];
+    index_t inp_H = input.sizes[2];
+    index_t inp_W = input.sizes[3];
+    index_t out_H = grid.sizes[1];
+    index_t out_W = grid.sizes[2];
+    index_t inp_sN = input.strides[0];
+    index_t inp_sC = input.strides[1];
+    index_t inp_sH = input.strides[2];
+    index_t inp_sW = input.strides[3];
+    index_t grid_sN = grid.strides[0];
+    index_t grid_sH = grid.strides[1];
+    index_t grid_sW = grid.strides[2];
+    index_t grid_sCoor = grid.strides[3];
+    index_t gOut_sN = grad_output.strides[0];
+    index_t gOut_sC = grad_output.strides[1];
+    index_t gOut_sH = grad_output.strides[2];
+    index_t gOut_sW = grad_output.strides[3];
+    // gInp_* (and NC_offset below) are not really needed if input_requires_grad is false.
+    index_t gInp_sN;
+    index_t gInp_sC;
+    index_t gInp_sH;
+    index_t gInp_sW;
+    if (input_requires_grad) {
+      gInp_sN = grad_input.strides[0];
+      gInp_sC = grad_input.strides[1];
+      gInp_sH = grad_input.strides[2];
+      gInp_sW = grad_input.strides[3];
+    }
+    index_t gGrid_sW = grad_grid.strides[2];
+
+    HIP_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+      const index_t w = index % out_W;
+      const index_t h = (index / out_W) % out_H;
+      const index_t n = index / (out_H * out_W);
+      const auto grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y co-ordinates from grid
+      scalar_t x = grid.data[grid_offset];
+      scalar_t y = grid.data[grid_offset + grid_sCoor];
+
+      // multipliers for gradients on ix and iy
+      scalar_t gix_mult, giy_mult;
+      scalar_t ix = grid_sampler_compute_source_index_set_grad(x, inp_W, padding_mode, align_corners, &gix_mult);
+      scalar_t iy = grid_sampler_compute_source_index_set_grad(y, inp_H, padding_mode, align_corners, &giy_mult);
+
+      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+        // get NE, NW, SE, SW pixel values from (x, y)
+        index_t ix_nw = static_cast<index_t>(std::floor(ix));
+        index_t iy_nw = static_cast<index_t>(std::floor(iy));
+        index_t ix_ne = ix_nw + 1;
+        index_t iy_ne = iy_nw;
+        index_t ix_sw = ix_nw;
+        index_t iy_sw = iy_nw + 1;
+        index_t ix_se = ix_nw + 1;
+        index_t iy_se = iy_nw + 1;
+
+        // get surfaces to each neighbor:
+        scalar_t nw = (ix_se - ix)    * (iy_se - iy);
+        scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
+        scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
+        scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
+
+        scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
+        const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+        index_t NC_offset = n * gInp_sN;
+        const scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+        for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, NC_offset += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
+          const scalar_t gOut = *gOut_ptr_NCHW;
+
+          if (input_requires_grad) {
+            // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
+            safe_add_2d(grad_input.data, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut, NC_offset, grad_input_memory_span);
+            safe_add_2d(grad_input.data, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut, NC_offset, grad_input_memory_span);
+            safe_add_2d(grad_input.data, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut, NC_offset, grad_input_memory_span);
+            safe_add_2d(grad_input.data, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut, NC_offset, grad_input_memory_span);
+          }
+
+          // calculate grad_grid
+          if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+            scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW];
+            gix -= nw_val * (iy_se - iy) * gOut;
+            giy -= nw_val * (ix_se - ix) * gOut;
+          }
+          if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+            scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW];
+            gix += ne_val * (iy_sw - iy) * gOut;
+            giy -= ne_val * (ix - ix_sw) * gOut;
+          }
+          if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+            scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW];
+            gix -= sw_val * (iy - iy_ne) * gOut;
+            giy += sw_val * (ix_ne - ix) * gOut;
+          }
+          if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+            scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW];
+            gix += se_val * (iy - iy_nw) * gOut;
+            giy += se_val * (ix - ix_nw) * gOut;
+          }
+        }
+
+        // assuming grad_grid is contiguous
+        // thus we can
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
+        //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
+        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+        if (input_requires_grad) {
+          index_t ix_nearest = static_cast<index_t>(std::nearbyint(ix));
+          index_t iy_nearest = static_cast<index_t>(std::nearbyint(iy));
+
+          // assign nearest neighbour pixel value to output pixel
+          const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+          index_t NC_offset = n * gInp_sN;
+          for (index_t c = 0; c < C; ++c, NC_offset += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
+            // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
+            safe_add_2d(grad_input.data, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW, NC_offset, grad_input_memory_span);
+          }
+        }
+
+        // assuming grad_grid is contiguous
+        // thus we can
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
+        //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
+        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
+        gGrid_ptr_NHW[0] = static_cast<scalar_t>(0);
+        gGrid_ptr_NHW[1] = static_cast<scalar_t>(0);
+      } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+
+        ix = grid_sampler_unnormalize_set_grad(x, inp_W, align_corners, &gix_mult);
+        iy = grid_sampler_unnormalize_set_grad(y, inp_H, align_corners, &giy_mult);
+
+        scalar_t ix_nw = std::floor(ix);
+        scalar_t iy_nw = std::floor(iy);
+
+        const scalar_t tx = ix - ix_nw;
+        const scalar_t ty = iy - iy_nw;
+
+        scalar_t x_coeffs[4];
+        scalar_t y_coeffs[4];
+        scalar_t x_coeffs_grad[4];
+        scalar_t y_coeffs_grad[4];
+
+        get_cubic_upsampling_coefficients<scalar_t>(x_coeffs, tx);
+        get_cubic_upsampling_coefficients<scalar_t>(y_coeffs, ty);
+        get_cubic_coefficients_grad<scalar_t>(x_coeffs_grad, tx);
+        get_cubic_coefficients_grad<scalar_t>(y_coeffs_grad, ty);
+
+        scalar_t gix = static_cast<scalar_t>(0);
+        scalar_t giy = static_cast<scalar_t>(0);
+
+        const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
+        index_t NC_offset = n * gInp_sN;
+        const scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+
+        for (index_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC+= inp_sC) {
+          const scalar_t gOut = *gOut_ptr_NCHW;
+
+          #pragma unroll 4
+          for (index_t i = 0; i < 4; ++i) {
+            #pragma unroll 4
+            for (index_t j = 0; j < 4; ++j) {
+
+              if (input_requires_grad) {
+                // set input gradient. See Note [Passing pointer and offset to fastAtomicAdd].
+                add_value_bounded<scalar_t>(grad_input.data, ix_nw - 1 + i, iy_nw - 1 + j, inp_W, inp_H, gInp_sW, gInp_sH,
+                  gOut * x_coeffs[i] * y_coeffs[j],
+                  padding_mode,
+                  align_corners,
+                  NC_offset,
+                  grad_input_memory_span);
+              }
+
+              // set grid gradient
+              scalar_t val = get_value_bounded<scalar_t>(inp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j,
+                inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners);
+
+              gix -= val * x_coeffs_grad[i] * y_coeffs[j] * gOut;
+              giy -= val * y_coeffs_grad[j] * x_coeffs[i] * gOut;
+            }
+          }
+        }
+
+        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    }
+  }
+
+  template <typename scalar_t, typename index_t>
+  C10_LAUNCH_BOUNDS_1(256)
+  __global__ void grid_sampler_3d_backward_kernel(
+      const index_t nthreads,
+      TensorInfo<const scalar_t, index_t> grad_output,
+      TensorInfo<const scalar_t, index_t> input,
+      TensorInfo<const scalar_t, index_t> grid,
+      TensorInfo<scalar_t, index_t> grad_input,  // initialized to zeros (or unused if input_requires_grad is false)
+      TensorInfo<scalar_t, index_t> grad_grid,   // initialized to empty
+      const GridSamplerInterpolation interpolation_mode,
+      const GridSamplerPadding padding_mode,
+      bool align_corners,
+      const index_t grad_input_memory_span,
+      const bool input_requires_grad) {
+
+    index_t C = input.sizes[1];
+    index_t inp_D = input.sizes[2];
+    index_t inp_H = input.sizes[3];
+    index_t inp_W = input.sizes[4];
+    index_t out_D = grid.sizes[1];
+    index_t out_H = grid.sizes[2];
+    index_t out_W = grid.sizes[3];
+    index_t inp_sN = input.strides[0];
+    index_t inp_sC = input.strides[1];
+    index_t inp_sD = input.strides[2];
+    index_t inp_sH = input.strides[3];
+    index_t inp_sW = input.strides[4];
+    index_t grid_sN = grid.strides[0];
+    index_t grid_sD = grid.strides[1];
+    index_t grid_sH = grid.strides[2];
+    index_t grid_sW = grid.strides[3];
+    index_t grid_sCoor = grid.strides[4];
+    index_t gOut_sN = grad_output.strides[0];
+    index_t gOut_sC = grad_output.strides[1];
+    index_t gOut_sD = grad_output.strides[2];
+    index_t gOut_sH = grad_output.strides[3];
+    index_t gOut_sW = grad_output.strides[4];
+    // gInp_* (and NC_offset below) are not really needed if input_requires_grad is false.
+    int64_t gInp_sN = 0;
+    int64_t gInp_sC = 0;
+    int64_t gInp_sD = 0;
+    int64_t gInp_sH = 0;
+    int64_t gInp_sW = 0;
+    if (input_requires_grad) {
+      gInp_sN = grad_input.strides[0];
+      gInp_sC = grad_input.strides[1];
+      gInp_sD = grad_input.strides[2];
+      gInp_sH = grad_input.strides[3];
+      gInp_sW = grad_input.strides[4];
+    }
+    index_t gGrid_sW = grad_grid.strides[3];
+
+    HIP_KERNEL_LOOP_TYPE(index, nthreads, index_t) {
+      const index_t w = index % out_W;
+      const index_t h = (index / out_W) % out_H;
+      const index_t d = (index / (out_H * out_W)) % out_D;
+      const index_t n = index / (out_D * out_H * out_W);
+      const auto grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+      // get the corresponding input x, y, z co-ordinates from grid
+      scalar_t ix = grid.data[grid_offset];
+      scalar_t iy = grid.data[grid_offset + grid_sCoor];
+      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
+
+      // multipliers for gradients on ix, iy, and iz
+      scalar_t gix_mult, giy_mult, giz_mult;
+      ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult);
+      iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult);
+      iz = grid_sampler_compute_source_index_set_grad(iz, inp_D, padding_mode, align_corners, &giz_mult);
+
+      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+        // get corner pixel values from (x, y, z)
+        // for 4d, we used north-east-south-west
+        // for 5d, we add top-bottom
+        index_t ix_tnw = static_cast<index_t>(std::floor(ix));
+        index_t iy_tnw = static_cast<index_t>(std::floor(iy));
+        index_t iz_tnw = static_cast<index_t>(std::floor(iz));
+
+        index_t ix_tne = ix_tnw + 1;
+        index_t iy_tne = iy_tnw;
+        index_t iz_tne = iz_tnw;
+
+        index_t ix_tsw = ix_tnw;
+        index_t iy_tsw = iy_tnw + 1;
+        index_t iz_tsw = iz_tnw;
+
+        index_t ix_tse = ix_tnw + 1;
+        index_t iy_tse = iy_tnw + 1;
+        index_t iz_tse = iz_tnw;
+
+        index_t ix_bnw = ix_tnw;
+        index_t iy_bnw = iy_tnw;
+        index_t iz_bnw = iz_tnw + 1;
+
+        index_t ix_bne = ix_tnw + 1;
+        index_t iy_bne = iy_tnw;
+        index_t iz_bne = iz_tnw + 1;
+
+        index_t ix_bsw = ix_tnw;
+        index_t iy_bsw = iy_tnw + 1;
+        index_t iz_bsw = iz_tnw + 1;
+
+        index_t ix_bse = ix_tnw + 1;
+        index_t iy_bse = iy_tnw + 1;
+        index_t iz_bse = iz_tnw + 1;
+
+        // get surfaces to each neighbor:
+        scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
+        scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
+        scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
+        scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
+        scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
+        scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
+        scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
+        scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
+
+        scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
+        const scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+        index_t NC_offset;
+        if (input_requires_grad) {
+          NC_offset = n * gInp_sN;
+        }
+        const scalar_t *inp_ptr_NC = input.data + n * inp_sN;
+        // calculate bilinear weighted pixel value and set output pixel
+        for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC += inp_sC) {
+          scalar_t gOut = *gOut_ptr_NCDHW;
+
+          // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
+          if (input_requires_grad) {
+            safe_add_3d(grad_input.data, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut,
+                        NC_offset, grad_input_memory_span);
+            safe_add_3d(grad_input.data, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut,
+                        NC_offset, grad_input_memory_span);
+          }
+          // calculate grad_grid
+          if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+            scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
+            gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
+            giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
+            giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
+          }
+          if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+            scalar_t tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW];
+            gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
+            giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
+            giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
+          }
+          if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+            scalar_t tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW];
+            gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
+            giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
+            giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
+          }
+          if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+            scalar_t tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW];
+            gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
+            giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
+            giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
+          }
+          if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+            scalar_t bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW];
+            gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
+            giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
+            giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
+          }
+          if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+            scalar_t bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW];
+            gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
+            giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
+            giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
+          }
+          if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+            scalar_t bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW];
+            gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
+            giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
+            giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
+          }
+          if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+            scalar_t bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW];
+            gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
+            giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
+            giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
+          }
+        }
+
+        // assuming grad_grid is contiguous
+        // thus we can
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
+        //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
+        scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
+        gGrid_ptr_NDHW[0] = gix_mult * gix;
+        gGrid_ptr_NDHW[1] = giy_mult * giy;
+        gGrid_ptr_NDHW[2] = giz_mult * giz;
+      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+        if (input_requires_grad) {
+          auto ix_nearest = static_cast<index_t>(std::nearbyint(ix));
+          auto iy_nearest = static_cast<index_t>(std::nearbyint(iy));
+          auto iz_nearest = static_cast<index_t>(std::nearbyint(iz));
+
+          // assign nearest neighbour pixel value to output pixel
+          const scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+          index_t NC_offset = n * gInp_sN;
+          for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) {
+            // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd].
+            safe_add_3d(grad_input.data, iz_nearest, iy_nearest, ix_nearest,
+                        gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW,
+                        NC_offset, grad_input_memory_span);
+          }
+        }
+        // assuming grad_grid is contiguous
+        // thus we can
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
+        //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
+        scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
+        gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0);
+        gGrid_ptr_NDHW[1] = static_cast<scalar_t>(0);
+        gGrid_ptr_NDHW[2] = static_cast<scalar_t>(0);
+      }
+    }
+  }
+}  // namespace
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
+  auto N = input.size(0);
+  auto H = grid.size(1);
+  auto W = grid.size(2);
+  int64_t count = N * H * W;
+  if (count > 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      input.scalar_type(), "grid_sampler_2d_zoom", [&] {
+      if (canUse32BitIndexMath(input) && canUse32BitIndexMath(grid) &&
+          canUse32BitIndexMath(output)) {
+        grid_sampler_2d_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 256), 256, 0, c10::zoom::getCurrentZoomStream()>>>(
+            static_cast<int>(count),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
+            getTensorInfo<scalar_t, int>(output),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        grid_sampler_2d_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 256), 256, 0, c10::zoom::getCurrentZoomStream()>>>(
+            count,
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
+            getTensorInfo<scalar_t, int64_t>(output),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    });
+  }
+}
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_3d(input, grid, interpolation_mode);
+
+  auto N = input.size(0);
+  auto D = grid.size(1);
+  auto H = grid.size(2);
+  auto W = grid.size(3);
+  int64_t count = N * D * H * W;
+  if (count > 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      input.scalar_type(), "grid_sampler_3d_zoom", [&] {
+      if (canUse32BitIndexMath(input) && canUse32BitIndexMath(grid) &&
+          canUse32BitIndexMath(output)) {
+        grid_sampler_3d_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 512), 512, 0, c10::zoom::getCurrentZoomStream()>>>(
+            static_cast<int>(count),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
+            getTensorInfo<scalar_t, int>(output),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        grid_sampler_3d_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 512), 512, 0, c10::zoom::getCurrentZoomStream()>>>(
+            count,
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
+            getTensorInfo<scalar_t, int64_t>(output),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    });
+  }
+}
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool,2> output_mask) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("grid_sampler_2d_backward_zoom");
+  auto N = input.size(0);
+  auto H = grid.size(1);
+  auto W = grid.size(2);
+
+  // If `input` gradient is not required, we skip computing it -- not needing to create
+  // the tensor to hold the gradient can markedly increase performance. (`grid` gradient
+  // is always computed.)
+  auto input_requires_grad = output_mask[0];
+
+  int64_t count = N * H * W;
+  if (count > 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      input.scalar_type(), "grid_sampler_2d_backward_zoom", [&] {
+      if (canUse32BitIndexMath(input) && canUse32BitIndexMath(grid) &&
+          canUse32BitIndexMath(grad_output)) {
+        grid_sampler_2d_backward_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 256), 256, 0, c10::zoom::getCurrentZoomStream()>>>(
+            static_cast<int>(count),
+            getTensorInfo<const scalar_t, int>(grad_output),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
+            input_requires_grad ? getTensorInfo<scalar_t, int>(grad_input) : TensorInfo<scalar_t, int>(),
+            getTensorInfo<scalar_t, int>(grad_grid),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners,
+            /*grad_input_memory_span =*/input_requires_grad ? static_cast<int>(grad_input.numel()) : 0,
+            input_requires_grad);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        grid_sampler_2d_backward_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 256), 256, 0, c10::zoom::getCurrentZoomStream()>>>(
+            count,
+            getTensorInfo<const scalar_t, int64_t>(grad_output),
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
+            input_requires_grad ? getTensorInfo<scalar_t, int64_t>(grad_input) : TensorInfo<scalar_t, int64_t>(),
+            getTensorInfo<scalar_t, int64_t>(grad_grid),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners,
+            /*grad_input_memory_span =*/input_requires_grad ? grad_input.numel() : 0,
+            input_requires_grad);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    });
+  }
+}
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase& grad_output, const TensorBase& input,
+    const TensorBase& grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool,2> output_mask) {
+  // See NOTE [ grid_sampler Native Functions ].
+  // Add checks here in case this is called instead of grid_sampler.
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_3d(input, grid, interpolation_mode);
+
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("grid_sampler_3d_backward_zoom");
+  auto N = input.size(0);
+  auto D = grid.size(1);
+  auto H = grid.size(2);
+  auto W = grid.size(3);
+  int64_t count = N * D * H * W;
+  auto input_requires_grad = output_mask[0];
+  if (count > 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      input.scalar_type(), "grid_sampler_3d_backward_zoom", [&] {
+      if (canUse32BitIndexMath(input) && canUse32BitIndexMath(grid) &&
+          canUse32BitIndexMath(grad_output)) {
+        grid_sampler_3d_backward_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 256), 256, 0, c10::zoom::getCurrentZoomStream()>>>(
+            static_cast<int>(count),
+            getTensorInfo<const scalar_t, int>(grad_output),
+            getTensorInfo<const scalar_t, int>(input),
+            getTensorInfo<const scalar_t, int>(grid),
+            input_requires_grad ? getTensorInfo<scalar_t, int>(grad_input) : TensorInfo<scalar_t, int>(),
+            getTensorInfo<scalar_t, int>(grad_grid),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners,
+            /*grad_input_memory_span =*/input_requires_grad ? static_cast<int>(grad_input.numel()) : 0,
+            input_requires_grad);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else {
+        grid_sampler_3d_backward_kernel<scalar_t>
+          <<<GET_BLOCKS(count, 256), 256, 0, c10::zoom::getCurrentZoomStream()>>>(
+            count,
+            getTensorInfo<const scalar_t, int64_t>(grad_output),
+            getTensorInfo<const scalar_t, int64_t>(input),
+            getTensorInfo<const scalar_t, int64_t>(grid),
+            input_requires_grad ? getTensorInfo<scalar_t, int64_t>(grad_input) : TensorInfo<scalar_t, int64_t>(),
+            getTensorInfo<scalar_t, int64_t>(grad_grid),
+            static_cast<GridSamplerInterpolation>(interpolation_mode),
+            static_cast<GridSamplerPadding>(padding_mode),
+            align_corners,
+            /*grad_input_memory_span =*/input_requires_grad ? grad_input.numel() : 0,
+            input_requires_grad);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    });
+  }
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/GridSampler.cuh b/aten/src/ATen/native/zoom/GridSampler.cuh
new file mode 100644
index 00000000000000..d25c2830e0f066
--- /dev/null
+++ b/aten/src/ATen/native/zoom/GridSampler.cuh
@@ -0,0 +1,321 @@
+#pragma once
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/native/GridSamplerUtils.h>
+
+namespace at { namespace native {
+
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
+// except that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
+                                           bool align_corners, scalar_t *grad_in) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    *grad_in = static_cast<scalar_t>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    *grad_in = static_cast<scalar_t>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t clip_coordinates(scalar_t in, int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
+}
+
+// clip_coordinates_set_grad works similarly to clip_coordinates except that
+// it also returns the `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) {
+  // Note that it is important for the gradient calculation that borders
+  // are considered out of bounds.
+  if (in <= static_cast<scalar_t>(0)) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  } else {
+    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<scalar_t>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<scalar_t>(1);
+      return in;
+    }
+  }
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+// reflect_coordinates_set_grad works similarly to reflect_coordinates except
+// that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high,
+                                      scalar_t *grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  }
+  int grad_in_mult_;
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<scalar_t>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  // `fmod` returns same sign as `in`, which is positive after the `if` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<scalar_t>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t safe_downgrade_to_int_range(scalar_t x){
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t compute_coordinates(scalar_t coord, int size,
+                             GridSamplerPadding padding_mode,
+                             bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2*(size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2*size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_compute_source_index(
+    scalar_t coord,
+    int size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+// grid_sampler_compute_source_index_set_grad works similarly to
+// grid_sampler_compute_source_index except that it also returns the
+// `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_compute_source_index_set_grad(
+    scalar_t coord,
+    int size,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    scalar_t *grad_in) {
+  scalar_t grad_clip, grad_refl;
+  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
+    } else {
+      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+static __forceinline__ __device__
+bool within_bounds_2d(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static __forceinline__ __device__
+bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t get_value_bounded(
+    const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int ix = static_cast<int>(x);
+  int iy = static_cast<int>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void safe_add_2d(scalar_t *data, int h, int w,
+                 int sH, int sW, int H, int W,
+                 scalar_t delta,
+                 const index_t NC_offset,
+                 const index_t memory_span) {
+  if (within_bounds_2d(h, w, H, W)) {
+    fastAtomicAdd(data,
+                  NC_offset + h * sH + w * sW,
+                  memory_span,
+                  delta,
+                  true);
+  }
+}
+
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void safe_add_3d(scalar_t *data, int d, int h, int w,
+                 int sD, int sH, int sW, int D, int H, int W,
+                 scalar_t delta,
+                 const index_t NC_offset,
+                 const index_t memory_span) {
+  if (within_bounds_3d(d, h, w, D, H, W)) {
+    fastAtomicAdd(data,
+                  NC_offset + d * sD + h * sH + w * sW,
+                  memory_span,
+                  delta,
+                  true);
+  }
+}
+
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void add_value_bounded(
+    scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    scalar_t delta,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    const index_t NC_offset,
+    const index_t memory_span) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int ix = static_cast<int>(x);
+  int iy = static_cast<int>(y);
+
+  safe_add_2d(data, iy, ix, sH, sW, H, W, delta, NC_offset, memory_span);
+}
+
+// Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
+template<typename scalar_t>
+static __forceinline__ __device__
+void get_cubic_coefficients_grad(
+    scalar_t coeffs[4],
+    scalar_t t) {
+
+  // Must be the same as forward calculation in
+  // aten/src/ATen/native/zoom/UpSample.cuh:get_cubic_upsample_coefficients
+  scalar_t A = -0.75;
+
+  scalar_t x;
+  x = -1 - t;  // 1 < x = |-1 - tx| < 2
+  coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A;
+  x = -t;     // x = |0 - tx| <= 1
+  coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 1 - t;  // x = |1 - tx| <= 1
+  coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 2 - t;  // 1 < x = |2 - tx| < 2
+  coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A;
+}
+
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/GridSampler.h b/aten/src/ATen/native/zoom/GridSampler.h
new file mode 100644
index 00000000000000..aace9c30b0a7e9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/GridSampler.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/IGammaKernel.cu b/aten/src/ATen/native/zoom/IGammaKernel.cu
new file mode 100644
index 00000000000000..5502460a975f48
--- /dev/null
+++ b/aten/src/ATen/native/zoom/IGammaKernel.cu
@@ -0,0 +1,554 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+// TODO: review jiterating igamma and igammac if/when a persistent (across processes)
+//   cache is implemented, because they take a VERY long time to compile
+// TODO: it's also odd these ops use gpu_kernel_with_scalars
+
+namespace {
+
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+// regularized lower & upper incomplete gamma
+template <typename scalar_t>
+__host__ __device__ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
+    const scalar_t denom[], int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  int64_t i, dir;
+  accscalar_t y, num_ans, denom_ans;
+  accscalar_t absx = ::fabs(x);
+  const accscalar_t *p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  }
+  else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  }
+  else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return ::pow(x, static_cast<accscalar_t>(i)) * num_ans / denom_ans;
+  }
+  else {
+    return num_ans / denom_ans;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+
+  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+    0.006061842346248906525783753964555936883222,
+    0.5098416655656676188125178644804694509993,
+    19.51992788247617482847860966235652136208,
+    449.9445569063168119446858607650988409623,
+    6955.999602515376140356310115515198987526,
+    75999.29304014542649875303443598909137092,
+    601859.6171681098786670226533699352302507,
+    3481712.15498064590882071018964774556468,
+    14605578.08768506808414169982791359218571,
+    43338889.32467613834773723740590533316085,
+    86363131.28813859145546927288977868422342,
+    103794043.1163445451906271053616070238554,
+    56906521.91347156388090791033559122686859
+  };
+  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+    1.,
+    66.,
+    1925.,
+    32670.,
+    357423.,
+    2637558.,
+    13339535.,
+    45995730.,
+    105258076.,
+    150917976.,
+    120543840.,
+    39916800.,
+    0
+  };
+  return ratevl(static_cast<accscalar_t>(x), lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) / sizeof(lanczos_sum_expg_scaled_num[0]) - 1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) / sizeof(lanczos_sum_expg_scaled_denom[0]) - 1);
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  accscalar_t ax, fac, res, num, numfac;
+  static const accscalar_t MAXLOG = std::is_same<accscalar_t,double>::value ?
+    7.09782712893383996843E2 : 88.72283905206835;
+  static const accscalar_t EXP1 = 2.718281828459045;
+  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (::fabs(a - x) > 0.4 * ::fabs(a)) {
+    ax = a * ::log(x) - x - ::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return ::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= ::exp(a - x) * ::pow(x / fac, a);
+  }
+  else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static const int MAXITER = 2000;
+
+  int i;
+  accscalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  int n;
+  accscalar_t fac = 1;
+  accscalar_t sum = 0;
+  accscalar_t term, logx;
+  static const int MAXITER = 2000;
+  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (::fabs(term) <= MACHEP * ::fabs(sum)) {
+        break;
+    }
+  }
+
+  logx = ::log(x);
+  term = -::expm1(a * logx - ::lgamma(1+a));
+  return term - ::exp(a * logx - ::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  static const accscalar_t d[25][25] =
+    {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
+    {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
+    {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
+    {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, 2.0453671226782849e-14},
+    {-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4, -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5, 1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6, 8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11, 2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9, -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10, -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12, -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18, -5.0453320690800944e-14},
+    {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, -1.3249659916340829e-13},
+    {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, 3.6902800842763467e-13},
+    {3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4, 2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7, 2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6, 4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7, -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8, -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17, -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11, 2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12, 1.0865561947091654e-12},
+    {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, -3.3721464474854592e-12},
+    {-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4, -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7, -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5, -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6, 8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7, 8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14, 3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10, -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11, -1.1002224534207726e-11},
+    {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, 3.7647749553543836e-11},
+    {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, 1.3481607129399749e-10},
+    {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, -5.0423112718105824e-10},
+    {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, -1.9661464453856102e-9},
+    {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, 7.9795091026746235e-9},
+    {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, 3.3654425209171788e-8},
+    {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, -1.4729737374018841e-7},
+    {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, -6.6812849447625594e-7},
+    {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, 3.1369106244517615e-6},
+    {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, 1.5227271505597605e-5},
+    {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, -7.6340103696869031e-5},
+    {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, -3.9479941246822517e-4},
+    {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, 2.1250180774699461e-3},
+    {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, 1.5109265210467774e-2},
+    {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, 4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  accscalar_t lambda = x / a;
+  accscalar_t sigma = (x - a) / a;
+  accscalar_t eta, res, ck, ckterm, term, absterm;
+  accscalar_t absoldterm = INFINITY;
+  accscalar_t etapow[25] = {1};
+  accscalar_t sum = 0;
+  accscalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  }
+  else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = ::sqrt(-2 * (::log1p(sigma) - sigma));
+  }
+  else if (lambda < 1) {
+    eta = -::sqrt(-2 * (::log1p(sigma) - sigma));
+  }
+  else {
+    eta = 0;
+  }
+  res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n-1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n]*etapow[n];
+      ck += ckterm;
+      if (std::fabs(ckterm) < MACHEP * std::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = std::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * std::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  int i;
+  accscalar_t ans, ax, c, yc, r, t, y, z;
+  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  static const int MAXITER = 2000;
+  static const accscalar_t MACHEP = std::is_same<accscalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static const accscalar_t BIG = std::is_same<accscalar_t,double>::value ?
+    4.503599627370496e15 : 16777216.;
+  static const accscalar_t BIGINV = std::is_same<accscalar_t,double>::value ?
+    2.22044604925031308085e-16 : 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = ::fabs((ans - r) / r);
+      ans = r;
+    }
+    else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+__noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  accscalar_t absxma_a;
+
+  static const accscalar_t SMALL = 20.0;
+  static const accscalar_t LARGE = 200.0;
+  static const accscalar_t SMALLRATIO = 0.3;
+  static const accscalar_t LARGERATIO = 4.5;
+
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<accscalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    }
+    else {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 1.0;
+  }
+  else if (::isinf(static_cast<accscalar_t>(a))) {
+    if (::isinf(static_cast<accscalar_t>(x))) {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+    return 1.0;
+  }
+  else if (::isinf(static_cast<accscalar_t>(x))) {
+    return 0.0;
+  }
+
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  }
+  else if (x <= 0.5) {
+    if (-0.4 / ::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+  else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+// NOTE: this __noinline__ is important -- otherwise, observed compile times significantly
+// increase.  The same kernel seems to get recompiled multiple times via gpu_kernel_with_scalars,
+// multiple dtypes, etc.
+template <typename scalar_t>
+__noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the substraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  accscalar_t absxma_a;
+  static const accscalar_t SMALL = 20.0;
+  static const accscalar_t LARGE = 200.0;
+  static const accscalar_t SMALLRATIO = 0.3;
+  static const accscalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<accscalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    }
+    else {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 0.0; // zero integration limit
+  }
+  else if (::isinf(static_cast<accscalar_t>(a))) {
+    if (::isinf(static_cast<accscalar_t>(x))) {
+      return std::numeric_limits<accscalar_t>::quiet_NaN();
+    }
+    return 0.0;
+  }
+  else if (::isinf(static_cast<accscalar_t>(x))) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. */
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+template<typename scalar_t>
+struct CalcIgamma{
+  CalcIgamma(bool calc_igammac): calc_igammac_(calc_igammac){}
+  bool calc_igammac_;
+  __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
+    if (calc_igammac_) {
+      return calc_igammac(a,b);
+    } else {
+      return calc_igamma(a,b);
+    }
+  }
+};
+
+}
+
+// end of regularized lower & upper incomplete gamma
+
+namespace at::native {
+
+void igamma_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igamma_zoom", [&]() {
+    gpu_kernel(iter, CalcIgamma<scalar_t>(false));
+  });
+}
+
+void igammac_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "igammac_zoom", [&]() {
+    gpu_kernel(iter, CalcIgamma<scalar_t>(true));
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(igamma_stub, &igamma_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(igammac_stub, &igammac_kernel_zoom);
+
+// DO NOT ADD ANY NEW KERNELS HERE
+// CUDA compilation times grow quickly.  It's perfectly acceptable to have a file per kernel.
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Im2Col.cu b/aten/src/ATen/native/zoom/Im2Col.cu
new file mode 100644
index 00000000000000..1129ac786fd065
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Im2Col.cu
@@ -0,0 +1,165 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/div_rtn.h>
+
+#include <ATen/zoom/ZoomContext.h>
+
+#include <ATen/native/zoom/im2col.cuh>
+#include <ATen/native/im2col_shape_check.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/col2im_native.h>
+#include <ATen/ops/im2col_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+static void im2col_out_zoom_template(
+    Tensor& output,
+    const Tensor& input_,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  TORCH_CHECK(
+      kernel_size.size() == 2,
+      "It is expected kernel_size equals to 2, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 2,
+      "It is expected dilation equals to 2, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 2,
+      "It is expected padding equals to 2, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      stride.size());
+
+  int64_t kernel_height = kernel_size[0];
+  int64_t kernel_width = kernel_size[1];
+  int64_t dilation_height = dilation[0];
+  int64_t dilation_width = dilation[1];
+  int64_t pad_height = padding[0];
+  int64_t pad_width = padding[1];
+  int64_t stride_height = stride[0];
+  int64_t stride_width = stride[1];
+
+  TensorArg input_arg{input_, "input", 1};
+  TensorArg output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  im2col_shape_check(
+      input_,
+      Tensor(),
+      kernel_height,
+      kernel_width,
+      dilation_height,
+      dilation_width,
+      pad_height,
+      pad_width,
+      stride_height,
+      stride_width);
+
+  Tensor input = input_.contiguous();
+
+  bool batched_input = true;
+
+  if (input.dim() == 3) {
+    batched_input = false;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+  }
+
+  int64_t batch_size = input.size(0);
+  int64_t n_input_plane = input.size(1);
+  int64_t input_height = input.size(2);
+  int64_t input_width = input.size(3);
+
+  int64_t output_height = (input_height + 2 * pad_height -
+                           (dilation_height * (kernel_height - 1) + 1)) /
+          stride_height +
+      1;
+  int64_t output_width = (input_width + 2 * pad_width -
+                          (dilation_width * (kernel_width - 1) + 1)) /
+          stride_width +
+      1;
+  int64_t n_output_plane = n_input_plane * kernel_width * kernel_height;
+  int64_t output_length = output_height * output_width;
+
+  output.resize_({batch_size, n_output_plane, output_length});
+
+  // Launch kernel
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "im2col_out_zoom", [&] {
+    Tensor input_n;
+    Tensor output_n;
+
+    for (int64_t elt = 0; elt < batch_size; elt++) {
+      input_n = input.select(0, elt);
+      output_n = output.select(0, elt);
+
+      im2col<scalar_t>(
+          c10::zoom::getCurrentZoomStream(),
+          input_n.const_data_ptr<scalar_t>(),
+          n_input_plane,
+          input_height,
+          input_width,
+          output_height,
+          output_width,
+          kernel_height,
+          kernel_width,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          output_n.mutable_data_ptr<scalar_t>());
+    }
+
+    if (!batched_input) {
+      output.resize_({n_output_plane, output_length});
+    }
+  });
+}
+
+} // namespace
+
+Tensor& im2col_out_zoom(const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    Tensor& output) {
+  im2col_out_zoom_template(
+      output, input, kernel_size, dilation, padding, stride);
+  return output;
+}
+
+Tensor im2col_zoom(
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  Tensor output = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  im2col_out_zoom_template(
+      output, input, kernel_size, dilation, padding, stride);
+  return output;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSample.cuh b/aten/src/ATen/native/zoom/UpSample.cuh
new file mode 100644
index 00000000000000..9e278cc9d9e6e6
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSample.cuh
@@ -0,0 +1,370 @@
+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/zoom/Atomic.cuh>
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/OptionalArrayRef.h>
+
+#include <math.h>
+
+namespace at {
+namespace native {
+
+namespace upsample {
+// TODO: Remove duplicate declaration.
+TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
+    c10::IntArrayRef input_size,  // Full input tensor size.
+    at::OptionalIntArrayRef output_size,
+    c10::optional<c10::ArrayRef<double>> scale_factors);
+} // namespace upsample
+
+namespace upsample_zoom {
+
+// TODO: Remove duplication with Upsample.h (CPU).
+inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
+  if (!scales) {
+    return nullopt;
+  }
+  return scales->at(idx);
+}
+
+} // namespace upsample_zoom
+
+
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+
+// NOTE [ Nearest neighbor upsampling kernel implementation ]
+//
+// The nearest neighbor upsampling kernel implementation is symmetrical as
+// expected. We launch kernels with threads mapping to destination tensors where
+// kernels write data to, each thread reads data from the source tensor, this
+// means:
+// 1. In the forward kernel,
+//      src_xxx refers to properties of input tensors;
+//      dst_xxx refers to properties of output tensors;
+//      scale_factor is the ratio of src_size to dst_size;
+// 2. In the backward kernel,
+//      src_xxx refers to properties of grad_output tensors;
+//      dst_xxx refers to properties of grad_input tensors;
+//      scale_factor is the ratio of src_size to dst_size;
+//
+// Because of this, we need to take the reciprocal of the scale defined by
+// upsample layer during forward path. The motivation is to avoid slow
+// division in the kernel code, so we can use faster multiplication instead.
+// This is not necessary during backward path, since the scale_factor is already
+// the reciprocal of corresponding scale_factor used in the forward path due to
+// the swap of source and destination tensor.
+//
+// Similarly, since the mapping from grad_input to grad_output during backward
+// is the reverse of the mapping of output to input, we need to have opposite
+// mapping functions to compute the source index.
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t compute_scales_value(
+    const c10::optional<double> scale,
+    int64_t src_size,
+    int64_t dst_size) {
+  // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+  return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value())
+                                                   : (accscalar_t)src_size / dst_size;
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
+    const c10::optional<double> scale,
+    int64_t src_size,
+    int64_t dst_size) {
+  // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+  return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value()
+                                                   : (accscalar_t)src_size / dst_size;
+}
+
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
+    int input_size,
+    int output_size,
+    bool align_corners,
+    const c10::optional<double> scale) {
+  if(align_corners) {
+    if(output_size > 1) {
+      return (accscalar_t)(input_size - 1) / (output_size - 1);
+    }
+    else {
+      return static_cast<accscalar_t>(0);
+    }
+  }
+  else{
+    return compute_scales_value<accscalar_t>(scale, input_size, output_size);
+  }
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
+    accscalar_t scale,
+    int dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    accscalar_t src_idx = scale * (dst_index + static_cast<accscalar_t>(0.5)) -
+        static_cast<accscalar_t>(0.5);
+    // See Note[Follow Opencv resize logic]
+    return (!cubic && src_idx < static_cast<accscalar_t>(0))
+        ? static_cast<accscalar_t>(0)
+        : src_idx;
+  }
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+__device__ __forceinline__ static int nearest_neighbor_compute_source_index(
+    const float scale,
+    int dst_index,
+    int input_size) {
+  // index_f32 = (output_index) * scale
+  // input_index = round(index_f32)
+  // Same as a buggy OpenCV INTER_NEAREST
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_neighbor_exact_compute_source_index as replacement
+  const int src_index =
+      min(static_cast<int>(floorf((dst_index) * scale)), input_size - 1);
+  return src_index;
+}
+
+__device__ __forceinline__ static int nearest_neighbor_exact_compute_source_index(
+    const float scale,
+    int dst_index,
+    int input_size) {
+  // index_f32 = (output_index + 0.5) * scale - 0.5
+  // input_index = round(index_f32)
+  // Same as Pillow and Scikit-Image/Scipy ndi.zoom
+  const int src_index =
+      min(static_cast<int>(floorf((dst_index + static_cast<float>(0.5)) * scale)), input_size - 1);
+  return src_index;
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+__device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
+    const float scale,
+    int dst_index,
+    int output_size) {
+  // Equivalent to buggy OpenCV INTER_NEAREST
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_neighbor_exact_bw_compute_source_index as replacement
+  const int src_index =
+      min(static_cast<int>(ceilf(dst_index * scale)), output_size);
+  return src_index;
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+__device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_index(
+    const float scale,
+    int dst_index,
+    int output_size) {
+  // Equivalent to Pillow and Scikit-Image/Scipy ndi.zoom
+  const int src_index =
+      min(static_cast<int>(ceilf(dst_index * scale - static_cast<float>(0.5))), output_size);
+  return src_index;
+}
+
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t upsample_get_value_bounded(
+    const PackedTensorAccessor64<const scalar_t, 4>& data,
+    int batch,
+    int channel,
+    int height,
+    int width,
+    int y,
+    int x) {
+  int access_y = max(min(y, height - 1), 0);
+  int access_x = max(min(x, width - 1), 0);
+  return data[batch][channel][access_y][access_x];
+}
+
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static void upsample_increment_value_bounded(
+    PackedTensorAccessor64<scalar_t, 4>& data,
+    int batch,
+    int channel,
+    int height,
+    int width,
+    int y,
+    int x,
+    accscalar_t value) {
+  int access_y = max(min(y, height - 1), 0);
+  int access_x = max(min(x, width - 1), 0);
+  /* TODO: result here is truncated to scalar_t,
+     check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
+   */
+  gpuAtomicAddNoReturn(
+      &data[batch][channel][access_y][access_x], static_cast<scalar_t>(value));
+}
+
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_convolution1(
+    accscalar_t x,
+    accscalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_convolution2(
+    accscalar_t x,
+    accscalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static void get_cubic_upsampling_coefficients(
+    accscalar_t coeffs[4],
+    accscalar_t t) {
+  accscalar_t A = -0.75;
+
+  accscalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<accscalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<accscalar_t>(x1, A);
+
+  // opposite coefficients
+  accscalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<accscalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<accscalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    accscalar_t t) {
+  accscalar_t coeffs[4];
+  get_cubic_upsampling_coefficients<accscalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+namespace upsample_antialias {
+
+// taken from
+// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
+// src/libImaging/Resample.c#L20-L29
+struct BilinearFilterFunctor {
+
+  template <typename accscalar_t>
+  __device__ accscalar_t operator()(accscalar_t x) const {
+    if (x < 0) {
+      x = -x;
+    }
+    if (x < 1) {
+      return 1 - x;
+    }
+    return 0;
+  }
+
+  static const int size = 2;
+};
+
+// taken from
+// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
+// src/libImaging/Resample.c#L46-L62
+struct BicubicFilterFunctor {
+
+  template <typename accscalar_t>
+  __device__ accscalar_t operator()(accscalar_t x) const {
+    // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+    const accscalar_t a = -0.5;
+    if (x < 0) {
+      x = -x;
+    }
+    if (x < 1) {
+      return ((a + 2) * x - (a + 3)) * x * x + 1;
+    }
+    if (x < 2) {
+      return (((x - 5) * x + 8) * x - 4) * a;
+    }
+    return 0;
+  }
+
+  static const int size = 4;
+};
+
+template <typename accscalar_t>
+__device__ __forceinline__ static void _compute_weights_span(
+    const int i,
+    const int input_size,
+    const accscalar_t scale,
+    const accscalar_t support,
+    int& xmin,
+    int& xsize,
+    accscalar_t& center) {
+  center = scale * (i + static_cast<accscalar_t>(0.5));
+  xmin = max(static_cast<int>(center - support + static_cast<accscalar_t>(0.5)), static_cast<int>(0));
+  xsize = min(static_cast<int>(center + support + static_cast<accscalar_t>(0.5)), input_size) - xmin;
+}
+
+template <typename scalar_t, typename accscalar_t, typename interp_filter_t>
+__device__ __forceinline__ static void _compute_weights(
+    scalar_t* wt_ptr,
+    const accscalar_t scale,
+    int interp_size,
+    const interp_filter_t& interp_filter,
+    accscalar_t xmin_m_center,
+    int xsize) {
+
+  accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
+  accscalar_t total_w = 0.0;
+  int j = 0;
+  for (j = 0; j < xsize; j++) {
+    accscalar_t w = interp_filter((j + xmin_m_center + static_cast<accscalar_t>(0.5)) * invscale);
+    wt_ptr[j] = static_cast<scalar_t>(w);
+    total_w += w;
+  }
+  for (j = 0; j < xsize; j++) {
+    if (total_w != 0.0) {
+      wt_ptr[j] /= total_w;
+    }
+  }
+  for (; j < interp_size; j++) {
+    wt_ptr[j] = static_cast<scalar_t>(0.0);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static accscalar_t interpolate_aa_single_dim(
+    const scalar_t* src,
+    const scalar_t* weights,
+    int size) {
+  scalar_t t = static_cast<accscalar_t>(*src);
+  scalar_t wts = static_cast<accscalar_t>(weights[0]);
+  accscalar_t output = t * wts;
+
+  int j = 1;
+  for (; j < size; j++) {
+    wts = static_cast<accscalar_t>(weights[j]);
+    t = static_cast<accscalar_t>(*(src + j));
+    output += t * wts;
+  }
+  return output;
+}
+
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cu b/aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cu
new file mode 100644
index 00000000000000..e9f3677a803224
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cu
@@ -0,0 +1,112 @@
+#include <ATen/native/zoom/fused_adam_amsgrad_impl.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/fused_adam_utils.cuh>
+#include <vector>
+
+namespace at::native {
+
+void _fused_adam_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(),
+      grads.vec(),
+      exp_avgs.vec(),
+      exp_avg_sqs.vec(),
+      max_exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = nullptr;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adam_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<5>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ORIGINAL, true>(),
+            lr_ptr, // unused
+            lr,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adam_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(),
+      grads.vec(),
+      exp_avgs.vec(),
+      exp_avg_sqs.vec(),
+      max_exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = lr.const_data_ptr<float>();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adam_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<5>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ORIGINAL, true>(),
+            lr_ptr,
+            1.0, // unused
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cuh b/aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cuh
new file mode 100644
index 00000000000000..6fae0a38f0d07a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adam_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adam_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adam_impl.cu b/aten/src/ATen/native/zoom/fused_adam_impl.cu
new file mode 100644
index 00000000000000..d3bae8e0efc869
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adam_impl.cu
@@ -0,0 +1,102 @@
+#include <ATen/native/zoom/fused_adam_impl.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/fused_adam_utils.cuh>
+#include <vector>
+
+namespace at::native {
+
+void _fused_adam_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = nullptr;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adam_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<4>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ORIGINAL, false>(),
+            lr_ptr, // unused
+            lr,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adam_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = lr.const_data_ptr<float>();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adam_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<4>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ORIGINAL, false>(),
+            lr_ptr,
+            1.0, // unused
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/fused_adam_impl.cuh b/aten/src/ATen/native/zoom/fused_adam_impl.cuh
new file mode 100644
index 00000000000000..a56e27fe91499d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adam_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adam_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adam_utils.cuh b/aten/src/ATen/native/zoom/fused_adam_utils.cuh
new file mode 100644
index 00000000000000..d8bbddd2145f49
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adam_utils.cuh
@@ -0,0 +1,202 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/zoom/ForeachFunctors.cuh>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/Pow.cuh>
+#include <utility>
+
+namespace at {
+namespace native {
+
+enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 };
+
+namespace {
+
+constexpr uint8_t kParamIdx = 0;
+constexpr uint8_t kGradIdx = 1;
+constexpr uint8_t kExpAvgIdx = 2;
+constexpr uint8_t kExpAvgSqIdx = 3;
+constexpr uint8_t kMaxExpAvgSqIdx = 4;
+
+template <
+    typename scalar_type,
+    typename opmath_t,
+    int depth,
+    ADAM_MODE adam_mode,
+    bool amsgrad>
+C10_DEVICE inline void adam_math(
+    scalar_type r_args[depth][kILP],
+    const double& lr,
+    const double& beta1,
+    const double& beta2,
+    const double& weight_decay,
+    const double& eps,
+    const bool& maximize,
+    const float* grad_scale_ptr,
+    const float* found_inf_ptr,
+    const opmath_t& bias_correction1,
+    const opmath_t& bias_correction2_sqrt) {
+  static_assert(depth == 4 || depth == 5);
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    // Load values.
+    opmath_t param = static_cast<opmath_t>(r_args[kParamIdx][ii]);
+    opmath_t grad = static_cast<opmath_t>(r_args[kGradIdx][ii]);
+    if (grad_scale_ptr) {
+      grad /= (static_cast<double>(*grad_scale_ptr));
+    }
+    const opmath_t grad_to_store = grad;
+    if (maximize) {
+      grad = -grad;
+    }
+    opmath_t exp_avg = static_cast<opmath_t>(r_args[kExpAvgIdx][ii]);
+    opmath_t exp_avg_sq = static_cast<opmath_t>(r_args[kExpAvgSqIdx][ii]);
+    opmath_t max_exp_avg_sq;
+    if (amsgrad) {
+      max_exp_avg_sq = static_cast<opmath_t>(r_args[kMaxExpAvgSqIdx][ii]);
+    }
+    // Update param, grad, 1st and 2nd order momentum.
+    if (weight_decay != 0) {
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad += param * weight_decay;
+      } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param -= lr * weight_decay * param;
+      }
+    }
+    // todo(crcrpar): use lerp
+    // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/
+    exp_avg = beta1 * exp_avg + (1 - beta1) * grad;
+    exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad;
+    const opmath_t step_size = lr / bias_correction1;
+    opmath_t denom;
+    if (amsgrad) {
+      max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq);
+      denom = (std::sqrt(max_exp_avg_sq) / bias_correction2_sqrt) + eps;
+    } else {
+      denom = (std::sqrt(exp_avg_sq) / bias_correction2_sqrt) + eps;
+    }
+    param -= step_size * exp_avg / denom;
+
+    // Store results.
+    r_args[kParamIdx][ii] = param;
+    if (grad_scale_ptr) {
+      r_args[kGradIdx][ii] = grad_to_store;
+    }
+    r_args[kExpAvgIdx][ii] = exp_avg;
+    r_args[kExpAvgSqIdx][ii] = exp_avg_sq;
+    if (amsgrad) {
+      r_args[kMaxExpAvgSqIdx][ii] = max_exp_avg_sq;
+    }
+  }
+}
+
+// [note: Conditional Gradient Store when `optimizer.step` is called by
+// GradScaler] When a user is training their model(s) with an FP16 AMP recipe,
+// parameter updates are done via `grad_scaler.step(optimizer)` instead of
+// `optimizer.step()`. For most optimizers, GradScaler unscales gradients on
+// behalf of those optimizers. Also, before `.step`, it makes sure that all the
+// gradients involved are finite, which incurs a device sync. On the other hand,
+// fused optimizers set their member variable of `_step_supports_amp_scaling` to
+// `True` in order to remove the device sync above. This means that fused
+// optimizers have to have their CUDA kernels (a) unscale gradients and (b) skip
+// parameter updates accordingly. To be functionally on par with `torch.optim`
+// optimizers and `_multi_tensor` ones, the kernel below writes out gradients
+// only when `grad_scale_ptr != nullptr.
+template <typename scalar_type, int depth, ADAM_MODE adam_mode, bool amsgrad>
+struct FusedAdamMathFunctor {
+  static_assert(
+      depth == 4 || depth == 5,
+      "depth of 4 for Adam, depth of 5 for Adam with AMSGrad.");
+  using opmath_t = at::opmath_type<scalar_type>;
+  C10_DEVICE __forceinline__ void operator()(
+      int chunk_size,
+      FusedOptimizerTensorListMetadata<depth>& tl,
+      const float* lr_ptr,
+      const double& lr,
+      const double& beta1,
+      const double& beta2,
+      const double& weight_decay,
+      const double& eps,
+      const bool& maximize,
+      const float* grad_scale_ptr,
+      const float* found_inf_ptr) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    const double lr_double = lr_ptr ? *lr_ptr : lr;
+
+    if (found_inf_ptr && *found_inf_ptr == 1) {
+      return;
+    }
+    const auto [bias_correction1, bias_correction2_sqrt] =
+        [&]() -> std::pair<double, double> {
+      auto* step_count =
+          reinterpret_cast<const float*>(tl.state_steps_addresses[tensor_loc]);
+      const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count);
+      const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count);
+      const auto bias_correction2_sqrt = std::sqrt(bias_correction2);
+      return {bias_correction1, bias_correction2_sqrt};
+    }();
+
+    scalar_type* args[depth];
+    scalar_type r_args[depth][kILP];
+    const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size;
+
+    const bool all_aligned{
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
+    if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (int i = 0; i < depth; i++) {
+          load_store(r_args[i], args[i], 0, i_start);
+        }
+        adam_math<scalar_type, opmath_t, depth, adam_mode, amsgrad>(
+            r_args,
+            lr_double,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr,
+            bias_correction1,
+            bias_correction2_sqrt);
+#pragma unroll
+        for (int i = 0; i < depth; i++) {
+          if (i != kGradIdx || grad_scale_ptr) {
+            load_store(args[i], r_args[i], i_start, 0);
+          }
+        }
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth>(r_args, args, i_start, chunk_size, n);
+        adam_math<scalar_type, opmath_t, depth, adam_mode, amsgrad>(
+            r_args,
+            lr_double,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr,
+            bias_correction1,
+            bias_correction2_sqrt);
+#pragma unroll
+        for (int i = 0; i < depth; i++) {
+          if (i != kGradIdx || grad_scale_ptr) {
+            store_args(args[i], r_args[i], i_start, chunk_size, n);
+          }
+        }
+      }
+    }
+  }
+};
+} // namespace
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cu b/aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cu
new file mode 100644
index 00000000000000..fae0c7ba1a9daa
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cu
@@ -0,0 +1,114 @@
+#include <ATen/native/zoom/fused_adamw_amsgrad_impl.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/fused_adam_utils.cuh>
+#include <vector>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(),
+      grads.vec(),
+      exp_avgs.vec(),
+      exp_avg_sqs.vec(),
+      max_exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = nullptr;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adamw_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<5>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ADAMW, true>(),
+            lr_ptr, // unused
+            lr,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adamw_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(),
+      grads.vec(),
+      exp_avgs.vec(),
+      exp_avg_sqs.vec(),
+      max_exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = lr.const_data_ptr<float>();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adamw_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<5>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 5, ADAM_MODE::ADAMW, true>(),
+            lr_ptr,
+            1.0, // unused
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cuh b/aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cuh
new file mode 100644
index 00000000000000..6f6b1fea5dfb56
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adamw_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_amsgrad_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adamw_impl.cu b/aten/src/ATen/native/zoom/fused_adamw_impl.cu
new file mode 100644
index 00000000000000..b191625daab281
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adamw_impl.cu
@@ -0,0 +1,104 @@
+#include <ATen/native/zoom/fused_adamw_impl.cuh>
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/zoom/MultiTensorApply.cuh>
+#include <ATen/native/zoom/fused_adam_utils.cuh>
+#include <vector>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = nullptr;
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adamw_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<4>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ADAMW, false>(),
+            lr_ptr, // unused
+            lr,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+// The following overload simply has a Tensor lr
+void _fused_adamw_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf) {
+  std::vector<std::vector<at::Tensor>> tensor_lists{
+      params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()};
+
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  const float* lr_ptr = lr.const_data_ptr<float>();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kHalf,
+      kBFloat16,
+      params[0].scalar_type(),
+      "fused_adamw_kernel_zoom",
+      [&]() {
+        multi_tensor_apply_for_fused_optimizer<4>(
+            tensor_lists,
+            state_steps,
+            FusedAdamMathFunctor<scalar_t, 4, ADAM_MODE::ADAMW, false>(),
+            lr_ptr,
+            1.0, // unused
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr);
+      });
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/fused_adamw_impl.cuh b/aten/src/ATen/native/zoom/fused_adamw_impl.cuh
new file mode 100644
index 00000000000000..c1ad5b72b8edd5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/fused_adamw_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_zoom_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/group_norm_kernel.cu b/aten/src/ATen/native/zoom/group_norm_kernel.cu
new file mode 100644
index 00000000000000..08796b12f4c1cb
--- /dev/null
+++ b/aten/src/ATen/native/zoom/group_norm_kernel.cu
@@ -0,0 +1,996 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/group_norm.h>
+
+#include <type_traits>
+
+#include <thrust/tuple.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+constexpr int kHIPNumThreads = 256;
+constexpr int kReduceTileSize = 32;
+
+template <typename T>
+__global__ void RowwiseMomentsZoomKernel(
+    int64_t N,
+    T eps,
+    const T* X,
+    T* mean,
+    T* rstd) {
+  using T_ACC = acc_type<T, true>;
+  using WelfordType = WelfordData<T_ACC, int64_t>;
+  using WelfordOp =
+      WelfordOps<T_ACC, T_ACC, int64_t, thrust::pair<T_ACC, T_ACC>>;
+
+  const int64_t i = blockIdx.x;
+  WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
+  WelfordType val(0, 0, 0, 0);
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    val = welford_op.reduce(val, static_cast<T_ACC>(X[index]), index);
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    val = zoom_utils::WarpReduce(val, welford_op);
+  } else {
+    // There will be a warning if we declare a __shared__ WelfordType array.
+    // https://github.com/pytorch/pytorch/pull/13967
+    __shared__ typename std::aligned_storage<
+        sizeof(WelfordType),
+        alignof(WelfordType)>::type val_shared[C10_WARP_SIZE];
+    WelfordType* val_shared_ptr = reinterpret_cast<WelfordType*>(val_shared);
+    val = zoom_utils::BlockReduce(
+        val,
+        welford_op,
+        /*identity_element=*/WelfordType(0, 0, 0, 0),
+        val_shared_ptr);
+  }
+  if (threadIdx.x == 0) {
+    T_ACC m1;
+    T_ACC m2;
+    thrust::tie(m2, m1) = welford_op.project(val);
+    mean[i] = m1;
+    rstd[i] = c10::hip::compat::rsqrt(m2 + static_cast<T_ACC>(eps));
+  }
+}
+
+template <typename T>
+__global__ void ComputeFusedParamsZoomKernel(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const T* gamma,
+    const T* beta,
+    acc_type<T, true>* a,
+    acc_type<T, true>* b) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < N * C) {
+    const int64_t ng = index / (C / group);
+    const int64_t c = index % C;
+    const T_ACC scale = (gamma == nullptr)
+        ? static_cast<T_ACC>(rstd[ng])
+        : static_cast<T_ACC>(rstd[ng]) * static_cast<T_ACC>(gamma[c]);
+    a[index] = scale;
+    b[index] = -scale * static_cast<T_ACC>(mean[ng]) +
+        ((beta == nullptr) ? 0 : static_cast<T_ACC>(beta[c]));
+  }
+}
+
+template <typename T>
+__global__ void Compute1dBackwardFusedParamsZoomKernel(
+    int64_t C,
+    int64_t group,
+    const T* dY,
+    const T* X,
+    const T* mean,
+    const T* rstd,
+    const T* gamma,
+    acc_type<T, true>* c2,
+    acc_type<T, true>* c3) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const int64_t n = blockIdx.x;
+  const int64_t g = blockIdx.y;
+  const int64_t ng = n * G + g;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t i = threadIdx.x; i < D; i += blockDim.x) {
+    const int64_t index = ng * D + i;
+    const int64_t c = g * D + i;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[c]);
+    sum1 += dY[index] * X[index] * gamma_v;
+    sum2 += dY[index] * gamma_v;
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+    sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  } else {
+    __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+    __shared__ T_ACC db_shared[C10_WARP_SIZE];
+    sum1 = zoom_utils::BlockReduceSum<T_ACC>(sum1, ds_shared);
+    sum2 = zoom_utils::BlockReduceSum<T_ACC>(sum2, db_shared);
+  }
+  if (threadIdx.x == 0) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(D);
+    const T_ACC x = (sum2 * static_cast<T_ACC>(mean[ng]) - sum1) *
+        static_cast<T_ACC>(rstd[ng]) * static_cast<T_ACC>(rstd[ng]) *
+        static_cast<T_ACC>(rstd[ng]) * s;
+    c2[ng] = x;
+    c3[ng] = -x * static_cast<T_ACC>(mean[ng]) -
+        sum2 * static_cast<T_ACC>(rstd[ng]) * s;
+  }
+}
+
+template <typename T>
+__global__ void GammaBeta1dBackwardZoomKernel1(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* dY,
+    const T* X,
+    const T* mean,
+    const T* rstd,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t n = 0; n < N; ++n) {
+      const int64_t nc = n * C + c;
+      const int64_t ng = n * G + c / D;
+      const T_ACC dy_acc = static_cast<T_ACC>(dY[nc]);
+      const T_ACC x_acc = static_cast<T_ACC>(X[nc]);
+      sum1 += (dgamma == nullptr)
+          ? T_ACC(0)
+          : ((dy_acc * x_acc - dy_acc * static_cast<T_ACC>(mean[ng])) *
+             static_cast<T_ACC>(rstd[ng]));
+      sum2 += (dbeta == nullptr) ? T_ACC(0) : dy_acc;
+    }
+    if (dgamma != nullptr) {
+      dgamma[c] = sum1;
+    }
+    if (dbeta != nullptr) {
+      dbeta[c] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBeta1dBackwardZoomKernel2(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* dY,
+    const T* X,
+    const T* mean,
+    const T* rstd,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  __shared__ T_ACC g_shared[kReduceTileSize][kReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kReduceTileSize][kReduceTileSize + 1];
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    // Accumulate each 32 cols into a 32 * 32 tile.
+    // Since the blockDim is (32, 16), accumulate twice for 1st and 2nd 16 rows
+    // of a 32 contiguous elements.
+    for (int64_t n = threadIdx.y; n < N; n += blockDim.y * 2) {
+      const int64_t n1 = n;
+      const int64_t n2 = n + blockDim.y;
+      const int64_t nc1 = n1 * C + c;
+      const int64_t nc2 = n2 * C + c;
+      const int64_t ng1 = n1 * G + c / D;
+      const int64_t ng2 = n2 * G + c / D;
+      const T_ACC dy1_acc = static_cast<T_ACC>(dY[nc1]);
+      const T_ACC x1_acc = static_cast<T_ACC>(X[nc1]);
+      dg_sum1 += dgamma == nullptr
+          ? T_ACC(0)
+          : ((dy1_acc * x1_acc - dy1_acc * static_cast<T_ACC>(mean[ng1])) *
+             static_cast<T_ACC>(rstd[ng1]));
+      db_sum1 += dbeta == nullptr ? T_ACC(0) : dy1_acc;
+      if (n2 < N) {
+        const T_ACC dy2_acc = static_cast<T_ACC>(dY[nc2]);
+        const T_ACC x2_acc = static_cast<T_ACC>(X[nc2]);
+        dg_sum2 += dgamma == nullptr
+            ? T_ACC(0)
+            : ((dy2_acc * x2_acc - dy2_acc * static_cast<T_ACC>(mean[ng2])) *
+               static_cast<T_ACC>(rstd[ng2]));
+        db_sum2 += dbeta == nullptr ? T_ACC(0) : dy2_acc;
+      }
+    }
+  }
+
+  // Write accumulated tile to shared memory.
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+
+  // Do warp reduce for the 1st 16 cols in the tile.
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+
+  // Do warp reduce for the 2nd 16 cols in the tile.
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsZoomKernel(
+    int64_t HxW,
+    const T* dY,
+    const T* X,
+    acc_type<T, true>* ds,
+    acc_type<T, true>* db) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t nc = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t hw = threadIdx.x; hw < HxW; hw += blockDim.x) {
+    const int64_t index = nc * HxW + hw;
+    sum1 += static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]);
+    sum2 += static_cast<T_ACC>(dY[index]);
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+    sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  } else {
+    __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+    __shared__ T_ACC db_shared[C10_WARP_SIZE];
+    sum1 = zoom_utils::BlockReduceSum<T_ACC>(sum1, ds_shared);
+    sum2 = zoom_utils::BlockReduceSum<T_ACC>(sum2, db_shared);
+  }
+  if (threadIdx.x == 0) {
+    ds[nc] = sum1;
+    db[nc] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeBackwardFusedParamsZoomKernel(
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const T* gamma,
+    const acc_type<T, true>* ds,
+    const acc_type<T, true>* db,
+    acc_type<T, true>* c2,
+    acc_type<T, true>* c3) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const int64_t n = blockIdx.x;
+  const int64_t g = blockIdx.y;
+  const int64_t ng = n * G + g;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t i = threadIdx.x; i < D; i += blockDim.x) {
+    const int64_t index = ng * D + i;
+    const int64_t c = g * D + i;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[c]);
+    sum1 += ds[index] * gamma_v;
+    sum2 += db[index] * gamma_v;
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+    sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  } else {
+    __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+    __shared__ T_ACC db_shared[C10_WARP_SIZE];
+    sum1 = zoom_utils::BlockReduceSum<T_ACC>(sum1, ds_shared);
+    sum2 = zoom_utils::BlockReduceSum<T_ACC>(sum2, db_shared);
+  }
+  if (threadIdx.x == 0) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(D * HxW);
+    const T_ACC x = (sum2 * static_cast<T_ACC>(mean[ng]) - sum1) *
+        static_cast<T_ACC>(rstd[ng]) * static_cast<T_ACC>(rstd[ng]) *
+        static_cast<T_ACC>(rstd[ng]) * s;
+    c2[ng] = x;
+    c3[ng] = -x * static_cast<T_ACC>(mean[ng]) -
+        sum2 * static_cast<T_ACC>(rstd[ng]) * s;
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardZoomKernel1(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const acc_type<T, true>* ds,
+    const acc_type<T, true>* db,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t n = 0; n < N; ++n) {
+      const int64_t nc = n * C + c;
+      const int64_t ng = n * G + c / D;
+      sum1 += (dgamma == nullptr)
+          ? T_ACC(0)
+          : ((ds[nc] - db[nc] * static_cast<T_ACC>(mean[ng])) *
+             static_cast<T_ACC>(rstd[ng]));
+      sum2 += (dbeta == nullptr) ? T_ACC(0) : db[nc];
+    }
+    if (dgamma != nullptr) {
+      dgamma[c] = sum1;
+    }
+    if (dbeta != nullptr) {
+      dbeta[c] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardZoomKernel2(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const acc_type<T, true>* ds,
+    const acc_type<T, true>* db,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  __shared__ T_ACC g_shared[kReduceTileSize][kReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kReduceTileSize][kReduceTileSize + 1];
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    // Accumulate each 32 cols into a 32 * 32 tile.
+    // Since the blockDim is (32, 16), accumulate twice for 1st and 2nd 16 rows
+    // of a 32 contiguous elements.
+    for (int64_t n = threadIdx.y; n < N; n += blockDim.y * 2) {
+      const int64_t n1 = n;
+      const int64_t n2 = n + blockDim.y;
+      const int64_t nc1 = n1 * C + c;
+      const int64_t nc2 = n2 * C + c;
+      const int64_t ng1 = n1 * G + c / D;
+      const int64_t ng2 = n2 * G + c / D;
+      dg_sum1 += dgamma == nullptr
+          ? T_ACC(0)
+          : ((ds[nc1] - db[nc1] * static_cast<T_ACC>(mean[ng1])) *
+             static_cast<T_ACC>(rstd[ng1]));
+      db_sum1 += dbeta == nullptr ? T_ACC(0) : db[nc1];
+      if (n2 < N) {
+        dg_sum2 += dgamma == nullptr
+            ? T_ACC(0)
+            : ((ds[nc2] - db[nc2] * static_cast<T_ACC>(mean[ng2])) *
+               static_cast<T_ACC>(rstd[ng2]));
+        db_sum2 += dbeta == nullptr ? T_ACC(0) : db[nc2];
+      }
+    }
+  }
+
+  // Write accumulated tile to shared memory.
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+
+  // Do warp reduce for the 1st 16 cols in the tile.
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+
+  // Do warp reduce for the 2st 16 cols in the tile.
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = zoom_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = zoom_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+void GroupNorm1dForward(
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    Tensor& Y) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  if (gamma.defined() && beta.defined()) {
+    auto iter = TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N, G, D}))
+                    .add_owned_const_input(X.view({N, G, D}))
+                    .add_owned_input(mean.view({N, G, 1}))
+                    .add_owned_input(rstd.view({N, G, 1}))
+                    .add_owned_const_input(gamma.view({1, G, D}))
+                    .add_owned_const_input(beta.view({1, G, D}))
+                    .build();
+    gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma, T beta) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma) +
+          static_cast<T_ACC>(beta);
+    });
+  } else if (gamma.defined()) {
+    auto iter = TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N, G, D}))
+                    .add_owned_const_input(X.view({N, G, D}))
+                    .add_owned_input(mean.view({N, G, 1}))
+                    .add_owned_input(rstd.view({N, G, 1}))
+                    .add_owned_const_input(gamma.view({1, G, D}))
+                    .build();
+    gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
+    });
+  } else if (beta.defined()) {
+    auto iter = TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N, G, D}))
+                    .add_owned_const_input(X.view({N, G, D}))
+                    .add_owned_input(mean.view({N, G, 1}))
+                    .add_owned_input(rstd.view({N, G, 1}))
+                    .add_owned_const_input(beta.view({1, G, D}))
+                    .build();
+    gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T beta) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd) +
+          static_cast<T_ACC>(beta);
+    });
+  } else {
+    auto iter = TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N * G, D}))
+                    .add_owned_const_input(X.view({N * G, D}))
+                    .add_owned_input(mean.view({N * G, 1}))
+                    .add_owned_input(rstd.view({N * G, 1}))
+                    .build();
+    gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd);
+    });
+  }
+  C10_ZOOM_CHECK(hipGetLastError());
+}
+
+template <typename T>
+void GroupNormKernelImplInternal(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    T eps,
+    Tensor& Y,
+    Tensor& mean,
+    Tensor& rstd) {
+  using T_ACC = acc_type<T, true>;
+  TORCH_CHECK(X.numel() == N * C * HxW);
+  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
+  TORCH_CHECK(!beta.defined() || beta.numel() == C);
+  if (N == 0) {
+    return;
+  }
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const T* X_data = X.const_data_ptr<T>();
+  T* mean_data = mean.mutable_data_ptr<T>();
+  T* rstd_data = rstd.mutable_data_ptr<T>();
+
+  hipStream_t zoom_stream = c10::zoom::getCurrentZoomStream();
+  const int64_t num_threads = D * HxW < zoom_utils::kHIPBlockReduceNumThreads
+      ? at::zoom::warp_size()
+      : zoom_utils::kHIPBlockReduceNumThreads;
+  RowwiseMomentsZoomKernel<T><<<N * G, num_threads, 0, zoom_stream>>>(
+      D * HxW, eps, X_data, mean_data, rstd_data);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  if (HxW == 1) {
+    GroupNorm1dForward<T>(X, mean, rstd, gamma, beta, N, C, G, Y);
+  } else if (!gamma.defined() && !beta.defined()) {
+    auto iter = TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N * G, D * HxW}))
+                    .add_owned_const_input(X.view({N * G, D * HxW}))
+                    .add_owned_input(mean.view({N * G, 1}))
+                    .add_owned_input(rstd.view({N * G, 1}))
+                    .build();
+    gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd);
+    });
+  } else {
+    const auto kAccType =
+        (X.scalar_type() == kHalf || X.scalar_type() == kBFloat16)
+        ? kFloat
+        : X.scalar_type();
+    Tensor a = at::empty({N, C}, X.options().dtype(kAccType));
+    Tensor b = at::empty({N, C}, X.options().dtype(kAccType));
+    const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+    const T* beta_data = beta.defined() ? beta.const_data_ptr<T>() : nullptr;
+    T_ACC* a_data = a.mutable_data_ptr<T_ACC>();
+    T_ACC* b_data = b.mutable_data_ptr<T_ACC>();
+
+    // TODO: Since there is some issues in gpu_kernel_multiple_outputs, we are
+    // using manual kernel here. Make it using gpu_kernel_multiple_outputs once
+    // the issue fixed.
+    const int64_t B = (N * C + kHIPNumThreads - 1) / kHIPNumThreads;
+    ComputeFusedParamsZoomKernel<T><<<B, kHIPNumThreads, 0, zoom_stream>>>(
+        N, C, G, mean_data, rstd_data, gamma_data, beta_data, a_data, b_data);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    auto iter = TensorIteratorConfig()
+                    .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N * C, HxW}))
+                    .add_owned_const_input(X.view({N * C, HxW}))
+                    .add_owned_input(a.view({N * C, 1}))
+                    .add_owned_input(b.view({N * C, 1}))
+                    .build();
+    gpu_kernel(iter, [] GPU_LAMBDA(T x, T_ACC a, T_ACC b) -> T {
+      return a * static_cast<T_ACC>(x) + b;
+    });
+  }
+  C10_ZOOM_CHECK(hipGetLastError());
+}
+
+void GroupNormKernelImpl(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    double eps,
+    Tensor& Y,
+    Tensor& mean,
+    Tensor& rstd) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      X.scalar_type(),
+      "GroupNormKernelImpl",
+      [&]() {
+        GroupNormKernelImplInternal<scalar_t>(
+            X,
+            gamma,
+            beta,
+            N,
+            C,
+            HxW,
+            group,
+            static_cast<scalar_t>(eps),
+            Y,
+            mean,
+            rstd);
+      });
+}
+
+template <typename T>
+void GroupNorm1dBackward(
+    const Tensor dY,
+    const Tensor X,
+    const Tensor mean,
+    const Tensor rstd,
+    const Tensor gamma,
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    Tensor& dX,
+    Tensor& dgamma,
+    Tensor& dbeta) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const T* dY_data = dY.const_data_ptr<T>();
+  const T* X_data = X.const_data_ptr<T>();
+  const T* mean_data = mean.const_data_ptr<T>();
+  const T* rstd_data = rstd.const_data_ptr<T>();
+
+  hipStream_t zoom_stream = c10::zoom::getCurrentZoomStream();
+  if (dX.defined()) {
+    const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+    const auto kAccType =
+        (X.scalar_type() == kHalf || X.scalar_type() == kBFloat16)
+        ? kFloat
+        : X.scalar_type();
+    Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType));
+    Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType));
+    T_ACC* c2_data = c2.mutable_data_ptr<T_ACC>();
+    T_ACC* c3_data = c3.mutable_data_ptr<T_ACC>();
+    const int64_t num_threads = (C / G) < zoom_utils::kHIPBlockReduceNumThreads
+        ? at::zoom::warp_size()
+        : zoom_utils::kHIPBlockReduceNumThreads;
+    Compute1dBackwardFusedParamsZoomKernel<T>
+        <<<dim3(N, G), num_threads, 0, zoom_stream>>>(
+            C,
+            G,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            gamma_data,
+            c2_data,
+            c3_data);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    if (gamma.defined()) {
+      auto iter = TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N, G, D}))
+                      .add_owned_const_input(dY.view({N, G, D}))
+                      .add_owned_const_input(X.view({N, G, D}))
+                      .add_owned_const_input(rstd.view({N, G, 1}))
+                      .add_owned_const_input(gamma.view({1, G, D}))
+                      .add_owned_const_input(c2.view({N, G, 1}))
+                      .add_owned_const_input(c3.view({N, G, 1}))
+                      .build();
+      gpu_kernel(
+          iter,
+          [] GPU_LAMBDA(T dy, T x, T rstd, T gamma, T_ACC c2, T_ACC c3) -> T {
+            const T_ACC c1 =
+                static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    } else {
+      auto iter = TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N * G, D}))
+                      .add_owned_const_input(dY.view({N * G, D}))
+                      .add_owned_const_input(X.view({N * G, D}))
+                      .add_owned_const_input(rstd.view({N * G, 1}))
+                      .add_owned_const_input(c2.view({N * G, 1}))
+                      .add_owned_const_input(c3.view({N * G, 1}))
+                      .build();
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(T dy, T x, T rstd, T_ACC c2, T_ACC c3) -> T {
+            const T_ACC c1 = static_cast<T_ACC>(rstd);
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    }
+  }
+  if (dgamma.defined() || dbeta.defined()) {
+    T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr<T>() : nullptr;
+    T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr<T>() : nullptr;
+    if (N <= 128) {
+      const int64_t B = (C + kHIPNumThreads - 1) / kHIPNumThreads;
+      GammaBeta1dBackwardZoomKernel1<T><<<B, kHIPNumThreads, 0, zoom_stream>>>(
+          N,
+          C,
+          G,
+          dY_data,
+          X_data,
+          mean_data,
+          rstd_data,
+          dgamma_data,
+          dbeta_data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
+      // The algorithm for colwise reduction here is to accumulate each 32 cols
+      // to a 32 * 32 tile and write the tile to shared memory. Then do warp
+      // reduce for each col in the tile. So here the blockDim must be (32, 16).
+      constexpr int kThreadX = kReduceTileSize;
+      constexpr int kThreadY = kReduceTileSize / 2;
+      GammaBeta1dBackwardZoomKernel2<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, zoom_stream>>>(
+              N,
+              C,
+              G,
+              dY_data,
+              X_data,
+              mean_data,
+              rstd_data,
+              dgamma_data,
+              dbeta_data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+}
+
+template <typename T>
+void GroupNormBackwardKernelImplInternal(
+    const Tensor& dY,
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const Tensor& gamma,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    Tensor& dX,
+    Tensor& dgamma,
+    Tensor& dbeta) {
+  using T_ACC = acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  TORCH_CHECK(dY.numel() == N * C * HxW);
+  TORCH_CHECK(X.numel() == N * C * HxW);
+  TORCH_CHECK(mean.numel() == N * G);
+  TORCH_CHECK(rstd.numel() == N * G);
+  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
+  hipStream_t zoom_stream = c10::zoom::getCurrentZoomStream();
+
+  if (N == 0) {
+    if (dgamma.defined()) {
+      dgamma.fill_(T(0));
+    }
+    if (dbeta.defined()) {
+      dbeta.fill_(T(0));
+    }
+    return;
+  }
+
+  const T* dY_data = dY.const_data_ptr<T>();
+  const T* X_data = X.const_data_ptr<T>();
+  const T* mean_data = mean.const_data_ptr<T>();
+  const T* rstd_data = rstd.const_data_ptr<T>();
+  const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+  const auto kAccType =
+      (X.scalar_type() == kHalf || X.scalar_type() == kBFloat16)
+      ? kFloat
+      : X.scalar_type();
+  Tensor ds = at::empty({N, C}, X.options().dtype(kAccType));
+  Tensor db = at::empty({N, C}, X.options().dtype(kAccType));
+  T_ACC* ds_data = ds.mutable_data_ptr<T_ACC>();
+  T_ACC* db_data = db.mutable_data_ptr<T_ACC>();
+
+  if (HxW == 1) {
+    GroupNorm1dBackward<T>(
+        dY, X, mean, rstd, gamma, N, C, G, dX, dgamma, dbeta);
+    return;
+  }
+
+  int warp_size = at::zoom::warp_size();
+  int64_t num_threads = HxW < zoom_utils::kHIPBlockReduceNumThreads
+      ? warp_size
+      : zoom_utils::kHIPBlockReduceNumThreads;
+  ComputeInternalGradientsZoomKernel<T><<<N * C, num_threads, 0, zoom_stream>>>(
+      HxW, dY_data, X_data, ds_data, db_data);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  if (dX.defined()) {
+    Tensor c1 = at::empty({0}, X.options().dtype(kAccType));
+    Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType));
+    Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType));
+    T_ACC* c2_data = c2.mutable_data_ptr<T_ACC>();
+    T_ACC* c3_data = c3.mutable_data_ptr<T_ACC>();
+
+    if (gamma.defined()) {
+      auto iter = TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .add_output(c1)
+                      .add_owned_const_input(rstd.view({N, G, 1}))
+                      .add_owned_const_input(gamma.view({1, G, D}))
+                      .build();
+      gpu_kernel(iter, [] GPU_LAMBDA(T rstd, T gamma) -> T_ACC {
+        return static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
+      });
+    }
+
+    num_threads = (C / G) < zoom_utils::kHIPBlockReduceNumThreads
+        ? warp_size
+        : zoom_utils::kHIPBlockReduceNumThreads;
+    ComputeBackwardFusedParamsZoomKernel<T>
+        <<<dim3(N, G), num_threads, 0, zoom_stream>>>(
+            C,
+            HxW,
+            G,
+            mean_data,
+            rstd_data,
+            gamma_data,
+            ds_data,
+            db_data,
+            c2_data,
+            c3_data);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    if (gamma.defined()) {
+      auto iter = TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N * G, D, HxW}))
+                      .add_owned_const_input(dY.view({N * G, D, HxW}))
+                      .add_owned_const_input(X.view({N * G, D, HxW}))
+                      .add_owned_const_input(c1.view({N * G, D, 1}))
+                      .add_owned_const_input(c2.view({N * G, 1, 1}))
+                      .add_owned_const_input(c3.view({N * G, 1, 1}))
+                      .build();
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T {
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    } else {
+      auto iter = TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N * G, D * HxW}))
+                      .add_owned_const_input(dY.view({N * G, D * HxW}))
+                      .add_owned_const_input(X.view({N * G, D * HxW}))
+                      .add_owned_const_input(rstd.view({N * G, 1}))
+                      .add_owned_const_input(c2.view({N * G, 1}))
+                      .add_owned_const_input(c3.view({N * G, 1}))
+                      .build();
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T {
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    }
+  }
+  if (dgamma.defined() || dbeta.defined()) {
+    T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr<T>() : nullptr;
+    T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr<T>() : nullptr;
+    if (N <= 128) {
+      // For small batch size, do colwise reduce directly.
+      const int64_t B = (C + kHIPNumThreads - 1) / kHIPNumThreads;
+      GammaBetaBackwardZoomKernel1<T><<<B, kHIPNumThreads, 0, zoom_stream>>>(
+          N,
+          C,
+          G,
+          mean_data,
+          rstd_data,
+          ds_data,
+          db_data,
+          dgamma_data,
+          dbeta_data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
+      // The algorithm for colwise reduction here is to accumulate each 32 cols
+      // to a 32 * 32 tile and write the tile to shared memory. Then do warp
+      // reduce for each col in the tile. So here the blockDim must be (32, 16).
+      constexpr int kThreadX = kReduceTileSize;
+      constexpr int kThreadY = kReduceTileSize / 2;
+      GammaBetaBackwardZoomKernel2<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, zoom_stream>>>(
+              N,
+              C,
+              G,
+              mean_data,
+              rstd_data,
+              ds_data,
+              db_data,
+              dgamma_data,
+              dbeta_data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+}
+
+void GroupNormBackwardKernelImpl(
+    const Tensor& dY,
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const Tensor& gamma,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    Tensor& dX,
+    Tensor& dgamma,
+    Tensor& dbeta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      X.scalar_type(),
+      "GroupNormBackwardKernelImpl",
+      [&]() {
+        GroupNormBackwardKernelImplInternal<scalar_t>(
+            dY, X, mean, rstd, gamma, N, C, HxW, group, dX, dgamma, dbeta);
+      });
+}
+
+} // namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(GroupNormKernel, &GroupNormKernelImpl);
+REGISTER_PRIVATEUSE1_DISPATCH(GroupNormBackwardKernel, &GroupNormBackwardKernelImpl);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/hermite_polynomial_h.cu b/aten/src/ATen/native/zoom/hermite_polynomial_h.cu
new file mode 100644
index 00000000000000..88bb4ef674de72
--- /dev/null
+++ b/aten/src/ATen/native/zoom/hermite_polynomial_h.cu
@@ -0,0 +1,31 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char hermite_polynomial_h_name[] = "hermite_polynomial_h_forward";
+
+            void hermite_polynomial_h_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_h_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<hermite_polynomial_h_name, scalar_t, scalar_t>(iterator, hermite_polynomial_h_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_h_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return hermite_polynomial_h_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // hermite_polynomial_h_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/hermite_polynomial_he.cu b/aten/src/ATen/native/zoom/hermite_polynomial_he.cu
new file mode 100644
index 00000000000000..4fb1e2a113be44
--- /dev/null
+++ b/aten/src/ATen/native/zoom/hermite_polynomial_he.cu
@@ -0,0 +1,31 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char hermite_polynomial_he_name[] = "hermite_polynomial_he_forward";
+
+            void hermite_polynomial_he_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_he_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<hermite_polynomial_he_name, scalar_t, scalar_t>(iterator, hermite_polynomial_he_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_he_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return hermite_polynomial_he_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // hermite_polynomial_he_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_kernel_zoom);
+} // namespace at::native
diff --git a/test/custom_backend/CMakeLists.txt b/test/custom_backend/CMakeLists.txt
index 835f17850a842a..a15d00f5aa2d52 100644
--- a/test/custom_backend/CMakeLists.txt
+++ b/test/custom_backend/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(custom_backend)
 
-if(USE_ROCM)
+if(USE_ROCM OR USE_ZOOM)
 include(utils)
 include(LoadHIP)
 endif()
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index 6d1a4988fe3821..a11628df65cf38 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -2,7 +2,7 @@
 cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 project(custom_ops)
 
-if(USE_ROCM)
+if(USE_ROCM OR USE_ZOOM)
 include(utils)
 include(LoadHIP)
 endif()
diff --git a/test/test_torch.py b/test/test_torch.py
index c8cff93bd1bf61..cae17008081328 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -38,8 +38,8 @@
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, skipIfTorchInductor, load_tests, slowTest, slowTestIf,
     TEST_WITH_CROSSREF, skipIfTorchDynamo, skipRocmIfTorchInductor, set_default_dtype,
     skipCUDAMemoryLeakCheckIf, BytesIOContext,
-    skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
-    wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
+    skipIfRocm, skipIfZoom, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
+    wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard, ZoomSyncGuard,
     skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps, noncontiguous_like,
     AlwaysWarnTypedStorageRemoval, TEST_WITH_TORCHDYNAMO)
 from multiprocessing.reduction import ForkingPickler
@@ -47,8 +47,8 @@
     expectedFailureMeta,
     expectedFailureXLA,
     instantiate_device_type_tests,
-    onlyCUDA, onlyCPU,
-    dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast,
+    onlyCUDA, onlyZOOM, onlyCUDAAndZOOM, onlyCPU,
+    dtypes, dtypesIfCUDA, dtypesIfZoom, dtypesIfCPU, deviceCountAtLeast,
     skipMeta, PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyNativeDeviceTypes,
     get_all_device_types, skipXLA)
 from typing import Tuple
@@ -242,6 +242,7 @@ def test_tensor_storage_type(self, device, dtype):
         a = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
 
         module = torch.cuda if (torch.device(device).type == 'cuda') else torch
+        module = torch.zoom if (torch.device(device).type == 'zoom') else module
         expected_storage_type = getattr(module, torch.storage._dtype_to_storage_type_map()[dtype])
 
         self.assertEqual(a.storage_type(), expected_storage_type)
@@ -992,7 +993,7 @@ def test_conv_transposed_backward_agnostic_to_memory_format(self, device):
         out.backward(torch.ones_like(out).transpose(-2, -1))
 
     # TODO: this test should be in test_nn.py
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @largeTensorTest('12GB')
     def test_conv_transposed_large(self, device):
         # ConvTranspose3d works for large input tensors (gh-32866)
@@ -1067,7 +1068,7 @@ def test_broadcast(self, fn, device):
             small2 = torch.randn(*dims_small2, device=device).float()
             small2_expanded = small2.expand(*dims_full)
 
-        if small.is_cuda and fn in ['map', 'map2']:
+        if (small.is_cuda or small.is_zoom) and fn in ['map', 'map2']:
             # map and map2 are not implementd on CUDA tensors
             return
 
@@ -1195,9 +1196,10 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
             _test_in_place_broadcastable(small2, small, large)
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @wrapDeterministicFlagAPITest
     def test_cublas_config_nondeterministic_alert(self, device):
+        is_zoom = torch.device(device).type == 'zoom'
         test_cases = [
             # (function, (tensor sizes))
             ('mm', ((2, 2), (2, 2),)),
@@ -1211,7 +1213,7 @@ def test_cublas_config_nondeterministic_alert(self, device):
             (':4096:8', True),
             (':16:8', True)]
 
-        cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG'
+        cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG' if not is_zoom else 'HIPBLAS_WORKSPACE_CONFIG'
         is_cuda10_2_or_higher = (
             (torch.version.cuda is not None)
             and ([int(x) for x in torch.version.cuda.split(".")] >= [10, 2]))
@@ -1229,9 +1231,13 @@ def test_case_info(fn_name, config):
                         del env[cublas_var_name]
                 else:
                     env[cublas_var_name] = config
-                should_throw_error = is_cuda10_2_or_higher and not is_config_deterministic
+                # Zoom automatically changes the hipBLAS atomics mode when grabbing the hipBLAS handle if
+                # deterministic algorithms are turned on, so we shouldn't get an error (but this IS checked in Context.cpp)
+                should_throw_error = (is_cuda10_2_or_higher) and not is_config_deterministic
+                # TODO(Arham): remove rename stmt once zoom has a dispatch key
                 script = f"""
 import torch
+torch.utils.rename_privateuse1_backend('zoom')
 torch.use_deterministic_algorithms(True)
 fn = torch.{fn_name}
 arg_sizes = {arg_sizes}
@@ -1246,7 +1252,7 @@ def test_case_info(fn_name, config):
     if not should_throw_error:
         raise RuntimeError('Did not expect any error to be raised')
     elif 'Deterministic behavior was enabled with either' not in str(e):
-        raise RuntimeError('Expected a CuBLAS nondeterministic error, but got a different error')
+        raise RuntimeError('Expected a CuBLAS nondeterministic error, but got a different error: ' + str(e))
 else:
     if should_throw_error:
         raise RuntimeError('Expected a CuBLAS nondeterministic error, but it was not raised')
@@ -1368,11 +1374,14 @@ def test_nondeterministic_alert_AvgPool3d(self, device):
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'avg_pool3d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'avg_pool3d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1381,11 +1390,14 @@ def test_nondeterministic_alert_AdaptiveAvgPool2d(self, device):
         input = torch.randn(2, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'adaptive_avg_pool2d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'adaptive_avg_pool2d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1394,11 +1406,14 @@ def test_nondeterministic_alert_AdaptiveAvgPool3d(self, device):
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'adaptive_avg_pool3d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'adaptive_avg_pool3d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1407,11 +1422,14 @@ def test_nondeterministic_alert_MaxPool3d(self, device):
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'max_pool3d_with_indices_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'max_pool3d_with_indices_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1420,11 +1438,14 @@ def test_nondeterministic_alert_AdaptiveMaxPool2d(self, device):
         input = torch.randn(2, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'adaptive_max_pool2d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'adaptive_max_pool2d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1433,11 +1454,14 @@ def test_nondeterministic_alert_FractionalMaxPool2d(self, device):
         input = torch.randn(2, 3, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'fractional_max_pool2d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'fractional_max_pool2d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1446,11 +1470,14 @@ def test_nondeterministic_alert_FractionalMaxPool3d(self, device):
         input = torch.randn(2, 3, 3, 3, 3, requires_grad=True, device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'fractional_max_pool3d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'fractional_max_pool3d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @dtypes(*floating_types_and(torch.half))
     @onlyNativeDeviceTypes
@@ -1497,6 +1524,9 @@ def test_nondeterministic_alert_MaxUnpool3d(self, device, dtype):
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_interpolate_linear(self, device):
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
         input = torch.randn(1, 2, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
             input,
@@ -1507,11 +1537,14 @@ def test_nondeterministic_alert_interpolate_linear(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad),
-            'upsample_linear1d_backward_out_cuda',
-            torch.device(device).type == 'cuda')
+            f'upsample_linear1d_backward_out_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_interpolate_bilinear(self, device):
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
         input = torch.randn(1, 2, 4, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
             input,
@@ -1522,8 +1555,8 @@ def test_nondeterministic_alert_interpolate_bilinear(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad),
-            'upsample_bilinear2d_backward_out_cuda',
-            torch.device(device).type == 'cuda')
+            f'upsample_bilinear2d_backward_out_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfTorchInductor("aot-autograd issue")
     def test_deterministic_replication_pad2d(self, device):
@@ -1577,6 +1610,9 @@ def test_deterministic_interpolate_bilinear(self, device):
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_interpolate_bicubic(self, device):
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
         input = torch.randn(1, 2, 4, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
             input,
@@ -1587,12 +1623,15 @@ def test_nondeterministic_alert_interpolate_bicubic(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad),
-            'upsample_bicubic2d_backward_out_cuda',
-            torch.device(device).type == 'cuda')
+            f'upsample_bicubic2d_backward_out_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_interpolate_trilinear(self, device):
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
         input = torch.randn(1, 2, 4, 4, 4, device=device, requires_grad=True)
         res = torch.nn.functional.interpolate(
             input,
@@ -1603,8 +1642,8 @@ def test_nondeterministic_alert_interpolate_trilinear(self, device):
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad),
-            'upsample_trilinear3d_backward_out_cuda',
-            torch.device(device).type == 'cuda')
+            f'upsample_trilinear3d_backward_out_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1613,11 +1652,14 @@ def test_nondeterministic_alert_ReflectionPad1d(self, device):
         input = torch.randn(2, 3, 8, device=device, requires_grad=True)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'reflection_pad1d_backward_out_cuda',
-            torch.device(device).type == 'cuda')
+            f'reflection_pad1d_backward_out_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_ReflectionPad2d(self, device):
@@ -1625,11 +1667,14 @@ def test_nondeterministic_alert_ReflectionPad2d(self, device):
         input = torch.randn(2, 3, 8, 8, device=device, requires_grad=True)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'reflection_pad2d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'reflection_pad2d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1638,11 +1683,14 @@ def test_nondeterministic_alert_ReflectionPad3d(self, device):
         input = torch.randn(2, 3, 8, 8, 8, device=device, requires_grad=True)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'reflection_pad3d_backward_out_cuda',
-            torch.device(device).type == 'cuda')
+            f'reflection_pad3d_backward_out_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1651,11 +1699,14 @@ def test_nondeterministic_alert_ReplicationPad1d(self, device):
         input = torch.randn(2, 3, 4, device=device, requires_grad=True)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'replication_pad1d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'replication_pad1d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_ReplicationPad2d(self, device):
@@ -1663,13 +1714,16 @@ def test_nondeterministic_alert_ReplicationPad2d(self, device):
         input = torch.randn(2, 3, 4, 4, device=device, requires_grad=True)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         # Nondeterministic alert should only be raised if the forward call was
         # nondeterministic
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'replication_pad2d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'replication_pad2d_backward_{device_type}',
+            is_cuda or is_zoom)
 
         with DeterministicGuard(True):
             res = module(input)
@@ -1690,23 +1744,28 @@ def test_nondeterministic_alert_ReplicationPad3d(self, device):
         input = torch.randn(2, 3, 4, 4, 4, device=device, requires_grad=True)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'replication_pad3d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'replication_pad3d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfTorchDynamo("Warning is not raised.")
     def test_nondeterministic_alert_NLLLoss(self, device):
         module = torch.nn.NLLLoss()
         input = torch.randn(2, 3, 5, 5, device=device)
         target = torch.rand(2, 5, 5, device=device).mul(3).floor().long()
-
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: module(input, target),
-            'nll_loss2d_forward_out_cuda_template',
-            torch.device(device).type == 'cuda')
+            f'nll_loss2d_forward_out_{device_type}_template',
+            is_cuda or is_zoom)
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_CTCLoss(self, device):
@@ -1717,11 +1776,14 @@ def test_nondeterministic_alert_CTCLoss(self, device):
         target_lengths = [30, 25, 20]
         res = module(input, target, input_lengths, target_lengths)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
             'ctc_loss_backward_gpu',
-            torch.device(device).type == 'cuda')
+            is_cuda or is_zoom)
 
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_EmbeddingBag_max(self, device):
@@ -1731,22 +1793,28 @@ def test_nondeterministic_alert_EmbeddingBag_max(self, device):
         input = torch.randint(0, 3, (4, 3), device=device)
         res = module(input)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'embedding_bag_backward_cuda_max',
-            torch.device(device).type == 'cuda')
+            f'embedding_bag_backward_{device_type}_max',
+            is_cuda or is_zoom)
 
     @dtypes(*all_types_and_complex_and(torch.bool))
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     def test_nondeterministic_alert_cumsum(self, device, dtype):
         input = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
-        should_alert = torch.device(device).type == 'cuda' and (dtype.is_floating_point or dtype.is_complex)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
+        should_alert = (is_cuda or is_zoom) and (dtype.is_floating_point or dtype.is_complex)
 
         for op_call in [torch.Tensor.cumsum, torch.cumsum]:
             self.check_nondeterministic_alert(
                 lambda: op_call(input, 0),
-                'cumsum_cuda_kernel',
+                f'cumsum_{device_type}_kernel',
                 should_alert)
 
     @expectedFailureMeta  # expected a non-determinitic error, but it was not raised
@@ -1769,34 +1837,42 @@ def test_nondeterministic_alert_put_accumulate(self, device):
         a = torch.randn(10, device=device)
         indices = torch.tensor([0, 0], device=device)
         values = torch.tensor([0., 1.], device=device)
+        is_cuda = torch.device(device).type == 'cuda'
+        is_zoom = torch.device(device).type == 'zoom'
 
         for op_call in [torch.Tensor.put, torch.Tensor.put_]:
             self.check_nondeterministic_alert(
                 lambda: op_call(a, indices, values, accumulate=True),
                 'put_',
-                torch.device(device).type == 'cuda')
+                is_cuda or is_zoom)
 
     @skipIfMps
     def test_nondeterministic_alert_histc(self, device):
         a = torch.tensor([], device=device)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
         for op_call in [torch.histc, torch.Tensor.histc]:
             self.check_nondeterministic_alert(
                 lambda: op_call(a, min=0, max=3),
-                '_histc_cuda',
-                torch.device(device).type == 'cuda')
+                f'_histc_{device_type}',
+                is_cuda or is_zoom)
 
     @skipIfMps
     def test_nondeterministic_alert_bincount(self, device):
         a = torch.tensor([], device=device, dtype=torch.long)
         weights = torch.tensor([], device=device)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         for op_call in [torch.bincount, torch.Tensor.bincount]:
             # Error should only be raised when device is CUDA and weights are
             # given
             self.check_nondeterministic_alert(
                 lambda: op_call(a, weights),
-                '_bincount_cuda',
-                torch.device(device).type == 'cuda')
+                f'_bincount_{device_type}',
+                is_cuda or is_zoom)
 
             self.check_nondeterministic_alert(
                 lambda: op_call(a),
@@ -1806,6 +1882,10 @@ def test_nondeterministic_alert_bincount(self, device):
     # Ensures that kthvalue throws nondeterministic alerts in the correct cases
     @dtypes(torch.double)
     def test_nondeterministic_alert_kthvalue(self, device, dtype):
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
+
         def test_func(call_type):
             S = 10
             k = 5
@@ -1824,8 +1904,8 @@ def test_func(call_type):
         for call_type in ['function', 'method', 'out']:
             self.check_nondeterministic_alert(
                 lambda: test_func('function'),
-                'kthvalue CUDA',
-                torch.device(device).type == 'cuda')
+                'kthvalue CUDA' if is_cuda else 'kthvalue Zoom',
+                is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1834,11 +1914,14 @@ def test_nondeterministic_alert_grid_sample_2d(self, device):
         grid = torch.empty(1, 1, 1, 2, device=device)
         res = torch.nn.functional.grid_sample(input, grid, align_corners=False)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'grid_sampler_2d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'grid_sampler_2d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     @skipIfMps
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
@@ -1847,11 +1930,14 @@ def test_nondeterministic_alert_grid_sample_3d(self, device):
         grid = torch.empty(1, 1, 1, 2, 3, device=device)
         res = torch.nn.functional.grid_sample(input, grid, align_corners=False)
         grad = torch.ones_like(res)
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
 
         self.check_nondeterministic_alert(
             lambda: res.backward(grad, retain_graph=True),
-            'grid_sampler_3d_backward_cuda',
-            torch.device(device).type == 'cuda')
+            f'grid_sampler_3d_backward_{device_type}',
+            is_cuda or is_zoom)
 
     def test_invalid_shapes_grid_sampler(self, device):
         make_arg = partial(
@@ -1920,6 +2006,10 @@ def run_test(x, y):
     # Ensures that median throws nondeterministic alerts in the correct cases
     @dtypes(torch.double)
     def test_nondeterministic_alert_median(self, device, dtype):
+        device_type = torch.device(device).type
+        is_cuda = device_type == 'cuda'
+        is_zoom = device_type == 'zoom'
+        should_alert = is_cuda or is_zoom
         def test_func(call_type):
             S = 10
             a = torch.randn(S, device=device)
@@ -1941,16 +2031,14 @@ def test_func(call_type):
         def test_func_expect_error(call_type, should_error):
             self.check_nondeterministic_alert(
                 lambda: test_func(call_type),
-                'median CUDA with indices output',
+                'median CUDA with indices output' if is_cuda else 'median Zoom with indices output',
                 should_error)
 
-        is_cuda = torch.device(device).type == 'cuda'
-
         test_func_expect_error('function', False)
-        test_func_expect_error('function with indices', is_cuda)
+        test_func_expect_error('function with indices', should_alert)
         test_func_expect_error('method', False)
-        test_func_expect_error('method with indices', is_cuda)
-        test_func_expect_error('out with indices', is_cuda)
+        test_func_expect_error('method with indices', should_alert)
+        test_func_expect_error('out with indices', should_alert)
 
     # FIXME: move to test_scatter_gather_ops
     def _test_gather_backward_one_dim(self, device, deterministic: bool = False) -> None:
@@ -2022,12 +2110,13 @@ def test_scatter_zero_size_index(self, device) -> None:
         result = original.scatter(0, null_index, null_arr)
         self.assertEqual(result, original, atol=0, rtol=0)
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @skipIfTorchInductor("FIXME")
     def test_sync_warning(self, device):
+        SyncGuard = CudaSyncGuard if torch.device(device).type == 'cuda' else ZoomSyncGuard
 
         def _sync_raises_helper(f, level):
-            with CudaSyncGuard(level):
+            with SyncGuard(level):
                 if level == 1:
                     with self.assertWarnsRegex(UserWarning, "called a synchronizing "):
                         f()
@@ -2036,7 +2125,7 @@ def _sync_raises_helper(f, level):
                         f()
 
         def _no_sync_helper(f, level):
-            with CudaSyncGuard(level):
+            with SyncGuard(level):
                 f()
 
         def _ind_put_fn(x, ind, val):
@@ -2126,6 +2215,7 @@ def test_repeat_interleave(self, device):
     @dtypes(*floating_types())
     @dtypesIfCPU(*floating_types_and(torch.bfloat16, torch.half))
     @dtypesIfCUDA(*floating_types_and(torch.half))
+    @dtypesIfZoom(*floating_types_and(torch.half))
     def test_bernoulli_p(self, device, dtype):
         for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]):
             x = torch.tensor(trivial_p, dtype=dtype, device=device)
@@ -2148,6 +2238,7 @@ def isBinary(t):
     @dtypes(*floating_types())
     @dtypesIfCPU(*all_types_and(torch.bool, torch.half))
     @dtypesIfCUDA(*all_types_and(torch.bool, torch.half))
+    @dtypesIfZoom(*all_types_and(torch.bool, torch.half))
     def test_bernoulli_self(self, device, dtype):
 
         def isBinary(t):
@@ -2176,6 +2267,7 @@ def isBinary(t):
     @slowTest
     @dtypes(*floating_types_and(torch.half))
     @dtypesIfCUDA(*floating_types_and(torch.half))
+    @dtypesIfZoom(*floating_types_and(torch.half))
     def test_bernoulli_edge_cases(self, device, dtype):
         # Need to draw a lot of samples to cover every random floating point number.
         a = torch.zeros(10000, 10000, dtype=dtype, device=device)  # probability of drawing "1" is 0
@@ -2201,7 +2293,7 @@ def test_exponential(self, device, dtype):
         with self.assertRaises(RuntimeError):
             torch.empty((1,), device=device, dtype=dtype).exponential_(-0.5)
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @dtypes(torch.half, torch.float)
     def test_exponential_no_zero(self, device, dtype):
         # naively, 0 in exponential can be generated with probability 2^-24
@@ -2267,6 +2359,7 @@ def test_uniform_kstest(self, device, dtype):
     @skipIfNoSciPy
     @dtypes(*floating_types_and(torch.half))
     @dtypesIfCUDA(*floating_types_and(torch.half, torch.bfloat16))
+    @dtypesIfZoom(*floating_types_and(torch.half, torch.bfloat16))
     def test_normal_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -2317,7 +2410,7 @@ def test_cauchy_kstest(self, device, dtype):
                 self.assertTrue(res.statistic < 0.1)
 
     @slowTest
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @dtypes(torch.bfloat16, torch.float32)
     def test_cauchy_no_inf(self, device, dtype):
         # torch.float16 will have `inf` because of its smaller range.
@@ -2442,7 +2535,7 @@ def test_cdist_norm_batch(self, device):
                             expected = self._brute_cdist(x, y, p=p)
                             self.assertEqual(expected, actual)
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_cdist_cuda_backward(self, device):
         for l1 in [1, 511, 513]:
             for l2 in [1, 511, 513]:
@@ -2858,6 +2951,7 @@ def test_diff_noncontig(self, device, dtype):
     @dtypes(*all_types_and_complex_and(torch.bool))
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool))
+    @dtypesIfZoom(*all_types_and_complex_and(torch.half, torch.bool))
     def test_diff(self, device, dtype):
         shapes = (
             (1,),
@@ -3065,10 +3159,11 @@ def _test_large_cum_fn_helper(self, x, fn):
 
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
     @unittest.skipIf(IS_JETSON, "psutil issue for largeTensorTest. Too large for Jetson.")
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @dtypes(torch.half)  # only small dtype not to get oom
     @largeTensorTest('25GB', device='cpu')
-    @largeTensorTest('4GB', device='cuda')
+    # See Note: [large tensor tests on GPU]
+    @largeTensorTest('4GB')
     def test_large_cumsum(self, device, dtype):
         # initialization to avoid overflow and half caveats
         x = torch.empty(2**30 + 200, device=device, dtype=dtype)
@@ -3077,10 +3172,11 @@ def test_large_cumsum(self, device, dtype):
         x[2::3] = 1
         self._test_large_cum_fn_helper(x, lambda x: torch.cumsum(x, 0))
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @dtypes(torch.half)  # only small dtype not to get oom
     @largeTensorTest('25GB', device='cpu')
-    @largeTensorTest('4GB', device='cuda')
+    # See Note: [large tensor tests on GPU]
+    @largeTensorTest('4GB')
     @unittest.skipIf(IS_JETSON, "psutil issue for largeTensorTest. Too large for Jetson.")
     def test_large_cumprod(self, device, dtype):
         # initialization to avoid overflow and half caveats
@@ -3280,6 +3376,7 @@ def test_clone_not_memory_dense(self):
 
     # FIXME: move to elementwise ternary test suite
     @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')))
+    @dtypesIfZoom(*set(get_all_math_dtypes('cuda')))
     @dtypes(*set(get_all_math_dtypes('cpu')))
     def test_addcmul(self, device, dtype):
         # Returns floating or integral scalar corresponding to dtype
@@ -3747,11 +3844,12 @@ def test_put_empty(self, device):
     # FIXME: port to test_scatter_gather_ops.py
     def scatter_allow_reduce(self, device, dtype, reduceop):
         device_type = torch.device(device).type
-        return device_type != 'cuda' or (reduceop == 'multiply' and dtype.is_floating_point)
+        return (device_type not in ['cuda', 'zoom']) or (reduceop == 'multiply' and dtype.is_floating_point)
 
     @dtypes(*floating_and_complex_types())
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfZoom(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_scatter_reduce_operations_to_large_input(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -3779,6 +3877,7 @@ def test_scatter_reduce_operations_to_large_input(self, device, dtype):
     @dtypes(*floating_and_complex_types())
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfZoom(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_scatter_reduce_scalar(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -3818,6 +3917,7 @@ def test_scatter_add_non_unique_index(self, device):
     @dtypes(*floating_and_complex_types())
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfZoom(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_scatter_reduce_non_unique_index(self, device, dtype):
         height = 2
         width = 2
@@ -3838,7 +3938,7 @@ def test_scatter_reduce_non_unique_index(self, device, dtype):
             input.scatter_(0, index, src, reduce=operation)
             self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}")
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @dtypes(*complex_types())
     def test_scatter_reduce_multiply_unsupported_dtypes(self, device, dtype):
         height = 2
@@ -3919,7 +4019,8 @@ def test_masked_scatter(self, device, dtype):
         # in order to avoid synchronization, but this means
         # we can not clear the failures. So there is no way
         # to test it then recover.
-        if self.device_type != 'cuda':
+        # TODO(Arham): Zoom also checks for errors inside a kernel, replace this with zoom key later
+        if self.device_type != 'cuda' and self.device_type != 'privateuseone':
             # make src smaller. this should fail
             src = torch.zeros(num_copy - 1, dtype=dt, device=device)
             with self.assertRaises(RuntimeError):
@@ -3952,7 +4053,7 @@ def test_masked_scatter_bool_tensor(self, device):
 
     # FIXME: find a test suite for the masked scatter operator
     #   test_scatter_gather_ops or test_masked_ops?
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @largeTensorTest('30GB')
     def test_masked_scatter_large_tensor(self, device):
         t_cpu = torch.empty(2**31 + 1, dtype=torch.bool).random_()
@@ -4294,9 +4395,15 @@ def test_dim_function_empty(self, device):
     # FIXME: find a test suite for the pdist operator
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
     @skipIfRocm
-    @onlyCUDA
+    # this fails on zoom due to an invalid configuration error, because pdist sets the grid size
+    # to be larger than 2^30
+    @skipIfZoom(msg="Grid Configuration too large for HIP")
+    @onlyCUDAAndZOOM
     @largeTensorTest('32GB', device='cpu')
-    @largeTensorTest('5GB', device='cuda')
+    # Note(Arham) [large tensor tests on GPU]: since this only runs on cuda and zoom, setting device=None is a way to do 
+    # the check for the appropriate device without prematurely skipping if one of them doesn't 
+    # have enough memory (or isn't built at all)
+    @largeTensorTest('5GB')
     def test_pdist_norm_large(self, device):
         # use dim0>=46342 for forward, see:
         # https://github.com/pytorch/pytorch/issues/30583
@@ -4311,6 +4418,7 @@ def test_pdist_norm_large(self, device):
     # FIXME: move to elementwise ternary test suite
     @onlyNativeDeviceTypes
     @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')))
+    @dtypesIfZoom(*set(get_all_math_dtypes('cuda')))
     @dtypes(*set(get_all_math_dtypes('cpu')))
     def test_addcdiv(self, device, dtype):
         # Returns floating or integral scalar corresponding to dtype
@@ -4576,7 +4684,7 @@ def test_scatter_mem_overlap(self, device):
             ind.scatter_(0, ind, ind.clone())
 
     # FIXME: move to test distributions
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_multinomial_device_constrain(self, device):
         x = torch.empty(3, device="cpu")
         y = torch.empty(3, device=device)
@@ -4586,7 +4694,7 @@ def test_multinomial_device_constrain(self, device):
 
     # FIXME: move to test distributions
     @deviceCountAtLeast(2)
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_multinomial_gpu_device_constrain(self, devices):
         x = torch.empty(3, device=devices[0])
         y = torch.empty(3, device=devices[1])
@@ -4596,8 +4704,9 @@ def test_multinomial_gpu_device_constrain(self, devices):
 
     # FIXME: convert this to an automated OpInfo test
     @deviceCountAtLeast(2)
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_device_guard(self, devices):
+        is_zoom = torch.device(devices[0]).type == 'zoom'
         # verify that all operators with `device_guard: False` behave properly with multiple devices.
         # TODO: if we had operator introspection we could figure out this set of operators automatically...
         x = torch.randn((1, 2, 3), device=devices[1])
@@ -4605,7 +4714,8 @@ def test_device_guard(self, devices):
         scalar = torch.tensor(5, device=devices[1])
 
         # property ops
-        torch.cudnn_is_acceptable(x)
+        if not is_zoom:
+            torch.cudnn_is_acceptable(x)
         x.is_distributed()
         x.is_floating_point()
         x.is_complex()
@@ -4696,6 +4806,11 @@ def test_tensor_type(self):
                 self.assertEqual(t.is_cuda, True)
             else:
                 self.assertEqual(t.is_cuda, False)
+            # TODO(Arham): exchange keys
+            if 'zoom' in t.__module__:
+                self.assertEqual(t.is_zoom, True)
+            else:
+                self.assertEqual(t.is_zoom, False)
             if 'xpu' in t.__module__:
                 self.assertEqual(t.is_xpu, True)
             else:
@@ -4704,7 +4819,7 @@ def test_tensor_type(self):
     # Note - reports a leak of 512 bytes on CUDA device 1
     @deviceCountAtLeast(2)
     @skipCUDAMemoryLeakCheckIf(True)
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_tensor_set_errors_multigpu(self, devices):
         f_cuda0 = torch.randn((2, 3), dtype=torch.float32, device=devices[0])
         f_cuda1 = torch.randn((2, 3), dtype=torch.float32, device=devices[1])
@@ -5042,7 +5157,7 @@ def compare_strides(s1, s2, div):
             for x in xs:
                 _test_helper(x, op, unary=True)
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property")
     @skipIfTorchDynamo("NotImplementedError: PrimTorch does not support pinned memory")
     def test_pin_memory_from_constructor(self, device):
@@ -5078,7 +5193,7 @@ def _get_tensors(**kwargs):
             self.assertFalse(x.is_pinned())
 
     @deviceCountAtLeast(1)
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_storage_all_devices(self, devices):
         for device in devices:
             t = torch.tensor((), device=device)
@@ -5262,6 +5377,7 @@ def run(num_threads, num_parallel, skip_first, should_error):
     # FIXME: move to test distributions
     @skipIfMps
     @dtypesIfCUDA(torch.float, torch.double, torch.half)
+    @dtypesIfZoom(torch.float, torch.double, torch.half)
     @dtypes(torch.float, torch.double, torch.half)
     def test_multinomial(self, device, dtype):
         def make_prob_dist(shape, is_contiguous):
@@ -5354,7 +5470,7 @@ def make_prob_dist(shape, is_contiguous):
         self.assertEqual(sample_indices.size(1), n_sample, msg="wrong number of samples")
 
     # FIXME: move to test distributions
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     @dtypes(torch.float, torch.double, torch.half)
     def test_multinomial_deterministic(self, device, dtype):
         gen = torch.Generator(device=device)
@@ -5556,8 +5672,9 @@ def transformation_fn(tensor, **kwargs):
             self._test_memory_format_transformations(
                 device, get_generator(mf, shape, torch.float64), get_fn('float'), mf, default_is_preserve=True)
 
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_memory_format_cpu_and_cuda_ops(self, device):
+        is_zoom = torch.device(device).type == 'zoom'
         def get_generator(memory_format, shape):
             def input_generator_fn(device):
                 return torch.randn(shape, device=device, dtype=torch.float32).contiguous(memory_format=memory_format)
@@ -5568,16 +5685,25 @@ def transformation_cpu_fn(tensor, **kwargs):
 
         def transformation_cuda_fn(tensor, **kwargs):
             return tensor.cuda(**kwargs)
+        
+        def transformation_zoom_fn(tensor, **kwargs):
+            return tensor.zoom(**kwargs)
 
         formats_shapes = (
             (torch.channels_last, (4, 3, 8, 8)),
             (torch.channels_last_3d, (4, 3, 8, 8, 8)))
 
+        cpu_transformation_fn = transformation_cuda_fn if not is_zoom else transformation_zoom_fn
         for mf, shape in formats_shapes:
+            if not is_zoom:
+                self._test_memory_format_transformations(
+                    'cuda', get_generator(mf, shape), transformation_cpu_fn, mf, default_is_preserve=True)
+            else:
+                self._test_memory_format_transformations(
+                    'zoom', get_generator(mf, shape), transformation_cpu_fn, mf, default_is_preserve=True)
+            
             self._test_memory_format_transformations(
-                'cuda', get_generator(mf, shape), transformation_cpu_fn, mf, default_is_preserve=True)
-            self._test_memory_format_transformations(
-                'cpu', get_generator(mf, shape), transformation_cuda_fn, mf, default_is_preserve=True)
+                'cpu', get_generator(mf, shape), cpu_transformation_fn, mf, default_is_preserve=True)
 
     # FIXME: move to test_serialization
     @onlyNativeDeviceTypes
@@ -5641,6 +5767,7 @@ def test_multinomial_empty_wo_replacement(self, device):
     def test_grad_scaling_unscale(self, device, dtype):
         device = torch.device(device)
         device0 = "cuda:0" if device.type == "cuda" else "cpu"
+        device0 = "zoom:0" if device.type == "zoom" else device0
         inv_scale = torch.full((1,), 0.25, dtype=torch.float, device=device0)
         found_inf = torch.full((1,), 0.0, dtype=torch.float, device=device0)
 
@@ -5825,10 +5952,10 @@ def test_grad_scaling_state_dict(self, device):
 
             # sets a random value for load_state_dict to overwrite
             s1._init_growth_tracker = 7
-
             if lazy_init_scale:
                 # Dummy scale() call to ensure the scale tensor is lazily initialized.
                 s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device=device))
+                print(type(s1._scale), s1._scale.dtype)
                 if "cuda" == device.type:
                     self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor))
                 else:
@@ -6190,6 +6317,7 @@ def step(self, closure=None):
         scaler.update()
 
     @dtypesIfCUDA(torch.float, torch.double, torch.half)
+    @dtypesIfZoom(torch.float, torch.double, torch.half)
     @dtypesIfCPU(torch.float, torch.double, torch.bfloat16, torch.half)
     @dtypes(torch.float, torch.double)
     def test_multinomial_cpu(self, device, dtype):
@@ -6254,7 +6382,7 @@ def check_equal(condition, x, y):
 
                 check_equal(condition, x, y)
                 check_equal(condition, y, x)
-                if self.device_type == "cuda":
+                if self.device_type in ["cuda", "zoom"]:
                     check_equal(condition, torch.tensor(x), y)
                     check_equal(condition, y, torch.tensor(x))
                     if not isinstance(y, torch.Tensor):
@@ -6413,7 +6541,7 @@ class TestDevicePrecision(TestCase):
     exact_dtype = True
 
     # FIXME: move to indexing test suite
-    @onlyCUDA
+    @onlyCUDAAndZOOM
     def test_index_add_bfloat16(self, device):
         inp_tensor = torch.randn(5, 3, device='cpu').bfloat16()
         t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.bfloat16, device='cpu')
@@ -6483,6 +6611,9 @@ def test_type_conversions_same_device(self, devices):
     @dtypesIfCUDA(torch.half, torch.float, torch.double,
                   torch.int8, torch.short, torch.int, torch.long,
                   torch.uint8)
+    @dtypesIfZoom(torch.half, torch.float, torch.double,
+                  torch.int8, torch.short, torch.int, torch.long,
+                  torch.uint8)
     @dtypes(torch.float, torch.double,
             torch.int8, torch.short, torch.int, torch.long,
             torch.uint8)
@@ -7097,10 +7228,12 @@ def test_tensor_set_errors(self):
     # NOTE: test_equal will be deprecated in favor of torch.testing.assert_close
     #   once torch.testing is out of beta
     def test_equal(self):
-        devices = [torch.cpu, torch.cuda]
-        for device in ["cpu", "cuda"]:
+        devices = [torch.cpu, torch.cuda, torch.zoom]
+        for device in ["cpu", "cuda", "zoom"]:
             if device == "cuda" and not torch.cuda.is_available():
                 continue
+            if device == "zoom" and not torch.zoom.is_available():
+                continue
 
             # Contiguous, 1D
             t1 = torch.tensor((3., 4., 9., 10.), device=device)

From d31d5df1f021d2b3f6344faadbbd9b079d3ae372 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 13 Apr 2025 22:26:56 -0500
Subject: [PATCH 22/23] add back index, loss, transposed and dilated conv,
 normalization, and unary kernels

---
 aten/src/ATen/native/native_functions.yaml    |  176 +-
 aten/src/ATen/native/zoom/Copy.cu             |   31 +-
 .../ATen/native/zoom/IndexKernelMasked.cpp    |   84 +
 aten/src/ATen/native/zoom/Lerp.cu             |  128 ++
 aten/src/ATen/native/zoom/LossCTC.cu          |  801 ++++++++
 .../native/zoom/MaxMinElementwiseKernel.cu    |   98 +
 aten/src/ATen/native/zoom/MaxUnpooling.cu     |  612 ++++++
 aten/src/ATen/native/zoom/MiscUtils.h         |   32 +
 .../native/zoom/MultiLabelMarginCriterion.cu  |  442 +++++
 aten/src/ATen/native/zoom/MultiMarginLoss.cu  |  416 ++++
 .../zoom/NaiveConvolutionTranspose2d.cu       |  834 ++++++++
 .../zoom/NaiveConvolutionTranspose3d.cu       | 1017 ++++++++++
 .../native/zoom/NaiveDilatedConvolution.cu    |  611 ++++++
 aten/src/ATen/native/zoom/Nonzero.cu          |  130 ++
 aten/src/ATen/native/zoom/Normalization.cu    |  820 ++++++++
 aten/src/ATen/native/zoom/Normalization.cuh   | 1737 +++++++++++++++++
 .../ATen/native/zoom/PointwiseOpsKernels.cu   |  145 ++
 aten/src/ATen/native/zoom/PowKernel.cu        |  209 ++
 aten/src/ATen/native/zoom/Reduce.cu           |   56 +
 .../ATen/native/zoom/ReduceMomentKernel.cu    |   68 +
 .../ATen/native/zoom/UnaryComplexKernels.cu   |   91 +
 .../ATen/native/zoom/UnaryFractionKernels.cu  |  199 ++
 .../src/ATen/native/zoom/UnaryGammaKernels.cu |  133 ++
 .../native/zoom/UnaryGeometricAcosKernel.cu   |   59 +
 .../native/zoom/UnaryGeometricAcoshKernel.cu  |   60 +
 .../native/zoom/UnaryGeometricAsinKernel.cu   |   56 +
 .../native/zoom/UnaryGeometricAsinhKernel.cu  |   60 +
 .../native/zoom/UnaryGeometricAtanKernel.cu   |   59 +
 .../native/zoom/UnaryGeometricAtanhKernel.cu  |   59 +
 .../native/zoom/UnaryGeometricCoshKernel.cu   |   59 +
 .../native/zoom/UnaryGeometricSinhKernel.cu   |   59 +
 .../native/zoom/UnaryGeometricTanKernel.cu    |   58 +
 .../native/zoom/UnaryGeometricTanhKernel.cu   |   59 +
 aten/src/ATen/native/zoom/UnaryLogKernels.cu  |  122 ++
 .../ATen/native/zoom/UnarySpecialOpsKernel.cu |  399 ++++
 aten/src/ATen/native/zoom/vol2col.cuh         |  263 +++
 36 files changed, 10146 insertions(+), 96 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/IndexKernelMasked.cpp
 create mode 100644 aten/src/ATen/native/zoom/Lerp.cu
 create mode 100644 aten/src/ATen/native/zoom/LossCTC.cu
 create mode 100644 aten/src/ATen/native/zoom/MaxMinElementwiseKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/MaxUnpooling.cu
 create mode 100644 aten/src/ATen/native/zoom/MiscUtils.h
 create mode 100644 aten/src/ATen/native/zoom/MultiLabelMarginCriterion.cu
 create mode 100644 aten/src/ATen/native/zoom/MultiMarginLoss.cu
 create mode 100644 aten/src/ATen/native/zoom/NaiveConvolutionTranspose2d.cu
 create mode 100644 aten/src/ATen/native/zoom/NaiveConvolutionTranspose3d.cu
 create mode 100644 aten/src/ATen/native/zoom/NaiveDilatedConvolution.cu
 create mode 100644 aten/src/ATen/native/zoom/Nonzero.cu
 create mode 100644 aten/src/ATen/native/zoom/Normalization.cu
 create mode 100644 aten/src/ATen/native/zoom/Normalization.cuh
 create mode 100644 aten/src/ATen/native/zoom/PointwiseOpsKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/PowKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/Reduce.cu
 create mode 100644 aten/src/ATen/native/zoom/ReduceMomentKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryComplexKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryFractionKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGammaKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricAcosKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricAcoshKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricAsinKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricAsinhKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricAtanKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricAtanhKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricCoshKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricSinhKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricTanKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryGeometricTanhKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/UnaryLogKernels.cu
 create mode 100644 aten/src/ATen/native/zoom/UnarySpecialOpsKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/vol2col.cuh

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index af421d12e05225..2286f7ca6f6513 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -403,14 +403,14 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: angle
+    CPU, CUDA, PrivateUse1: angle
     SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr
   tags: pointwise
 
 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: angle_out
+    CPU, CUDA, PrivateUse1: angle_out
     SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out
   tags: pointwise
 
@@ -485,7 +485,7 @@
 
 - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: conj_physical_out
+    CPU, CUDA, PrivateUse1: conj_physical_out
     MPS: conj_physical_out_mps
     SparseCPU, SparseCUDA: conj_physical_out_sparse
     SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
@@ -526,7 +526,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: acos_out
+    CPU, CUDA, PrivateUse1: acos_out
     MPS: acos_out_mps
   tags: pointwise
 
@@ -851,7 +851,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: acosh_out
+    CPU, CUDA, PrivateUse1: acosh_out
     MPS: acosh_out_mps
   tags: pointwise
 # arccosh, alias for acosh
@@ -884,7 +884,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: asinh_out
+    CPU, CUDA, PrivateUse1: asinh_out
     MPS: asinh_out_mps
     SparseCPU, SparseCUDA: asinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
@@ -919,7 +919,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: atanh_out
+    CPU, CUDA, PrivateUse1: atanh_out
     MPS: atanh_out_mps
     SparseCPU, SparseCUDA: atanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
@@ -977,7 +977,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: asin_out
+    CPU, CUDA, PrivateUse1: asin_out
     MPS: asin_out_mps
     SparseCPU, SparseCUDA: asin_sparse_out
     SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
@@ -1015,7 +1015,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: atan_out
+    CPU, CUDA, PrivateUse1: atan_out
     MPS: atan_out_mps
     SparseCPU, SparseCUDA: atan_sparse_out
     SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
@@ -1453,7 +1453,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: ceil_out
+    CPU, CUDA, PrivateUse1: ceil_out
     MPS: ceil_out_mps
     SparseCPU, SparseCUDA: ceil_sparse_out
     SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
@@ -1831,7 +1831,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: cosh_out
+    CPU, CUDA, PrivateUse1: cosh_out
     MPS: cosh_out_mps
   tags: pointwise
 
@@ -2049,13 +2049,14 @@
   dispatch:
     CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
+    PrivateUse1: ctc_loss_gpu
     Meta: ctc_loss_meta
   autogen: _ctc_loss.out
   tags: dynamic_output_shape  # the shape of second output is data dependent
 
 - func: _ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
   dispatch:
-    CPU, CUDA: ctc_loss_tensor
+    CPU, CUDA, PrivateUse1: ctc_loss_tensor
   autogen: _ctc_loss.Tensor_out
   tags: dynamic_output_shape  # the shape of second output is data dependent
 
@@ -2063,11 +2064,12 @@
   dispatch:
     CPU: ctc_loss_backward_cpu
     CUDA: ctc_loss_backward_gpu
+    PrivateUse1: ctc_loss_backward_gpu
   autogen: _ctc_loss_backward.out
 
 - func: _ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
   dispatch:
-    CPU, CUDA: ctc_loss_backward_tensor
+    CPU, CUDA, PrivateUse1: ctc_loss_backward_tensor
 
 - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
   variants: function, method
@@ -2540,7 +2542,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: erf_out
+    CPU, CUDA, PrivateUse1: erf_out
     MPS: erf_out_mps
     SparseCPU, SparseCUDA: erf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
@@ -2563,7 +2565,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: erfc_out
+    CPU, CUDA, PrivateUse1: erfc_out
   tags: pointwise
 
 - func: exp(Tensor self) -> Tensor
@@ -2601,7 +2603,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: exp2_out
+    CPU, CUDA, PrivateUse1: exp2_out
     MPS: exp2_out_mps
   tags: pointwise
 
@@ -2749,7 +2751,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: floor_out
+    CPU, CUDA, PrivateUse1: floor_out
     MPS: floor_out_mps
     SparseCPU, SparseCUDA: floor_sparse_out
     SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
@@ -2814,7 +2816,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: frac_out
+    CPU, CUDA, PrivateUse1: frac_out
     MPS: frac_out_mps
     SparseCPU, SparseCUDA: frac_sparse_out
     SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out
@@ -3490,7 +3492,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: log_out
+    CPU, CUDA, PrivateUse1: log_out
     MPS: log_out_mps
   tags: pointwise
 
@@ -3511,7 +3513,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: log10_out
+    CPU, CUDA, PrivateUse1: log10_out
     MPS: log10_out_mps
   tags: pointwise
 
@@ -3538,7 +3540,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: log1p_out
+    CPU, CUDA, PrivateUse1: log1p_out
     MPS: log1p_out_mps
     SparseCPU, SparseCUDA: log1p_sparse_out
     SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
@@ -3561,7 +3563,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: log2_out
+    CPU, CUDA, PrivateUse1: log2_out
     MPS: log2_out_mps
   tags: pointwise
 
@@ -3956,7 +3958,7 @@
   structured: True
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: mean_out
+    CPU, CUDA, PrivateUse1: mean_out
     MPS: mean_out_mps
     QuantizedCPU: mean_out_quantized_cpu
 
@@ -4336,12 +4338,14 @@
   dispatch:
     CPU: batch_norm_cpu
     CUDA: batch_norm_cuda
+    PrivateUse1: batch_norm_zoom
     MPS: batch_norm_mps
     MkldnnCPU: mkldnn_batch_norm
 
 - func: native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   dispatch:
     CUDA: batch_norm_cuda_out
+    PrivateUse1: batch_norm_zoom_out
     MPS: batch_norm_mps_out
     CPU: batch_norm_cpu_out
 
@@ -4350,6 +4354,7 @@
   dispatch:
     CPU: _batch_norm_legit_cpu
     CUDA: _batch_norm_legit_cuda
+    PrivateUse1: _batch_norm_legit_zoom
     MPS: _batch_norm_legit_mps
     MkldnnCPU: _mkldnn_batch_norm_legit
   autogen: _native_batch_norm_legit_functional
@@ -4368,12 +4373,14 @@
   dispatch:
     CPU: _batch_norm_legit_cpu_out
     CUDA: _batch_norm_legit_cuda_out
+    PrivateUse1: _batch_norm_legit_zoom_out
     MPS: _batch_norm_legit_mps_out
 
 - func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
   dispatch:
     CPU: _batch_norm_legit_no_stats_cpu
     CUDA: _batch_norm_legit_no_stats_cuda
+    PrivateUse1: _batch_norm_legit_no_stats_zoom
     MPS: _batch_norm_legit_no_stats_mps
     MkldnnCPU: _mkldnn_batch_norm_legit_no_stats
   tags: core
@@ -4382,36 +4389,43 @@
   dispatch:
     CPU: _batch_norm_legit_no_stats_cpu_out
     CUDA: _batch_norm_legit_no_stats_cuda_out
+    PrivateUse1: _batch_norm_legit_no_stats_zoom_out
     MPS: _batch_norm_legit_no_stats_mps_out
 
 - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   dispatch:
     CUDA: batch_norm_stats_cuda
+    PrivateUse1: batch_norm_stats_zoom
   autogen: batch_norm_stats.out
 
 - func: batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
   dispatch:
     CUDA: batch_norm_elemt_cuda
+    PrivateUse1: batch_norm_elemt_zoom
 
 - func: batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CUDA: batch_norm_elemt_cuda_out
+    PrivateUse1: batch_norm_elemt_zoom_out
 
 # for backward compatibility
 - func: batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)
   dispatch:
     CUDA: batch_norm_gather_stats_cuda
+    PrivateUse1: batch_norm_gather_stats_zoom
   autogen: batch_norm_gather_stats.out
 
 - func: batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
   dispatch:
     CUDA: batch_norm_gather_stats_with_counts_cuda
+    PrivateUse1: batch_norm_gather_stats_with_counts_zoom
   autogen: batch_norm_gather_stats_with_counts.out
 
 - func: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CPU: batch_norm_backward_cpu
     CUDA: batch_norm_backward_cuda
+    PrivateUse1: batch_norm_backward_zoom
     MPS: batch_norm_backward_mps
     MkldnnCPU: mkldnn_batch_norm_backward
   autogen: native_batch_norm_backward.out
@@ -4419,17 +4433,20 @@
 - func: batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
   dispatch:
     CUDA: batch_norm_backward_reduce_cuda
+    PrivateUse1: batch_norm_backward_reduce_zoom
   autogen: batch_norm_backward_reduce.out
 
 - func: batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
   dispatch:
     CUDA: batch_norm_backward_elemt_cuda
+    PrivateUse1: batch_norm_backward_elemt_zoom
   autogen: batch_norm_backward_elemt.out
 
 - func: batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
   dispatch:
     CPU: batch_norm_update_stats_cpu
     CUDA: batch_norm_update_stats_cuda
+    PrivateUse1: batch_norm_update_stats_zoom
   autogen: batch_norm_update_stats.out
 
 - func: is_vulkan_available() -> bool
@@ -4848,7 +4865,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: reciprocal_out
+    CPU, CUDA, PrivateUse1: reciprocal_out
     MPS: reciprocal_out_mps
   tags: pointwise
 
@@ -4982,7 +4999,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU: round_out
-    CUDA: round_out
+    CUDA, PrivateUse1: round_out
     MPS: round_out_mps
     SparseCPU, SparseCUDA: round_sparse_out
     SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
@@ -5006,7 +5023,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU: round_decimals_out
-    CUDA: round_decimals_out
+    CUDA, PrivateUse1: round_decimals_out
   tags: pointwise
 
 - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
@@ -5298,26 +5315,26 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sigmoid_out
+    CPU, CUDA, PrivateUse1: sigmoid_out
     MPS: sigmoid_out_mps
   tags: pointwise
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   variants: function, method
   dispatch:
-    CPU, CUDA: logit
+    CPU, CUDA, PrivateUse1: logit
     MPS: logit_mps
   tags: pointwise
 
 - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
   variants: function, method
   dispatch:
-    CPU, CUDA: logit_
+    CPU, CUDA, PrivateUse1: logit_
   tags: pointwise
 
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: logit_out
+    CPU, CUDA, PrivateUse1: logit_out
     MPS: logit_out_mps
   tags: pointwise
 
@@ -5365,7 +5382,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sinc_out
+    CPU, CUDA, PrivateUse1: sinc_out
   tags: pointwise
 
 - func: sinh(Tensor self) -> Tensor
@@ -5391,7 +5408,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: sinh_out
+    CPU, CUDA, PrivateUse1: sinh_out
     MPS: sinh_out_mps
     SparseCPU, SparseCUDA: sinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_out
@@ -5879,7 +5896,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: std
+    CPU, CUDA, PrivateUse1: std
     MPS: std_mps
     QuantizedCPU: std_quantized_cpu
 
@@ -5897,7 +5914,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: std_mean
+    CPU, CUDA, PrivateUse1: std_mean
     MPS: std_mean_mps
   autogen: std_mean.correction_out
 
@@ -5917,7 +5934,7 @@
 - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: std_out
+    CPU, CUDA, PrivateUse1: std_out
     QuantizedCPU: std_out_quantized_cpu
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
@@ -6004,7 +6021,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: tan_out
+    CPU, CUDA, PrivateUse1: tan_out
     MPS: tan_out_mps
     SparseCPU, SparseCUDA: tan_sparse_out
     SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
@@ -6038,7 +6055,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: tanh_out
+    CPU, CUDA, PrivateUse1: tanh_out
     MPS: tanh_out_mps
     SparseCPU, SparseCUDA: tanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
@@ -6319,7 +6336,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: trunc_out
+    CPU, CUDA, PrivateUse1: trunc_out
     MPS: trunc_out_mps
     SparseCPU, SparseCUDA: trunc_sparse_out
     SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
@@ -6427,7 +6444,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA: var
+    CPU, CUDA, PrivateUse1: var
     MPS: var_mps
   tags: core
 
@@ -6438,7 +6455,7 @@
 - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: var_out
+    CPU, CUDA, PrivateUse1: var_out
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6471,7 +6488,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA: var_mean
+    CPU, CUDA, PrivateUse1: var_mean
     MPS: var_mean_mps
   autogen: var_mean.correction_out
 
@@ -6635,6 +6652,7 @@
   dispatch:
     CPU: _batch_norm_with_update_cpu
     CUDA: _batch_norm_with_update_cuda
+    PrivateUse1: _batch_norm_with_update_zoom
     MPS: _batch_norm_with_update_mps
     MkldnnCPU: _batch_norm_with_update_mkldnn
   autogen: _batch_norm_with_update_functional
@@ -6643,6 +6661,7 @@
   dispatch:
     CPU: _batch_norm_with_update_cpu_out
     CUDA: _batch_norm_with_update_cuda_out
+    PrivateUse1: _batch_norm_with_update_zoom_out
     MPS: _batch_norm_with_update_mps_out
 
 - func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
@@ -6654,6 +6673,7 @@
   dispatch:
     CPU: _new_batch_norm_backward_cpu
     CUDA: _new_batch_norm_backward_cuda
+    PrivateUse1: _new_batch_norm_backward_zoom
     MPS: _new_batch_norm_backward_mps
     MkldnnCPU: _new_batch_norm_backward_mkldnn
 
@@ -8030,6 +8050,7 @@
   dispatch:
     CPU: masked_scatter__cpu
     CUDA: masked_scatter__cuda
+    PrivateUse1: masked_scatter__zoom
     MPS: masked_scatter__mps
   autogen: masked_scatter.out
 
@@ -9233,6 +9254,7 @@
   dispatch:
     CPU: masked_select_out_cpu
     CUDA: masked_select_out_cuda
+    PrivateUse1: masked_select_out_zoom
     MPS: masked_select_out_mps
   tags: dynamic_output_shape
 
@@ -9241,6 +9263,7 @@
   dispatch:
     CPU: masked_select_cpu
     CUDA: masked_select_cuda
+    PrivateUse1: masked_select_zoom
     MPS: masked_select_mps
   tags: dynamic_output_shape
 
@@ -9253,6 +9276,7 @@
   dispatch:
     CPU: nonzero_out_cpu
     CUDA: nonzero_out_cuda
+    PrivateUse1: nonzero_out_zoom
     MPS: nonzero_out_mps
   tags: dynamic_output_shape
 
@@ -9261,6 +9285,7 @@
   dispatch:
     CPU: nonzero_cpu
     CUDA: nonzero_cuda
+    PrivateUse1: nonzero_zoom
     MPS: nonzero_mps
   tags: [dynamic_output_shape, core]
 
@@ -9308,7 +9333,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: addcmul_out
+    CPU, CUDA, PrivateUse1: addcmul_out
     MPS: addcmul_out_mps
   tags: pointwise
 
@@ -9329,7 +9354,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: addcdiv_out
+    CPU, CUDA, PrivateUse1: addcdiv_out
     MPS: addcdiv_out_mps
   tags: pointwise
 
@@ -9514,7 +9539,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: lgamma_out
+    CPU, CUDA, PrivateUse1: lgamma_out
     MPS: lgamma_out_mps
   tags: pointwise
 
@@ -9535,7 +9560,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: digamma_out
+    CPU, CUDA, PrivateUse1: digamma_out
     MPS: digamma_out_mps
   tags: pointwise
 
@@ -9550,7 +9575,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: polygamma_out
+    CPU, CUDA, PrivateUse1: polygamma_out
     MPS: polygamma_out_mps
   tags: pointwise
 
@@ -9590,7 +9615,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: erfinv_out
+    CPU, CUDA, PrivateUse1: erfinv_out
     MPS: erfinv_out_mps
     SparseCPU, SparseCUDA: erfinv_sparse_out
     SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
@@ -9610,7 +9635,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: i0_out
+    CPU, CUDA, PrivateUse1: i0_out
   tags: pointwise
 
 - func: sign(Tensor self) -> Tensor
@@ -9705,7 +9730,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: lerp_Scalar
+    CPU, CUDA, PrivateUse1: lerp_Scalar
     MPS: lerp_Scalar_mps
   tags: pointwise
 
@@ -9714,7 +9739,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: lerp_Tensor
+    CPU, CUDA, PrivateUse1: lerp_Tensor
     MPS: lerp_Tensor_mps
   tags: pointwise
 
@@ -9961,7 +9986,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA, MPS: fmin_out
+    CPU, CUDA, PrivateUse1, MPS: fmin_out
   tags: pointwise
 
 - func: max(Tensor self) -> Tensor
@@ -9983,7 +10008,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA, MPS: fmax_out
+    CPU, CUDA, PrivateUse1, MPS: fmax_out
   tags: pointwise
 
 - func: maximum(Tensor self, Tensor other) -> Tensor
@@ -9997,7 +10022,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: maximum_out
+    CPU, CUDA, PrivateUse1: maximum_out
     MPS: maximum_out_mps
   tags: pointwise
 
@@ -10029,7 +10054,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: minimum_out
+    CPU, CUDA, PrivateUse1: minimum_out
     MPS: minimum_out_mps
   tags: pointwise
 
@@ -10205,7 +10230,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: pow_Tensor_Tensor_out
+    CPU, CUDA, PrivateUse1: pow_Tensor_Tensor_out
     MPS: pow_tensor_tensor_out_mps
   tags: pointwise
 
@@ -10233,7 +10258,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: pow_Tensor_Scalar_out
+    CPU, CUDA, PrivateUse1: pow_Tensor_Scalar_out
     SparseCPU, SparseCUDA: pow_out_sparse_scalar
     MPS: pow_tensor_scalar_out_mps
   tags: pointwise
@@ -11683,25 +11708,29 @@
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_out
-    CUDA: multi_margin_loss_cuda_out
+    CUDA: multi_margin_loss_cuda
+    PrivateUse1: multi_margin_loss_zoom_out
 
 - func: multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu
     CUDA: multi_margin_loss_cuda
+    PrivateUse1: multi_margin_loss_zoom
 
 - func: multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_backward_out
-    CUDA: multi_margin_loss_cuda_backward_out
+    CUDA: multi_margin_loss_cuda
+    PrivateUse1: multi_margin_loss_zoom_backward_out
 
 - func: multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
   python_module: nn
   dispatch:
     CPU: multi_margin_loss_cpu_backward
-    CUDA: multi_margin_loss_cuda_backward
+    CUDA: multi_margin_loss_cuda
+    PrivateUse1: multi_margin_loss_zoom_backward
 
 - func: multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -11714,24 +11743,28 @@
   dispatch:
     CPU: multilabel_margin_loss_forward_out_cpu
     CUDA: multilabel_margin_loss_forward_out_cuda
+    PrivateUse1: multilabel_margin_loss_forward_out_zoom
 
 - func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_forward_cpu
     CUDA: multilabel_margin_loss_forward_cuda
+    PrivateUse1: multilabel_margin_loss_forward_zoom
 
 - func: multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_backward_cpu_out
-    CUDA: multilabel_margin_loss_backward_cuda_out
+    CUDA: multilabel_margin_loss_backward_cuda
+    PrivateUse1: multilabel_margin_loss_backward_zoom_out
 
 - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_backward_cpu
     CUDA: multilabel_margin_loss_backward_cuda
+    PrivateUse1: multilabel_margin_loss_backward_zoom
 
 - func: nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -12545,24 +12578,28 @@
   dispatch:
     CPU: max_unpooling2d_forward_out_cpu
     CUDA: max_unpooling2d_forward_out_cuda
+    PrivateUse1: max_unpooling2d_forward_out_zoom
 
 - func: max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_forward_cpu
     CUDA: max_unpooling2d_forward_cuda
+    PrivateUse1: max_unpooling2d_forward_zoom
 
 - func: max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_forward_out_cpu
     CUDA: max_unpooling3d_forward_out_cuda
+    PrivateUse1: max_unpooling3d_forward_out_zoom
 
 - func: max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_forward_cpu
     CUDA: max_unpooling3d_forward_cuda
+    PrivateUse1: max_unpooling3d_forward_zoom
 
 - func: reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -13142,6 +13179,7 @@
   dispatch:
     CPU: slow_conv_transpose2d_structured_cpu
     CUDA: slow_conv_transpose2d_structured_cuda
+    PrivateUse1: slow_conv_transpose2d_structured_zoom
 
 - func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
   python_module: nn
@@ -13152,12 +13190,14 @@
   dispatch:
     CPU: slow_conv_transpose3d_out_cpu
     CUDA: slow_conv_transpose3d_out_cuda
+    PrivateUse1: slow_conv_transpose3d_out_zoom
 
 - func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_cpu
     CUDA: slow_conv_transpose3d_cuda
+    PrivateUse1: slow_conv_transpose3d_zoom
 
 - func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -13235,6 +13275,7 @@
   dispatch:
     CPU: slow_conv_dilated2d_cpu
     CUDA: slow_conv_dilated2d_cuda
+    PrivateUse1: slow_conv_dilated2d_zoom
   autogen: slow_conv_dilated2d.out
 
 - func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
@@ -13242,6 +13283,7 @@
   dispatch:
     CPU: slow_conv_dilated3d_cpu
     CUDA: slow_conv_dilated3d_cuda
+    PrivateUse1: slow_conv_dilated3d_zoom
   autogen: slow_conv_dilated3d.out
 
 - func: col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
@@ -13364,7 +13406,7 @@
   python_module: special
   variants: function
   dispatch:
-    CPU, CUDA: special_entr_out
+    CPU, CUDA, PrivateUse1: special_entr_out
   tags: pointwise
 
 - func: special_ndtri(Tensor self) -> Tensor
@@ -13379,7 +13421,7 @@
   python_module: special
   variants: function
   dispatch:
-    CPU, CUDA: special_ndtri_out
+    CPU, CUDA, PrivateUse1: special_ndtri_out
   tags: pointwise
 
 - func: special_log_ndtr(Tensor self) -> Tensor
@@ -13394,7 +13436,7 @@
   python_module: special
   variants: function
   dispatch:
-    CPU, CUDA: special_log_ndtr_out
+    CPU, CUDA, PrivateUse1: special_log_ndtr_out
   tags: pointwise
 
 - func: special_expm1(Tensor self) -> Tensor
@@ -13463,7 +13505,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: special_erfcx_out
+    CPU, CUDA, PrivateUse1: special_erfcx_out
   tags: pointwise
 
 - func: special_erfinv(Tensor self) -> Tensor
@@ -13628,7 +13670,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: special_i0e_out
+    CPU, CUDA, PrivateUse1: special_i0e_out
   tags: pointwise
 
 - func: special_i1(Tensor self) -> Tensor
@@ -13642,7 +13684,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: special_i1_out
+    CPU, CUDA, PrivateUse1: special_i1_out
   tags: pointwise
 
 - func: special_i1e(Tensor self) -> Tensor
@@ -13656,7 +13698,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: special_i1e_out
+    CPU, CUDA, PrivateUse1: special_i1e_out
   tags: pointwise
 
 - func: special_logit(Tensor self, float? eps=None) -> Tensor
diff --git a/aten/src/ATen/native/zoom/Copy.cu b/aten/src/ATen/native/zoom/Copy.cu
index f1ad63e7cd7e63..50b04fc10f92c8 100644
--- a/aten/src/ATen/native/zoom/Copy.cu
+++ b/aten/src/ATen/native/zoom/Copy.cu
@@ -30,35 +30,8 @@ void direct_copy_kernel_zoom(TensorIteratorBase &iter);
 // forward decl, defined in UnarySignKernels.cu
 void neg_kernel_zoom(TensorIteratorBase& iter);
 
-// NB: Ignores the negative bit on tensors
-CONSTEXPR_EXCEPT_WIN_CUDA char conj_name[] = "conj_kernel";
-void conj_kernel_zoom(TensorIteratorBase& iter) {
-  auto conj_chalf = [&] {
-    using scalar_t = c10::complex<at::Half>;
-
-      static const auto conj_string = jiterator_stringify(
-        template <typename T>
-        T conj_kernel(T z) {
-          return std::conj(z);
-        }
-      );
-      jitted_gpu_kernel<conj_name, scalar_t, scalar_t, 1>(iter, conj_string);
-
-  };
-
-  AT_DISPATCH_SWITCH(iter.common_dtype(), "conj_zoom",
-    AT_DISPATCH_CASE_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, [&] {
-      // Conj is a no-op for non-complex types
-      direct_copy_kernel_zoom(iter);
-    })
-    AT_DISPATCH_CASE_COMPLEX_TYPES([&] {
-      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return std::conj(a);
-      });
-    })
-    AT_DISPATCH_CASE(kComplexHalf, conj_chalf)
-  );
-}
+// forward decl, defined in UnaryComplexKernels.cu
+void conj_kernel_zoom(TensorIteratorBase& iter);
 
 void float8_copy_kernel_zoom(TensorIteratorBase &iter) {
   ScalarType dtype = iter.dtype(0);
diff --git a/aten/src/ATen/native/zoom/IndexKernelMasked.cpp b/aten/src/ATen/native/zoom/IndexKernelMasked.cpp
new file mode 100644
index 00000000000000..a2c07a760ab6b8
--- /dev/null
+++ b/aten/src/ATen/native/zoom/IndexKernelMasked.cpp
@@ -0,0 +1,84 @@
+// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/IndexKernel.h>
+#include <ATen/native/TensorAdvancedIndexing.h>  // For at::native::index_out
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <c10/zoom/ZoomFunctions.h>
+#else
+#include <ATen/ops/index.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/masked_scatter_native.h>
+#include <ATen/ops/masked_select_native.h>
+#endif
+
+
+namespace at::native {
+
+static Tensor & masked_select_out_zoom_impl(Tensor & result, const Tensor & self, const Tensor & mask) {
+  NoNamesGuard guard;
+
+  TORCH_CHECK(mask.scalar_type() == ScalarType::Bool,
+              "masked_select: expected BoolTensor for mask");
+  TORCH_CHECK(self.scalar_type() == result.scalar_type(),
+              "masked_select(): self and result must have the same scalar type");
+
+  auto mask_temp = (mask.dim() == 0)
+    ? c10::MaybeOwned<Tensor>::owned(mask.unsqueeze(0))
+    : c10::MaybeOwned<Tensor>::borrowed(mask);
+  auto self_temp = (self.dim() == 0)
+    ? c10::MaybeOwned<Tensor>::owned(self.unsqueeze(0))
+    : c10::MaybeOwned<Tensor>::borrowed(self);
+
+  // Cannot reassign to mask_temp and self_temp here! if they are
+  // owning and expand_outplace returns a borrow, the returned borrow
+  // would dangle.
+  auto mask_self_expanded = expand_outplace(*mask_temp, *self_temp);
+  at::index_out(
+      result, *std::get<1>(mask_self_expanded),
+      c10::List<std::optional<at::Tensor>>({*std::move(std::get<0>(mask_self_expanded))}));
+
+  return result;
+}
+
+Tensor masked_select_zoom(const Tensor & self, const Tensor & mask) {
+  namedinference::compute_broadcast_outnames(self, mask);
+  Tensor result = at::empty({0}, self.options());
+  return masked_select_out_zoom_impl(result, self, mask);
+}
+
+Tensor & masked_select_out_zoom(const Tensor & self, const Tensor & mask, Tensor & result) {
+  namedinference::compute_broadcast_outnames(self, mask);
+  return masked_select_out_zoom_impl(result, self, mask);
+}
+
+Tensor & masked_scatter__zoom(Tensor& self, const Tensor& mask, const Tensor& source) {
+  at::assert_no_internal_overlap(self);
+  TORCH_CHECK(
+      self.scalar_type() == source.scalar_type(),
+      "masked_scatter_: expected self and source to have same dtypes but got ",
+      self.scalar_type(),
+      " and ",
+      source.scalar_type());
+  TORCH_CHECK(mask.dtype() == ScalarType::Bool, "masked_scatter_ only supports boolean masks, "
+     "but got mask with dtype ", mask.dtype());
+
+  c10::MaybeOwned<Tensor> b_mask = expand_inplace(self, mask, "masked_scatter_");
+
+  if (self.numel() == 0) {
+    return self;
+  }
+
+  auto maskPrefixSum = at::empty(self.sizes(), mask.options().dtype(kLong));
+  launch_masked_scatter_kernel(self, *b_mask, maskPrefixSum, source);
+
+  return self;
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Lerp.cu b/aten/src/ATen/native/zoom/Lerp.cu
new file mode 100644
index 00000000000000..39a4520a236cc3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Lerp.cu
@@ -0,0 +1,128 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/Lerp.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/OpMathType.h>
+
+namespace at::native {
+namespace {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char lerp_tensor_name[] = "lerp_tensor";
+void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
+  auto dtype = iter.common_dtype();
+  if(at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+  static const auto lerp_tensor_string = jiterator_stringify(
+      template <typename T>
+      T lerp_tensor(T self_val, T end_val, T weight_val) {
+        return (std::abs(weight_val) < 0.5)
+            ? self_val + weight_val * (end_val - self_val)
+            : end_val -
+                (end_val - self_val) * (static_cast<T>(1) - weight_val);
+      }
+  ); // lerp_tensor_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_zoom", [&] {
+        jitted_gpu_kernel<
+          /*name=*/ lerp_tensor_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 3>(iter, lerp_tensor_string);
+      });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_zoom", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      at::native::gpu_kernel(
+        iter,
+        [] GPU_LAMBDA(
+            scalar_t self_val,
+            scalar_t end_val,
+            scalar_t weight_val) -> scalar_t {
+           opmath_t self_val_f = self_val;
+           opmath_t end_val_f = end_val;
+           opmath_t weight_val_f = weight_val;
+          return lerp(self_val, end_val, weight_val);
+        });
+      });
+#endif
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      dtype, "lerp_zoom",
+      [&] {
+        at::native::gpu_kernel(
+            iter,
+            [] GPU_LAMBDA(
+                scalar_t self_val,
+                scalar_t end_val,
+                scalar_t weight_val) -> scalar_t {
+              return lerp(self_val, end_val, weight_val);
+            });
+      });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char lerp_scalar_name[] = "lerp_scalar";
+void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+  static const auto lerp_scalar_string = jiterator_stringify(
+      template <typename T>
+      T lerp_scalar(T self_val, T end_val, T weight_val) {
+        return (std::abs(weight_val) < 0.5)
+            ? self_val + weight_val * (end_val - self_val)
+            : end_val -
+                (end_val - self_val) * (static_cast<T>(1) - weight_val);
+      }
+  ); // lerp_scalar_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_zoom", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      auto weight_val = weight.to<opmath_t>();
+      jitted_gpu_kernel<
+        /*name=*/ lerp_scalar_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(
+        iter,
+        lerp_scalar_string,
+        /*scalar_pos=*/ at::zoom::jit::BinaryFuncVariant::NoScalar,
+        /*scalar_val=*/ 0,
+        /*extra_args=*/ std::make_tuple(weight_val));
+  });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_zoom", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto weight_val = weight.to<opmath_t>();
+    at::native::gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA(scalar_t self_val, scalar_t end_val) {
+          opmath_t self_val_f = self_val;
+          opmath_t end_val_f = end_val;
+          return lerp(self_val, end_val, weight_val);
+        });
+  });
+#endif
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      dtype, "lerp_zoom",
+      [&]{
+        using opmath_t = at::opmath_type<scalar_t>;
+        auto weight_val = weight.to<opmath_t>();
+        at::native::gpu_kernel(
+            iter, [=] GPU_LAMBDA(scalar_t self_val, scalar_t end_val) {
+              return lerp(self_val, end_val, weight_val);
+            });
+      });
+    }
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(lerp_kernel_tensor_weight, &lerp_tensor_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(lerp_kernel_scalar_weight, &lerp_scalar_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/LossCTC.cu b/aten/src/ATen/native/zoom/LossCTC.cu
new file mode 100644
index 00000000000000..902273d80a965a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/LossCTC.cu
@@ -0,0 +1,801 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+// Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+// Licensed under the BSD-3-Clause license
+// This is the GPU implementation of the Connectionist Temporal Loss.
+// We mostly follow Graves.
+// 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
+// We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
+// Graves et al call the probabilities y, we use log_probs (also calling them inputs)
+// A few optimizations (similar to those here, but also some I didn't take) are described in
+// 2. Minmin Sun: http://on-demand.gputechconf.com/gtc/2016/presentation/s6383-minmin-sun-speech-recognition.pdf
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorUtils.h>
+#include <c10/util/Exception.h>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_ctc_loss_backward_native.h>
+#include <ATen/ops/_ctc_loss_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/logsumexp.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+#include <type_traits>
+#include <numeric>
+
+namespace at::native {
+
+namespace {
+
+// this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1])
+// so if l is l_0 l_1 ... l_(tl-1) then this looks up idx in
+// l' = BLANK l_0 BLANK l_1 BLANK ... BLANK l_(tl-1) BLANK
+// - note that no bound-checking is done
+// - it is important to only call it with idx == 0 if the target length is 0
+// - __restrict__ impact to be measured, see
+//   https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/
+template <typename target_t>
+__device__ static inline int64_t get_target_prime(
+    const target_t* __restrict__ target,
+    int64_t offset,
+    int64_t stride,
+    int64_t idx,
+    int64_t BLANK) {
+  if (idx % 2 == 0) {
+    return BLANK;
+  } else {
+    return target[offset + stride * (idx / 2)];
+  }
+}
+
+// this kernel is a relatively straightforward implementation of the alpha calculation in the forward backward algorithm (section 4.1).
+// A (minor) twist is that we are using log-calculations to enhance numerical stability (log_probs and log_alpha).
+// In total it would be more efficient to compute the beta in the same kernel (e.g. cudnn does this). While the beta are not
+// needed for the loss itself (just the grad), we can return log_alpha+log_beta (so same space as currently) and the overhead
+// is small and the use-case for loss without grad is relatively limited.
+// We parallelize by batch and target sequence. Empirically, it is faster to loop over the input (log probs) sequence  and do
+// target in parallel, even if it means more frequent __syncthreads.
+// In contrast to the cuDNN implementation, we allow large target lengths. For this we need that all previous `s` have been
+// computed when we start a new block_s. This is why we have our own for loop here.
+template<typename scalar_t, typename target_t>
+__global__ void
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
+                                    const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                    const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                    scalar_t* __restrict__ neg_log_likelihood_data,
+                                    int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                    int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                    const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                    int64_t batch_size, int64_t BLANK) {
+
+  constexpr scalar_t neginf = -INFINITY;
+
+  // bookkeeping
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (b >= batch_size)
+    return;
+
+  if (input_length == 0) {
+    if (threadIdx.x == 0) {
+      scalar_t log_likelihood = target_length == 0 ? 0 : neginf;
+      neg_log_likelihood_data[b] = -log_likelihood;
+    }
+    return;
+  }
+
+  // first row (t=0), the three equations for alpha_1 above eq (6)
+  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    scalar_t la;
+    switch (s) {
+    case 0:
+      la = log_probs_data[lp_batch_offset + lp_char_stride * BLANK];
+      break;
+    case 1:
+      la = target_length == 0 ? neginf
+                              : log_probs_data
+                                    [lp_batch_offset +
+                                     lp_char_stride *
+                                         get_target_prime(
+                                             targets_data,
+                                             tg_batch_offset,
+                                             tg_target_stride,
+                                             1,
+                                             BLANK)];
+      break;
+    default:
+      la = neginf;
+    }
+    if (s < 2*max_target_length+1)
+      log_alpha_data[la_batch_offset + /* la_input_stride * 0 */ + la_target_stride * s] = la;
+  }
+
+  for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+
+    // These two only depend on s, so we can cache them.
+    int64_t current_char;       // l_s in eq (6)
+    bool have_three;            // flag which of the two cases in eq (6) we have
+    if (s < 2 * target_length + 1 && target_length > 0) {
+      current_char = get_target_prime(
+          targets_data,
+          tg_batch_offset,
+          tg_target_stride,
+          s,
+          BLANK);
+      have_three =
+          ((s > 1) &&
+           (get_target_prime(
+                targets_data,
+                tg_batch_offset,
+                tg_target_stride,
+                s - 2,
+                BLANK) != current_char));
+    } else {
+      current_char = BLANK;
+      have_three = false;
+    }
+    for (int64_t t=1; t < max_input_length; t++) {
+      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
+      if ((t < input_length) && (s < 2 * target_length + 1)) {
+        // only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
+        // lamax is the maximum for the logsumexp trick.
+        scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
+        scalar_t lamax = la1;
+        scalar_t la2, la3;
+        if (s > 0) {
+          la2 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-1)];
+          if (la2 > lamax)
+            lamax = la2;
+        } else {
+          la2 = neginf;
+        }
+        if (have_three) {
+          la3 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * (s-2)];
+          if (la3 > lamax)
+            lamax = la3;
+        } else {
+          la3 = neginf;
+        }
+        if (lamax == neginf) // when all are neginf. (then the whole thing is neginf, but we can pretend)
+          lamax = 0;
+
+        log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = ::log(::exp(la1-lamax)+::exp(la2-lamax)+::exp(la3-lamax))+lamax
+          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
+      } else {
+        // otherwise we just set to neginf
+        if (s < 2*max_target_length+1)
+          log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
+      }
+    }
+  }
+  __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
+
+  // compute the loss (eq (8))
+  if (threadIdx.x == 0) {
+    scalar_t l1 = log_alpha_data[la_batch_offset + la_input_stride * (input_length-1) + la_target_stride * (target_length*2)];
+    scalar_t l2 = target_length > 0
+        ? log_alpha_data
+              [la_batch_offset + la_input_stride * (input_length - 1) +
+               la_target_stride * (target_length * 2 - 1)]
+        : neginf;
+    scalar_t m = ((l1 > l2) ? l1 : l2);
+    m = ((m == neginf) ? 0 : m);
+    scalar_t log_likelihood = ::log(::exp(l1-m)+::exp(l2-m))+m;
+    neg_log_likelihood_data[b] = -log_likelihood;
+  }
+}
+
+// The forward computation. Lot's of admin and a call to the alpha kernel.
+// Note: we do not check that the labels are in the valid range. As we use
+// them for indexing in the kernels, you'll see memory errors when you
+// pass corrupt labels.
+// We support both a 2-dimensional tensor as targets (one set of targets in each row) and
+// a 1-dimensional tensor where all targets are concatenated (and we use target_lengths
+// to figure out where they begin).
+// We return log_alpha (currently, might change to (log_alpha+log_beta) to be passed to the
+// backward. The dispatch function will only return the loss.
+template<typename scalar_t, ScalarType target_scalar_type>
+std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK) {
+  // log_probs: input_len x batch_size x num_labels
+  // targets [int64]: batch_size x target_length OR sum(target_lengths)
+  CheckedFrom c = "ctc_loss_gpu";
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  auto log_probs_arg = TensorArg(log_probs, "log_probs", 1);
+  auto targets_arg = TensorArg(targets, "targets", 2);
+  checkAllSameGPU(c, {log_probs_arg, targets_arg});
+
+  checkScalarType(c, targets_arg, target_scalar_type);
+  checkDim(c, log_probs_arg, 3);
+  checkDimRange(c, targets_arg, 1, 3);
+
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  TORCH_CHECK((0 <= BLANK) && (BLANK < num_labels), "blank must be in label range");
+  TORCH_CHECK(input_lengths.size() == static_cast<size_t>(batch_size), "input_lengths must be of size batch_size");
+  TORCH_CHECK(target_lengths.size() == static_cast<size_t>(batch_size), "target_lengths must be of size batch_size");
+
+  int64_t tg_target_stride;
+
+  int64_t max_target_length = 0;
+  auto tg_batch_offsets = at::empty({batch_size}, at::device(at::kCPU).dtype(at::kLong));
+  auto tg_batch_offsets_data = tg_batch_offsets.mutable_data_ptr<int64_t>();
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      TORCH_CHECK(target_lengths[i] >= 0,
+                  "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i],
+                  " (while checking arguments for ", c, ")");
+      tg_batch_offsets_data[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+        max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+    checkSize(c, targets_arg, 0, pos);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      TORCH_CHECK(target_lengths[i] >= 0,
+                  "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i],
+                  " (while checking arguments for ", c, ")");
+      tg_batch_offsets_data[i] = i * tg_batch_stride;
+      if (max_target_length < target_lengths[i])
+        max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(1);
+    checkSize(c, targets_arg, 0, batch_size);
+    TORCH_CHECK(targets.size(1) >= max_target_length,
+             "Expected tensor to have size at least ", max_target_length, " at dimension 1, but got size ", targets.size(1), " for ", targets_arg,
+             " (while checking arguments for ", c, ")");
+  }
+  int64_t max_input_length = log_probs.size(0);
+  for (int64_t b = 0; b < batch_size; b++) {
+    TORCH_CHECK(input_lengths[b] >= 0,
+             "Expected input_lengths to have value at least ", 0, ", but got value ", input_lengths[b],
+             " (while checking arguments for ", c, ")");
+    TORCH_CHECK(input_lengths[b] <= max_input_length,
+             "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b],
+             " (while checking arguments for ", c, ")");
+  }
+
+  auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
+  auto input_lengths_t = at::tensor(input_lengths, targets.options().dtype(kLong));
+  // TODO(Arham): add tensor body method for zoom
+  // tg_batch_offsets = tg_batch_offsets.cuda();
+  tg_batch_offsets = tg_batch_offsets.to(c10::DeviceType::PrivateUse1, /*non_blocking*/ false, /*copy*/ false);
+
+  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
+  Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options());
+
+  // Very likely, we could be more clever here, e.g. learning (or generalizing and reusing) from SoftMax.cu...
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 768; // we need 72 or so 32 bit registers for double
+  int threads_target = max_threads;
+  while (threads_target / 2 >= 2*max_target_length+1) {
+    threads_target /= 2;
+  }
+  int threads_batch = ::min(max_threads / threads_target, (int) batch_size);
+  dim3 block(threads_target, threads_batch);
+  dim3 grid(1, (batch_size+threads_batch-1)/threads_batch);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+ hipLaunchKernelGGL(( ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t>), dim3(grid), dim3(block), 0, stream, 
+                      log_alpha.mutable_data_ptr<scalar_t>(),
+                      log_probs.const_data_ptr<scalar_t>(), input_lengths_t.const_data_ptr<int64_t>(), log_probs.size(0),
+                      targets.const_data_ptr<target_t>(), target_lengths_t.const_data_ptr<int64_t>(), max_target_length,
+                      neg_log_likelihood.mutable_data_ptr<scalar_t>(),
+                      log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+                      log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+                      tg_batch_offsets.const_data_ptr<int64_t>(), tg_target_stride,
+                      batch_size, BLANK);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  return std::make_tuple(neg_log_likelihood, log_alpha);
+}
+
+// The second (backward) half of the forward backward algorithm, (10) and (11). This is parallel to the
+// alpha kernel above. (As mentioned above, it might make sense do the calculation in the alpha kernel.)
+template<typename scalar_t, typename target_t>
+__global__ void
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
+                                      const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                      const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                      int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                      int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                      const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                      int64_t batch_size, int64_t BLANK) {
+  constexpr scalar_t neginf = -INFINITY;
+
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (b >= batch_size)
+    return;
+
+  if (input_length == 0)
+    return;
+
+  // "first" row, the beta initialization before eq (10) (t=target_length - differes per batch)
+  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    scalar_t lb;
+    if (s == 2*target_length) {
+      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * BLANK];
+    } else if (s == 2 * target_length - 1) { // false for target_length == 0
+      int64_t current_target_prime = get_target_prime(
+          targets_data,
+          tg_batch_offset,
+          tg_target_stride,
+          s,
+          BLANK);
+      lb = log_probs_data[lp_batch_offset + (input_length-1) * lp_input_stride + lp_char_stride * current_target_prime];
+    } else {
+      lb = neginf;
+    }
+    if (s < 2*max_target_length+1) {
+      log_beta_data[lb_batch_offset + (input_length-1) * lb_input_stride + lb_target_stride * s] = lb;
+    }
+  }
+
+  // go backward in s
+  for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) {
+    int64_t s = threadIdx.x + block_s;
+    int64_t current_target_prime;
+    bool have_three;
+    if (s < 2 * target_length + 1 && target_length > 0) {
+      current_target_prime = get_target_prime(
+          targets_data,
+          tg_batch_offset,
+          tg_target_stride,
+          s,
+          BLANK);
+      have_three =
+          ((s < 2 * target_length - 1) &&
+           (get_target_prime(
+                targets_data,
+                tg_batch_offset,
+                tg_target_stride,
+                s + 2,
+                BLANK) != current_target_prime));
+    } else {
+      current_target_prime = BLANK;
+      have_three = false;
+    }
+    // now go backward in t. Note that we need to skip the last timestep that we did above.
+    for (int64_t t=max_input_length-2; t>=0; t--) {
+      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch item
+      if ((t < input_length - 1) && (s < 2 * target_length + 1)) {
+        scalar_t lb1 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * s];
+        scalar_t lbmax = lb1;
+        scalar_t lb2, lb3;
+
+        if (s < 2*target_length) {
+          lb2 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+1)];
+          if (lb2 > lbmax)
+            lbmax = lb2;
+        } else {
+          lb2 = neginf;
+        }
+        if (have_three) {
+          lb3 = log_beta_data[lb_batch_offset + lb_input_stride * (t+1) + lb_target_stride * (s+2)];
+          if (lb3 > lbmax)
+            lbmax = lb3;
+        } else {
+          lb3 = neginf;
+        }
+        if (lbmax == neginf)
+          lbmax = 0;
+
+        scalar_t lb = ::log(::exp(lb1-lbmax)+::exp(lb2-lbmax)+::exp(lb3-lbmax))+lbmax
+          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_target_prime];
+
+        log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s] = lb;
+      } else if (
+          (s < 2 * max_target_length + 1) &&
+          (((target_length == 0) && (s > 0)) || (s >= 2 * target_length + 1) ||
+           (t >= input_length))) {
+        log_beta_data
+            [lb_batch_offset + lb_input_stride * t + lb_target_stride * s] =
+                neginf;
+      }
+    }
+  }
+}
+
+// This implements the subtrahend of equation (16) for all *nonblank* characters.
+// It assumes you have probs in gradient_data when called
+// and it modifies gradient_data to be, the gradient.
+// In order to facilitate this inplace update, We don't actually do this in logspace.
+// (The other variant implemented uses log_space and the differences seem to be
+//  not so problematic at least with unit normal distributed test activations.)
+// Internally this uses atomicAdd because different threads may write to the same
+// gradient position.
+// This is parallelised over b and s again.
+// Note that for us, the Z of eqn (16) is actually constant for all t and it is the
+// likelihood - this is why we use the negative log likelihood below.
+// We also multiply by the input gradient to keep with standard autograd style.
+// I took this trick from [2], for moderate alphabet sizes a log-space
+// calculation (with an atomic log add) is similarly in performance, but for large
+// alphabets the inplace nature is a considerable advantage.
+template<typename scalar_t, typename target_t>
+__global__ void
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_data,
+                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
+                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
+                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths,
+                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths,
+                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
+                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
+                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                              int64_t batch_size, bool zero_infinity) {
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t s = threadIdx.x + blockIdx.x * blockDim.x; // note, this directly indexes into targets, not targets prime!
+
+  if (b >= batch_size)
+    return;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t gr_batch_offset = b*gr_batch_stride;
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  if (s >= target_length)
+    return;
+
+  int64_t target = targets_data[tg_batch_offset + s * tg_target_stride];
+  scalar_t nll = neg_log_likelihood_data[b];
+  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
+
+  if (zero_infinity && nll == INFINITY)
+    return;
+
+  for (int64_t t = 0; t < input_length; t++) {
+    scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
+    gpuAtomicAddNoReturn(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
+              -::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
+                        + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
+                        + nll - lp) * gr);
+  }
+}
+
+// This is the naive implementation of equation (16). It is parallelised in batch and input timestep.
+// It appears to be faster than the above method for small batch sizes.
+template<typename scalar_t, typename target_t>
+__global__ void
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
+                                                     const scalar_t* __restrict__ grad_out_data, int64_t grad_out_batch_stride,
+                                                     const scalar_t* __restrict__ log_alpha_data, const scalar_t* __restrict__ log_beta_data,
+                                                     const scalar_t*log_probs_data, const int64_t* __restrict__ input_lengths, int64_t max_input_length,
+                                                     const target_t* __restrict__ targets_data, const int64_t* __restrict__ target_lengths, int64_t max_target_length,
+                                                     const scalar_t* __restrict__ neg_log_likelihood_data,
+                                                     int64_t gr_input_stride, int64_t gr_batch_stride, int64_t gr_char_stride,
+                                                     int64_t lp_input_stride, int64_t lp_batch_stride, int64_t lp_char_stride,
+                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
+                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
+                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
+                                     int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
+
+  constexpr scalar_t neginf = -INFINITY;
+  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+  int64_t t = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if ((t >= max_input_length) || (b >= batch_size))
+    return;
+
+  int64_t input_length = input_lengths[b];
+  int64_t target_length = target_lengths[b];
+  int64_t gr_batch_offset = b*gr_batch_stride;
+  int64_t lp_batch_offset = b*lp_batch_stride;
+  int64_t la_batch_offset = b*la_batch_stride;
+  int64_t lb_batch_offset = b*lb_batch_stride;
+  int64_t tg_batch_offset = tg_batch_offsets[b];
+
+  // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
+  for (int s = 0; s < 2*max_target_length+1; s++) {
+    if (s < 2 * target_length + 1) { // if target_length == 0, s == 0
+      int64_t current_target_prime = get_target_prime(
+          targets_data,
+          tg_batch_offset,
+          tg_target_stride,
+          s,
+          BLANK);
+      scalar_t log_alpha_beta = (log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s]
+                                 + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * s]);
+      scalar_t& lcab = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * current_target_prime];
+      if (lcab == neginf) {
+        lcab = log_alpha_beta;
+      } else {
+        scalar_t max = ((lcab > log_alpha_beta) ? lcab : log_alpha_beta);
+        lcab = ::log(::exp(lcab-max)+::exp(log_alpha_beta-max))+max;
+      }
+    }
+  }
+
+  scalar_t nll = neg_log_likelihood_data[b];
+  scalar_t gr =  grad_out_data[b * grad_out_batch_stride];
+
+  for (int64_t c = 0; c < num_labels; c++) {
+    scalar_t& res = gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * c];
+    if (t < input_length && (! zero_infinity || nll != INFINITY)) {
+      scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * c];
+      res = (::exp(lp)-::exp(res + nll - lp)) * gr;
+    }
+    else {
+      res = 0.;
+    }
+  }
+}
+
+// This is to zero gradients which corresponding to the out-of-sequence position
+// Those gradients should not be used in any model update since the input
+// elements are padded
+template<typename scalar_t>
+__global__ void
+C10_LAUNCH_BOUNDS_2((std::is_same<scalar_t, float>::value ? 1024 : 896), 1)
+ctc_loss_zero_padded_gradients(
+    scalar_t* __restrict__ gradient_data,   /* (T, B, D) layout */
+    const int64_t* __restrict__ input_lengths, /* (B, ) layout */
+    int64_t gr_timestep_stride,
+    int64_t gr_batch_stride,
+    int64_t gr_label_stride,
+    int64_t max_input_length, /* T */
+    int64_t batch_size, /* B */
+    int64_t num_labels  /* D */ ) {
+      int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
+      int64_t t = threadIdx.x + blockIdx.x * blockDim.x;
+
+      if (b >= batch_size || t >= max_input_length) {
+        return;
+      }
+
+      scalar_t input_length = input_lengths[b];
+      if (t >= input_length) {
+        for (int l = 0; l < num_labels; l++)
+          gradient_data[
+            t * gr_timestep_stride + b * gr_batch_stride + l * gr_label_stride]
+          = 0.0f;
+      }
+  }
+
+
+// The backward. It essentially computes eq 16 by using the above kernels.
+// We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
+template<typename scalar_t, ScalarType target_scalar_type>
+Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
+                                      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
+  constexpr scalar_t neginf = -INFINITY;
+  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
+  int64_t batch_size = log_probs.size(1);
+  int64_t num_labels = log_probs.size(2);
+  int64_t tg_target_stride;
+
+  int64_t max_target_length;
+  auto tg_batch_offsets = at::empty({batch_size}, TensorOptions(at::CPU(kLong)));
+  auto tg_batch_offsets_data = tg_batch_offsets.mutable_data_ptr<int64_t>();
+  if (targets.dim() == 1) { // concatenated targets
+    int64_t pos = 0;
+    max_target_length = 0;
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = pos;
+      pos += target_lengths[i];
+      if (max_target_length < target_lengths[i])
+        max_target_length = target_lengths[i];
+    }
+    tg_target_stride = targets.stride(0);
+  }
+  else { // batch x max_target_length
+    // dim is 2
+    int64_t tg_batch_stride = targets.stride(0);
+    for (int64_t i = 0; i < batch_size; i++) {
+      tg_batch_offsets_data[i] = i * tg_batch_stride;
+    }
+    tg_target_stride = targets.stride(1);
+    max_target_length = log_alpha.size(2)/2; // targets.size(1) might be larger
+  }
+  auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
+  auto input_lengths_t = at::tensor(input_lengths, targets.options().dtype(kLong));
+  // TODO(Arham): add tensor body method for zoom
+  // tg_batch_offsets = tg_batch_offsets.cuda();
+  tg_batch_offsets = tg_batch_offsets.to(c10::DeviceType::PrivateUse1, /*non_blocking*/ false, /*copy*/ false);
+
+  Tensor log_beta = at::empty_like(log_alpha, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  log_beta.fill_(neginf);
+
+  Tensor grad = at::full_like(log_probs, neginf, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // initialization for log(sum (alpha beta))
+
+  // As above, there may be better configurations to use.
+  constexpr int max_threads = std::is_same<scalar_t, float>::value ? 1024 : 896; // we need 72 or so 32 bit registers for double
+  int threads_target = max_threads;
+  while (threads_target / 2 >= 2*max_target_length+1) {
+    threads_target /= 2;
+  }
+  int threads_batch = ::min(max_threads / threads_target, (int) batch_size);
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  {
+    dim3 block(threads_target, threads_batch);
+    dim3 grid(1, (batch_size+threads_batch-1)/threads_batch);
+   hipLaunchKernelGGL(( ctc_loss_backward_log_beta_gpu_kernel<scalar_t, target_t>), dim3(grid), dim3(block), 0, stream, 
+      log_beta.mutable_data_ptr<scalar_t>(),
+       log_probs.const_data_ptr<scalar_t>(), input_lengths_t.const_data_ptr<int64_t>(), log_probs.size(0),
+       targets.const_data_ptr<target_t>(), target_lengths_t.const_data_ptr<int64_t>(), max_target_length,
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.const_data_ptr<int64_t>(), tg_target_stride,
+       batch_size, BLANK);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+
+  // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
+  // the (capped) difference of timings.
+  // Note that for OK problems target length <= input length, so we
+  // only consider input length.
+  bool is_large = (2*log_probs.size(0)+(24*batch_size)/10+(2*num_labels)/10) > 450;
+  if (is_large) { // large alphabet, large batch
+    // this computes the probs, minuend in (16)
+    at::exp_out(grad, log_probs);
+    // now we compute the subtrahend for the blanks. It is a straightforward reduction because we know that
+    // blanks are in every other position.
+    // maybe we should kernelize this, too.
+    auto grad_blank = grad.narrow(2, BLANK, 1);
+    grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
+                                                      {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
+                                 + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
+                                                       {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
+                                 2, true)
+                   .permute({1, 0, 2})
+                   .add_(neg_log_likelihood.view({1, batch_size, 1}))
+                   .sub_(log_probs.narrow(2, BLANK, 1))
+                   .exp_()
+                   );
+    // scale by output gradient (blanks and first summand of non-blanks)
+    grad *= grad_out.view({1, batch_size, 1});
+    if (zero_infinity) {
+      grad = at::where(neg_log_likelihood.view({1, batch_size, 1}) == Scalar(INFINITY), at::zeros({}, grad.options()), grad);
+    }
+
+    // For the non-blank characters, we use a kernel to compute the subtrahend.
+    // Again we might configure block and grid in a better way.
+    int threads_target = max_threads;
+    while (threads_target / 2 >= max_target_length && threads_target > 1) {
+      threads_target /= 2;
+    }
+    int threads_batch = ::min(max_threads / threads_target, (int) batch_size);
+    dim3 block(threads_target, threads_batch);
+    dim3 grid(
+        std::max<int>(
+            (max_target_length + threads_target - 1) / threads_target, 1),
+        (batch_size + threads_batch - 1) / threads_batch,
+        1);
+   hipLaunchKernelGGL(( ctc_loss_backward_collect_nonblank_gpu_kernel<scalar_t, target_t>), dim3(grid), dim3(block), 0, stream, 
+      grad.mutable_data_ptr<scalar_t>(),
+       grad_out.const_data_ptr<scalar_t>(), grad_out.stride(0),
+       log_alpha.const_data_ptr<scalar_t>(), log_beta.const_data_ptr<scalar_t>(),
+       log_probs.const_data_ptr<scalar_t>(), input_lengths_t.const_data_ptr<int64_t>(),
+       targets.const_data_ptr<target_t>(), target_lengths_t.const_data_ptr<int64_t>(),
+       neg_log_likelihood.const_data_ptr<scalar_t>(),
+       grad.stride(0), grad.stride(1), grad.stride(2),
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.const_data_ptr<int64_t>(), tg_target_stride,
+       batch_size, zero_infinity);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else { // small problem, use naive algorithm
+    // Still no block/grid configuration guru...
+    int threads_input = max_threads;
+    while (threads_input / 2 >= log_probs.size(0) && threads_input > 1) {
+      threads_input /= 2;
+    }
+    threads_batch = ::min(max_threads / threads_input, (int) batch_size);
+    dim3 block(threads_input, threads_batch);
+    dim3 grid((log_probs.size(0) + threads_input-1)/threads_input, (batch_size+threads_batch-1)/threads_batch);
+   hipLaunchKernelGGL(( ctc_loss_backward_collect_gpu_kernel<scalar_t, target_t>), dim3(grid), dim3(block), 0, stream, 
+      grad.mutable_data_ptr<scalar_t>(),
+       grad_out.const_data_ptr<scalar_t>(), grad_out.stride(0),
+       log_alpha.const_data_ptr<scalar_t>(), log_beta.const_data_ptr<scalar_t>(),
+       log_probs.const_data_ptr<scalar_t>(), input_lengths_t.const_data_ptr<int64_t>(), log_probs.size(0),
+       targets.const_data_ptr<target_t>(), target_lengths_t.const_data_ptr<int64_t>(), max_target_length,
+       neg_log_likelihood.const_data_ptr<scalar_t>(),
+       grad.stride(0), grad.stride(1), grad.stride(2),
+       log_probs.stride(0), log_probs.stride(1), log_probs.stride(2),
+       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
+       log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
+       tg_batch_offsets.const_data_ptr<int64_t>(), tg_target_stride,
+       batch_size, num_labels, BLANK, zero_infinity);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK(); // catch launch errors
+  }
+
+  // zero those invalid graident elements due to padding
+  {
+    int threads_input = max_threads;
+    while (threads_input / 2 >= log_probs.size(0)) {
+      threads_input /= 2;
+    }
+    threads_batch = ::min(max_threads / threads_input, (int) batch_size);
+    dim3 block(threads_input, threads_batch);
+    dim3 grid(
+      (log_probs.size(0) + threads_input-1)/threads_input,
+      (batch_size+threads_batch-1)/threads_batch);
+   hipLaunchKernelGGL(( ctc_loss_zero_padded_gradients<scalar_t>), dim3(grid), dim3(block), 0, stream, 
+      grad.mutable_data_ptr<scalar_t>(),
+      input_lengths_t.const_data_ptr<int64_t>(),
+      grad.stride(0),
+      grad.stride(1),
+      grad.stride(2),
+      grad.size(0),
+      grad.size(1),
+      grad.size(2)
+    );
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+
+  return grad;
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK, bool zero_infinity) {
+  (void)zero_infinity; // only used for backward
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_zoom", [&] {
+      if (targets.scalar_type() == kLong) {
+        return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      } else {
+        return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+      }
+    });
+}
+
+Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
+                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("ctc_loss_backward_gpu");
+  return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_zoom", [&] {
+      if (targets.scalar_type() == kLong) {
+        return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
+      } else {
+        return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
+      }
+    });
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/MaxMinElementwiseKernel.cu b/aten/src/ATen/native/zoom/MaxMinElementwiseKernel.cu
new file mode 100644
index 00000000000000..d287484d5783a0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MaxMinElementwiseKernel.cu
@@ -0,0 +1,98 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void maximum_kernel_zoom(TensorIteratorBase& iter) {
+  if (iter.dtype() == ScalarType::Bool) {
+    opmath_symmetric_gpu_kernel_with_scalars<bool>(
+        iter, []GPU_LAMBDA(bool a, bool b) -> bool {
+      return a || b;
+    });
+  } else if (isIntegralType(iter.dtype(), /*includeBool=*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "max_elementwise_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
+          iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return ::max(a, b);
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "max_elementwise_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(
+          iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        if (a != a) {
+          return a;
+        } else if (b != b) {
+          return b;
+        } else {
+          return ::max(a, b);
+        }
+      });
+    });
+  }
+}
+
+void minimum_kernel_zoom(TensorIteratorBase& iter) {
+  if (iter.dtype() == ScalarType::Bool) {
+    opmath_symmetric_gpu_kernel_with_scalars<bool>(iter, []GPU_LAMBDA(bool a, bool b) -> bool {
+      return a && b;
+    });
+  } else if (isIntegralType(iter.dtype(), /*includeBool=*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "minimum_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return ::min(a, b);
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "min_elementwise_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        if (a != a) {
+          return a;
+        } else if (b != b) {
+          return b;
+        } else {
+          return ::min(a, b);
+        }
+      });
+    });
+  }
+}
+
+void fmax_kernel_zoom(TensorIteratorBase& iter) {
+  if (isFloatingType(iter.common_dtype())) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "fmax_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return ::fmax(a, b);
+      });
+    });
+  } else {
+    maximum_kernel_zoom(iter);
+  }
+}
+
+void fmin_kernel_zoom(TensorIteratorBase& iter) {
+  if (isFloatingType(iter.common_dtype())) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "fmin_zoom", [&]() {
+      opmath_symmetric_gpu_kernel_with_scalars<scalar_t>(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return ::fmin(a, b);
+      });
+    });
+  } else {
+    minimum_kernel_zoom(iter);
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(maximum_stub, &maximum_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(minimum_stub, &minimum_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(fmax_stub, &fmax_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(fmin_stub, &fmin_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/MaxUnpooling.cu b/aten/src/ATen/native/zoom/MaxUnpooling.cu
new file mode 100644
index 00000000000000..cf8f69c89dc19c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MaxUnpooling.cu
@@ -0,0 +1,612 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/max_unpool2d_native.h>
+#include <ATen/ops/max_unpool3d_native.h>
+
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
+namespace at::native {
+
+using namespace at::zoom::detail;
+
+template <typename T>
+__host__ __device__ __forceinline__ T ceilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+template <typename T>
+__global__ void max_unpooling2d_forward_kernel(
+    const int64_t numInputElements,
+    const T* input,
+    const int64_t* indices,
+    const int64_t numChannels,
+    const int64_t inputHeight,
+    const int64_t inputWidth,
+    const int64_t outputHeight,
+    const int64_t outputWidth,
+    T* output) {
+  int64_t outputImageSize = outputHeight * outputWidth;
+  HIP_KERNEL_LOOP(linearIndex, numInputElements) {
+    int c = (linearIndex / inputWidth / inputHeight) % numChannels;
+    int n = linearIndex / inputWidth / inputHeight / numChannels;
+    output += (n * numChannels + c) * outputHeight * outputWidth;
+    int maxind = indices[linearIndex];
+    ZOOM_KERNEL_ASSERT(maxind >= 0 && maxind < outputImageSize);
+    output[maxind] = input[linearIndex];
+  }
+}
+
+template <typename T>
+__global__ void max_unpooling3d_forward_kernel(
+    PackedTensorAccessor64<const T, 4> input,
+    PackedTensorAccessor64<const int64_t, 4> indices,
+    T* output,
+    const int64_t oT,
+    const int64_t oH,
+    const int64_t oW,
+    const int64_t offsetZ) {
+  int64_t iColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t iRow = blockIdx.y * blockDim.y + threadIdx.y;
+  int64_t iFrame = (blockIdx.z + offsetZ) % input.size(1); // input frame/time
+  int64_t slice = (blockIdx.z + offsetZ) / input.size(1); // input slice/feature
+  int64_t outputImageSize = oT * oH * oW;
+  if (iRow < input.size(2) && iColumn < input.size(3)) {
+    const T val = input[slice][iFrame][iRow][iColumn];
+    const int64_t index = indices[slice][iFrame][iRow][iColumn];
+    ZOOM_KERNEL_ASSERT(index >= 0 && index < outputImageSize);
+    output[slice * oT * oH * oW + index] = val;
+  }
+}
+
+template <typename T>
+__global__ void max_unpooling2d_backward_kernel(
+    const int64_t numInputElements,
+    const T* input,
+    const int64_t* indices,
+    const int64_t numChannels,
+    const int64_t inputHeight,
+    const int64_t inputWidth,
+    const int64_t outputHeight,
+    const int64_t outputWidth,
+    T* output) {
+  HIP_KERNEL_LOOP(linearIndex, numInputElements) {
+    int c = (linearIndex / inputWidth / inputHeight) % numChannels;
+    int n = linearIndex / inputWidth / inputHeight / numChannels;
+    input += (n * numChannels + c) * outputHeight * outputWidth;
+    int maxind = indices[linearIndex];
+    output[linearIndex] = input[maxind];
+  }
+}
+
+template <typename T>
+__global__ void max_unpooling3d_backward_kernel(
+    const T* gradOutputData,
+    int64_t oT,
+    int64_t oH,
+    int64_t oW,
+    PackedTensorAccessor64<int64_t, 4> indices,
+    PackedTensorAccessor64<T, 4> gradInput,
+    int offsetZ) {
+  int iColumn = blockIdx.x * blockDim.x + threadIdx.x;
+  int iRow = blockIdx.y * blockDim.y + threadIdx.y;
+  int iFrame = (blockIdx.z + offsetZ) % gradInput.size(1); // output frame/time
+  int slice =
+      (blockIdx.z + offsetZ) / gradInput.size(1); // output slice/feature
+
+  if (iRow < gradInput.size(2) && iColumn < gradInput.size(3)) {
+    int64_t index = indices[slice][iFrame][iRow][iColumn];
+    T grad_val = gradOutputData[slice * oT * oH * oW + index];
+    gradInput[slice][iFrame][iRow][iColumn] = grad_val;
+  }
+}
+
+Tensor& max_unpooling2d_forward_out_zoom(const Tensor& self_,
+    const Tensor& indices_,
+    IntArrayRef output_size,
+    Tensor& output) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic with duplicate indices
+  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
+
+  TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
+  TORCH_CHECK(
+      indices_.scalar_type() == at::ScalarType::Long,
+      "elements in indices should be type int64 but got: ", indices_.scalar_type());
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
+  TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
+      indices_arg{indices_, "indices_", 3};
+  checkAllSameGPU(
+      "max_unpooling2d_forward_out_zoom", {output_arg, self_arg, indices_arg});
+
+  for (int64_t i = 1; i < self_.ndimension(); ++i) {
+    TORCH_CHECK(self_.size(i) > 0, "max_unpooling2d_forward_out_zoom(): ",
+                "Expected input to have non-zero size for non-batch dimensions, but got ",
+                self_.sizes(), " with dimension ", i , " being empty.");
+  }
+
+  TORCH_CHECK(
+      (self_.ndimension() == 3 || self_.ndimension() == 4),
+      "Input to max_unpooling2d should be a 3d or 4d Tensor, but got tensor with dimension: ", self_.ndimension());
+  TORCH_CHECK(
+      self_.sizes() == indices_.sizes(),
+      "Expected shape of indices to be: ", self_.sizes(), " but got: ", indices_.sizes());
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
+
+  int64_t dimw = 2;
+  int64_t dimh = 1;
+  int64_t numBatch = 1;
+
+  int64_t numChannels;
+  int64_t inputHeight;
+  int64_t inputWidth;
+
+  auto self = self_.contiguous();
+  auto indices = indices_.contiguous();
+
+  if (self.ndimension() == 4) {
+    numBatch = self.size(0);
+    dimw++;
+    dimh++;
+  }
+  numChannels = self.size(dimh - 1);
+  inputHeight = self.size(dimh);
+  inputWidth = self.size(dimw);
+
+  output.resize_({numBatch, numChannels, oheight, owidth});
+
+  output.zero_();
+
+  auto count = self.numel();
+  if (count != 0) {
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half,
+        self.scalar_type(), "max_unpooling2d_forward_kernel", ([&] {
+          max_unpooling2d_forward_kernel<<<
+              GET_BLOCKS(count),
+              HIP_NUM_THREADS,
+              0,
+              c10::zoom::getCurrentZoomStream()>>>(
+              self.numel(),
+              self.const_data_ptr<scalar_t>(),
+              indices.const_data_ptr<int64_t>(),
+              numChannels,
+              inputHeight,
+              inputWidth,
+              oheight,
+              owidth,
+              output.mutable_data_ptr<scalar_t>());
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }));
+  }
+  if (self.ndimension() == 3) {
+    output.resize_({numChannels, oheight, owidth});
+  }
+  return output;
+}
+
+Tensor max_unpooling2d_forward_zoom(
+    const Tensor& self,
+    const Tensor& indices,
+    IntArrayRef output_size) {
+  auto output = at::empty({0}, self.options());
+  at::native::max_unpooling2d_forward_out_zoom(self, indices, output_size, output);
+  return output;
+}
+
+static void max_unpooling3d_shape_check(
+    const Tensor& input,
+    const Tensor& gradOutput,
+    const Tensor& indices,
+    IntArrayRef output_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    const char *fn_name) {
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+  TORCH_CHECK(
+      indices.scalar_type() == at::ScalarType::Long,
+      "elements in indices should be type int64 but got: ", indices.scalar_type());
+  TORCH_CHECK(
+      (input.ndimension() == 4 || input.ndimension() == 5),
+      "Input to max_unpooling3d should be a 4d or 5d Tensor, but got a tensor with dim ", input.ndimension());
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "There should be exactly three elements (depth, height, width) in output_size, but got ", output_size.size(), " elements.");
+  TORCH_CHECK(
+      stride.size() == 3,
+      "There should be exactly three elements (depth, height, width) in stride, but got: ", stride.size(), " elements.");
+  TORCH_CHECK(
+      padding.size() == 3,
+      "There should be exactly three elements (depth, height, width) in padding, but got: ", padding.size(), " elements.");
+  TORCH_CHECK(
+      input.sizes() == indices.sizes(),
+      "Expected shape of indices to be: ", input.sizes(), " but got: ", indices.sizes());
+
+  for (int64_t i = 1; i < input.ndimension(); ++i) {
+    TORCH_CHECK(input.size(i) > 0, fn_name,
+                ": Expected input to have non-zero size for non-batch dimensions, but got ",
+                input.sizes(), " with dimension ", i , " being empty.");
+  }
+
+  TORCH_CHECK(
+      stride[0] > 0 && stride[1] > 0 && stride[2] > 0,
+      "strides should be greater than zero, but got stride: ",
+      stride);
+
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int dimn = 0;
+
+  if (input.ndimension() == 5) {
+    dimw++;
+    dimh++;
+    dimt++;
+    dimn++;
+  }
+
+  int nslices = input.size(dimn);
+
+  if (gradOutput.defined()) {
+    if (oT != gradOutput.size(dimt) || oH != gradOutput.size(dimh) ||
+        oW != gradOutput.size(dimw)) {
+      AT_ERROR(
+          "Inconsistent gradOutput size. oT= ",
+          oT,
+          ", oH= ",
+          oH,
+          ", oW= ",
+          oW,
+          ". gradOutput: ",
+          gradOutput.size(dimt),
+          "x",
+          gradOutput.size(dimh),
+          "x",
+          gradOutput.size(dimw));
+    }
+    TORCH_CHECK(
+        gradOutput.ndimension() == input.ndimension() &&
+            gradOutput.size(dimn) == nslices,
+        "gradOutput and input Tensors should have same number of dimensions and also the same number of channels/slices");
+  }
+}
+
+Tensor& max_unpooling3d_forward_out_zoom(const Tensor& self_,
+    const Tensor& indices_,
+    IntArrayRef output_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    Tensor& output) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic with duplicate indices
+  at::globalContext().alertNotDeterministic("max_unpooling3d_forward_out");
+
+  TORCH_CHECK(output.is_contiguous(), "output must be contiguous");
+  max_unpooling3d_shape_check(
+    self_, Tensor(), indices_, output_size, stride, padding, "max_unpooling3d_forward_out_zoom()");
+
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
+  TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
+      indices_arg{indices_, "indices_", 3};
+  checkAllSameGPU(
+      "max_unpooling3d_forward_out_zoom", {output_arg, self_arg, indices_arg});
+
+  auto self = self_.contiguous();
+  auto indices = indices_.contiguous();
+
+  int64_t batchSize;
+  int64_t inputSlices;
+  int64_t inputTime;
+  int64_t inputHeight;
+  int64_t inputWidth;
+
+  if (self.ndimension() == 4) {
+    batchSize = 1;
+    inputSlices = self.size(0);
+    inputTime = self.size(1);
+    inputHeight = self.size(2);
+    inputWidth = self.size(3);
+    output.resize_({inputSlices, oT, oH, oW});
+  } else {
+    batchSize = self.size(0);
+    inputSlices = self.size(1);
+    inputTime = self.size(2);
+    inputHeight = self.size(3);
+    inputWidth = self.size(4);
+    output.resize_({batchSize, inputSlices, oT, oH, oW});
+  }
+
+  output.zero_();
+
+  // Collapse batch and feature dimensions if needed
+  if (self.ndimension() == 5) {
+    self = self.reshape({self.size(0) * self.size(1),
+                         self.size(2),
+                         self.size(3),
+                         self.size(4)});
+    indices = indices.reshape({indices.size(0) * indices.size(1),
+                               indices.size(2),
+                               indices.size(3),
+                               indices.size(4)});
+  }
+
+  if (self.numel() == 0) {
+    return output;
+  }
+
+  int totalZ = inputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half,
+      self.scalar_type(), "max_unpooling3d_forward_kernel", ([&] {
+        while (totalZ > 0) {
+          dim3 grid(
+              ceilDiv(inputWidth, static_cast<int64_t>(block.x)),
+              ceilDiv(inputHeight, static_cast<int64_t>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+          max_unpooling3d_forward_kernel<<<
+              grid,
+              block,
+              0,
+              c10::zoom::getCurrentZoomStream()>>>(
+              self.packed_accessor64<const scalar_t, 4>(),
+              indices.packed_accessor64<const int64_t, 4>(),
+              output.mutable_data_ptr<scalar_t>(),
+              oT,
+              oH,
+              oW,
+              offsetZ);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          totalZ -= 65535;
+          offsetZ += 65535;
+        }
+      }));
+  return output;
+}
+
+Tensor max_unpooling3d_forward_zoom(
+    const Tensor& self,
+    const Tensor& indices,
+    IntArrayRef output_size,
+    IntArrayRef stride,
+    IntArrayRef padding) {
+  auto output = at::empty({0}, self.options());
+  at::native::max_unpooling3d_forward_out_zoom(
+      self, indices, output_size, stride, padding, output);
+  return output;
+}
+
+at::Tensor& max_unpooling2d_backward_out_zoom(const Tensor& grad_output_,
+    const Tensor& self_,
+    const Tensor& indices_,
+    IntArrayRef output_size,
+    Tensor& grad_input) {
+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+  TORCH_CHECK(
+      indices_.scalar_type() == at::ScalarType::Long,
+      "elements in indices should be type int64 but got type: ", indices_.scalar_type());
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2},
+      self_arg{self_, "self_", 3}, indices_arg{indices_, "indices_", 4};
+  checkAllSameGPU(
+      "max_unpooling2d_backward_out_zoom",
+      {grad_input_arg, grad_output_arg, self_arg, indices_arg});
+
+  TORCH_CHECK(
+      (self_.ndimension() == 3 || self_.ndimension() == 4),
+      "Input to max_unpooling2d should be a 3d or 4d Tensor, instead got: ",
+      self_);
+
+  TORCH_CHECK(
+      self_.sizes() == indices_.sizes(),
+      "Expected shape of indices to be: ", self_.sizes(), " but got: ", indices_.sizes());
+
+  TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
+
+  int64_t nInputCols, nInputRows, nInputPlane;
+
+  int dimw = 2;
+  int dimh = 1;
+
+  auto self = self_.contiguous();
+  auto indices = indices_.contiguous();
+  auto grad_output = grad_output_.contiguous();
+
+  if (self.ndimension() == 3) {
+    nInputPlane = self.size(0);
+  } else {
+    ++dimw;
+    ++dimh;
+    nInputPlane = self.size(1);
+  }
+
+  nInputCols = self.size(dimw);
+  nInputRows = self.size(dimh);
+
+  if (oheight != grad_output.size(dimh) || owidth != grad_output.size(dimw)) {
+    AT_ERROR(
+        "Inconsistent gradOutput size. output height: ",
+        oheight,
+        ", output width= ",
+        owidth,
+        ", gradOutput: ",
+        grad_output.size(dimh),
+        "x",
+        grad_output.size(dimw));
+  }
+
+  grad_input.resize_as_(self);
+  grad_input.zero_();
+
+  int64_t count = self.numel();
+  if (count == 0) {
+    return grad_input;
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half,
+      self.scalar_type(), "max_unpooling2d_backward_kernel", ([&] {
+        max_unpooling2d_backward_kernel<<<
+            GET_BLOCKS(count),
+            HIP_NUM_THREADS,
+            0,
+            c10::zoom::getCurrentZoomStream()>>>(
+            count,
+            grad_output.const_data_ptr<scalar_t>(),
+            indices.const_data_ptr<int64_t>(),
+            nInputPlane,
+            nInputRows,
+            nInputCols,
+            oheight,
+            owidth,
+            grad_input.mutable_data_ptr<scalar_t>());
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }));
+  return grad_input;
+}
+at::Tensor max_unpooling2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& indices,
+    IntArrayRef output_size) {
+  auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  at::native::max_unpooling2d_backward_out_zoom(
+      grad_output, self, indices, output_size, grad_input);
+  return grad_input;
+}
+
+at::Tensor& max_unpooling3d_backward_out_zoom(const Tensor& grad_output_,
+    const Tensor& self_,
+    const Tensor& indices_,
+    IntArrayRef output_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
+  max_unpooling3d_shape_check(
+    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_zoom()");
+
+  int batchSize = 0;
+  int inputSlices = 0;
+  int inputTime = 0;
+  int64_t inputHeight = 0;
+  int64_t inputWidth = 0;
+
+  TensorArg self_arg{self_, "self_", 1}, indices_arg{indices_, "indices_", 2},
+      grad_output_arg{grad_output_, "grad_output_", 3},
+      grad_input_arg{grad_input, "grad_input", 4};
+  checkAllSameGPU(
+      "max_unpooling3d_backward_out_zoom",
+      {self_arg, indices_arg, grad_output_arg, grad_input_arg});
+
+  auto self = self_.contiguous();
+  auto indices = indices_.contiguous();
+  auto grad_output = grad_output_.contiguous();
+
+  if (self.ndimension() == 4) {
+    batchSize = 1;
+    inputSlices = self.size(0);
+    inputTime = self.size(1);
+    inputHeight = self.size(2);
+    inputWidth = self.size(3);
+  } else {
+    batchSize = self.size(0);
+    inputSlices = self.size(1);
+    inputTime = self.size(2);
+    inputHeight = self.size(3);
+    inputWidth = self.size(4);
+  }
+
+  grad_input.resize_as_(self);
+  grad_input.zero_();
+
+  // Collapse batch and feature dimensions if needed
+  auto grad_input_reshaped = grad_input;
+  if (grad_input.ndimension() == 5) {
+    grad_input_reshaped =
+        grad_input.reshape({grad_input.size(0) * grad_input.size(1),
+                            grad_input.size(2),
+                            grad_input.size(3),
+                            grad_input.size(4)});
+
+    indices = indices.reshape({indices.size(0) * indices.size(1),
+                               indices.size(2),
+                               indices.size(3),
+                               indices.size(4)});
+  }
+  if (grad_input.numel() == 0) {
+    return grad_input;
+  }
+
+  int totalZ = inputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+
+  dim3 block(32, 8);
+
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half,
+      self.scalar_type(), "max_unpooling3d_backward_kernel", ([&] {
+        while (totalZ > 0) {
+          dim3 grid(
+              ceilDiv(inputWidth, static_cast<int64_t>(block.x)),
+              ceilDiv(inputHeight, static_cast<int64_t>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+          max_unpooling3d_backward_kernel<<<
+              grid,
+              block,
+              0,
+              c10::zoom::getCurrentZoomStream()>>>(
+              grad_output.const_data_ptr<scalar_t>(),
+              oT,
+              oH,
+              oW,
+              indices.packed_accessor64<int64_t, 4>(),
+              grad_input_reshaped.packed_accessor64<scalar_t, 4>(),
+              offsetZ);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          totalZ -= 65535;
+          offsetZ += 65535;
+        }
+      }));
+  return grad_input;
+}
+
+at::Tensor max_unpooling3d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& indices,
+    IntArrayRef output_size,
+    IntArrayRef stride,
+    IntArrayRef padding) {
+  auto grad_input = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  at::native::max_unpooling3d_backward_out_zoom(
+      grad_output, self, indices, output_size, stride, padding, grad_input);
+  return grad_input;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/MiscUtils.h b/aten/src/ATen/native/zoom/MiscUtils.h
new file mode 100644
index 00000000000000..257c488bd7e98e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MiscUtils.h
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#pragma once
+#include <c10/zoom/ZoomException.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/PinnedMemoryAllocator.h>
+
+namespace at {
+namespace native {
+
+static inline int zoom_int_cast(int64_t value, const char* varname) {
+  auto result = static_cast<int>(value);
+  TORCH_CHECK(static_cast<int64_t>(result) == value,
+              "zoom_int_cast: The value of ", varname, "(", (long long)value,
+              ") is too large to fit into a int (", sizeof(int), " bytes)");
+  return result;
+}
+
+// Creates an array of size elements of type T, backed by pinned memory
+// wrapped in a Storage
+template<class T>
+static inline Storage pin_memory(int64_t size) {
+  auto* allocator = zoom::getPinnedMemoryAllocator();
+  int64_t adjusted_size = size * sizeof(T);
+  return Storage(
+      Storage::use_byte_size_t(),
+      adjusted_size,
+      allocator,
+      /*resizable=*/false);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/MultiLabelMarginCriterion.cu b/aten/src/ATen/native/zoom/MultiLabelMarginCriterion.cu
new file mode 100644
index 00000000000000..da1fddcdf18cdf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MultiLabelMarginCriterion.cu
@@ -0,0 +1,442 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <c10/macros/Macros.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <c10/zoom/ZoomFunctions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/sum_privateuse1_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss.h>
+#endif
+
+
+namespace at::native {
+
+namespace {
+const int MULTILABELMARGIN_THREADS = 128;
+
+void multilabel_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+      TORCH_CHECK(
+          target.dim() <= 1 && target.numel() == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+      TORCH_CHECK(
+          target.dim() == 2 && target.size(0) == nframe &&
+          target.size(1) == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(MULTILABELMARGIN_THREADS)
+__global__ void multilabel_margin_loss_forward_kernel(
+    scalar_t* output,
+    const scalar_t* input,
+    const int64_t* target,
+    scalar_t* is_target,
+    int nframe,
+    int dim,
+    bool size_average) {
+
+  // vectors:
+  int k = blockIdx.x;
+  const scalar_t* input_k = input + k * dim;
+  const int64_t* target_k = target + k * dim;
+  scalar_t* output_k = output + k;
+  scalar_t* is_target_k = is_target + k * dim;
+
+  // zero is_target
+  for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+    is_target_k[d] = static_cast<scalar_t>(0);
+  }
+  __syncthreads();
+
+  // mark targets in is_target
+  if (threadIdx.x == 0) {
+    for (int dt = 0; dt < dim; dt++) {
+      int target_idx = target_k[dt];
+      if (target_idx < 0) {
+        break;
+      }
+      is_target_k[target_idx] = static_cast<scalar_t>(1);
+    }
+  }
+  __syncthreads();
+
+  // iterate over targets
+  accscalar_t sum = 0;
+  for (int dt = 0; dt < dim; dt++) {
+    // next target:
+    int target_idx = target_k[dt];
+    if (target_idx < 0) {
+      break;
+    }
+
+    // current value for target
+    scalar_t input_target_k = input_k[target_idx];
+
+    // compare to all inputs (multithreaded):
+    for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+      // contribute to loss only if not a target
+      if (!static_cast<int>(is_target_k[d])) {
+        scalar_t z = 1 - input_target_k + input_k[d];
+        if (z > 0) {
+          sum += z;
+        }
+      }
+    }
+  }
+
+  // Temporary sums (for mapreduce)
+  __shared__ accscalar_t smem[MULTILABELMARGIN_THREADS];
+  accscalar_t total_sum = zoom_utils::BlockReduceSum(sum, smem);
+  if (threadIdx.x == 0) {
+    if (size_average) {
+      *output_k = static_cast<scalar_t>((total_sum / dim) / nframe);
+    } else {
+      *output_k = static_cast<scalar_t>(total_sum / dim);
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(MULTILABELMARGIN_THREADS)
+__global__ void multilabel_margin_loss_backward_kernel(
+    scalar_t* grad_input,
+    const scalar_t* grad_output,
+    const scalar_t* input,
+    const int64_t* target,
+    const scalar_t* is_target,
+    int nframe,
+    int dim,
+    bool size_average,
+    bool reduce) {
+
+  int k = blockIdx.x;
+  const scalar_t* input_k = input + k * dim;
+  scalar_t* grad_input_k = grad_input + k * dim;
+  const int64_t* target_k = target + k * dim;
+  const scalar_t* is_target_k = is_target + k * dim;
+
+  const scalar_t* grad_output_k = grad_output;
+  if (!reduce) {
+    grad_output_k += k;
+  }
+
+  // gain:
+  scalar_t g = static_cast<scalar_t>(
+      size_average && reduce ? 1. / static_cast<accscalar_t>(nframe * dim)
+                             : 1. / static_cast<accscalar_t>(dim));
+
+  // zero gradients:
+  for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+    grad_input_k[d] = static_cast<scalar_t>(0);
+  }
+  __syncthreads();
+
+  // iterate over targets
+  for (int dt = 0; dt < dim; dt++) {
+    // next target:
+    int target_idx = static_cast<int>(target_k[dt]);
+    if (target_idx < 0) {
+      break;
+    }
+
+    // current value for target
+    scalar_t input_target_k = input_k[target_idx];
+
+    // compare to all inputs (multithreaded):
+    accscalar_t sum = 0;
+    for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+      // contribute to loss only if not a target
+      if (!static_cast<int>(is_target_k[d])) {
+        scalar_t z = 1 - input_target_k + input_k[d];
+        if (z > 0) {
+          sum -= g;
+          grad_input_k[d] += g;
+        }
+      }
+    }
+    __syncthreads();
+
+    // Temporary sums (for mapreduce)
+    __shared__ accscalar_t smem[MULTILABELMARGIN_THREADS];
+    accscalar_t total_sum = zoom_utils::BlockReduceSum(sum, smem);
+    if (threadIdx.x == 0) {
+      grad_input_k[target_idx] += static_cast<scalar_t>(total_sum);
+    }
+  }
+
+  for (int d = threadIdx.x; d < dim; d += blockDim.x) {
+    grad_input_k[d] *= *grad_output_k;
+  }
+}
+
+void multilabel_margin_loss_forward_out_zoom_template(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    Tensor& output,
+    Tensor& is_target) {
+  int64_t nframe, dim;
+  const int64_t ndims = input.dim();
+  multilabel_margin_loss_shape_check(nframe, dim, ndims, input, target);
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  auto input_ = input.contiguous();
+  auto target_ = target.contiguous();
+  auto is_target_ = is_target.contiguous();
+  is_target_.resize_as_(target);
+
+  if (input.dim() <= 1) {
+    output.resize_({});
+
+    dim3 blocks(1);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "multilabel_margin_loss_forward_kernel",
+        [&] {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+         hipLaunchKernelGGL(( multilabel_margin_loss_forward_kernel<scalar_t, accscalar_t>)
+              , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(), 
+                  output.mutable_data_ptr<scalar_t>(),
+                  input_.const_data_ptr<scalar_t>(),
+                  target_.const_data_ptr<int64_t>(),
+                  is_target_.mutable_data_ptr<scalar_t>(),
+                  1,
+                  dim,
+                  reduction == at::Reduction::Mean);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+  } else if (input.dim() == 2) {
+    dim3 blocks(input.size(0));
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    if (reduction != at::Reduction::None) {
+      auto output_tmp = at::empty({input_.size(0)}, input_.options());
+      output.resize_({});
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+          at::ScalarType::Half,
+          at::ScalarType::BFloat16,
+          input.scalar_type(),
+          "multilabel_margin_loss_forward_kernel",
+          [&] {
+            using accscalar_t = at::acc_type<scalar_t, true>;
+           hipLaunchKernelGGL(( multilabel_margin_loss_forward_kernel<scalar_t, accscalar_t>)
+                , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(), 
+                    output_tmp.mutable_data_ptr<scalar_t>(),
+                    input_.const_data_ptr<scalar_t>(),
+                    target_.const_data_ptr<int64_t>(),
+                    is_target_.mutable_data_ptr<scalar_t>(),
+                    nframe,
+                    dim,
+                    reduction == at::Reduction::Mean);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          });
+      at::privateuse1::sum_out(
+          output,
+          output_tmp,
+          at::IntArrayRef(std::vector<int64_t>{}),
+          false,
+          output.scalar_type());
+    } else {
+      output.resize_({input.size(0)});
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+          at::ScalarType::Half,
+          at::ScalarType::BFloat16,
+          input.scalar_type(),
+          "multilabel_margin_loss_forward_kernel",
+          [&] {
+            using accscalar_t = at::acc_type<scalar_t, true>;
+           hipLaunchKernelGGL(( multilabel_margin_loss_forward_kernel<scalar_t, accscalar_t>)
+                , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(), 
+                    output.mutable_data_ptr<scalar_t>(),
+                    input_.const_data_ptr<scalar_t>(),
+                    target_.const_data_ptr<int64_t>(),
+                    is_target_.mutable_data_ptr<scalar_t>(),
+                    nframe,
+                    dim,
+                    false);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          });
+    }
+
+  } else {
+    TORCH_CHECK(
+        false,
+        "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
+        input.sizes());
+  }
+}
+
+void multilabel_margin_loss_backward_zoom_out_template(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction,
+    const Tensor& is_target,
+    Tensor& grad_input) {
+  int64_t nframe, dim;
+  const int64_t ndims = input.dim();
+  multilabel_margin_loss_shape_check(nframe, dim, ndims, input, target);
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  auto input_ = input.contiguous();
+  auto target_ = target.contiguous();
+  auto is_target_ = is_target.contiguous();
+  auto grad_output_ = grad_output.contiguous();
+  grad_input.resize_as_(input_);
+
+  if (grad_input.dim() <= 1) {
+    int target_size = target_.dim() == 0 ? 1 : target_.size(0);
+    TORCH_CHECK(
+        (target_.numel() != 0) && (target_.dim() <= 1) && (target_size == dim),
+        "inconsistent target size");
+    TORCH_CHECK(
+        target_.sizes() == is_target_.sizes(), "inconsistent is_target size");
+    dim3 blocks(1);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "multilabel_margin_loss_backward_kernel",
+        [&] {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+         hipLaunchKernelGGL(( multilabel_margin_loss_backward_kernel<scalar_t, accscalar_t>)
+              , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(), 
+                  grad_input.mutable_data_ptr<scalar_t>(),
+                  grad_output_.const_data_ptr<scalar_t>(),
+                  input_.const_data_ptr<scalar_t>(),
+                  target_.const_data_ptr<int64_t>(),
+                  is_target_.const_data_ptr<scalar_t>(),
+                  1,
+                  dim,
+                  reduction == at::Reduction::Mean,
+                  reduction != at::Reduction::None);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+  } else if (grad_input.dim() == 2) {
+    TORCH_CHECK(
+        (input_.size(1) != 0) && (target_.dim() == 2) &&
+            (target_.size(0) == nframe) && (target_.size(1) == dim),
+        "inconsistent target size");
+    TORCH_CHECK(target_.sizes() == is_target_.sizes(), "inconsistent is_target size");
+    dim3 blocks(grad_input.size(0));
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        input.scalar_type(),
+        "multilabel_margin_loss_backward_kernel",
+        [&] {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+         hipLaunchKernelGGL(( multilabel_margin_loss_backward_kernel<scalar_t, accscalar_t>)
+              , dim3(blocks), dim3(threads), 0, c10::zoom::getCurrentZoomStream(), 
+                  grad_input.mutable_data_ptr<scalar_t>(),
+                  grad_output_.const_data_ptr<scalar_t>(),
+                  input_.const_data_ptr<scalar_t>(),
+                  target_.const_data_ptr<int64_t>(),
+                  is_target_.const_data_ptr<scalar_t>(),
+                  grad_input.size(0),
+                  grad_input.size(1),
+                  reduction == at::Reduction::Mean,
+                  reduction != at::Reduction::None);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+  } else {
+    TORCH_CHECK(
+        false,
+        "Expected 2D input with optional zero batch dim, or 1D input with non-zero dims, but got sizes: ",
+        grad_input.sizes());
+  }
+}
+
+} // namespace
+
+std::tuple<Tensor&, Tensor&> multilabel_margin_loss_forward_out_zoom(
+    const Tensor& self,
+    const Tensor& target,
+    int64_t reduction,
+    Tensor& output,
+    Tensor& is_target) {
+  multilabel_margin_loss_forward_out_zoom_template(
+      self, target, reduction, output, is_target);
+  return std::tuple<Tensor&, Tensor&>(output, is_target);
+}
+
+std::tuple<Tensor, Tensor> multilabel_margin_loss_forward_zoom(
+    const Tensor& self,
+    const Tensor& target,
+    int64_t reduction) {
+  auto output = at::empty({0}, self.options());
+  auto is_target = at::empty({0}, self.options());
+  multilabel_margin_loss_forward_out_zoom_template(
+      self, target, reduction, output, is_target);
+  return std::make_tuple(output, is_target);
+}
+
+Tensor& multilabel_margin_loss_backward_zoom_out(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target,
+    int64_t reduction,
+    const Tensor& is_target,
+    Tensor& grad_input) {
+  multilabel_margin_loss_backward_zoom_out_template(
+      grad_output, self, target, reduction, is_target, grad_input);
+  return grad_input;
+}
+
+Tensor multilabel_margin_loss_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target,
+    int64_t reduction,
+    const Tensor& is_target) {
+  auto grad_input = at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  multilabel_margin_loss_backward_zoom_out_template(
+      grad_output, self, target, reduction, is_target, grad_input);
+  return grad_input;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/MultiMarginLoss.cu b/aten/src/ATen/native/zoom/MultiMarginLoss.cu
new file mode 100644
index 00000000000000..09ecf1bb95391d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/MultiMarginLoss.cu
@@ -0,0 +1,416 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/Resize.h>
+#include <c10/zoom/ZoomStream.h>
+#include <c10/zoom/ZoomException.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/multi_margin_loss_native.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+constexpr int MULTIMARGIN_THREADS = 128;
+
+template <int P, typename scalar_t>
+__global__ void MultiMarginLoss_forward_kernel(
+    scalar_t *output, const scalar_t *input, const int64_t *target, const scalar_t *weights,
+    int nframe, int dim, bool sizeAverage, scalar_t margin) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  __shared__ acc_t buffer[MULTIMARGIN_THREADS];
+  int k = blockIdx.x;
+  const scalar_t *input_k = input + k*dim;
+  scalar_t *output_k = output + k;
+  int target_k = static_cast<int>(target[k]);
+  ZOOM_KERNEL_ASSERT(target_k >= 0 && target_k < dim && "target index is out of bounds");
+  scalar_t input_target_k = input_k[target_k];
+
+  int i_start = threadIdx.x;
+  int i_end = dim;
+  int i_step = blockDim.x;
+
+  buffer[threadIdx.x] = 0;
+  for (int i = i_start; i < i_end; i += i_step) {
+    scalar_t z = margin - input_target_k + input_k[i];
+    if (i == target_k) {
+      continue;
+    }
+
+    if (z > 0) {
+      scalar_t h = (P==1) ? z : z*z;
+      if (weights) {
+        h *= weights[target_k];
+      }
+      buffer[threadIdx.x] += h;
+    }
+  }
+  __syncthreads();
+
+  // reduce
+  if (threadIdx.x == 0) {
+    acc_t sum = 0;
+    for (int i=0; i < blockDim.x; i++)
+      sum += buffer[i];
+
+    const int denom = sizeAverage ? nframe * dim : dim;
+    *output_k = static_cast<scalar_t>(sum / denom);
+  }
+}
+
+template <int P, typename scalar_t>
+__global__ void MultiMarginLoss_backward_kernel(
+    scalar_t *gradInput, const scalar_t *gradOutput, const scalar_t *input, const int64_t *target,
+    const scalar_t *weights, int nframe, int dim, bool sizeAverage, scalar_t margin,
+    bool reduce) {
+  using acc_t = at::acc_type<scalar_t, true>;
+  __shared__ acc_t buffer[MULTIMARGIN_THREADS];
+  int k = blockIdx.x;
+  const scalar_t *input_k = input + k*dim;
+  scalar_t *gradInput_k = gradInput + k*dim;
+  int target_k = static_cast<int>(target[k]);
+  scalar_t input_target_k = input_k[target_k];
+
+  const scalar_t *gradOutput_k = gradOutput;
+  if (!reduce) {
+    gradOutput_k += k;
+  }
+
+  const int denom = sizeAverage && reduce ? nframe * dim : dim;
+  const acc_t g = acc_t(1) / static_cast<acc_t>(denom);
+
+  int i_start = threadIdx.x;
+  int i_end = dim;
+  int i_step = blockDim.x;
+
+  buffer[threadIdx.x] = 0;
+  for (int i=i_start; i<i_end; i+=i_step) {
+    scalar_t z = margin - input_target_k + input_k[i];
+    if (i == target_k) {
+      continue;
+    }
+
+    if (z > 0) {
+      acc_t h = (P == 1) ? g : 2*g*z;
+      if (weights) {
+        h *= weights[target_k];
+      }
+
+      buffer[threadIdx.x] -= static_cast<scalar_t>(h);
+      gradInput_k[i] = static_cast<scalar_t>(h);
+    } else {
+      gradInput_k[i] = static_cast<scalar_t>(0);
+    }
+  }
+
+  __syncthreads();
+
+  // reduce
+  if (threadIdx.x == 0) {
+    acc_t gradInput_target_k = 0;
+    for (int i=0; i<blockDim.x; i++) {
+      gradInput_target_k += buffer[i];
+    }
+    gradInput_k[target_k] = static_cast<scalar_t>(gradInput_target_k);
+  }
+
+  for (int i=i_start; i<i_end; i+= i_step) {
+    gradInput_k[i] *= * gradOutput_k;
+  }
+}
+
+void multi_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target,
+    const std::optional<Tensor>& weight) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+    }
+
+    TORCH_CHECK(
+        target.dim() <= 1 && target.numel() == nframe,
+        "inconsistent target size, expected ", nframe, " but got ",
+        target.sizes());
+    if (weight && weight->defined()) {
+      TORCH_CHECK(
+          weight->dim() <= 1 && weight->numel() == dim,
+          "inconsistent weight size, expected ", dim, " but got ",
+          weight->sizes());
+    }
+}
+
+}  // namespace (anonymous)
+
+Tensor& multi_margin_loss_zoom_out(
+    const Tensor &input_, const Tensor &target_, const Scalar &p_, const Scalar &margin_,
+    const std::optional<Tensor> &weights_, int64_t reduction, Tensor& out_) {
+  auto p = p_.toLong();
+  int64_t nframe, dim;
+  const auto ndims = input_.dim();
+
+  TORCH_CHECK(p == 1 || p == 2, "multi_margin_loss: Invalid p, expected 1 or 2 but got ", p);
+
+  multi_margin_loss_shape_check(nframe, dim, ndims, input_, target_, weights_);
+
+  // produce a scalar output for 1d input
+  if (reduction == Reduction::None && target_.dim() > 0) {
+    resize_output(out_, {nframe});
+  } else {
+    resize_output(out_, {});
+  }
+  if (input_.numel() == 0) {
+    return out_;
+  }
+
+  auto input = input_.contiguous();
+  auto target = target_.contiguous();
+  Tensor weights;
+  if (weights_ && weights_->defined()) {
+    weights = weights_->contiguous();
+  }
+  auto out = (out_.is_contiguous() ? out_ :
+              at::empty(out_.sizes(), input.options()));
+
+  const auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "multi_margin_loss_zoom", [&] {
+    const scalar_t margin = margin_.to<scalar_t>();
+    if (input.dim() <= 1) {
+      TORCH_CHECK(target.dim() <= 1 && target.numel() == nframe, "inconsistent target size");
+      dim3 blocks(1);
+      dim3 threads(MULTIMARGIN_THREADS);
+      if (p == 1) {
+       hipLaunchKernelGGL(( MultiMarginLoss_forward_kernel<1>) , dim3(blocks), dim3(threads), 0, stream, 
+            out.mutable_data_ptr<scalar_t>(),
+            input.const_data_ptr<scalar_t>(),
+            target.const_data_ptr<int64_t>(),
+            weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() < 1 ? input.numel() : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else if (p == 2) {
+       hipLaunchKernelGGL(( MultiMarginLoss_forward_kernel<2>) , dim3(blocks), dim3(threads), 0, stream, 
+            out.mutable_data_ptr<scalar_t>(),
+            input.const_data_ptr<scalar_t>(),
+            target.const_data_ptr<int64_t>(),
+            weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() < 1 ? input.numel() : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    } else {
+      auto in_sizes = input.sizes();
+      TORCH_INTERNAL_ASSERT(in_sizes.size() == 2);
+      // allow zero-dim target for 2D input.
+      TORCH_CHECK(in_sizes[1] != 0 && target.dim() <= 1 && target.numel() == nframe,
+                "inconsistent target size");
+      dim3 blocks(nframe);
+      dim3 threads(MULTIMARGIN_THREADS);
+
+      if (reduction == at::Reduction::None) {
+        if (p == 1) {
+         hipLaunchKernelGGL(( MultiMarginLoss_forward_kernel<1>) , dim3(blocks), dim3(threads), 0, stream, 
+              out.mutable_data_ptr<scalar_t>(),
+              input.const_data_ptr<scalar_t>(),
+              target.const_data_ptr<int64_t>(),
+              weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              false,
+              margin);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        } else if (p == 2) {
+         hipLaunchKernelGGL(( MultiMarginLoss_forward_kernel<2>) , dim3(blocks), dim3(threads), 0, stream, 
+              out.mutable_data_ptr<scalar_t>(),
+              input.const_data_ptr<scalar_t>(),
+              target.const_data_ptr<int64_t>(),
+              weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              false,
+              margin);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      } else {
+        auto tmp_output = at::empty({nframe}, input.options());
+        if (p == 1) {
+         hipLaunchKernelGGL(( MultiMarginLoss_forward_kernel<1>) , dim3(blocks), dim3(threads), 0, stream, 
+              tmp_output.mutable_data_ptr<scalar_t>(),
+              input.const_data_ptr<scalar_t>(),
+              target.const_data_ptr<int64_t>(),
+              weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              reduction == Reduction::Mean,
+              margin);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        } else if (p == 2) {
+         hipLaunchKernelGGL(( MultiMarginLoss_forward_kernel<2>) , dim3(blocks), dim3(threads), 0, stream, 
+              tmp_output.mutable_data_ptr<scalar_t>(),
+              input.const_data_ptr<scalar_t>(),
+              target.const_data_ptr<int64_t>(),
+              weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+              nframe, in_sizes[1],
+              reduction == Reduction::Mean,
+              margin);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+        at::sum_out(out, tmp_output, IntArrayRef{});
+      }
+    }
+  });
+
+  if (!out.is_alias_of(out_)) {
+    out_.copy_(out);
+  }
+  return out_;
+}
+
+Tensor multi_margin_loss_zoom(
+    const Tensor &input, const Tensor &target, const Scalar &p, const Scalar &margin,
+    const std::optional<Tensor> &weights, int64_t reduction) {
+  auto out = at::empty({0}, input.options());
+  multi_margin_loss_zoom_out(input, target, p, margin, weights, reduction, out);
+  return out;
+}
+
+Tensor& multi_margin_loss_zoom_backward_out(
+    const Tensor &grad_output_,const Tensor &input_, const Tensor &target_,
+    const Scalar &p_, const Scalar &margin_, const std::optional<Tensor> &weights_,
+    int64_t reduction, Tensor &grad_input_) {
+  auto p = p_.toLong();
+  int64_t nframe, dim;
+  const auto ndims = input_.dim();
+
+  TORCH_CHECK(p == 1 || p == 2,
+              "multi_margin_loss_backward: Invalid p, expected 1 or 2 but got ", p);
+
+  multi_margin_loss_shape_check(nframe, dim, ndims, input_, target_, weights_);
+  resize_output(grad_input_, input_.sizes());
+
+  if (input_.numel() == 0) {
+    return grad_input_;
+  }
+
+  auto input = input_.contiguous();
+  auto grad_input = (grad_input_.is_contiguous() ? grad_input_ :
+                     at::empty(grad_input_.sizes(), input.options()));
+  auto grad_output = grad_output_.contiguous();
+  auto target = target_.contiguous();
+  Tensor weights;
+  if (weights_ && weights_->defined()) {
+    weights = weights_->contiguous();
+  }
+
+  const auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                  "multi_margin_loss_backward_zoom", [&] {
+    const scalar_t margin = margin_.to<scalar_t>();
+
+    if (input.dim() <= 1) {
+      dim3 blocks(1);
+      dim3 threads(MULTIMARGIN_THREADS);
+
+      if (p == 1) {
+       hipLaunchKernelGGL(( MultiMarginLoss_backward_kernel<1>) , dim3(blocks), dim3(threads), 0, stream, 
+            grad_input.mutable_data_ptr<scalar_t>(),
+            grad_output.const_data_ptr<scalar_t>(),
+            input.const_data_ptr<scalar_t>(),
+            target.const_data_ptr<int64_t>(),
+            weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() == 0 ? 1 : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else if (p == 2) {
+       hipLaunchKernelGGL(( MultiMarginLoss_backward_kernel<2>) , dim3(blocks), dim3(threads), 0, stream, 
+            grad_input.mutable_data_ptr<scalar_t>(),
+            grad_output.const_data_ptr<scalar_t>(),
+            input.const_data_ptr<scalar_t>(),
+            target.const_data_ptr<int64_t>(),
+            weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+            1,
+            input.dim() == 0 ? 1 : input.sizes()[0],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    } else {
+      auto in_sizes = input.sizes();
+      TORCH_INTERNAL_ASSERT(in_sizes.size() == 2);
+      TORCH_CHECK((in_sizes[1] != 0) && (target.dim() <= 1) && (target.numel() == nframe),
+                  "inconsistent target size");
+      dim3 blocks(in_sizes[0]);
+      dim3 threads(MULTIMARGIN_THREADS);
+
+      if (p == 1) {
+       hipLaunchKernelGGL(( MultiMarginLoss_backward_kernel<1>) , dim3(blocks), dim3(threads), 0, stream, 
+            grad_input.mutable_data_ptr<scalar_t>(),
+            grad_output.const_data_ptr<scalar_t>(),
+            input.const_data_ptr<scalar_t>(),
+            target.const_data_ptr<int64_t>(),
+            weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+            nframe, in_sizes[1],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      } else if (p == 2) {
+       hipLaunchKernelGGL(( MultiMarginLoss_backward_kernel<2>) , dim3(blocks), dim3(threads), 0, stream, 
+            grad_input.mutable_data_ptr<scalar_t>(),
+            grad_output.const_data_ptr<scalar_t>(),
+            input.const_data_ptr<scalar_t>(),
+            target.const_data_ptr<int64_t>(),
+            weights.defined() ? weights.const_data_ptr<scalar_t>() : nullptr,
+            nframe, in_sizes[1],
+            reduction == at::Reduction::Mean,
+            margin,
+            reduction != at::Reduction::None);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      }
+    }
+  });
+
+  if (!grad_input.is_alias_of(grad_input_)) {
+    grad_input_.copy_(grad_input);
+  }
+  return grad_input_;
+}
+
+Tensor multi_margin_loss_zoom_backward(
+    const Tensor &grad_output, const Tensor &input, const Tensor &target,
+    const Scalar &p, const Scalar &margin, const std::optional<Tensor> &weights,
+    int64_t reduction) {
+  auto grad_input = at::empty({0}, input.options());
+  multi_margin_loss_zoom_backward_out(
+      grad_output, input, target, p, margin, weights, reduction, grad_input);
+  return grad_input;
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/zoom/NaiveConvolutionTranspose2d.cu
new file mode 100644
index 00000000000000..9a2b2621459d68
--- /dev/null
+++ b/aten/src/ATen/native/zoom/NaiveConvolutionTranspose2d.cu
@@ -0,0 +1,834 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/im2col.cuh>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+
+#include <ATen/zoom/HIPBlas.h>
+#include <ATen/zoom/ZoomContext.h>
+
+#include <ATen/native/ConvUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_transpose2d_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+static inline void slow_conv_transpose2d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    const Tensor& weight,
+    const Tensor& bias,
+    int kernel_height,
+    int kernel_width,
+    int stride_height,
+    int stride_width,
+    int pad_height,
+    int pad_width,
+    int output_padding_height,
+    int output_padding_width,
+    int dilation_height,
+    int dilation_width,
+    bool weight_nullable) {
+  TORCH_CHECK(
+      kernel_width > 0 && kernel_height > 0,
+      "kernel size should be greater than zero, but got kernel_height: ",
+      kernel_height,
+      " kernel_width: ",
+      kernel_width);
+  TORCH_CHECK(
+      stride_width > 0 && stride_height > 0,
+      "stride should be greater than zero, but got stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width);
+  TORCH_CHECK(
+      dilation_width > 0 && dilation_height > 0,
+      "dilation should be greater than zero, but got dilation_height: ",
+      dilation_height,
+      ", dilation_width: ",
+      dilation_width);
+  TORCH_CHECK(
+      (output_padding_width < stride_width ||
+       output_padding_width < dilation_width) &&
+          (output_padding_height < stride_height ||
+           output_padding_height < dilation_height),
+      "output padding must be smaller than either stride or dilation, ",
+      "but got output_padding_height: ",
+      output_padding_height,
+      " output_padding_width: ",
+      output_padding_width,
+      " stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width,
+      " dilation_height: ",
+      dilation_height,
+      " dilation_width: ",
+      dilation_width);
+
+  if (weight.defined()) {
+    TORCH_CHECK(
+        weight.numel() != 0 && (weight.dim() == 2 || weight.dim() == 4),
+        "non-empty 2D or 4D weight tensor expected, but got: ",
+        weight.sizes());
+    if (bias.defined()) {
+      check_dim_size(bias, 1, 0, weight.size(1));
+    }
+  } else if (!weight_nullable) {
+    AT_ERROR("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input.dim();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(
+      input.numel() != 0 && (ndim == 3 || ndim == 4),
+      "non-empty 3D or 4D input tensor expected but got a tensor with size ",
+      input.sizes());
+
+  int64_t input_height = input.size(dimh);
+  int64_t input_width = input.size(dimw);
+  int64_t output_height = (input_height - 1) * stride_height - 2 * pad_height +
+      (dilation_height * (kernel_height - 1) + 1) + output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * pad_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  if (output_width < 1 || output_height < 1) {
+    AT_ERROR(
+        "Given input size per channel: (",
+        input_height,
+        " x ",
+        input_width,
+        "). Calculated output spatial size per channel: (",
+        output_height,
+        " x ",
+        output_width,
+        "). Output size is too small");
+  }
+
+  if (weight.defined()) {
+    int64_t n_input_plane = weight.size(0);
+    check_dim_size(input, ndim, dimf, n_input_plane);
+  }
+
+  if (grad_output.defined()) {
+    if (weight.defined()) {
+      int64_t n_output_plane = weight.size(1);
+      check_dim_size(grad_output, ndim, dimf, n_output_plane);
+    } else if (bias.defined()) {
+      int64_t n_output_plane = bias.size(0);
+      check_dim_size(grad_output, ndim, dimf, n_output_plane);
+    }
+    check_dim_size(grad_output, ndim, dimh, output_height);
+    check_dim_size(grad_output, ndim, dimw, output_width);
+  }
+}
+
+void slow_conv_transpose2d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2},
+      weight_arg{weight, "weight", 3}, bias_arg{bias, "bias", 4};
+
+  checkAllSameGPU(
+      __func__,
+      {input_arg, output_arg, weight_arg, bias_arg});
+
+  int n_input_plane = weight.size(0);
+  int n_output_plane = weight.size(1);
+
+  int64_t kernel_height = kernel_size[0];
+  int64_t kernel_width = kernel_size[1];
+  int64_t dilation_height = dilation[0];
+  int64_t dilation_width = dilation[1];
+  int64_t pad_height = padding[0];
+  int64_t pad_width = padding[1];
+  int64_t stride_height = stride[0];
+  int64_t stride_width = stride[1];
+  int64_t output_padding_height = output_padding[0];
+  int64_t output_padding_width = output_padding[1];
+
+  Tensor input_ = input.contiguous();
+  Tensor weight_ = weight.contiguous();
+
+  Tensor bias_ = Tensor();
+
+  if (bias.defined()) {
+    bias_ = bias.contiguous();
+  }
+
+  bool is_batch = false;
+  if (input_.dim() == 3) {
+    // Force batch
+    is_batch = true;
+    input_.resize_({1, input_.size(0), input_.size(1), input_.size(2)});
+  }
+
+  int64_t input_height = input_.size(2);
+  int64_t input_width = input_.size(3);
+  int64_t output_height = (input_height - 1) * stride_height - 2 * pad_height +
+      (dilation_height * (kernel_height - 1) + 1) + output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * pad_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  // Batch size + input planes
+  int64_t batch_size = input_.size(0);
+
+  // Create temporary columns
+  Tensor columns_ = at::empty({n_output_plane * kernel_width * kernel_height,
+      input_height * input_width}, input_.options());
+
+  // Define a buffer of ones, for bias accumulation
+  Tensor ones_ = bias.defined() ? at::ones({output_height, output_width}, input_.options()) : Tensor();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input_.scalar_type(), "slow_conv_transpose2d_out_zoom", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        // Helpers
+        Tensor input_n;
+        Tensor output_n;
+
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batch_size; elt++) {
+          // Matrix multiply per output:
+          input_n = input_.select(0, elt);
+          output_n = output.select(0, elt);
+
+          // M,N,K are dims of matrix A and B
+          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+          int64_t m = weight_.size(1) * weight_.size(2) * weight_.size(3);
+          int64_t n = input_height * input_width;
+          int64_t k = weight_.size(0);
+
+          // Do GEMM (note: this is a bit confusing because gemm assumes
+          // column-major matrices)
+          at::zoom::blas::gemm<scalar_t>(
+              'n',
+              't',
+              n,
+              m,
+              k,
+              1,
+              input_n.const_data_ptr<scalar_t>(),
+              n,
+              weight_.const_data_ptr<scalar_t>(),
+              m,
+              0,
+              columns_.mutable_data_ptr<scalar_t>(),
+              n);
+
+          // Unpack columns back into input:
+          col2im<scalar_t, accscalar_t>(
+              c10::zoom::getCurrentZoomStream(),
+              columns_.const_data_ptr<scalar_t>(),
+              n_output_plane,
+              output_height,
+              output_width,
+              input_height,
+              input_width,
+              kernel_height,
+              kernel_width,
+              pad_height,
+              pad_width,
+              stride_height,
+              stride_width,
+              dilation_height,
+              dilation_width,
+              output_n.mutable_data_ptr<scalar_t>());
+
+          // Do Bias after:
+          // M,N,K are dims of matrix A and B
+          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+          int64_t m_ = n_output_plane;
+          int64_t n_ = output_height * output_width;
+          int64_t k_ = 1;
+
+          // Do GEMM (note: this is a bit confusing because gemm assumes
+          // column-major matrices)
+          if (bias.defined()) {
+            at::zoom::blas::gemm<scalar_t>(
+                't',
+                'n',
+                n_,
+                m_,
+                k_,
+                1,
+                ones_.const_data_ptr<scalar_t>(),
+                k_,
+                bias_.const_data_ptr<scalar_t>(),
+                k_,
+                1,
+                output_n.mutable_data_ptr<scalar_t>(),
+                n_);
+          }
+        }
+
+        // Resize output
+        if (is_batch) {
+          output.resize_({n_output_plane, output_height, output_width});
+          input_.resize_({n_input_plane, input_height, input_width});
+        }
+      }); // end of dispatch
+}
+
+static void slow_conv_transpose2d_backward_out_zoom_template(
+    const Tensor& input_,
+    const Tensor& grad_output_,
+    Tensor& grad_input,
+    const Tensor& weight_,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation) {
+  TORCH_CHECK(
+      kernel_size.size() == 2,
+      "It is expected kernel_size equals to 2, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 2,
+      "It is expected dilation equals to 2, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 2,
+      "It is expected padding equals to 2, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      stride.size());
+
+  TORCH_CHECK(
+      output_padding.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      output_padding.size());
+
+  TensorArg input_arg{input_, "input", 1},
+      grad_output_arg{grad_output_, "grad_output", 2},
+      weight_arg{weight_, "weight", 3},
+      grad_input_arg{grad_input, "grad_input", 4};
+
+  checkAllSameGPU(
+      __func__,
+      {input_arg,
+       grad_output_arg,
+       weight_arg,
+       grad_input_arg});
+
+  int n_input_plane = weight_.size(0);
+  int n_output_plane = weight_.size(1);
+
+  int64_t kernel_height = kernel_size[0];
+  int64_t kernel_width = kernel_size[1];
+  int64_t dilation_height = dilation[0];
+  int64_t dilation_width = dilation[1];
+  int64_t pad_height = padding[0];
+  int64_t pad_width = padding[1];
+  int64_t stride_height = stride[0];
+  int64_t stride_width = stride[1];
+  int64_t output_padding_height = output_padding[0];
+  int64_t output_padding_width = output_padding[1];
+
+  slow_conv_transpose2d_shape_check(
+      input_,
+      grad_output_,
+      weight_,
+      Tensor(),
+      kernel_height,
+      kernel_width,
+      stride_height,
+      stride_width,
+      pad_height,
+      pad_width,
+      output_padding_height,
+      output_padding_width,
+      dilation_height,
+      dilation_width,
+      false);
+
+  Tensor input = input_.contiguous();
+  Tensor grad_output = grad_output_.contiguous();
+  Tensor weight = weight_.contiguous();
+
+  bool is_batch = false;
+  if (input.dim() == 3) {
+    // Force batch
+    is_batch = true;
+    input.resize_({1, input.size(0), input.size(1), input.size(2)});
+    grad_output.resize_(
+        {1, grad_output.size(0), grad_output.size(1), grad_output.size(2)});
+  }
+
+  int64_t input_width = input.size(3);
+  int64_t input_height = input.size(2);
+  int64_t output_height = (input_height - 1) * stride_height - 2 * pad_height +
+      (dilation_height * (kernel_height - 1) + 1) + output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * pad_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  // Batch size + input planes
+  int64_t batch_size = input.size(0);
+
+  // Resize output
+  grad_input.resize_({batch_size, n_input_plane, input_height, input_width});
+
+  // Create temporary columns
+  bool need_columns = (kernel_height != 1 || kernel_width != 1 || stride_height != 1 ||
+      stride_width != 1 || pad_height != 0 || pad_width != 0 ||
+      dilation_height != 1 || dilation_width != 1);
+  Tensor grad_columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height,
+      input_height * input_width}, input.options()) : Tensor();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      grad_output.scalar_type(), "slow_conv_transpose2d_backward_out_zoom", [&] {
+        // Helpers
+        Tensor grad_input_n = Tensor();
+        Tensor grad_output_n = Tensor();
+
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batch_size; elt++) {
+          // Matrix multiply per sample:
+          grad_input_n = grad_input.select(0, elt);
+          grad_output_n = grad_output.select(0, elt);
+
+          if (need_columns) {
+            im2col<scalar_t>(
+                c10::zoom::getCurrentZoomStream(),
+                grad_output_n.const_data_ptr<scalar_t>(),
+                n_output_plane,
+                output_height,
+                output_width,
+                input_height,
+                input_width,
+                kernel_height,
+                kernel_width,
+                pad_height,
+                pad_width,
+                stride_height,
+                stride_width,
+                dilation_height,
+                dilation_width,
+                grad_columns.mutable_data_ptr<scalar_t>());
+          }
+
+          // M,N,K are dims of matrix A and B
+          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+          int64_t m = weight.size(0);
+          int64_t n = input_height * input_width;
+          int64_t k = weight.size(1) * weight.size(2) * weight.size(3);
+
+          // Do GEMM (note: this is a bit confusing because gemm assumes
+          // column-major matrices)
+          auto gemm_in_ptr = need_columns ? grad_columns.const_data_ptr<scalar_t>()
+              : grad_output_n.const_data_ptr<scalar_t>();
+          at::zoom::blas::gemm<scalar_t>(
+              'n',
+              'n',
+              n,
+              m,
+              k,
+              1,
+              gemm_in_ptr,
+              n,
+              weight.const_data_ptr<scalar_t>(),
+              k,
+              0,
+              grad_input_n.mutable_data_ptr<scalar_t>(),
+              n);
+        }
+
+        // Resize output
+        if (is_batch) {
+          grad_output.resize_({n_output_plane, output_height, output_width});
+          input.resize_({n_input_plane, input_height, input_width});
+          grad_input.resize_({n_input_plane, input_height, input_width});
+        }
+      }); // end of dispatch
+}
+
+void slow_conv_transpose2d_acc_grad_parameters_zoom_template(
+    const Tensor& input_,
+    const Tensor& grad_output_,
+    Tensor& grad_weight,
+    Tensor& grad_bias,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    int scale_) {
+  TORCH_CHECK(
+      kernel_size.size() == 2,
+      "It is expected kernel_size equals to 2, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 2,
+      "It is expected dilation equals to 2, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 2,
+      "It is expected padding equals to 2, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      stride.size());
+
+  TORCH_CHECK(
+      output_padding.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      output_padding.size());
+
+  TensorArg input_arg{input_, "input", 1},
+      grad_output_arg{grad_output_, "grad_output", 2},
+      grad_weight_arg{grad_weight, "grad_weight", 3},
+      grad_bias_arg{grad_bias, "grad_bias", 4};
+
+  checkAllSameGPU(
+      __func__,
+      {input_arg,
+       grad_output_arg,
+       grad_weight_arg,
+       grad_bias_arg});
+
+  int64_t kernel_height = kernel_size[0];
+  int64_t kernel_width = kernel_size[1];
+  int64_t dilation_height = dilation[0];
+  int64_t dilation_width = dilation[1];
+  int64_t pad_height = padding[0];
+  int64_t pad_width = padding[1];
+  int64_t stride_height = stride[0];
+  int64_t stride_width = stride[1];
+  int64_t output_padding_height = output_padding[0];
+  int64_t output_padding_width = output_padding[1];
+
+  slow_conv_transpose2d_shape_check(
+      input_,
+      grad_output_,
+      grad_weight,
+      grad_bias,
+      kernel_height,
+      kernel_width,
+      stride_height,
+      stride_width,
+      pad_height,
+      pad_width,
+      output_padding_height,
+      output_padding_width,
+      dilation_height,
+      dilation_width,
+      true);
+
+  Tensor input = input_.contiguous();
+  Tensor grad_output = grad_output_.contiguous();
+
+  int64_t n_output_plane;
+  if (grad_weight.defined()) {
+    n_output_plane = grad_weight.size(1);
+  } else if (grad_bias.defined()) {
+    n_output_plane = grad_bias.size(0);
+  } else {
+    return;
+  }
+
+  if (grad_weight.defined()) {
+    TORCH_CHECK(
+        grad_weight.is_contiguous(), "grad_weight needs to be contiguous");
+  }
+
+  if (grad_bias.defined()) {
+    TORCH_CHECK(grad_bias.is_contiguous(), "grad_bias needs to be contiguous");
+  }
+
+  bool is_batch = false;
+  if (input.dim() == 3) {
+    // Force batch
+    is_batch = true;
+    input.resize_({1, input.size(0), input.size(1), input.size(2)});
+    grad_output.resize_(
+        {1, grad_output.size(0), grad_output.size(1), grad_output.size(2)});
+  }
+
+  int64_t input_width = input.size(3);
+  int64_t input_height = input.size(2);
+  int64_t output_height = (input_height - 1) * stride_height - 2 * pad_height +
+      (dilation_height * (kernel_height - 1) + 1) + output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * pad_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  // Batch size + input planes
+  int64_t batch_size = input.size(0);
+
+  // Create temporary columns
+  bool need_columns = (kernel_height != 1 || kernel_width != 1 || stride_height != 1 ||
+      stride_width != 1 || pad_height != 0 || pad_width != 0 ||
+      dilation_height != 1 || dilation_width != 1);
+  Tensor columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height,
+      input_height * input_width}, input.options()) : Tensor();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "slow_conv_transpose2d_acc_grad_parameters_zoom", [&] {
+        // Helpers
+        Tensor input_n = Tensor();
+        Tensor grad_output_n = Tensor();
+
+        scalar_t scale = static_cast<scalar_t>(scale_);
+
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batch_size; elt++) {
+          // Matrix multiply per output:
+          grad_output_n = grad_output.select(0, elt);
+
+          // Do Weight:
+          if (grad_weight.defined()) {
+            // Matrix multiply per output:
+            input_n = input.select(0, elt);
+
+            if (need_columns) {
+              // Extract columns:
+              im2col<scalar_t>(
+                  c10::zoom::getCurrentZoomStream(),
+                  grad_output_n.const_data_ptr<scalar_t>(),
+                  n_output_plane,
+                  output_height,
+                  output_width,
+                  input_height,
+                  input_width,
+                  kernel_height,
+                  kernel_width,
+                  pad_height,
+                  pad_width,
+                  stride_height,
+                  stride_width,
+                  dilation_height,
+                  dilation_width,
+                  columns.mutable_data_ptr<scalar_t>());
+            }
+
+            // M,N,K are dims of matrix A and B
+            // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+            int64_t n = n_output_plane * kernel_height * kernel_width;
+            int64_t m = input_n.size(0); // n_input_plane
+            int64_t k = input_height * input_width;
+
+            // Do GEMM (note: this is a bit confusing because gemm assumes
+            // column-major matrices)
+            auto gemm_in_ptr = need_columns ? columns.const_data_ptr<scalar_t>()
+                : grad_output_n.const_data_ptr<scalar_t>();
+            at::zoom::blas::gemm<scalar_t>(
+                't',
+                'n',
+                n,
+                m,
+                k,
+                scale,
+                gemm_in_ptr,
+                k,
+                input_n.const_data_ptr<scalar_t>(),
+                k,
+                1,
+                grad_weight.mutable_data_ptr<scalar_t>(),
+                n);
+          }
+        }
+
+        if (grad_bias.defined()) {
+          at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
+        }
+
+        // Resize
+        if (is_batch) {
+          grad_output.resize_({n_output_plane, output_height, output_width});
+          input.resize_({input.size(1), input_height, input_width});
+        }
+      }); // end of dispatch
+}
+} // namespace
+
+TORCH_IMPL_FUNC(slow_conv_transpose2d_structured_zoom)
+(const Tensor& input,
+ const Tensor& weight,
+ IntArrayRef kernel_size,
+ OptionalTensorRef bias_opt,
+ IntArrayRef stride,
+ IntArrayRef padding,
+ IntArrayRef output_padding,
+ IntArrayRef dilation,
+ const Tensor& output) {
+  const Tensor& bias = bias_opt.getTensorRef();
+
+  slow_conv_transpose2d_out_zoom_template(
+      output,
+      input,
+      weight,
+      kernel_size,
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> slow_conv_transpose2d_backward_out_zoom(const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  if (grad_input.defined()) {
+    slow_conv_transpose2d_backward_out_zoom_template(
+        input,
+        grad_output,
+        grad_input,
+        weight,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation);
+  }
+
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes());
+    grad_weight.zero_();
+  }
+
+  if (grad_bias.defined()) {
+    grad_bias.resize_({weight.size(1)});
+    grad_bias.zero_();
+  }
+
+  if (grad_weight.defined() || grad_bias.defined()) {
+    slow_conv_transpose2d_acc_grad_parameters_zoom_template(
+        input,
+        grad_output,
+        grad_weight,
+        grad_bias,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        1);
+  }
+
+  return std::tuple<Tensor&, Tensor&, Tensor&>(
+      grad_input, grad_weight, grad_bias);
+}
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv_transpose2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    std::array<bool, 3> output_mask) {
+  Tensor grad_input;
+  Tensor grad_weight;
+  Tensor grad_bias;
+
+  if (output_mask[0]) {
+    grad_input = at::empty({0}, grad_output.options());
+  } else {
+    grad_input = Tensor();
+  }
+
+  if (output_mask[1]) {
+    grad_weight = at::empty({0}, grad_output.options());
+  } else {
+    grad_weight = Tensor();
+  }
+
+  if (output_mask[2]) {
+    grad_bias = at::empty({0}, grad_output.options());
+  } else {
+    grad_bias = Tensor();
+  }
+
+  if (grad_input.defined()) {
+    slow_conv_transpose2d_backward_out_zoom_template(
+        input,
+        grad_output,
+        grad_input,
+        weight,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation);
+  }
+
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes());
+    grad_weight.zero_();
+  }
+
+  if (grad_bias.defined()) {
+    grad_bias.resize_({weight.size(1)});
+    grad_bias.zero_();
+  }
+
+  if (grad_weight.defined() || grad_bias.defined()) {
+    slow_conv_transpose2d_acc_grad_parameters_zoom_template(
+        input,
+        grad_output,
+        grad_weight,
+        grad_bias,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        1);
+  }
+
+  return std::tuple<Tensor, Tensor, Tensor>(grad_input, grad_weight, grad_bias);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(slow_conv_transpose2d_backward_stub, &slow_conv_transpose2d_backward_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/zoom/NaiveConvolutionTranspose3d.cu
new file mode 100644
index 00000000000000..c6bad4efae3330
--- /dev/null
+++ b/aten/src/ATen/native/zoom/NaiveConvolutionTranspose3d.cu
@@ -0,0 +1,1017 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+
+#include <ATen/zoom/HIPBlas.h>
+#include <ATen/zoom/ZoomContext.h>
+
+#include <ATen/native/ConvUtils.h>
+#include <ATen/native/zoom/vol2col.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_transpose3d_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+static inline void slow_conv_transpose3d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    const Tensor& weight,
+    const Tensor& bias,
+    int kernel_depth,
+    int kernel_width,
+    int kernel_height,
+    int stride_depth,
+    int stride_width,
+    int stride_height,
+    int padding_depth,
+    int padding_width,
+    int padding_height,
+    int dilation_depth,
+    int dilation_width,
+    int dilation_height,
+    int output_padding_depth,
+    int output_padding_width,
+    int output_padding_height,
+    int weight_nullable) {
+  TORCH_CHECK(
+      input.numel() != 0 && (input.dim() == 4 || input.dim() == 5),
+      "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ",
+      input.sizes());
+  TORCH_CHECK(
+      stride_depth > 0 && stride_width > 0 && stride_height > 0,
+      "stride should be greater than zero, but got stride_depth: ",
+      stride_depth,
+      " stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width);
+  TORCH_CHECK(
+      dilation_depth > 0 && dilation_width > 0 && dilation_height > 0,
+      "dilation should be greater than zero, but got dilation_depth: ",
+      dilation_depth,
+      ", dilation_height: ",
+      dilation_height,
+      ", dilation_width: ",
+      dilation_width);
+  TORCH_CHECK(
+      (output_padding_depth < stride_depth ||
+       output_padding_depth < dilation_depth) &&
+          (output_padding_width < stride_width ||
+           output_padding_width < dilation_width) &&
+          (output_padding_height < stride_height ||
+           output_padding_height < dilation_height),
+      "output padding must be smaller than either stride or dilation,",
+      " but got output_padding_depth: ",
+      output_padding_depth,
+      " output_padding_height: ",
+      output_padding_height,
+      " output_padding_width: ",
+      output_padding_width,
+      " stride_depth: ",
+      stride_depth,
+      " stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width,
+      " dilation_depth: ",
+      dilation_depth,
+      " dilation_height: ",
+      dilation_height,
+      " dilation_width: ",
+      dilation_width);
+
+  // number of input & output planes and kernel size is indirectly defined by
+  // the weight tensor
+  if (weight.defined()) {
+    TORCH_CHECK(
+        weight.numel() != 0 && weight.dim() == 5,
+        "non-empty 5D (n_output_plane x n_input_plane ",
+        "x kernel_depth x kernel_height x kernel_width) tensor ",
+        "expected for weight, but got: ",
+        weight.sizes());
+    if (bias.defined()) {
+      check_dim_size(bias, 1, 0, weight.size(1));
+    }
+  } else if (!weight_nullable) {
+    AT_ERROR("weight tensor is expected to be non-nullable");
+  }
+
+  int ndim = input.dim();
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  if (weight.defined()) {
+    const int64_t n_input_plane = weight.size(0);
+    check_dim_size(input, ndim, dimf, n_input_plane);
+  }
+
+  int64_t input_width = input.size(dimw);
+  int64_t input_height = input.size(dimh);
+  int64_t input_depth = input.size(dimd);
+
+  int64_t output_depth = (input_depth - 1) * stride_depth - 2 * padding_depth +
+      (dilation_depth * (kernel_depth - 1) + 1) + output_padding_depth;
+  int64_t output_height = (input_height - 1) * stride_height -
+      2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
+      output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * padding_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  if (output_depth < 1 || output_width < 1 || output_height < 1) {
+    AT_ERROR(
+        "Given input size per channel: (",
+        input_depth,
+        " x ",
+        input_height,
+        " x ",
+        input_width,
+        "). Calculated output size per channel: (",
+        output_depth,
+        " x ",
+        output_height,
+        " x ",
+        output_width,
+        "). Output size is too small");
+  }
+
+  if (grad_output.defined()) {
+    if (weight.defined()) {
+      const int64_t n_output_plane = weight.size(1);
+      check_dim_size(grad_output, ndim, dimf, n_output_plane);
+    } else if (bias.defined()) {
+      const int64_t n_output_plane = bias.size(0);
+      check_dim_size(grad_output, ndim, dimf, n_output_plane);
+    }
+    check_dim_size(grad_output, ndim, dimd, output_depth);
+    check_dim_size(grad_output, ndim, dimh, output_height);
+    check_dim_size(grad_output, ndim, dimw, output_width);
+  }
+}
+
+void slow_conv_transpose3d_out_zoom_template(
+    Tensor& output,
+    const Tensor& input_,
+    const Tensor& weight_,
+    IntArrayRef kernel_size,
+    const Tensor& bias_,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation) {
+  TORCH_CHECK(
+      kernel_size.size() == 3,
+      "It is expected kernel_size equals to 3, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 3,
+      "It is expected dilation equals to 3, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 3,
+      "It is expected padding equals to 3, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 3,
+      "It is expected stride equals to 3, but got size ",
+      stride.size());
+
+  TORCH_CHECK(
+      output_padding.size() == 3,
+      "It is expected stride equals to 3, but got size ",
+      output_padding.size());
+
+  int64_t kernel_depth = kernel_size[0];
+  int64_t kernel_height = kernel_size[1];
+  int64_t kernel_width = kernel_size[2];
+  int64_t dilation_depth = dilation[0];
+  int64_t dilation_height = dilation[1];
+  int64_t dilation_width = dilation[2];
+  int64_t padding_depth = padding[0];
+  int64_t padding_height = padding[1];
+  int64_t padding_width = padding[2];
+  int64_t stride_depth = stride[0];
+  int64_t stride_height = stride[1];
+  int64_t stride_width = stride[2];
+  int64_t output_padding_depth = output_padding[0];
+  int64_t output_padding_height = output_padding[1];
+  int64_t output_padding_width = output_padding[2];
+
+  int n_input_plane = weight_.size(0);
+  int n_output_plane = weight_.size(1);
+
+  TensorArg input_arg{input_, "input", 1}, output_arg{output, "output", 2},
+      weight_arg{weight_, "weight", 3}, bias_arg{bias_, "bias", 4};
+
+  checkAllSameGPU(
+      "slow_conv_transpose3d_out_zoom",
+      {input_arg, output_arg, weight_arg, bias_arg});
+
+  slow_conv_transpose3d_shape_check(
+      input_,
+      Tensor(),
+      weight_,
+      bias_,
+      kernel_depth,
+      kernel_width,
+      kernel_height,
+      stride_depth,
+      stride_width,
+      stride_height,
+      padding_depth,
+      padding_width,
+      padding_height,
+      dilation_depth,
+      dilation_width,
+      dilation_height,
+      output_padding_depth,
+      output_padding_width,
+      output_padding_height,
+      0);
+
+  Tensor input = input_.contiguous();
+  Tensor weight = weight_.contiguous();
+  Tensor bias = bias_.defined() ? bias_.contiguous() : bias_;
+
+  int is_batch = false;
+  if (input.dim() == 4) {
+    // Force batch
+    is_batch = true;
+    input.resize_(
+        {1, input.size(0), input.size(1), input.size(2), input.size(3)});
+  }
+
+  int64_t input_width = input.size(4);
+  int64_t input_height = input.size(3);
+  int64_t input_depth = input.size(2);
+
+  int64_t output_depth = (input_depth - 1) * stride_depth - 2 * padding_depth +
+      (dilation_depth * (kernel_depth - 1) + 1) + output_padding_depth;
+  int64_t output_height = (input_height - 1) * stride_height -
+      2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
+      output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * padding_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  // Batch size + input planes
+  int64_t batch_size = input.size(0);
+
+  // Resize output
+  output.resize_(
+      {batch_size, n_output_plane, output_depth, output_height, output_width});
+
+  // Create temporary columns
+  Tensor columns = at::empty({n_output_plane * kernel_width * kernel_height * kernel_depth,
+      input_depth * input_height * input_width}, input.options());
+
+  // Define a buffer of ones, for bias accumulation
+  Tensor ones = bias.defined() ? at::ones({output_depth, output_height, output_width}, input_.options()) : Tensor();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "slow_conv_transpose3d_out_zoom", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        // Helpers
+        Tensor input_n;
+        Tensor output_n;
+
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batch_size; elt++) {
+          // Matrix multiply per output:
+          input_n = input.select(0, elt);
+          output_n = output.select(0, elt);
+
+          // M,N,K are dims of matrix A and B
+          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+          int64_t m =
+              weight.size(1) * weight.size(2) * weight.size(3) * weight.size(4);
+          int64_t n = columns.size(1);
+          int64_t k = weight.size(0);
+
+          // Do GEMM (note: this is a bit confusing because gemm assumes
+          // column-major matrices)
+          at::zoom::blas::gemm<scalar_t>(
+              'n',
+              't',
+              n,
+              m,
+              k,
+              static_cast<scalar_t>(1),
+              input_n.const_data_ptr<scalar_t>(),
+              n,
+              weight.const_data_ptr<scalar_t>(),
+              m,
+              static_cast<scalar_t>(0),
+              columns.mutable_data_ptr<scalar_t>(),
+              n);
+
+          // Unpack columns back into input:
+          at::native::col2vol<scalar_t, accscalar_t>(
+              c10::zoom::getCurrentZoomStream(),
+              columns.const_data_ptr<scalar_t>(),
+              n_output_plane,
+              output_depth,
+              output_height,
+              output_width,
+              input_depth,
+              input_height,
+              input_width,
+              kernel_depth,
+              kernel_height,
+              kernel_width,
+              padding_depth,
+              padding_height,
+              padding_width,
+              stride_depth,
+              stride_height,
+              stride_width,
+              dilation_depth,
+              dilation_height,
+              dilation_width,
+              output_n.mutable_data_ptr<scalar_t>());
+
+          // Do Bias after:
+          // M,N,K are dims of matrix A and B
+          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+          int64_t m_ = n_output_plane;
+          int64_t n_ = output_depth * output_height * output_width;
+          int64_t k_ = 1;
+
+          // Do GEMM (note: this is a bit confusing because gemm assumes
+          // column-major matrices)
+          if (bias.defined()) {
+            at::zoom::blas::gemm<scalar_t>(
+                't',
+                'n',
+                n_,
+                m_,
+                k_,
+                static_cast<scalar_t>(1),
+                ones.const_data_ptr<scalar_t>(),
+                k_,
+                bias.const_data_ptr<scalar_t>(),
+                k_,
+                static_cast<scalar_t>(1),
+                output_n.mutable_data_ptr<scalar_t>(),
+                n_);
+          }
+        }
+
+        // Resize output
+        if (is_batch) {
+          output.resize_(
+              {n_output_plane, output_depth, output_height, output_width});
+          input.resize_(
+              {n_input_plane, input_depth, input_height, input_width});
+        }
+      });
+}
+
+void slow_conv_transpose3d_backward_out_zoom_template(
+    const Tensor& input_,
+    const Tensor& grad_output_,
+    Tensor& grad_input,
+    const Tensor& weight_,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation) {
+  TORCH_CHECK(
+      kernel_size.size() == 3,
+      "It is expected kernel_size equals to 3, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 3,
+      "It is expected dilation equals to 3, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 3,
+      "It is expected padding equals to 3, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 3,
+      "It is expected stride equals to 3, but got size ",
+      stride.size());
+
+  TORCH_CHECK(
+      output_padding.size() == 3,
+      "It is expected stride equals to 3, but got size ",
+      output_padding.size());
+
+  int n_input_plane = weight_.size(0);
+  int n_output_plane = weight_.size(1);
+
+  int64_t kernel_depth = kernel_size[0];
+  int64_t kernel_height = kernel_size[1];
+  int64_t kernel_width = kernel_size[2];
+  int64_t dilation_depth = dilation[0];
+  int64_t dilation_height = dilation[1];
+  int64_t dilation_width = dilation[2];
+  int64_t padding_depth = padding[0];
+  int64_t padding_height = padding[1];
+  int64_t padding_width = padding[2];
+  int64_t stride_depth = stride[0];
+  int64_t stride_height = stride[1];
+  int64_t stride_width = stride[2];
+  int64_t output_padding_depth = output_padding[0];
+  int64_t output_padding_height = output_padding[1];
+  int64_t output_padding_width = output_padding[2];
+
+  TensorArg input_arg{input_, "input", 1},
+      grad_output_arg{grad_output_, "grad_output", 2},
+      weight_arg{weight_, "weight", 3},
+      grad_input_arg{grad_input, "grad_input", 4};
+
+  checkAllSameGPU(
+      "slow_conv_transpose3d_backward_out_zoom",
+      {input_arg,
+       grad_output_arg,
+       weight_arg,
+       grad_input_arg});
+
+  slow_conv_transpose3d_shape_check(
+      input_,
+      grad_output_,
+      weight_,
+      Tensor(),
+      kernel_depth,
+      kernel_width,
+      kernel_height,
+      stride_depth,
+      stride_width,
+      stride_height,
+      padding_depth,
+      padding_width,
+      padding_height,
+      dilation_depth,
+      dilation_width,
+      dilation_height,
+      output_padding_depth,
+      output_padding_width,
+      output_padding_height,
+      0);
+
+  Tensor input = input_.contiguous();
+  Tensor grad_output = grad_output_.contiguous();
+  Tensor weight = weight_.contiguous();
+
+  bool is_batch = false;
+  if (input.dim() == 4) {
+    // Force batch
+    is_batch = true;
+    input.resize_(
+        {1, input.size(0), input.size(1), input.size(2), input.size(3)});
+    grad_output.resize_({1,
+                         grad_output.size(0),
+                         grad_output.size(1),
+                         grad_output.size(2),
+                         grad_output.size(3)});
+  }
+
+  int64_t input_width = input.size(4);
+  int64_t input_height = input.size(3);
+  int64_t input_depth = input.size(2);
+  int64_t output_depth = (input_depth - 1) * stride_depth - 2 * padding_depth +
+      (dilation_depth * (kernel_depth - 1) + 1) + output_padding_depth;
+  int64_t output_height = (input_height - 1) * stride_height -
+      2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
+      output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * padding_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  // Batch size + input planes
+  int64_t batch_size = input.size(0);
+
+  // Resize output
+  grad_input.resize_(
+      {batch_size, n_input_plane, input_depth, input_height, input_width});
+
+  // Create temporary columns
+  bool need_columns = (kernel_depth != 1 || kernel_height != 1 || kernel_width != 1 ||
+      stride_depth != 1 || stride_height != 1 || stride_width != 1 ||
+      dilation_depth != 1 || dilation_height != 1 ||
+      dilation_width != 1 || padding_depth != 0 ||
+      padding_height != 0 || padding_width != 0);
+  Tensor grad_columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height * kernel_depth,
+      input_depth * input_height * input_width}, input.options()) : Tensor();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "slow_conv_transpose3d_backward_out_zoom", [&] {
+        // Helpers
+        Tensor grad_input_n;
+        Tensor grad_output_n;
+
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batch_size; elt++) {
+          // Matrix multiply per sample:
+          grad_input_n = grad_input.select(0, elt);
+          grad_output_n = grad_output.select(0, elt);
+
+          if (need_columns) {
+            // Extract columns:
+            at::native::vol2col<scalar_t>(
+                c10::zoom::getCurrentZoomStream(),
+                grad_output_n.const_data_ptr<scalar_t>(),
+                n_output_plane,
+                output_depth,
+                output_height,
+                output_width,
+                input_depth,
+                input_height,
+                input_width,
+                kernel_depth,
+                kernel_height,
+                kernel_width,
+                padding_depth,
+                padding_height,
+                padding_width,
+                stride_depth,
+                stride_height,
+                stride_width,
+                dilation_depth,
+                dilation_height,
+                dilation_width,
+                grad_columns.mutable_data_ptr<scalar_t>());
+          }
+
+          // M,N,K are dims of matrix A and B
+          // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+          int64_t m = weight.size(0);
+          int64_t n = input_depth * input_height * input_width;
+          int64_t k =
+              weight.size(1) * weight.size(2) * weight.size(3) * weight.size(4);
+
+          // Do GEMM (note: this is a bit confusing because gemm assumes
+          // column-major matrices)
+          auto gemm_in_ptr = need_columns ? grad_columns.const_data_ptr<scalar_t>()
+              : grad_output_n.const_data_ptr<scalar_t>();
+          at::zoom::blas::gemm<scalar_t>(
+              'n',
+              'n',
+              n,
+              m,
+              k,
+              static_cast<scalar_t>(1),
+              gemm_in_ptr,
+              n,
+              weight.const_data_ptr<scalar_t>(),
+              k,
+              static_cast<scalar_t>(0),
+              grad_input_n.mutable_data_ptr<scalar_t>(),
+              n);
+        }
+
+        // Resize output
+        if (is_batch) {
+          grad_output.resize_(
+              {n_output_plane, output_depth, output_height, output_width});
+          input.resize_(
+              {n_input_plane, input_depth, input_height, input_width});
+          grad_input.resize_(
+              {n_input_plane, input_depth, input_height, input_width});
+        }
+      });
+}
+
+void slow_conv_transpose3d_acc_grad_parameters_zoom(
+    const Tensor& input_,
+    const Tensor& grad_output_,
+    Tensor& grad_weight,
+    Tensor& grad_bias,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    int scale_) {
+  TORCH_CHECK(
+      kernel_size.size() == 3,
+      "It is expected kernel_size equals to 3, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 3,
+      "It is expected dilation equals to 3, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 3,
+      "It is expected padding equals to 3, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 3,
+      "It is expected stride equals to 3, but got size ",
+      stride.size());
+
+  TORCH_CHECK(
+      output_padding.size() == 3,
+      "It is expected stride equals to 3, but got size ",
+      output_padding.size());
+
+  int64_t kernel_depth = kernel_size[0];
+  int64_t kernel_height = kernel_size[1];
+  int64_t kernel_width = kernel_size[2];
+  int64_t dilation_depth = dilation[0];
+  int64_t dilation_height = dilation[1];
+  int64_t dilation_width = dilation[2];
+  int64_t padding_depth = padding[0];
+  int64_t padding_height = padding[1];
+  int64_t padding_width = padding[2];
+  int64_t stride_depth = stride[0];
+  int64_t stride_height = stride[1];
+  int64_t stride_width = stride[2];
+  int64_t output_padding_depth = output_padding[0];
+  int64_t output_padding_height = output_padding[1];
+  int64_t output_padding_width = output_padding[2];
+
+  TensorArg input_arg{input_, "input", 1},
+      grad_output_arg{grad_output_, "grad_output", 2},
+      grad_weight_arg{grad_weight, "grad_weight", 3},
+      grad_bias_arg{grad_bias, "grad_bias", 4};
+
+  checkAllSameGPU(
+      "slow_conv_transpose3d_acc_grad_parameters_zoom",
+      {input_arg,
+       grad_output_arg,
+       grad_weight_arg,
+       grad_bias_arg});
+
+  slow_conv_transpose3d_shape_check(
+      input_,
+      grad_output_,
+      grad_weight,
+      grad_bias,
+      kernel_depth,
+      kernel_width,
+      kernel_height,
+      stride_depth,
+      stride_width,
+      stride_height,
+      padding_depth,
+      padding_width,
+      padding_height,
+      dilation_depth,
+      dilation_width,
+      dilation_height,
+      output_padding_depth,
+      output_padding_width,
+      output_padding_height,
+      1);
+
+  int n_output_plane;
+  if (grad_weight.defined()) {
+    n_output_plane = grad_weight.size(1);
+  } else if (grad_bias.defined()) {
+    n_output_plane = grad_bias.size(0);
+  } else {
+    return;
+  }
+
+  if (grad_weight.defined()) {
+    TORCH_CHECK(
+        grad_weight.is_contiguous(), "grad_weight needs to be contiguous");
+  }
+  if (grad_bias.defined()) {
+    TORCH_CHECK(grad_bias.is_contiguous(), "grad_bias needs to be contiguous");
+  }
+
+  Tensor input = input_.contiguous();
+  Tensor grad_output = grad_output_.contiguous();
+
+  bool is_batch = false;
+  if (input.dim() == 4) {
+    // Force batch
+    is_batch = true;
+    input.resize_(
+        {1, input.size(0), input.size(1), input.size(2), input.size(3)});
+    grad_output.resize_({1,
+                         grad_output.size(0),
+                         grad_output.size(1),
+                         grad_output.size(2),
+                         grad_output.size(3)});
+  }
+
+  int64_t input_width = input.size(4);
+  int64_t input_height = input.size(3);
+  int64_t input_depth = input.size(2);
+
+  int64_t output_depth = (input_depth - 1) * stride_depth - 2 * padding_depth +
+      (dilation_depth * (kernel_depth - 1) + 1) + output_padding_depth;
+  int64_t output_height = (input_height - 1) * stride_height -
+      2 * padding_height + (dilation_height * (kernel_height - 1) + 1) +
+      output_padding_height;
+  int64_t output_width = (input_width - 1) * stride_width - 2 * padding_width +
+      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;
+
+  // Batch size + input planes
+  int64_t batch_size = input.size(0);
+
+  // Create temporary columns
+  bool need_columns = (kernel_depth != 1 || kernel_height != 1 || kernel_width != 1 ||
+      stride_depth != 1 || stride_height != 1 || stride_width != 1 ||
+      dilation_depth != 1 || dilation_height != 1 ||
+      dilation_width != 1 || padding_depth != 0 ||
+      padding_height != 0 || padding_width != 0);
+  Tensor columns = need_columns ? at::empty({n_output_plane * kernel_width * kernel_height * kernel_depth,
+      input_depth * input_height * input_width}, input.options()) : Tensor();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(),
+      "slow_conv_transpose3d_acc_grad_parameters_zoom",
+      [&] {
+        // Helpers
+        Tensor input_n;
+        Tensor grad_output_n;
+
+        scalar_t scale = static_cast<scalar_t>(scale_);
+
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batch_size; elt++) {
+          // Matrix multiply per output:
+          grad_output_n = grad_output.select(0, elt);
+
+          // Do Weight:
+          if (grad_weight.defined()) {
+            // Matrix multiply per output:
+            input_n = input.select(0, elt);
+
+            if (need_columns) {
+              // Extract columns:
+              at::native::vol2col<scalar_t>(
+                  c10::zoom::getCurrentZoomStream(),
+                  grad_output_n.const_data_ptr<scalar_t>(),
+                  n_output_plane,
+                  output_depth,
+                  output_height,
+                  output_width,
+                  input_depth,
+                  input_height,
+                  input_width,
+                  kernel_depth,
+                  kernel_height,
+                  kernel_width,
+                  padding_depth,
+                  padding_height,
+                  padding_width,
+                  stride_depth,
+                  stride_height,
+                  stride_width,
+                  dilation_depth,
+                  dilation_height,
+                  dilation_width,
+                  columns.mutable_data_ptr<scalar_t>());
+            }
+
+            // M,N,K are dims of matrix A and B
+            // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+            int64_t n = n_output_plane * kernel_width * kernel_height * kernel_depth;
+            int64_t m = input_n.size(0); // n_input_plane
+            int64_t k = input_depth * input_height * input_width;
+
+            // Do GEMM (note: this is a bit confusing because gemm assumes
+            // column-major matrices)
+            auto gemm_in_ptr = need_columns ? columns.const_data_ptr<scalar_t>() : grad_output_n.const_data_ptr<scalar_t>();
+            at::zoom::blas::gemm<scalar_t>(
+                't',
+                'n',
+                n,
+                m,
+                k,
+                scale,
+                gemm_in_ptr,
+                k,
+                input_n.const_data_ptr<scalar_t>(),
+                k,
+                static_cast<scalar_t>(1),
+                grad_weight.mutable_data_ptr<scalar_t>(),
+                n);
+          }
+        }
+
+        if (grad_bias.defined()) {
+          at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3, 4});
+        }
+
+        // Resize
+        if (is_batch) {
+          grad_output.resize_(
+              {n_output_plane, output_depth, output_height, output_width});
+          input.resize_(
+              {input.size(1), input_depth, input_height, input_width});
+        }
+      });
+}
+
+} // namespace
+
+Tensor& slow_conv_transpose3d_out_zoom(const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    Tensor& output) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  slow_conv_transpose3d_out_zoom_template(
+      output,
+      input,
+      weight,
+      kernel_size,
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation);
+
+  return output;
+}
+
+Tensor slow_conv_transpose3d_zoom(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  Tensor output = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  slow_conv_transpose3d_out_zoom_template(
+      output,
+      input,
+      weight,
+      kernel_size,
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation);
+
+  return output;
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> slow_conv_transpose3d_backward_out_zoom(const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias) {
+  if (grad_input.defined()) {
+    slow_conv_transpose3d_backward_out_zoom_template(
+        input,
+        grad_output,
+        grad_input,
+        weight,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation);
+  }
+
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes());
+    grad_weight.zero_();
+  }
+
+  if (grad_bias.defined()) {
+    grad_bias.resize_({weight.size(1)});
+    grad_bias.zero_();
+  }
+
+  if (grad_weight.defined() || grad_bias.defined()) {
+    slow_conv_transpose3d_acc_grad_parameters_zoom(
+        input,
+        grad_output,
+        grad_weight,
+        grad_bias,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        1);
+  }
+
+  return std::tuple<Tensor&, Tensor&, Tensor&>(
+      grad_input, grad_weight, grad_bias);
+}
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv_transpose3d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef dilation,
+    std::array<bool, 3> output_mask) {
+  Tensor grad_input;
+  Tensor grad_weight;
+  Tensor grad_bias;
+
+  if (output_mask[0]) {
+    grad_input = at::empty({0}, grad_output.options());
+  } else {
+    grad_input = Tensor();
+  }
+
+  if (output_mask[1]) {
+    grad_weight = at::empty({0}, grad_output.options());
+  } else {
+    grad_weight = Tensor();
+  }
+
+  if (output_mask[2]) {
+    grad_bias = at::empty({0}, grad_output.options());
+  } else {
+    grad_bias = Tensor();
+  }
+
+  if (grad_input.defined()) {
+    slow_conv_transpose3d_backward_out_zoom_template(
+        input,
+        grad_output,
+        grad_input,
+        weight,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation);
+  }
+
+  if (grad_weight.defined()) {
+    grad_weight.resize_(weight.sizes());
+    grad_weight.zero_();
+  }
+
+  if (grad_bias.defined()) {
+    grad_bias.resize_({weight.size(1)});
+    grad_bias.zero_();
+  }
+
+  if (grad_weight.defined() || grad_bias.defined()) {
+    slow_conv_transpose3d_acc_grad_parameters_zoom(
+        input,
+        grad_output,
+        grad_weight,
+        grad_bias,
+        kernel_size,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        1);
+  }
+
+  return std::tuple<Tensor, Tensor, Tensor>(grad_input, grad_weight, grad_bias);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(slow_conv_transpose3d_backward_stub, &slow_conv_transpose3d_backward_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/NaiveDilatedConvolution.cu b/aten/src/ATen/native/zoom/NaiveDilatedConvolution.cu
new file mode 100644
index 00000000000000..4bfaa40d85ffa9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/NaiveDilatedConvolution.cu
@@ -0,0 +1,611 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/vol2col.cuh>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/HIPBlas.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/ConvUtils.h>
+#include <ATen/native/zoom/im2col.cuh>
+#include <ATen/native/DilatedConvolutionUtils.h>
+#include <c10/util/accumulate.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_dilated2d_native.h>
+#include <ATen/ops/slow_conv_dilated3d_native.h>
+#endif
+
+#include <tuple>
+
+namespace at::native {
+
+namespace {
+
+// hyper-volume to column, CUDA
+template <typename Dtype, int64_t dim>
+void hvol2col(
+    hipStream_t stream,
+    const Dtype* data_hvol,
+    const int channels,
+    const IntArrayRef input_size,
+    const IntArrayRef output_size,
+    const IntArrayRef kernel_size,
+    const IntArrayRef stride_size,
+    const IntArrayRef pad_size,
+    const IntArrayRef dilation_size,
+    Dtype* data_col) {
+  if (dim == 3) {
+    vol2col<Dtype>(
+        stream,
+        data_hvol,
+        channels,
+        input_size[0],
+        input_size[1],
+        input_size[2],
+        output_size[0],
+        output_size[1],
+        output_size[2],
+        kernel_size[0],
+        kernel_size[1],
+        kernel_size[2],
+        pad_size[0],
+        pad_size[1],
+        pad_size[2],
+        stride_size[0],
+        stride_size[1],
+        stride_size[2],
+        dilation_size[0],
+        dilation_size[1],
+        dilation_size[2],
+        data_col);
+  }
+  if (dim == 2) {
+    im2col<Dtype>(
+        stream,
+        data_hvol,
+        channels,
+        input_size[0],
+        input_size[1],
+        output_size[0],
+        output_size[1],
+        kernel_size[0],
+        kernel_size[1],
+        pad_size[0],
+        pad_size[1],
+        stride_size[0],
+        stride_size[1],
+        dilation_size[0],
+        dilation_size[1],
+        data_col);
+  }
+}
+
+// column to hyper-volume, CUDA
+template <typename Dtype, int64_t dim>
+void col2hvol(
+    hipStream_t stream,
+    const Dtype* data_col,
+    const int channels,
+    const IntArrayRef input_size,
+    const IntArrayRef output_size,
+    const IntArrayRef kernel_size,
+    const IntArrayRef stride_size,
+    const IntArrayRef pad_size,
+    const IntArrayRef dilation_size,
+    Dtype* data_hvol) {
+  if (dim == 3) {
+    col2vol<Dtype, Dtype>(
+        stream,
+        data_col,
+        channels,
+        input_size[0],
+        input_size[1],
+        input_size[2],
+        output_size[0],
+        output_size[1],
+        output_size[2],
+        kernel_size[0],
+        kernel_size[1],
+        kernel_size[2],
+        pad_size[0],
+        pad_size[1],
+        pad_size[2],
+        stride_size[0],
+        stride_size[1],
+        stride_size[2],
+        dilation_size[0],
+        dilation_size[1],
+        dilation_size[2],
+        data_hvol);
+  }
+  if (dim == 2) {
+    col2im<Dtype, Dtype>(
+        stream,
+        data_col,
+        channels,
+        input_size[0],
+        input_size[1],
+        output_size[0],
+        output_size[1],
+        kernel_size[0],
+        kernel_size[1],
+        pad_size[0],
+        pad_size[1],
+        stride_size[0],
+        stride_size[1],
+        dilation_size[0],
+        dilation_size[1],
+        data_hvol);
+  }
+}
+
+/*
+   check tensor data locations
+*/
+void slow_conv_dilated_location_check(
+    CheckedFrom c,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& grad_output) {
+  // checking data locations of user-provided tensor arguments
+  TensorArg input_arg{input, "input", 2}, weight_arg{weight, "weight", 3},
+      bias_arg{bias, "bias", 4}, grad_output_arg{grad_output, "grad_output", 5};
+  checkAllSameGPU(c, {input_arg, weight_arg});
+  if (bias.defined()) {
+    checkAllSameGPU(c, {input_arg, bias_arg});
+  }
+  if (grad_output.defined()) {
+    checkAllSameGPU(c, {input_arg, grad_output_arg});
+  }
+  // we are not checking the data locations of other tensor
+  // arguments such as output, grad_input, etc because of these are
+  // allocated based on input options and hence these tensors always
+  // have the same data location as of input tensor.
+}
+
+/*
+  slow_conv_dilated_all_cuda_template
+
+  Main worker. Computes tensors output, grad_input, grad_weight,
+  and/or grad_bias if defined, respectively.
+ */
+
+template <int64_t dim>
+void slow_conv_dilated_all_zoom_template(
+    Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& grad_output,
+    Tensor& grad_input,
+    Tensor& grad_weight,
+    Tensor& grad_bias,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  slow_conv_dilated_location_check(__func__, input, weight, bias, grad_output);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  auto options = input.options();
+  // The rear part of input tensor sizes:
+  auto input_size = input.sizes().slice(2);
+  // The rear part of output tensor sizes:
+  auto output_size = internal::get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+  int64_t batchSize = input.size(0);
+  int64_t nInputPlane = weight.size(1);
+  int64_t nOutputPlane = weight.size(0);
+  // Temporary buffers:
+  const int64_t m = c10::multiply_integers(kernel_size);
+  const int64_t output_vsize = c10::multiply_integers(output_size);
+  Tensor columns = at::empty({0}, options);
+  if (output.defined() || grad_weight.defined() || grad_input.defined()) {
+    columns.resize_({nInputPlane * m, output_vsize});
+  }
+  // Initialize
+  if (grad_weight.defined()) {
+    grad_weight.zero_();
+  }
+  if (grad_bias.defined()) {
+    grad_bias.zero_();
+  }
+  if (output.defined() && !bias.defined()) {
+    output.zero_();
+  }
+
+  /* When using ROCm, the sum evaluation is inaccurate for double
+     tensors. The reason is currently unknown. Hence, we use gemv for
+     computing `grad_output_n.sum(dims)` until the ROCm-sum issue is
+     resolved. */
+  Tensor ones = at::empty({0}, options);
+  if (grad_bias.defined()) {
+    ones.resize_({output_vsize});
+    ones.fill_(1);
+  }
+  /* MSVC does not like #ifdef-s inside the CPP macro
+     AT_DISPATCH_FLOATING_TYPES_AND_HALF. So, we define the code
+     branching outside the CPP macro: */
+#define CALCULATE_GRAD_BIAS                                \
+  at::zoom::blas::gemv<scalar_t>(                          \
+      /*trans=*/'t',                                       \
+      /*    m=*/output_vsize,                              \
+      /*    n=*/nOutputPlane,                              \
+      /*alpha=*/static_cast<scalar_t>(1),                  \
+      /*    A=*/grad_output_n.const_data_ptr<scalar_t>(),  \
+      /*  lda=*/output_vsize,                              \
+      /*    x=*/ones.const_data_ptr<scalar_t>(),           \
+      /* incx=*/1,                                         \
+      /* beta=*/static_cast<scalar_t>(1),                  \
+      /*    y=*/grad_bias.mutable_data_ptr<scalar_t>(),    \
+      /* incy=*/1)
+
+  // Helpers
+  Tensor grad_output_n;
+  std::vector<int64_t> dims(dim);
+  std::iota(dims.begin(), dims.end(), 1);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "slow_conv_dilated<>", [&] {
+        // For each elt in batch, do:
+        for (int elt = 0; elt < batchSize; elt++) {
+          // Matrix multiply per output:
+          Tensor input_n = input.select(0, elt);
+
+          // Output
+          if (output.defined()) {
+            Tensor output_n = output.select(0, elt);
+            if (bias.defined()) {
+              /* For gemm argument derivation, see
+                 slow_conv_dilated_all_cuda_template in
+                 ATen/native/DilatedConvolution.cpp */
+              for (int n = 0; n < nOutputPlane; n++) {
+                output_n.select(0, n).fill_(bias[n]);
+              }
+            }
+            // Extract columns:
+            hvol2col<scalar_t, dim>(
+                stream,
+                input_n.const_data_ptr<scalar_t>(),
+                nInputPlane,
+                input_size,
+                output_size,
+                kernel_size,
+                stride_size,
+                pad_size,
+                dilation_size,
+                columns.mutable_data_ptr<scalar_t>());
+            /* For gemm argument derivation, see
+               slow_conv_dilated_all_cuda_template in
+               ATen/native/DilatedConvolution.cpp */
+            at::zoom::blas::gemm<scalar_t>(
+                /*transa=*/'n',
+                /*transb=*/'n',
+                /*     m=*/columns.size(1),
+                /*     n=*/nOutputPlane,
+                /*     k=*/columns.size(0),
+                /* alpha=*/static_cast<scalar_t>(1),
+                /*     A=*/columns.const_data_ptr<scalar_t>(),
+                /*   lda=*/columns.size(1),
+                /*     B=*/weight.const_data_ptr<scalar_t>(),
+                /*   ldb=*/columns.size(0),
+                /*  beta=*/static_cast<scalar_t>(1),
+                /*     C=*/output_n.mutable_data_ptr<scalar_t>(),
+                /*   ldc=*/columns.size(1));
+
+          } else {
+            // All gradients
+            grad_output_n = grad_output.select(0, elt);
+          }
+
+          // Gradient of input:
+          if (grad_input.defined()) {
+            /* For gemm argument derivation, see
+               slow_conv_dilated_all_cuda_template in
+               ATen/native/DilatedConvolution.cpp */
+            at::zoom::blas::gemm<scalar_t>(
+                /*transa=*/'n',
+                /*transb=*/'t',
+                /*     m=*/columns.size(1),
+                /*     n=*/columns.size(0),
+                /*     k=*/nOutputPlane,
+                /* alpha=*/static_cast<scalar_t>(1),
+                /*     A=*/grad_output_n.const_data_ptr<scalar_t>(),
+                /*   lda=*/columns.size(1),
+                /*     B=*/weight.const_data_ptr<scalar_t>(),
+                /*   ldb=*/columns.size(0),
+                /*  beta=*/static_cast<scalar_t>(0),
+                /*     C=*/columns.mutable_data_ptr<scalar_t>(),
+                /*   ldc=*/columns.size(1));
+            // Unpack columns back into input:
+            Tensor grad_input_n = grad_input.select(0, elt);
+
+            col2hvol<scalar_t, dim>(
+                stream,
+                columns.const_data_ptr<scalar_t>(),
+                nInputPlane,
+                input_size,
+                output_size,
+                kernel_size,
+                stride_size,
+                pad_size,
+                dilation_size,
+                grad_input_n.mutable_data_ptr<scalar_t>());
+          }
+
+          // Gradient of weight:
+          if (grad_weight.defined()) {
+            // Extract columns:
+            hvol2col<scalar_t, dim>(
+                stream,
+                input_n.const_data_ptr<scalar_t>(),
+                nInputPlane,
+                input_size,
+                output_size,
+                kernel_size,
+                stride_size,
+                pad_size,
+                dilation_size,
+                columns.mutable_data_ptr<scalar_t>());
+            scalar_t scale = static_cast<scalar_t>(
+                1); // TODO: expose as argument?
+            /* For gemm argument derivation, see
+               slow_conv_dilated_all_cuda_template in
+               ATen/native/DilatedConvolution.cpp */
+            at::zoom::blas::gemm<scalar_t>(
+                /*transa=*/'t',
+                /*transb=*/'n',
+                /*     m=*/columns.size(0),
+                /*     n=*/nOutputPlane,
+                /*     k=*/columns.size(1),
+                /* alpha=*/scale,
+                /*     A=*/columns.const_data_ptr<scalar_t>(),
+                /*   lda=*/columns.size(1),
+                /*     B=*/grad_output_n.const_data_ptr<scalar_t>(),
+                /*   ldb=*/columns.size(1),
+                /*  beta=*/static_cast<scalar_t>(1),
+                /*     C=*/grad_weight.mutable_data_ptr<scalar_t>(),
+                /*   ldc=*/columns.size(0));
+          }
+
+          // Gradient of bias:
+          if (grad_bias.defined()) {
+            /* For gemv argument derivation, see
+               slow_conv_dilated_all_cpu_template in
+               ATen/native/DilatedConvolution.cpp */
+            CALCULATE_GRAD_BIAS; /* MSVC does not like #ifdef-s
+                                    inside the CPP macros, see above. */
+            /*
+              TODO: when scale != 1 is introduced then use:
+                grad_bias += scale * grad_output_n.sum(dims);
+             */
+          }
+        }
+      });
+
+} // slow_conv_dilated_all_cuda_template
+
+} // namespace
+
+Tensor slow_conv_dilated2d_zoom(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  Tensor undefined;
+  internal::slow_conv_dilated_shape_check<2>(
+      input,
+      weight,
+      bias,
+      undefined,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  auto is_batch = input.dim() == 4;
+  auto options = input.options();
+  // calculate output tensor size
+  auto output_size = internal::get_output_size<2>(
+      input, weight, kernel_size, stride_size, pad_size, dilation_size);
+  // template function assumes batched tensors.  unsqueeze(0) will
+  // insert batch dimension without affecting the original tensor.
+  const Tensor input_ =
+      (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
+  const Tensor weight_ = weight.contiguous();
+  const Tensor bias_ = (bias.defined() ? bias.contiguous() : undefined);
+  Tensor output = at::empty(output_size, options);
+  Tensor output_ = (is_batch ? output : output.unsqueeze(0));
+
+  slow_conv_dilated_all_zoom_template<2>(
+      output_,
+      input_,
+      weight_,
+      bias_,
+      undefined,
+      undefined,
+      undefined,
+      undefined,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  return output;
+}
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size,
+    const std::array<bool, 3ul> output_mask) {
+  Tensor undefined;
+  internal::slow_conv_dilated_shape_check<2>(
+      input,
+      weight,
+      undefined,
+      grad_output,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  auto is_batch = input.dim() == 4;
+  auto options = grad_output.options();
+  // template function assumes batched tensors.  unsqueeze(0) will
+  // insert batch dimension without affecting the original tensor.
+  const Tensor grad_output_ =
+      (is_batch ? grad_output.contiguous()
+                : grad_output.contiguous().unsqueeze(0));
+  const Tensor input_ =
+      (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
+  const Tensor weight_ = weight.contiguous();
+  // compute only gradients for which the corresponding output_mask is true:
+  Tensor grad_input =
+      (output_mask[0] ? at::empty(input.sizes(), options) : undefined);
+  Tensor grad_weight =
+      (output_mask[1] ? at::empty(weight.sizes(), options) : undefined);
+  Tensor grad_bias =
+      (output_mask[2] ? at::empty(weight.size(0), options) : undefined);
+  Tensor grad_input_ =
+      (output_mask[0] ? (is_batch ? grad_input : grad_input.unsqueeze(0))
+                      : undefined);
+  slow_conv_dilated_all_zoom_template<2>(
+      undefined,
+      input_,
+      weight_,
+      undefined,
+      grad_output_,
+      grad_input,
+      grad_weight,
+      grad_bias,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  return std::tie(grad_input, grad_weight, grad_bias);
+}
+
+Tensor slow_conv_dilated3d_zoom(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size, const std::optional<Tensor>& bias_opt,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  Tensor undefined;
+  internal::slow_conv_dilated_shape_check<3>(
+      input,
+      weight,
+      bias,
+      undefined,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  auto is_batch = input.dim() == 5;
+  auto options = input.options();
+  // calculate output tensor size
+  auto output_size = internal::get_output_size<3>(
+      input, weight, kernel_size, stride_size, pad_size, dilation_size);
+  // template function assumes batched tensors.  unsqueeze(0) will
+  // insert batch dimension without affecting the original tensor.
+  const Tensor input_ =
+      (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
+  const Tensor weight_ = weight.contiguous();
+  const Tensor bias_ = (bias.defined() ? bias.contiguous() : undefined);
+  Tensor output = at::empty(output_size, options);
+  Tensor output_ = (is_batch ? output : output.unsqueeze(0));
+
+  slow_conv_dilated_all_zoom_template<3>(
+      output,
+      input_,
+      weight_,
+      bias_,
+      undefined,
+      undefined,
+      undefined,
+      undefined,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  return output;
+}
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv_dilated3d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size,
+    const std::array<bool, 3ul> output_mask) {
+  Tensor undefined;
+  internal::slow_conv_dilated_shape_check<3>(
+      input,
+      weight,
+      undefined,
+      grad_output,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  auto is_batch = input.dim() == 5;
+  auto options = grad_output.options();
+  // template function assumes batched tensors.  unsqueeze(0) will
+  // insert batch dimension without affecting the original tensor.
+  const Tensor grad_output_ =
+      (is_batch ? grad_output.contiguous()
+                : grad_output.contiguous().unsqueeze(0));
+  const Tensor input_ =
+      (is_batch ? input.contiguous() : input.contiguous().unsqueeze(0));
+  const Tensor weight_ = weight.contiguous();
+  // compute only gradients for which the corresponding output_mask is true:
+  Tensor grad_input =
+      (output_mask[0] ? at::empty(input.sizes(), options) : undefined);
+  Tensor grad_weight =
+      (output_mask[1] ? at::empty(weight.sizes(), options) : undefined);
+  Tensor grad_bias =
+      (output_mask[2] ? at::empty(weight.size(0), options) : undefined);
+  Tensor grad_input_ =
+      (output_mask[0] ? (is_batch ? grad_input : grad_input.unsqueeze(0))
+                      : undefined);
+  slow_conv_dilated_all_zoom_template<3>(
+      undefined,
+      input_,
+      weight_,
+      undefined,
+      grad_output_,
+      grad_input,
+      grad_weight,
+      grad_bias,
+      kernel_size,
+      stride_size,
+      pad_size,
+      dilation_size);
+  return std::tie(grad_input, grad_weight, grad_bias);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(slow_conv_dilated2d_backward_stub, &slow_conv_dilated2d_backward_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(slow_conv_dilated3d_backward_stub, &slow_conv_dilated3d_backward_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Nonzero.cu b/aten/src/ATen/native/zoom/Nonzero.cu
new file mode 100644
index 00000000000000..d735795bcc1720
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Nonzero.cu
@@ -0,0 +1,130 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/zoom/ZoomCachingAllocator.h>
+#include <ATen/zoom/EmptyTensor.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/jit/OffsetCalculator.cuh> //for MAX_DIMS
+#include <ATen/zoom/cub.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/nonzero_native.h>
+#endif
+
+
+namespace at::native {
+
+namespace{
+template<typename T>
+struct NonZeroOp
+{
+    __host__ __device__ __forceinline__ bool operator()(const T& a) const {
+      return (a!=T(0));
+    }
+};
+
+//TODO: actually support int64_t index_t
+template<typename index_t>
+struct TensorDims {
+  index_t sizes[MAX_DIMS];
+};
+
+template <typename index_t>
+__global__ void write_indices(
+    int64_t* inp,
+    TensorDims<index_t> dims,
+    int ndim,
+    index_t n) {
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n) {
+    index_t div = 1;
+    int64_t idx_flat = inp[index];
+#pragma unroll
+    for (int dim = MAX_DIMS; dim >= 0; dim--) {
+      if (dim > ndim - 1)
+        continue;
+      auto dim_size = dims.sizes[dim];
+      inp[index + dim * n] = (idx_flat / div) % dim_size;
+      div *= dim_size;
+    }
+  }
+}
+
+} //anonymous namespace
+
+template<typename scalar_t>
+void nonzero_zoom_out_impl(const Tensor& self, Tensor& out){
+  Tensor self_ = self.contiguous();
+  int N = self_.numel();
+  const hipStream_t stream = c10::zoom::getCurrentZoomStream();
+// compute number of nonzero elements
+  size_t temp_storage_bytes=0;
+  auto& allocator = *c10::zoom::ZoomCachingAllocator::get();
+  auto num_nonzeros = allocator.allocate(sizeof(int));
+  hipcub::TransformInputIterator<bool, NonZeroOp<scalar_t>, const scalar_t*> itr(self_.const_data_ptr<scalar_t>(), NonZeroOp<scalar_t>());
+  hipcub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
+  auto temp_storage = allocator.allocate(temp_storage_bytes);
+  hipcub::DeviceReduce::Sum(temp_storage.get(), temp_storage_bytes, itr, (int*)num_nonzeros.get(), N, stream);
+  int num_nonzeros_h;
+  c10::zoom::memcpy_and_sync(&num_nonzeros_h, num_nonzeros.get(), sizeof(int), hipMemcpyDeviceToHost, stream);
+  //expected output size is num_nonzeros x ndim
+  //we are producing output with size {num_nonzeros, ndim} and strides {1, num_nonzeros} (that is, transposed ndim x num_nonzeros output)
+  //we are able to directly use passed output with this size and strides, and we can also (per contract)
+  //resize passed output with incorrect sizes anyway we want.
+  //However, out with correct sizes and incorrect strides will have to be copied to from the intermediate we've produced.
+  bool need_to_copy = out.dim() == 2 && out.sizes()[0] == num_nonzeros_h && out.sizes()[1] == self.dim() && !out.t().is_contiguous();
+  at::Tensor out_temp = need_to_copy ?
+      Tensor(at::detail::empty_zoom({self.dim(), num_nonzeros_h}, out.options())) :
+      out.resize_({self.dim(), num_nonzeros_h});
+  //Scalars are expected to produce output of size (1,0), so we can't write to it
+  if (self.dim() > 0) {
+    hipcub::CountingInputIterator<int64_t> counting_itr(0);
+    temp_storage_bytes = 0;
+    hipcub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,
+      out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
+    temp_storage = allocator.allocate(temp_storage_bytes);
+    hipcub::DeviceSelect::Flagged(temp_storage.get(), temp_storage_bytes, counting_itr, itr,
+      out_temp.mutable_data_ptr<int64_t>(), (int*)num_nonzeros.get(), N, stream);
+    if (num_nonzeros_h > 0 && self.dim() > 1){
+        TensorDims<int> dims;
+        for (int i=0; i<self.dim(); i++){
+            dims.sizes[i] = self.sizes()[i];
+        }
+        const int nthreads = 256;
+        const int nblocks = (num_nonzeros_h + nthreads -1)/nthreads;
+        write_indices<<<nblocks, nthreads, 0, stream>>>(out_temp.mutable_data_ptr<int64_t>(),
+        dims, self.dim(), num_nonzeros_h);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+  if (need_to_copy) {
+    out.copy_(out_temp.t());
+  } else {
+    //transpose out so it is correct size
+    Tensor out_ = out_temp.t();
+    out.set_(out_);
+  }
+}
+
+Tensor& nonzero_out_zoom(const Tensor& self, Tensor& out){
+  TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
+  See https://github.com/pytorch/pytorch/issues/51871");
+  TORCH_CHECK(out.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out.dtype());
+  TORCH_CHECK(self.device() == out.device(), "expected self and out to be on the same device, but got out on ",
+  out.device(), " and self on ", self.device());
+  TORCH_CHECK(self.dim() <= MAX_DIMS, "nonzero is not supported for tensor with more than ", MAX_DIMS, " dimensions");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(at::ScalarType::ComplexHalf, at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+    self.scalar_type(), "nonzero_zoom",
+    [&] {nonzero_zoom_out_impl<scalar_t>(self, out);});
+  return out;
+}
+
+Tensor nonzero_zoom(const Tensor& self){
+  Tensor out = at::detail::empty_zoom({0}, self.options().dtype(kLong));
+  return at::native::nonzero_out_zoom(self, out);
+}
+} //namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Normalization.cu b/aten/src/ATen/native/zoom/Normalization.cu
new file mode 100644
index 00000000000000..093f040aa1281b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Normalization.cu
@@ -0,0 +1,820 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/detail/ZoomHooksInterface.h>
+#include <ATen/native/Normalization.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/ReduceOps.h>
+#include <ATen/native/Resize.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Resize.h>
+#include <ATen/native/zoom/Normalization.cuh>
+#include <c10/zoom/HIPMathCompat.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_batch_norm_with_update_native.h>
+#include <ATen/ops/batch_norm_backward_native.h>
+#include <ATen/ops/batch_norm_backward_elemt_native.h>
+#include <ATen/ops/batch_norm_backward_reduce_native.h>
+#include <ATen/ops/batch_norm_elemt_native.h>
+#include <ATen/ops/batch_norm_gather_stats_native.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_native.h>
+#include <ATen/ops/batch_norm_stats_native.h>
+#include <ATen/ops/batch_norm_update_stats_native.h>
+#include <ATen/ops/cudnn_batch_norm.h>
+#include <ATen/ops/cudnn_batch_norm_backward.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/miopen_batch_norm.h>
+#include <ATen/ops/miopen_batch_norm_backward.h>
+#include <ATen/ops/native_batch_norm_backward_native.h>
+#include <ATen/ops/native_batch_norm_native.h>
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+ScalarType first_type() {
+  return ScalarType::Undefined;
+}
+
+template <typename... Args>
+ScalarType first_type(const Tensor& arg, const Args&... parameters) {
+  return arg.defined() ? arg.scalar_type() : first_type(parameters...);
+}
+
+// A transform is mixed type if the parameters are higher precision than the input
+template <typename... Args>
+bool is_mixed_type(const Tensor& input, const Args&... parameters) {
+  const auto parameter_type = first_type(parameters...);
+  return ((parameter_type != ScalarType::Undefined) &&
+          (parameter_type != input.scalar_type()));
+}
+
+inline bool batch_norm_use_channels_last_kernels(const at::Tensor& self) {
+  return (
+    self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+    self.is_contiguous(at::MemoryFormat::ChannelsLast3d) ||
+    (self.is_contiguous() && self.strides()[1] == 1)
+  );
+}
+
+enum class Impl {
+  Contiguous,
+  ChannelsLast,
+  General,
+};
+
+inline Impl batch_norm_choose_impl(const Tensor& self) {
+  if (!at::zoom::detail::canUse32BitIndexMath(self)) {
+    return Impl::General;
+  }
+
+  if (self.is_contiguous()) {
+    return self.strides()[1] == 1 ? Impl::ChannelsLast : Impl::Contiguous;
+  }
+
+  if (self.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+    return Impl::ChannelsLast;
+  }
+
+  return Impl::General;
+}
+
+inline Impl batch_norm_choose_impl(const Tensor& in1, const Tensor& in2) {
+  auto imp1 = batch_norm_choose_impl(in1);
+  if (imp1 == Impl::General) {
+    return imp1;
+  }
+  auto imp2 = batch_norm_choose_impl(in2);
+  return imp1 == imp2 ? imp1 : Impl::General;
+}
+
+void batch_norm_elementwise(
+    const Tensor& out, const Tensor& self, const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt, const Tensor& mean_, const Tensor& invstd_) {
+  switch (batch_norm_choose_impl(self)) {
+  case Impl::Contiguous: {
+    c10::MaybeOwned<Tensor> weight = at::borrow_from_optional_tensor(weight_opt);
+    c10::MaybeOwned<Tensor> bias = at::borrow_from_optional_tensor(bias_opt);
+    resize_output(out, self.sizes());
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(),
+                                    "batch_norm_elementwise_zoom", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      const bool mixed_type = is_mixed_type(self, *weight, *bias);
+      if (mixed_type) {
+        batch_norm_elemt_zoom_template<scalar_t, accscalar_t, int32_t>(
+            out, self, *weight, *bias, mean_, invstd_);
+      } else {
+        batch_norm_elemt_zoom_template<scalar_t, scalar_t, int32_t>(
+            out, self, *weight, *bias, mean_, invstd_);
+      }
+    });
+    return;
+  }
+  case Impl::ChannelsLast: {
+    auto weight = at::borrow_from_optional_tensor(weight_opt);
+    auto bias = at::borrow_from_optional_tensor(bias_opt);
+
+    if (resize_output_check(out, self.sizes())) {
+        resize_impl_zoom_(out.unsafeGetTensorImpl(), self.sizes(), self.strides());
+    }
+    if ((out.strides() == self.strides()) &&
+        (!weight->defined() || weight->is_contiguous()) &&
+        (!bias->defined() || bias->is_contiguous()) &&
+        (!mean_.defined() || mean_.is_contiguous()) &&
+        (!invstd_.defined() || invstd_.is_contiguous())) {
+      batch_norm_elemt_channels_last_zoom_template(
+          out, self, *weight, *bias, mean_, invstd_);
+      return;
+    }
+    [[fallthrough]];
+  }
+  case Impl::General: {
+    const int64_t ndim = self.dim();
+    DimVector sizes(ndim, 1), strides(ndim, 0);
+    // Helper to convert 1d tensors to an nd tensor that broadcasts with input
+    // All elements go into the channel dimension
+    auto as_nd = [&](const Tensor& t) {
+      TORCH_INTERNAL_ASSERT(t.defined() && t.dim() == 1);
+      sizes[1] = t.sizes()[0];
+      strides[1] = t.strides()[0];
+      return t.as_strided(sizes, strides);
+    };
+
+    auto weight = weight_opt.has_value() && weight_opt->defined() ?
+        as_nd(*weight_opt) : at::scalar_tensor(1, mean_.options());
+    auto bias = bias_opt.has_value() && bias_opt->defined() ?
+        as_nd(*bias_opt) : at::scalar_tensor(0, mean_.options());
+    auto mean = as_nd(mean_);
+    auto invstd = as_nd(invstd_);
+
+    auto iter = TensorIteratorConfig()
+        .add_output(out)
+        .add_input(self)
+        .add_input(weight)
+        .add_input(bias)
+        .add_input(mean)
+        .add_input(invstd)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(),
+                                    "batch_norm_elementwise_zoom", [&] {
+      using acc_t = at::acc_type<scalar_t, true>;
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t input, acc_t weight, acc_t bias,
+                                      acc_t mean, acc_t invstd) -> scalar_t {
+        return (input - mean) * weight * invstd + bias;
+      });
+    });
+    return;
+  }
+  }
+}
+
+Tensor batch_norm_elementwise_backward_train(
+    const Tensor& grad_out, const Tensor& input, const Tensor& mean, const Tensor& invstd,
+    const Tensor& weight, const Tensor& sum_dy, const Tensor& sum_dy_xmu) {
+  switch (batch_norm_choose_impl(input, grad_out)) {
+  case Impl::Contiguous: {
+    return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                           "batch_norm_backward_elemt", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      const bool mixed_type = is_mixed_type(input, weight);
+      if (mixed_type) {
+        return batch_norm_backward_elemt_zoom_template<scalar_t, accscalar_t, int32_t>(
+            grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu);
+      } else {
+        return batch_norm_backward_elemt_zoom_template<scalar_t, scalar_t, int32_t>(
+            grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu);
+      }
+    });
+  }
+  case Impl::ChannelsLast: {
+    if ((!weight.defined() || weight.is_contiguous()) &&
+        mean.is_contiguous() && invstd.is_contiguous()) {
+      return batch_norm_backward_elemt_channels_last_zoom_template(
+          grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu);
+    }
+    [[fallthrough]];
+  }
+  case Impl::General: {
+    const auto ndim = input.dim();
+    DimVector sizes(ndim, 1), strides(ndim, 0);
+    auto as_nd = [&](const Tensor& t) {
+      TORCH_INTERNAL_ASSERT(t.defined() && t.dim() == 1);
+      sizes[1] = t.sizes()[0];
+      strides[1] = t.strides()[0];
+      return t.as_strided(sizes, strides);
+    };
+    auto invstd_nd = as_nd(invstd);
+    auto mean_nd = as_nd(mean);
+    auto sum_dy_nd = as_nd(sum_dy);
+    auto sum_dy_xmu_nd = as_nd(sum_dy_xmu);
+    auto weight_nd = weight.defined() ? as_nd(weight) :
+        at::scalar_tensor(1.0, input.options().dtype(mean.scalar_type()));
+
+    Tensor grad_input = at::empty(input.sizes(), grad_out.options().memory_format(input.suggest_memory_format()));
+    auto iter = TensorIteratorConfig()
+        .add_output(grad_input)
+        .add_input(grad_out)
+        .add_input(input)
+        .add_input(weight_nd)
+        .add_input(mean_nd)
+        .add_input(invstd_nd)
+        .add_input(sum_dy_xmu_nd)
+        .add_input(sum_dy_nd)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_out.scalar_type(),
+                                    "batch_norm_eval_backward", [&]{
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto norm_fct = static_cast<accscalar_t>(1.0 / (input.numel() /input.size(1)) );
+      gpu_kernel(iter, [norm_fct] GPU_LAMBDA (scalar_t gO, scalar_t input, accscalar_t weight,
+                                              accscalar_t mean, accscalar_t invstd,
+                                              accscalar_t xmu, accscalar_t dy) -> scalar_t {
+        auto factor_1_c = invstd * invstd * xmu * norm_fct;
+        auto factor_2_c = weight * invstd;
+        auto m_dy_c = dy * norm_fct;
+        return (gO - m_dy_c - (input - mean) * factor_1_c) * factor_2_c;
+      });
+    });
+    return grad_input;
+  }
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+Tensor batch_norm_elementwise_backward_eval(
+    const Tensor& grad_out, const Tensor& input,
+    const Tensor& invstd, const Tensor& weight) {
+  const auto ndim = input.dim();
+  DimVector shape(ndim, 1), strides(ndim, 0);
+  shape[1] = invstd.sizes()[0];
+  strides[1] = invstd.strides()[0];
+  auto invstd_nd = invstd.as_strided(shape, strides);
+  Tensor grad_input = at::empty(input.sizes(), grad_out.options());
+
+  if (weight.defined()) {
+    strides[1] = weight.strides()[0];
+    auto weight_nd = weight.as_strided(shape, strides);
+    auto iter = TensorIteratorConfig()
+        .add_output(grad_input)
+        .add_const_input(grad_out)
+        .add_const_input(invstd_nd)
+        .add_const_input(weight_nd)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_out.scalar_type(),
+                                    "batch_norm_eval_backward", [&]{
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t gO, accscalar_t invstd, accscalar_t weight)
+                 -> scalar_t {
+          return gO * weight * invstd;
+      });
+    });
+  } else {
+    auto iter = TensorIteratorConfig()
+        .add_output(grad_input)
+        .add_const_input(grad_out)
+        .add_const_input(invstd_nd)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_out.scalar_type(),
+                                    "batch_norm_eval_backward", [&]{
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t gO, accscalar_t invstd) -> scalar_t {
+          return gO * invstd;
+      });
+    });
+  }
+  return grad_input;
+}
+
+
+void batch_norm_mean_var(const Tensor& self, Tensor& save_mean, Tensor& save_var) {
+  // NOTE: Epsilon is only used for InvStd, not Var. The value here is ignored.
+  const double dummy_epsilon = 1e-5;
+  switch (batch_norm_choose_impl(self)) {
+  case Impl::Contiguous: {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, self.scalar_type(), "batch_norm_stats_zoom", [&] {
+      batch_norm_stats_zoom_template<scalar_t, int32_t, Var>(
+          save_mean, save_var, self, dummy_epsilon);
+    });
+    return;
+  }
+  case Impl::ChannelsLast: {
+    if ((!save_mean.defined() || save_mean.is_contiguous()) &&
+        (!save_var.defined() || save_var.is_contiguous())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
+          kHalf, kBFloat16, self.scalar_type(), "batch_norm_stats_zoom", [&] {
+        batch_norm_stats_channels_last_zoom_template<scalar_t, Var>(
+            save_mean, save_var, self, dummy_epsilon);
+      });
+      return;
+    }
+    [[fallthrough]];
+  }
+  case Impl::General: {
+    const int64_t ndim = self.dim();
+    DimVector reduce_dims(ndim - 1);
+    reduce_dims[0] = 0;
+    for (int64_t i = 2; i < ndim; ++i) {
+      reduce_dims[i - 1] = i;
+    }
+
+    // For some reason this isn't an actual operator but it exists anyway...
+    at::native::var_mean_out(save_var, save_mean, self, /*dims=*/reduce_dims,
+                            /*unbiased=*/false, /*keepdim=*/false);
+    return;
+  }
+  }
+}
+
+void batch_norm_update_stats(
+    const Tensor& save_mean, const Tensor& save_var,
+    const Tensor& running_mean, const Tensor& running_var,
+    double momentum_, int64_t N) {
+
+  auto iter = TensorIteratorConfig()
+      .add_output(running_mean)
+      .add_output(running_var)
+      .add_input(save_mean)
+      .add_input(save_var)
+      .add_input(running_mean)
+      .add_input(running_var)
+      .check_all_same_dtype(false)
+      .promote_inputs_to_common_dtype(false)
+      .build();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, running_mean.scalar_type(),
+                                  "batch_norm_update_stats_zoom", [&] {
+      using acc_t = at::acc_type<scalar_t, true>;
+      const auto bessel_correction_factor = static_cast<acc_t>(
+          static_cast<double>(N) / static_cast<double>(N - 1));
+      const auto momentum = static_cast<acc_t>(momentum_);
+      gpu_kernel_multiple_outputs(
+          iter, [=] GPU_LAMBDA (acc_t mean, acc_t var, scalar_t running_mean, scalar_t running_var)
+               -> thrust::tuple<scalar_t, scalar_t> {
+        const auto unbiased_var = var * bessel_correction_factor;
+        return thrust::tuple<scalar_t, scalar_t>{
+          mean * momentum + (1 - momentum) * running_mean,
+          unbiased_var * momentum + (1 - momentum) * running_var,
+        };
+      });
+  });
+}
+
+void batch_norm_update_stats_and_invert(
+    const Tensor& save_mean, const Tensor& save_var,
+    const Tensor& running_mean, const Tensor& running_var,
+    double momentum_, double epsilon, int64_t N) {
+
+  auto iter = TensorIteratorConfig()
+      .add_output(running_mean)
+      .add_output(running_var)
+      .add_output(save_var)
+      .add_const_input(save_mean)
+      .add_input(save_var)
+      .add_input(running_mean)
+      .add_input(running_var)
+      .check_all_same_dtype(false)
+      .promote_inputs_to_common_dtype(false)
+      .build();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, running_mean.scalar_type(),
+                                  "batch_norm_update_stats_zoom", [&] {
+      using acc_t = at::acc_type<scalar_t, true>;
+      const auto bessel_correction_factor = static_cast<acc_t>(
+          static_cast<double>(N) / static_cast<double>(N - 1));
+      const auto eps = static_cast<acc_t>(epsilon);
+      const auto momentum = static_cast<acc_t>(momentum_);
+      gpu_kernel_multiple_outputs(
+          iter, [=] GPU_LAMBDA (acc_t mean, acc_t var, scalar_t running_mean, scalar_t running_var)
+               -> thrust::tuple<scalar_t, scalar_t, acc_t> {
+        const auto unbiased_var = var * bessel_correction_factor;
+        return thrust::tuple<scalar_t, scalar_t, acc_t>{
+          mean * momentum + (1 - momentum) * running_mean,
+          unbiased_var * momentum + (1 - momentum) * running_var,
+          c10::hip::compat::rsqrt(var + eps)
+        };
+      });
+  });
+}
+
+void batch_norm_calc_invstd(const Tensor& out_invstd, const Tensor& running_var, double epsilon) {
+  auto iter = TensorIteratorConfig()
+      .add_output(out_invstd)
+      .add_input(running_var)
+      .check_all_same_dtype(false)
+      .build();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, running_var.scalar_type(),
+                                  "batch_norm_invert_std_zoom", [&] {
+    using acc_t = at::acc_type<scalar_t, true>;
+    auto eps = static_cast<acc_t>(epsilon);
+    gpu_kernel(iter, [eps] GPU_LAMBDA (scalar_t var) -> acc_t {
+      return c10::hip::compat::rsqrt(var + eps);
+    });
+  });
+}
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_zoom_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+  const bool has_running_mean = (running_mean_opt.has_value() && running_mean_opt->defined());
+  const bool has_running_var = (running_var_opt.has_value() && running_var_opt->defined());
+  TORCH_CHECK(has_running_mean == has_running_var);
+
+  if (train) {
+    batch_norm_mean_var(self, save_mean, save_invstd);
+    if (has_running_mean) {
+      const int64_t N = self.numel() / save_mean.numel();
+      batch_norm_update_stats_and_invert(
+          save_mean, save_invstd, *running_mean_opt, *running_var_opt,
+          momentum, epsilon, N);
+    } else {
+      batch_norm_calc_invstd(save_invstd, save_invstd, epsilon);
+    }
+  } else {
+    TORCH_CHECK(has_running_mean);
+    at::native::resize_output(save_mean, running_mean_opt->sizes());
+    save_mean.copy_(*running_mean_opt, /*non_blocking=*/true);
+    batch_norm_calc_invstd(save_invstd, running_var_opt.value(), epsilon);
+  }
+
+  batch_norm_elementwise(output, self, weight_opt, bias_opt, save_mean, save_invstd);
+  return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_invstd);
+}
+
+std::tuple<Tensor, Tensor, Tensor> batch_norm_zoom(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, bool train, double momentum, double epsilon) {
+  auto output = at::empty_like(self);
+  int64_t n_input = self.size(1);
+  auto options = self.options().dtype(
+      at::toAccumulateType(self.scalar_type(), /*is_cuda=*/true));
+  auto save_mean = at::empty({n_input}, options);
+  auto save_invstd = at::empty({n_input}, options);
+
+  at::native::batch_norm_zoom_out(
+      self,
+      weight_opt,
+      bias_opt,
+      running_mean_opt,
+      running_var_opt,
+      train,
+      momentum,
+      epsilon,
+      output,
+      save_mean,
+      save_invstd);
+  return std::make_tuple(output, save_mean, save_invstd);
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _batch_norm_with_update_zoom(
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  Tensor output, save_mean, save_var, reserve;
+
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps);
+  if (backend == BatchNormBackend::Cudnn) {
+    std::tie(output, save_mean, save_var, reserve) =
+        at::cudnn_batch_norm(input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else if (backend == BatchNormBackend::Miopen) {
+    reserve = at::empty({0}, input.options().dtype(kByte));
+    std::tie(output, save_mean, save_var) =
+        at::miopen_batch_norm(input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else {
+    reserve = at::empty({0}, input.options().dtype(kByte));
+    std::tie(output, save_mean, save_var) =
+        batch_norm_zoom(input, weight_opt, bias_opt, running_mean, running_var, /*training*/true, momentum, eps);
+  }
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(output, save_mean, save_var, reserve);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _batch_norm_with_update_zoom_out(
+    const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, double momentum, double eps,
+    Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps);
+  if (backend == BatchNormBackend::Cudnn) {
+    std::tie(out, save_mean, save_var, reserve) =
+        at::cudnn_batch_norm_out(out, save_mean, save_var, reserve, input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else if (backend == BatchNormBackend::Miopen) {
+    std::tie(out, save_mean, save_var) =
+        at::miopen_batch_norm_out(out, save_mean, save_var, input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps);
+  } else {
+    std::tie(out, save_mean, save_var) =
+      batch_norm_zoom_out(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps, out, save_mean, save_var);
+  }
+  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(out, save_mean, save_var, reserve);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_zoom(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) {
+  return batch_norm_zoom(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_zoom(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon) {
+  return batch_norm_zoom(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_zoom_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+  return batch_norm_zoom_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon, output, save_mean, save_invstd);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_zoom_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+  return batch_norm_zoom_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_invstd);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _new_batch_norm_backward_zoom(
+    const Tensor& grad_output, const Tensor& input, const Tensor& weight,
+    const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+    const std::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_var_opt,
+    bool update, double eps, std::array<bool,3> grad_input_mask, const Tensor& reserve) {
+  const Tensor& dummy_bias = at::empty(1);
+  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
+  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+  const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();});
+  const Tensor& save_var = c10::value_or_else(save_var_opt, [] {return Tensor();});
+
+  BatchNormBackend backend = _select_batch_norm_backend(input, weight, dummy_bias, running_mean, running_var, /*training*/true, eps);
+
+  if (backend == BatchNormBackend::Cudnn) {
+    return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, eps, reserve);
+  } else if (backend == BatchNormBackend::Miopen) {
+    return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, eps);
+  } else {
+    return batch_norm_backward_zoom(grad_output, input, weight, running_mean, running_var, save_mean, save_var, update, eps, grad_input_mask);
+  }
+}
+
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_zoom(const Tensor& grad_out, const Tensor& input, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt, bool train, double epsilon, std::array<bool,3> grad_input_mask) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight = at::borrow_from_optional_tensor(weight_opt);
+  c10::MaybeOwned<Tensor> save_mean = at::borrow_from_optional_tensor(save_mean_opt);
+  c10::MaybeOwned<Tensor> save_invstd = at::borrow_from_optional_tensor(save_invstd_opt);
+  c10::MaybeOwned<Tensor> running_mean = at::borrow_from_optional_tensor(running_mean_opt);
+  c10::MaybeOwned<Tensor> running_var = at::borrow_from_optional_tensor(running_var_opt);
+
+  const bool needs_reduction = train || grad_input_mask[1] || grad_input_mask[2];
+
+  // Fused reduction & elementwise kernel
+  if (needs_reduction && grad_input_mask[0] &&
+      !batch_norm_use_channels_last_kernels(input) &&
+      zoom::detail::canUse32BitIndexMath(input) &&
+      zoom::detail::canUse32BitIndexMath(grad_out)) {
+    return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
+                                           "batch_norm_backward_zoom", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      const bool mixed_type = is_mixed_type(input, *weight, *running_mean, *running_var);
+      if (mixed_type) {
+          return batch_norm_backward_zoom_template<scalar_t, accscalar_t, int32_t>(
+              grad_out, input, *weight, *running_mean, *running_var,
+              *save_mean, *save_invstd, train, epsilon, grad_input_mask);
+      } else {
+          return batch_norm_backward_zoom_template<scalar_t, scalar_t, int32_t>(
+              grad_out, input, *weight, *running_mean, *running_var,
+              *save_mean, *save_invstd, train, epsilon, grad_input_mask);
+      }
+    });
+  }
+
+  // NOTE: native_batch_norm always returns save_mean and save_invstd to be reused in backward.
+  // However, this is also called from cudnn_batch_norm in eval mode which doesn't give
+  // save_mean and save_invstd, so it needs recalculated.
+  const auto acc_type = at::toAccumulateType(input.scalar_type(), /*is_cuda=*/true);
+  Tensor mean;
+  TORCH_INTERNAL_ASSERT(save_mean->defined(), "save_mean should always be defined\n");
+  if (save_mean->numel() != 0) {
+    mean = *save_mean;
+  } else if (needs_reduction) {
+    TORCH_CHECK(!train && running_mean->defined());
+    mean = (running_mean->scalar_type() == acc_type) ?
+        *running_mean : running_mean->to(acc_type);
+  }
+
+  Tensor invstd;
+  TORCH_INTERNAL_ASSERT(save_invstd->defined(), "save_invstd should always be defined\n");
+  if (save_invstd->numel() != 0) {
+    invstd = *save_invstd;
+  } else {
+    TORCH_CHECK(!train && running_var->defined());
+    auto n_channels = input.sizes()[1];
+    invstd = at::empty({n_channels}, input.options().dtype(acc_type));
+    batch_norm_calc_invstd(invstd, *running_var, epsilon);
+  }
+
+  Tensor sum_dy, sum_dy_xmu, grad_weight, grad_bias;
+  if (needs_reduction) {
+    std::tie(sum_dy, sum_dy_xmu, grad_weight, grad_bias) =
+        batch_norm_backward_reduce_zoom(
+            grad_out, input, mean, invstd, *weight,
+            grad_input_mask[0], grad_input_mask[1], grad_input_mask[2]);
+  }
+
+  Tensor grad_input;
+  if (grad_input_mask[0]) {
+    if (train) {
+      // NOTE: sum_dy and sum_dy_xmy are defined, as train implies needs_reduction
+      grad_input = batch_norm_elementwise_backward_train(
+          grad_out, input, mean, invstd, *weight, sum_dy, sum_dy_xmu);
+    } else {
+      grad_input = batch_norm_elementwise_backward_eval(
+          grad_out, input, invstd, *weight);
+    }
+  }
+
+  return std::make_tuple(grad_input, grad_weight, grad_bias);
+}
+
+std::tuple<Tensor, Tensor> batch_norm_stats_zoom(const Tensor& self, double epsilon) {
+  auto options = self.options().dtype(
+      at::toAccumulateType(self.scalar_type(), /*is_cuda=*/true));
+  auto n_channels = self.size(1);
+  auto save_mean = at::empty({n_channels}, options);
+  auto save_invstd = at::empty({n_channels}, options);
+
+  bool use_channels_last_kernel = batch_norm_use_channels_last_kernels(self);
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+                                  self.scalar_type(), "batch_norm_stats_zoom", [&] {
+    if (zoom::detail::canUse32BitIndexMath(self)) {
+      if (use_channels_last_kernel) {
+        batch_norm_stats_channels_last_zoom_template<scalar_t, InvStd>(
+            save_mean, save_invstd, self, epsilon);
+      } else {
+        batch_norm_stats_zoom_template<scalar_t, int32_t, InvStd>(
+            save_mean, save_invstd, self, epsilon);
+      }
+    } else {
+      batch_norm_stats_zoom_template<scalar_t, int64_t, InvStd>(
+          save_mean, save_invstd, self, epsilon);
+    }
+  });
+  return std::tuple<Tensor, Tensor>(save_mean, save_invstd);
+}
+
+Tensor batch_norm_elemt_zoom(
+    const Tensor& self, const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& bias_opt, const Tensor& mean,
+    const Tensor& invstd, double epsilon) {
+  auto output = at::empty_like(self);
+  // FIXME: Epsilon parameter isn't required, we don't take the reciprocal
+  batch_norm_elementwise(output, self, weight_opt, bias_opt, mean, invstd);
+  return output;
+}
+
+Tensor& batch_norm_elemt_zoom_out(const Tensor& self, const std::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+                                  const Tensor& mean, const Tensor& invstd, double epsilon, Tensor& output) {
+  // FIXME: Epsilon parameter isn't required, we don't take the reciprocal
+  batch_norm_elementwise(output, self, weight_opt, bias_opt, mean, invstd);
+  return output;
+}
+
+// accepting input(self) here to determine template data types, since running_mean/running_var are optional
+std::tuple<Tensor, Tensor> batch_norm_gather_stats_zoom(const Tensor& self, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, double momentum, double epsilon, int64_t count) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
+  const Tensor& running_mean = *running_mean_maybe_owned;
+  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+
+  std::vector<int64_t> counts(mean.size(0), count);
+  Tensor counts_ = at::from_blob((void*)counts.data(), {(int64_t)counts.size()}, self.options().dtype(at::kLong).device(at::kCPU));
+  counts_ = counts_.to(self.device()).to(running_mean.defined() ? running_mean.dtype() : self.dtype());
+  return batch_norm_gather_stats_with_counts_zoom(self, mean, invstd, running_mean, running_var, momentum, epsilon, counts_);
+}
+
+
+std::tuple<Tensor, Tensor> batch_norm_gather_stats_with_counts_zoom(
+    const Tensor& self, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& running_mean_opt /* optional */, const c10::optional<Tensor>& running_var_opt /* optional */, double momentum, double epsilon, const Tensor& counts) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> running_mean_maybe_owned = at::borrow_from_optional_tensor(running_mean_opt);
+  const Tensor& running_mean = *running_mean_maybe_owned;
+  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+
+
+  auto scalar_type = running_mean.defined() ? running_mean.scalar_type() : self.scalar_type();
+  return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "batch_norm_update_stats_zoom", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    if (zoom::detail::canUse32BitIndexMath(self)) {
+      return batch_norm_gather_stats_zoom_template<scalar_t, accscalar_t, int32_t>(mean, invstd, running_mean, running_var, momentum, epsilon, counts);
+    } else {
+      return batch_norm_gather_stats_zoom_template<scalar_t, accscalar_t, int64_t>(mean, invstd, running_mean, running_var, momentum, epsilon, counts);
+    }
+  });
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_zoom(const Tensor& grad_output, const Tensor& input, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& weight_opt, bool input_g, bool weight_g, bool bias_g) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  if (at::zoom::detail::canUse32BitIndexMath(grad_output) &&
+      batch_norm_use_channels_last_kernels(grad_output) &&
+      batch_norm_use_channels_last_kernels(input) &&
+      (!weight.defined() || weight.is_contiguous()) &&
+      mean.is_contiguous() && invstd.is_contiguous()){
+    return batch_norm_backward_reduce_zoom_channels_last_template(
+        grad_output, input, mean, invstd, weight, input_g, weight_g, bias_g);
+  }
+
+  return AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(), "batch_norm_backward_reduce", [&] {
+    auto mean_st = mean.dtype();
+    auto invstd_st = invstd.dtype();
+    TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types");
+    const bool mixed_type = is_mixed_type(input, weight);
+    using accscalar_t = at::acc_type<scalar_t, true>;
+
+    if (zoom::detail::canUse32BitIndexMath(grad_output)) {
+      if (mixed_type) {
+        return batch_norm_backward_reduce_zoom_template<scalar_t, accscalar_t, int32_t>(grad_output, input, mean, invstd, weight, input_g, weight_g, bias_g);
+      } else {
+        return batch_norm_backward_reduce_zoom_template<scalar_t, scalar_t, int32_t>(grad_output, input, mean, invstd, weight, input_g, weight_g, bias_g);
+      }
+    } else {
+      if (mixed_type) {
+        return batch_norm_backward_reduce_zoom_template<scalar_t, accscalar_t, int64_t>(grad_output, input, mean, invstd, weight, input_g, weight_g, bias_g);
+      } else {
+        return batch_norm_backward_reduce_zoom_template<scalar_t, scalar_t, int64_t>(grad_output, input, mean, invstd, weight, input_g, weight_g, bias_g);
+      }
+    }
+  });
+}
+
+Tensor batch_norm_backward_elemt_zoom(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const std::optional<Tensor>& weight_opt, const Tensor& sum_dy, const Tensor& sum_dy_xmu, const Tensor& count) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+
+  if (at::zoom::detail::canUse32BitIndexMath(self) &&
+      batch_norm_use_channels_last_kernels(self) &&
+      batch_norm_use_channels_last_kernels(input))  {
+    return batch_norm_backward_elemt_channels_last_zoom_template(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
+  }
+
+  return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward_elemt", [&] {
+    auto mean_st = mean.dtype();
+    auto invstd_st = invstd.dtype();
+    TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types");
+    bool is_half_float = std::is_same<scalar_t, at::Half>::value && mean_st == at::kFloat;
+    bool is_bfloat16_float = std::is_same<scalar_t, at::BFloat16>::value && mean_st == at::kFloat;
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    if (zoom::detail::canUse32BitIndexMath(self)) {
+      if (is_half_float || is_bfloat16_float) {
+        return batch_norm_backward_elemt_zoom_template<scalar_t, accscalar_t, int32_t>(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
+      } else {
+        return batch_norm_backward_elemt_zoom_template<scalar_t, scalar_t, int32_t>(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
+      }
+    } else {
+      if (is_half_float || is_bfloat16_float) {
+        return batch_norm_backward_elemt_zoom_template<scalar_t, accscalar_t, int64_t>(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
+      } else {
+        return batch_norm_backward_elemt_zoom_template<scalar_t, scalar_t, int64_t>(self, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
+      }
+    }
+  });
+}
+
+std::tuple<Tensor, Tensor> batch_norm_update_stats_zoom(
+    const Tensor& self, const std::optional<Tensor>& running_mean_opt,
+    const std::optional<Tensor>& running_var_opt, double momentum) {
+  c10::MaybeOwned<Tensor> running_mean = at::borrow_from_optional_tensor(running_mean_opt);
+  c10::MaybeOwned<Tensor> running_var = at::borrow_from_optional_tensor(running_var_opt);
+
+  const int64_t n_input = self.size(1);
+
+  TORCH_CHECK(self.numel() != 0, "input tensor must have at least one element, but got input_sizes = ", self.sizes());
+  auto options = self.options().dtype(
+      at::toAccumulateType(self.scalar_type(), /*is_cuda=*/true));
+  auto save_mean = at::empty({n_input}, options);
+  auto save_var = at::empty({n_input}, options);
+
+  batch_norm_mean_var(self, save_mean, save_var);
+  TORCH_CHECK(running_mean->defined() == running_var->defined());
+  if (running_mean->defined()) {
+    const int64_t N = self.numel() / save_mean.numel();
+    batch_norm_update_stats(save_mean, save_var, *running_mean, *running_var, momentum, N);
+  }
+  return std::tuple<Tensor, Tensor>(save_mean, save_var);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Normalization.cuh b/aten/src/ATen/native/zoom/Normalization.cuh
new file mode 100644
index 00000000000000..2fd824ad49978d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Normalization.cuh
@@ -0,0 +1,1737 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+#include <ATen/native/zoom/DeviceSqrt.cuh>
+#include <ATen/native/zoom/LaunchUtils.h>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at { namespace native {
+
+// The maximum number of threads in a block
+constexpr int MAX_BLOCK_SIZE = 256;
+
+
+constexpr unsigned MAX_GRID_SIZE = 65535u;
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+  int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+
+// Returns the index of the most significant 1 bit in `val`.
+__device__ __forceinline__ int getMSB(int val) {
+  return 31 - __clz(val);
+}
+
+template <typename scalar_t, typename accscalar_t>
+struct Float2 {
+  accscalar_t v1, v2;
+  __device__ Float2() {}
+  __device__ Float2(scalar_t v1, scalar_t v2) : v1(static_cast<accscalar_t>(v1)), v2(static_cast<accscalar_t>(v2)) {}
+  __device__ Float2(int v) : v1(static_cast<accscalar_t>(v)), v2(static_cast<accscalar_t>(v)) {}
+  __device__ Float2& operator+=(const Float2& a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+  __device__ friend Float2 operator+(Float2 a, const Float2& b) {
+    a += b;
+    return a;
+  }
+};
+
+template <typename scalar_t, typename accscalar_t, typename PTA>
+struct GradOp {
+  __device__ GradOp(accscalar_t m, const PTA& i, const PTA& g)
+    : mean(m), input(i), grad_output(g) {}
+  __device__ __forceinline__ Float2<scalar_t, accscalar_t> operator()(int batch, int plane, int n) {
+    accscalar_t g = grad_output[batch][plane][n];
+    accscalar_t c = static_cast<accscalar_t>(input[batch][plane][n]) - mean;
+    return Float2<scalar_t, accscalar_t>(g, g * c);
+  }
+  const accscalar_t mean;
+  const PTA& input;
+  const PTA& grad_output;
+};
+
+template <typename acc_t>
+struct SumReduceOp {
+    __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; }
+
+    __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const {
+        return WARP_SHFL_DOWN(data, offset);
+    }
+};
+
+template <typename scalar_t, typename accscalar_t>
+struct SumReduceOp<Float2<scalar_t, accscalar_t>> {
+    using acc_t = Float2<scalar_t, accscalar_t>;
+
+    __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; }
+
+    __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const {
+        return {WARP_SHFL_DOWN(data.v1, offset), WARP_SHFL_DOWN(data.v2, offset)};
+    }
+};
+
+// Sum across (batch, x/y/z) applying Op() pointwise
+// this works by first having each thread sum it's part
+// of the data. Then there is a double-shuffling reduction.
+// First each warp (of C10_WARP_SIZE threads) uses warpSum to reduce its
+// data to the "warp leader", who writes its value into shared memory.
+// Then a single warp reads the remaining (at most C10_WARP_SIZE) items
+// and reduces them using another warpSum.
+// The implicit assumption is that there are no more
+// than C10_WARP_SIZE**2 threads.
+template<typename scalar_t, typename Op, typename PTA>
+__device__ scalar_t reduce(Op op, PTA tensor, int plane) {
+  // first the reductions each thread does separately
+  scalar_t sum = static_cast<scalar_t>(0);
+  for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) {
+    for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  __shared__ scalar_t shared[C10_WARP_SIZE];
+  SumReduceOp<scalar_t> reduce_op;
+  sum = zoom_utils::BlockReduce<scalar_t, SumReduceOp<scalar_t>, zoom_utils::Block2D>(sum, reduce_op, 0, shared);
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+      shared[0] = sum;
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole grad_input
+  return shared[0];
+}
+
+constexpr int ELEMENTS_PER_ITER = 4; // enables concurrency within each thread to hide latency
+constexpr int ELEMENTS_PER_THREAD = 16;
+constexpr int OPTIMAL_TILE_W = 32;
+constexpr int MAX_H_BLOCK = 128;
+
+__host__ void flexible_launch_configs(
+      const int reduction,
+      const int stride,
+      dim3 &block,
+      dim3 &grid,
+      const bool coop_flag = false) {
+  int block_x = ::min(lastPow2(stride), OPTIMAL_TILE_W);
+  int block_y = ::min(lastPow2(at::ceil_div(reduction , ELEMENTS_PER_THREAD)),
+                         MAX_BLOCK_SIZE / block_x);
+  if (block_x * block_y != MAX_BLOCK_SIZE) {
+    block_x = ::min(lastPow2(stride), MAX_BLOCK_SIZE / block_y);
+  }
+
+  int grid_x = at::ceil_div(stride, block_x);
+  int grid_y = ::min(at::ceil_div(reduction, block_y * ELEMENTS_PER_THREAD), MAX_H_BLOCK);
+  if (coop_flag) {
+    // it's not worth having a grid reduction if the reduction dimension is not big enough
+    grid_y = grid_y < 8 ? 1 : grid_y;
+  }
+
+  block.x = block_x;
+  block.y = block_y;
+  block.z = 1;
+  grid.x = grid_x;
+  grid.y = grid_y;
+  grid.z = 1;
+}
+
+template<typename T, typename C>
+__device__ __forceinline__ void welford_merge_element(C& count,
+                                                      T& mean,
+                                                      T& m2n,
+                                                      const C& count_new,
+                                                      const T& mean_new,
+                                                      const T& m2n_new) {
+      T factor = T(1.0) / ::max(1, (count + count_new));
+      T delta0 = mean - mean_new;
+      mean = (mean_new * count_new + mean * count) * factor;
+      m2n += m2n_new + delta0 * delta0 * count_new * count * factor;
+      count += count_new;
+}
+
+// merge mean/m2n among threadIdx.y within block
+template<typename T, typename C>
+__device__ __forceinline__ void welford_merge_block_vertical(C& count,
+                                                             T& mean,
+                                                             T& m2n,
+                                                             C* shmem_count,
+                                                             T* shmem_mean,
+                                                             T* shmem_m2n) {
+  // write to shared memory
+  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
+
+#pragma unroll
+  for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset*2) {
+      shmem_mean[address_base] = mean;
+      shmem_m2n[address_base] = m2n;
+      shmem_count[address_base] = count;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+      auto address = address_base + offset * blockDim.x;
+      // read shared memory back to register for reduction
+      auto count_new = shmem_count[address];
+      auto mean_new = shmem_mean[address];
+      auto m2n_new = shmem_m2n[address];
+
+      welford_merge_element(count, mean, m2n, count_new, mean_new, m2n_new);
+    }
+  }
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, bool train, typename index_t>
+__global__ void batch_norm_transform_input_kernel(
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, RestrictPtrTraits, index_t> input,
+    GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> output,
+    const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> mean_,
+    const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> var_or_invstd,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, RestrictPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, RestrictPtrTraits, index_t> bias,
+    stat_accscalar_t epsilon) {
+
+  index_t plane = blockIdx.x;
+
+  if (plane >= input.size(1)) {
+    return;
+  }
+
+  stat_accscalar_t gamma = weight.size(0) > 0 ? static_cast<stat_accscalar_t>(weight[plane]) : static_cast<stat_accscalar_t>(1);
+  stat_accscalar_t beta = bias.size(0) > 0 ? static_cast<stat_accscalar_t>(bias[plane]) : static_cast<stat_accscalar_t>(0);
+  stat_accscalar_t mean = static_cast<stat_accscalar_t>(mean_[plane]);
+  stat_accscalar_t invstd;
+  if (train) {
+    invstd = var_or_invstd[plane];
+  } else {
+    invstd = static_cast<stat_accscalar_t>(1) / device_sqrt(static_cast<stat_accscalar_t>(var_or_invstd[plane]) + epsilon);
+  }
+
+  index_t bs = input.size(0);
+  index_t fs = input.size(2);
+
+  index_t bstep  = blockDim.y * gridDim.y;
+  for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) {
+    auto o = output[batch][plane];
+    auto i = input[batch][plane];
+    for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) {
+      o[feature] = static_cast<input_scalar_t>(gamma * (i[feature] - mean) * invstd + beta);
+    }
+  }
+}
+
+struct InvStd {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T var, double epsilon) const {
+    T invstd = 0;
+    if (var != static_cast<T>(0) || epsilon != static_cast<T>(0)) {
+      invstd = static_cast<T>(1) / device_sqrt(var + epsilon);
+    }
+    return invstd;
+  }
+};
+
+struct Var {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T var, double epsilon) const {
+    return var;
+  }
+};
+
+template <typename VarTransform, typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_collect_statistics_kernel(
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, RestrictPtrTraits, index_t> input,
+    const stat_accscalar_t epsilon,
+    const stat_accscalar_t momentum,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, RestrictPtrTraits, index_t> save_mean,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, RestrictPtrTraits, index_t> save_transformed_var) {
+
+  __shared__ int shared_n[2 * 2 * C10_WARP_SIZE + C10_WARP_SIZE];
+
+  int plane = blockIdx.x;
+  int N = input.size(0) * input.size(2);
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+
+  // Compute the mean and variance across (batch, x/y/z)
+  // this uses the Welford (in the for loop)/parallel algorithm (to sum across the block)
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm
+  // and the parallel algorithm on the same page.
+  // We use two shuffles to reduce across the entire block.
+  // https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ has a description.
+  stat_accscalar_t* shared_avg_var = (stat_accscalar_t*) &shared_n[C10_WARP_SIZE];
+
+  // first the reductions each thread does separately
+  stat_accscalar_t avg = 0;
+  stat_accscalar_t var_n = 0;
+  int n = 0;
+  for (int batch = threadIdx.y; batch < input.size(0); batch += blockDim.y) {
+    for (int x = threadIdx.x; x < input.size(2); x += blockDim.x) {
+      stat_accscalar_t v = input[batch][plane][x];
+      stat_accscalar_t d1 = v - avg;
+      n++;
+      avg += d1 / n;
+      var_n += d1 * (v - avg);
+    }
+  }
+
+  // first warpSum to get one value per thread to
+  // one value per warp
+  for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) {
+    stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE);
+    int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE);
+    stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n);
+    var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
+    avg = (n * avg + o_n * o_avg) * factor;
+    n += o_n;
+  }
+
+  // this writes each warps  item into shared memory
+  // there are at most C10_WARP_SIZE items left because
+  // there are at most C10_WARP_SIZE**2 threads at the beginning
+  __syncthreads();
+  if (tid % C10_WARP_SIZE == 0) {
+    shared_n[tid / C10_WARP_SIZE] = n;
+    shared_avg_var[tid / C10_WARP_SIZE * 2] = avg;
+    shared_avg_var[tid / C10_WARP_SIZE * 2 + 1] = var_n;
+  }
+  __syncthreads();
+  // now have a second warpSum to reduce the intermediate values
+  // from shared memory to a single number. The very first
+  // thread writes it to shared memory.
+
+  if (tid < C10_WARP_SIZE) {
+    n = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_n[tid] : 0);
+    avg = (tid < blockDim.x * blockDim.y  / C10_WARP_SIZE ? shared_avg_var[2 * tid] : stat_accscalar_t(0));
+    var_n = (tid < blockDim.x * blockDim.y  / C10_WARP_SIZE ? shared_avg_var[2 * tid + 1] : stat_accscalar_t(0));
+  }
+  for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) {
+    stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE);
+    int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE);
+    stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n);
+    var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
+    avg = (n * avg + o_n * o_avg) * factor;
+    n += o_n;
+  }
+
+  // Save the mean, variance, and moving averages
+  if (tid == 0) {
+    if (save_mean.data() != NULL) {
+      save_mean[plane] = avg;
+    }
+    if (save_transformed_var.data() != NULL) {
+      save_transformed_var[plane] = VarTransform{}(var_n / N, epsilon);
+    }
+  }
+
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_kernel(
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<const input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, DefaultPtrTraits, index_t> running_mean,
+    const GenericPackedTensorAccessor<const stat_scalar_t, 1, DefaultPtrTraits, index_t> running_var,
+    const GenericPackedTensorAccessor<const stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_mean,
+    const GenericPackedTensorAccessor<const stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_invstd,
+    bool train,
+    stat_accscalar_t epsilon) {
+
+  index_t plane = blockIdx.x;
+  index_t N = grad_output.size(0) * grad_output.size(2);
+
+  stat_accscalar_t mean, invstd;
+  if (train) {
+    mean = save_mean[plane];
+    invstd = save_invstd[plane];
+  } else {
+    mean = static_cast<stat_accscalar_t>(running_mean[plane]);
+    invstd = static_cast<stat_accscalar_t>(1) / device_sqrt(static_cast<stat_accscalar_t>(running_var[plane]) + epsilon);
+  }
+
+  stat_accscalar_t weight_val = weight.size(0) > 0 ? static_cast<stat_accscalar_t>(weight[plane]) : stat_accscalar_t(1);
+  stat_accscalar_t norm = stat_accscalar_t(1) / N;
+
+  // Compute two values across (batch, x/y/z) in one pass:
+  // 1. Sum(grad_output)
+  // 2. DotProduct(input - mean, grad_output)
+  GradOp<input_scalar_t, stat_accscalar_t, GenericPackedTensorAccessor<const input_scalar_t, 3, DefaultPtrTraits, index_t>> g(mean, input, grad_output);
+  auto res = reduce<Float2<input_scalar_t, stat_accscalar_t>>(g, grad_output, plane);
+
+  stat_accscalar_t grad_output_sum = res.v1;
+  stat_accscalar_t dot_p = res.v2;
+
+  stat_accscalar_t grad_mean = grad_output_sum * norm;
+  stat_accscalar_t proj_scale = dot_p * norm * invstd * invstd;
+  stat_accscalar_t grad_scale = invstd * weight_val;
+
+  if (grad_input.data() != NULL) {
+    for (int batch = threadIdx.y; batch < grad_output.size(0); batch += blockDim.y) {
+      for (int x = threadIdx.x; x < grad_output.size(2); x += blockDim.x) {
+        input_scalar_t go = grad_output[batch][plane][x];
+        if (train) {
+          stat_accscalar_t inp = input[batch][plane][x];
+          stat_accscalar_t proj = (inp - mean) * proj_scale;
+          grad_input[batch][plane][x] = static_cast<input_scalar_t>((go - proj - grad_mean) * grad_scale);
+        } else {
+          grad_input[batch][plane][x] = static_cast<input_scalar_t>(go * grad_scale);
+        }
+      }
+    }
+  }
+
+  if (grad_weight.size(0) > 0) {
+    if (threadIdx.x == 0) {
+      grad_weight[plane] = static_cast<stat_scalar_t>(dot_p * invstd);
+    }
+  }
+
+  if (grad_bias.size(0) > 0) {
+    if (threadIdx.x == 0) {
+      grad_bias[plane] = static_cast<stat_scalar_t>(grad_output_sum);
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_t>
+__global__ void batch_norm_reduce_statistics_kernel(
+    const GenericPackedTensorAccessor<accscalar_t, 2, RestrictPtrTraits, index_t> vec_mean,
+    const GenericPackedTensorAccessor<accscalar_t, 2, RestrictPtrTraits, index_t> vec_invstd,
+    GenericPackedTensorAccessor<accscalar_t, 1, RestrictPtrTraits, index_t> mean,
+    GenericPackedTensorAccessor<accscalar_t, 1, RestrictPtrTraits, index_t> invstd,
+    GenericPackedTensorAccessor<scalar_t, 1, RestrictPtrTraits, index_t> running_mean,
+    GenericPackedTensorAccessor<scalar_t, 1, RestrictPtrTraits, index_t> running_var,
+    const accscalar_t epsilon,
+    const accscalar_t momentum,
+    const GenericPackedTensorAccessor<scalar_t, 1, RestrictPtrTraits, index_t> counts) {
+
+  int feature_size = vec_mean.size(1);
+  int world_size = vec_mean.size(0);
+
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  // first the reductions each thread does separately
+  for (int i = bid*blockDim.x+tid; i < feature_size; i += gridDim.x*blockDim.x) {
+    accscalar_t avg = 0;
+    accscalar_t var_n = 0;
+    index_t n = 0;
+    for (int j = 0; j < world_size; j++) {
+      scalar_t count = counts[j];
+      accscalar_t m = vec_mean[j][i];
+      accscalar_t v = accscalar_t(1.0) / (vec_invstd[j][i]);
+      v = (v * v - epsilon) * count;
+      accscalar_t factor = 1.0 / (n + count);
+      var_n += v + (avg - m) * (avg - m) * n * count * factor;
+      avg = n * factor * avg + count * factor * m;
+      n += count;
+    }
+    mean[i] = avg;
+    invstd[i] = static_cast<accscalar_t>(1) / device_sqrt(var_n / n + epsilon);
+    if (running_mean.data() != NULL) {
+      running_mean[i] = static_cast<scalar_t>((1 - momentum) * running_mean[i] + momentum * avg);
+    }
+    accscalar_t unbiasedVar = var_n / (n - 1);
+    if (running_var.data() != NULL) {
+      running_var[i] = static_cast<scalar_t>((1 - momentum) * running_var[i] + momentum * unbiasedVar);
+    }
+  }
+
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_reduce_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias) {
+
+  index_t plane = blockIdx.x;
+
+  stat_accscalar_t r_mean = mean[plane];
+  stat_accscalar_t factor = invstd[plane];
+
+  GradOp<input_scalar_t, stat_accscalar_t, GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>> g(r_mean, input, grad_output);
+  auto res = reduce<Float2<input_scalar_t, stat_accscalar_t>>(g, grad_output, plane);
+
+  if (threadIdx.x == 0) {
+    if (grad_weight.size(0) > 0) {
+      grad_weight[plane] = static_cast<stat_scalar_t>(res.v2 * factor);
+    }
+    if (grad_bias.size(0) > 0) {
+      grad_bias[plane] = static_cast<stat_scalar_t>(res.v1);
+    }
+    if (sum_dy.size(0) > 0) {
+      sum_dy[plane] = static_cast<stat_accscalar_t>(res.v1);
+    }
+    if (sum_dy_xmu.size(0) > 0) {
+      sum_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2);
+    }
+  }
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__device__ __forceinline__ void batch_norm_backward_elemt_kernel_impl(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    const stat_accscalar_t norm_fct) {
+  index_t plane = blockIdx.x;
+
+  if (plane >= input.size(1)) {
+    return;
+  }
+
+  stat_accscalar_t m_c = mean[plane];
+  stat_accscalar_t m_dy_c = sum_dy[plane] * norm_fct;
+  stat_accscalar_t factor_1_c = invstd[plane];
+  stat_accscalar_t factor_2_c = weight.size(0) > 0 ? static_cast<stat_accscalar_t>(weight[plane]) : stat_accscalar_t(1);
+  factor_2_c *= factor_1_c;
+  factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[plane] * norm_fct;
+
+  index_t bs = input.size(0);
+  index_t fs = input.size(2);
+
+  index_t bstep  = blockDim.y * gridDim.y;
+  for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) {
+    auto g_i = grad_input[batch][plane];
+    auto g_o = grad_output[batch][plane];
+    auto i = input[batch][plane];
+    for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) {
+      g_i[feature] = static_cast<input_scalar_t>((g_o[feature] - m_dy_c - (i[feature] - m_c) * factor_1_c) * factor_2_c);
+    }
+  }
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_elemt_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    const int* __restrict__ numel, const int world_size) {
+  int64_t total_numel = 0;
+  for (int i = 0; i < world_size; i ++) {
+    total_numel += numel[i];
+  }
+
+  const stat_accscalar_t norm_fct =
+      static_cast<stat_accscalar_t>(1) / static_cast<stat_accscalar_t>(total_numel);
+  batch_norm_backward_elemt_kernel_impl(
+      input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct);
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_elemt_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    const stat_accscalar_t norm_fct) {
+  batch_norm_backward_elemt_kernel_impl(
+      input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct);
+}
+
+template <typename scalar_t, int64_t dim, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+static GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t> get_packed_accessor(
+    const Tensor& t, c10::string_view var_name) {
+  constexpr auto expect_type = c10::CppTypeToScalarType<typename std::remove_const<scalar_t>::type>::value;
+  const auto actual_type = t.scalar_type();
+  TORCH_CHECK(actual_type == expect_type, "Expected ", var_name,
+              " to have type ", expect_type, " but got ", actual_type);
+  return t.generic_packed_accessor<scalar_t, dim, PtrTraits, index_t>();
+}
+
+template <typename scalar_t, int64_t dim, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+static GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t> packed_accessor_or_dummy(
+    const Tensor& t, c10::string_view var_name) {
+  if (!t.defined()) {
+    const std::array<index_t, dim> zeros{{0}};
+    return GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t>(nullptr, zeros.data(), zeros.data());
+  }
+  return get_packed_accessor<scalar_t, dim, PtrTraits, index_t>(t, var_name);
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_zoom_template(const Tensor& grad_out_, const Tensor& input_, const Tensor& weight_,
+                                                                     const Tensor& running_mean_, const Tensor& running_var_, const Tensor& save_mean_, const Tensor& save_invstd_,
+                                                                     bool train, double epsilon, std::array<bool,3> grad_input_mask) {
+
+  using accscalar_t = at::acc_type<stat_scalar_t, true>;
+  Tensor grad_input_;
+  Tensor grad_input_reshaped;
+  Tensor grad_weight_;
+  Tensor grad_bias_;
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1});
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+
+  if (grad_input_mask[0]) {
+    grad_input_ = at::empty_like(input_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    grad_input_reshaped = grad_input_.view(input_reshaped.sizes());
+  }
+  if (grad_input_mask[1]) {
+    grad_weight_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (grad_input_mask[2]) {
+    grad_bias_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+
+  auto input = get_packed_accessor<
+      const input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_output = get_packed_accessor<
+      const input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto grad_input = packed_accessor_or_dummy<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
+  auto weight = packed_accessor_or_dummy<
+      const stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+  auto grad_weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight");
+  auto grad_bias = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias");
+  auto running_mean = packed_accessor_or_dummy<
+      const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean");
+  auto running_var = packed_accessor_or_dummy<
+      const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var");
+  auto save_mean = packed_accessor_or_dummy<
+      const accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean");
+  auto save_invstd = packed_accessor_or_dummy<
+      const accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd");
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  dim3 blocks(input.size(1));
+  int tf = getNumThreads(input.size(2));
+  dim3 threads(tf, std::max<int>(1, MAX_BLOCK_SIZE/tf));
+
+ hipLaunchKernelGGL(( batch_norm_backward_kernel<input_scalar_t, stat_scalar_t, accscalar_t, index_t>) , dim3(blocks), dim3(threads), 0, stream, 
+    input, grad_output, grad_input, grad_weight, grad_bias, weight, running_mean, running_var,
+     save_mean, save_invstd, train, epsilon);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return std::make_tuple(grad_input_, grad_weight_, grad_bias_);
+}
+
+template<typename scalar_t, typename index_t, typename VarTransform>
+void batch_norm_stats_zoom_template(
+    const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input_, double epsilon) {
+
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  Tensor dummy_mean_;
+  Tensor dummy_var_;
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+
+  resize_output(out_mean, {n_input});
+  resize_output(out_invstd, {n_input});
+  auto input = get_packed_accessor<
+      const scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
+  TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() &&
+                        out_invstd.sizes()[0]);
+  TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() &&
+                        out_mean.sizes()[0]);
+
+  auto mean = packed_accessor_or_dummy<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(out_mean, "out_mean");
+  auto invstd = packed_accessor_or_dummy<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(out_invstd, "out_invstd");
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  dim3 blocks(input.size(1));
+  int tf = getNumThreads(input.size(2));
+  dim3 threads(tf, std::max<int>(1, MAX_BLOCK_SIZE/tf));
+ hipLaunchKernelGGL(( batch_norm_collect_statistics_kernel<VarTransform, scalar_t, scalar_t, accscalar_t, index_t>) , dim3(blocks), dim3(threads), 0, stream, 
+    input, epsilon, 0.0, mean, invstd);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+void batch_norm_elemt_zoom_template(const Tensor& output_, const Tensor& input_, const Tensor& weight_,
+                                    const Tensor& bias_, const Tensor& mean_, const Tensor& invstd_) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto output_reshaped = output_.view({input_.size(0), input_.size(1), -1});
+
+  auto input = get_packed_accessor<
+      const input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
+  auto output = get_packed_accessor<
+      input_scalar_t, 3, RestrictPtrTraits, index_t>(output_reshaped, "output");
+  auto weight = packed_accessor_or_dummy<
+    const stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight");
+  auto bias = packed_accessor_or_dummy<
+      const stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, RestrictPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, RestrictPtrTraits, index_t>(invstd_, "invstd");
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  // NOTE: We use transform_input_kernel in training mode, which ignores epsilon
+  const double dummy_epsilon = 1e-5;
+
+  // The input_transform kernel is pointwise, but we need to balance reading parameters (save_var/mean,
+  // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks
+  // and good occupancy. Quiet likely, we could go with even more blocks than 1024.
+  // The various planes are independent, so we use blocks for them.
+  int tf = std::max<int>(getNumThreads(input.size(2)/4),
+                         std::min<int>(getNumThreads(input.size(2)), 64));
+  int tb = std::max<int>(64/tf, 1);
+  dim3 blocks_trans(input.size(1), std::max<int>(1, std::min<int>((256*1024)/input.size(1),
+                                                                  (input.size(0)+tb-1)/tb)));
+  blocks_trans.y = ::min(blocks_trans.y, MAX_GRID_SIZE);
+  dim3 threads_trans(tf, tb);
+ hipLaunchKernelGGL(( batch_norm_transform_input_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, true, index_t>) , dim3(blocks_trans), dim3(threads_trans), 0, stream, 
+    input, output, mean, invstd, weight, bias, dummy_epsilon);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename scalar_t, typename accscalar_t, typename index_t>
+std::tuple<Tensor, Tensor> batch_norm_gather_stats_zoom_template(const Tensor& mean_, const Tensor& invstd_,
+                                                                 const Tensor& running_mean_, const Tensor& running_var_,
+                                                                 double momentum, double epsilon, const Tensor& counts_) {
+
+  Tensor save_mean_;
+  Tensor save_invstd_;
+
+  auto features = mean_.size(1);
+  auto input_options = mean_.options();
+  if (mean_.scalar_type() == at::ScalarType::Half || mean_.scalar_type() == at::ScalarType::BFloat16) {
+    input_options = input_options.dtype(ScalarType::Float);
+  }
+  save_mean_ = at::empty({features}, input_options);
+  save_invstd_ = at::empty({features}, input_options);
+
+  auto mean = packed_accessor_or_dummy<
+      accscalar_t, 2, RestrictPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      accscalar_t, 2, RestrictPtrTraits, index_t>(invstd_, "invstd");
+  auto running_mean = packed_accessor_or_dummy<
+      scalar_t, 1, RestrictPtrTraits, index_t>(running_mean_, "running_mean");
+  auto running_var = packed_accessor_or_dummy<
+      scalar_t, 1, RestrictPtrTraits, index_t>(running_var_, "running_mean");
+  auto counts = packed_accessor_or_dummy<
+      scalar_t, 1, RestrictPtrTraits, index_t>(counts_, "counts");
+
+  auto save_mean = get_packed_accessor<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(save_mean_, "save_mean");
+  auto save_invstd = get_packed_accessor<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(save_invstd_, "save_invstd");
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  int block = getNumThreads(features);
+  int grid = std::max<int>(1, features/block);
+ hipLaunchKernelGGL(( batch_norm_reduce_statistics_kernel<scalar_t, accscalar_t, index_t>) , dim3(grid), dim3(block), 0, stream, 
+      mean, invstd, save_mean, save_invstd, running_mean, running_var, epsilon, momentum, counts);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return std::make_tuple(save_mean_, save_invstd_);
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_zoom_template(const Tensor& grad_out_, const Tensor& input_,
+                                                                                    const Tensor& mean_, const Tensor& invstd_, const Tensor& weight_,
+                                                                                    const bool input_g, const bool weight_g, const bool bias_g) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  Tensor sum_dy_;
+  Tensor sum_dy_xmu_;
+  Tensor grad_weight_;
+  Tensor grad_bias_;
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+
+  if (input_g) {
+    sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (weight_g) {
+    grad_weight_ = at::empty({n_input}, weight_.options());
+  }
+  if (bias_g) {
+    grad_bias_ = at::empty({n_input}, weight_.options());
+  }
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto grad_weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight");
+  auto grad_bias = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd");
+  auto sum_dy = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy");
+  auto sum_dy_xmu = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu");
+
+  auto batch_size = input_reshaped.size(0);
+  auto feature_size = input_reshaped.size(2);
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  int warp_size = at::zoom::warp_size();
+  int block_y = std::min<int>(lastPow2(batch_size), MAX_BLOCK_SIZE/warp_size);
+  // We want block_x to be at least a warp width
+  int block_x = std::min<int>(std::max<int>(getNumThreads(feature_size), warp_size), MAX_BLOCK_SIZE/block_y);
+  const dim3 block(block_x, block_y);
+  const dim3 grid(n_input);
+
+ hipLaunchKernelGGL(( batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t>) , dim3(grid), dim3(block), 0, stream, 
+    input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+Tensor batch_norm_backward_elemt_zoom_template(const Tensor& grad_out_, const Tensor& input_,
+                                               const Tensor& mean_, const Tensor& invstd_,
+                                               const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+  auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd");
+  auto weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+  auto sum_dy = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy");
+  auto sum_dy_xmu = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu");
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  // The kernel is pointwise, but we need to balance reading parameters (save_var/mean,
+  // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks
+  // and good occupancy. Quiet likely, we could go with even more blocks than 1024.
+  // The various planes are independent, so we use blocks for them.
+  int tf = std::max<int>(getNumThreads(input.size(2)/4),
+                         std::min<int>(getNumThreads(input.size(2)), 64));
+  int tb = std::max<int>(64/tf, 1);
+  dim3 blocks_trans(input.size(1), std::max<int>(1, std::min<int>((256*1024)/input.size(1),
+                                                                  (input.size(0)+tb-1)/tb)));
+  blocks_trans.y = ::min(blocks_trans.y, MAX_GRID_SIZE);
+  dim3 threads_trans(tf, tb);
+  auto reduction_size = input_.numel() / n_input;
+  auto norm_fct = static_cast<stat_accscalar_t>(1.0 / reduction_size);
+ hipLaunchKernelGGL(( batch_norm_backward_elemt_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t>)
+      , dim3(blocks_trans), dim3(threads_trans), 0, stream, 
+      input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return grad_input_reshaped.view(input_.sizes());
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+Tensor batch_norm_backward_elemt_zoom_template(const Tensor& grad_out_, const Tensor& input_,
+                                               const Tensor& mean_, const Tensor& invstd_,
+                                               const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_, const Tensor& count) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+  auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd");
+  auto weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+  auto sum_dy = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy");
+  auto sum_dy_xmu = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu");
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  // The kernel is pointwise, but we need to balance reading parameters (save_var/mean,
+  // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks
+  // and good occupancy. Quiet likely, we could go with even more blocks than 1024.
+  // The various planes are independent, so we use blocks for them.
+  int tf = std::max<int>(getNumThreads(input.size(2)/4),
+                         std::min<int>(getNumThreads(input.size(2)), 64));
+  int tb = std::max<int>(64/tf, 1);
+  dim3 blocks_trans(input.size(1), std::max<int>(1, std::min<int>((256*1024)/input.size(1),
+                                                                  (input.size(0)+tb-1)/tb)));
+  blocks_trans.y = ::min(blocks_trans.y, MAX_GRID_SIZE);
+  dim3 threads_trans(tf, tb);
+ hipLaunchKernelGGL(( batch_norm_backward_elemt_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t>) , dim3(blocks_trans), dim3(threads_trans), 0, stream, 
+    input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, count.const_data_ptr<int>(), count.numel());
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+  return grad_input_reshaped.view(input_.sizes());
+}
+
+// welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance
+// original apex name: welford_kernel_c_last
+template
+   <typename VarTransform,
+    typename scalar_t,
+    typename accscalar_t,
+    int PARALLEL_LOADS>
+__global__ void
+batch_norm_collect_statistics_channels_last_kernel(
+      const scalar_t* __restrict__ input,
+      accscalar_t* __restrict__ out_mean,
+      accscalar_t* __restrict__ out_invstd,
+      volatile accscalar_t* staging_data,
+      int* semaphores,
+      const int reduction_size,
+      const int stride,
+      accscalar_t epsilon) {
+  // hide latency with concurrency
+  accscalar_t x_mean[PARALLEL_LOADS];
+  accscalar_t m_2_n[PARALLEL_LOADS];
+  int count[PARALLEL_LOADS];
+
+#pragma unroll
+  for (int i = 0; i < PARALLEL_LOADS; i++) {
+    x_mean[i] = accscalar_t(0);
+    m_2_n[i] = accscalar_t(0);
+    count[i] = accscalar_t(0);
+  }
+  // tensor dimension (m,c)
+
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+    accscalar_t x_math[PARALLEL_LOADS];
+    accscalar_t x_count_inv[PARALLEL_LOADS];
+    accscalar_t is_valid[PARALLEL_LOADS];
+
+    // load multiple data in
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        x_math[j] = input[address_base];
+        count[j]++;
+        x_count_inv[j] = accscalar_t(1) / count[j];
+        is_valid[j] = accscalar_t(1);
+      } else {
+        x_math[j] = accscalar_t(0);
+        x_count_inv[j] = accscalar_t(0);
+        is_valid[j] = accscalar_t(0);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+
+    // calculate mean/m2n with welford
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      accscalar_t delta0 = x_math[j] - x_mean[j];
+      x_mean[j] += delta0 * x_count_inv[j];
+      accscalar_t delta1 = x_math[j] - x_mean[j];
+      m_2_n[j] += delta0 * delta1 * is_valid[j];
+    }
+  }
+
+  // thread reduction to accumulate mean/m_2_n/count between PARALLEL_LOADS
+#pragma unroll
+  for (int j = 1; j < PARALLEL_LOADS; j++) {
+    welford_merge_element(count[0], x_mean[0], m_2_n[0], count[j], x_mean[j], m_2_n[j]);
+  }
+
+  // release x_mean / m_2_n
+  auto mean_th = x_mean[0];
+  auto m2_th = m_2_n[0];
+  auto count_th = count[0];
+
+  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
+  static __shared__ accscalar_t shmem_mean[MAX_BLOCK_SIZE];
+  static __shared__ accscalar_t shmem_m2n[MAX_BLOCK_SIZE];
+  static __shared__ int shmem_count[MAX_BLOCK_SIZE];
+
+  welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
+
+  if (gridDim.y > 1) {
+    volatile accscalar_t* staging_mean = staging_data;
+    volatile accscalar_t* staging_m2n = &staging_data[stride*gridDim.y];
+    volatile int* staging_count = reinterpret_cast<volatile int*>(&staging_m2n[stride*gridDim.y]);
+
+    address_base = c_offset + blockIdx.y * stride;
+    // write data to staging_data;
+    if (threadIdx.y == 0 && c_offset < stride) {
+      staging_mean[address_base] = mean_th;
+      staging_m2n[address_base] = m2_th;
+      staging_count[address_base] = count_th;
+    }
+
+    __threadfence();
+    __syncthreads(); // ensuring writes to staging_ is visible to all blocks
+
+    __shared__ bool is_last_block_done;
+    // mark block done
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int old = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done = (old == (gridDim.y-1));
+    }
+
+    __syncthreads();
+
+    // check that all data is now available in global memory
+    if (is_last_block_done) {
+      count_th = 0;
+      mean_th = accscalar_t(0.0);
+      m2_th = accscalar_t(0.0);
+
+      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+        address_base = c_offset + y * stride;
+        int count_new = c_offset < stride ? staging_count[address_base] : 0;
+        accscalar_t mean_new = c_offset < stride ? staging_mean[address_base] : accscalar_t(0.0);
+        accscalar_t m2n_new = c_offset < stride ? staging_m2n[address_base] : accscalar_t(0.0);
+
+        welford_merge_element(count_th, mean_th, m2_th, count_new, mean_new, m2n_new);
+      }
+
+      welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
+      if (threadIdx.y == 0 && c_offset < stride) {
+        out_mean[c_offset] = static_cast<accscalar_t>(mean_th);
+        out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon);
+      }
+    }
+  } else {
+    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
+      out_mean[c_offset] = static_cast<accscalar_t>(mean_th);
+      out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon);
+    }
+  }
+}
+
+// elementwise BN kernel
+// original apex name: batchnorm_forward_c_last_kernel
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t,
+    int PARALLEL_LOADS>
+__global__ void batch_norm_transform_input_channels_last_kernel(
+      const scalar_t* __restrict__ input,
+      const scalar_t* __restrict__ z,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const layerscalar_t* __restrict__ shift,
+      scalar_t* __restrict__ out,
+      const int reduction_size,
+      const int stride,
+      const bool fuse_relu) {
+  // tensor dimension (m,c)
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (c_offset >= stride || m_offset >= reduction_size) {
+    return;
+  }
+
+  auto m_c = mean[c_offset];
+  auto inv_std_c = static_cast<accscalar_t>(inv_std[c_offset]);
+  auto w_c = weight == nullptr ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset]);
+  auto s_c = shift == nullptr ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[c_offset]);
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        auto tmp = w_c * (static_cast<accscalar_t>(input[address_base]) - m_c ) * inv_std_c + s_c;
+        if (z != nullptr) {
+          tmp += z[address_base];
+        }
+        out[address_base] = (fuse_relu && tmp <= accscalar_t(0.0) ? scalar_t(0.0) : static_cast<scalar_t>(tmp));
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+  }
+}
+
+template<typename T>
+__device__ __forceinline__ void merge_block_vertical_backward(T& sum_dy,
+    T& sum_dy_xmu,
+    T* shmem_sum_dy,
+    T* shmem_sum_dy_xmu) {
+  // write to shared memory
+  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
+
+#pragma unroll
+  for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset*2) {
+      shmem_sum_dy[address_base] = sum_dy;
+      shmem_sum_dy_xmu[address_base] = sum_dy_xmu;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+      auto address = address_base + offset * blockDim.x;
+
+      sum_dy += shmem_sum_dy[address];
+      sum_dy_xmu += shmem_sum_dy_xmu[address];
+    }
+  }
+}
+
+// batchnorm backward kernel for c last tensor
+// original apex name: reduce_bn_c_last_kernel
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__global__ void batch_norm_backward_reduce_channels_last_kernel(
+      const scalar_t* __restrict__ input,
+      const scalar_t* __restrict__ grad_output,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      accscalar_t* __restrict__ sum_dy_o,
+      accscalar_t* __restrict__ sum_dy_xmu_o,
+      layerscalar_t* __restrict__ grad_weight,
+      layerscalar_t* __restrict__ grad_bias,
+      volatile accscalar_t* staging_data,
+      int* semaphores,
+      const int reduction_size,
+      const int stride) {
+
+  // hide latency with concurrency
+  accscalar_t sum_dy[PARALLEL_LOADS];
+  accscalar_t sum_dy_xmu[PARALLEL_LOADS];
+
+#pragma unroll
+  for (int i = 0; i < PARALLEL_LOADS; i++) {
+    sum_dy[i] = accscalar_t(0);
+    sum_dy_xmu[i] = accscalar_t(0);
+  }
+  // tensor dimension (m,c)
+
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (c_offset >= stride || m_offset >= reduction_size) {
+    return;
+  }
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  auto r_mean = mean[c_offset];
+  auto factor = inv_std[c_offset];
+
+  for (int i = 0; i < loop_count; i++) {
+    accscalar_t x_input[PARALLEL_LOADS];
+    accscalar_t x_grad_output[PARALLEL_LOADS];
+
+    // load multiple data in
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        x_input[j] = input[address_base];
+        x_grad_output[j] = grad_output[address_base];
+      } else {
+        x_input[j] = accscalar_t(0);
+        x_grad_output[j] = accscalar_t(0);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+
+    // calculate sum_dy / sum_dy_xmu
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      sum_dy[j] += x_grad_output[j];
+      sum_dy_xmu[j] += x_grad_output[j] * (x_input[j] - r_mean);
+    }
+  }
+
+  // thread reduction to accumulate sum_dy / sum_dy_xmu between PARALLEL_LOADS
+#pragma unroll
+  for (int j = 1; j < PARALLEL_LOADS; j++) {
+    sum_dy[0] += sum_dy[j];
+    sum_dy_xmu[0] += sum_dy_xmu[j];
+  }
+
+  // release array of registers
+  auto sum_dy_th = sum_dy[0];
+  auto sum_dy_xmu_th = sum_dy_xmu[0];
+
+  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
+  static __shared__ accscalar_t shmem_sum_dy[MAX_BLOCK_SIZE];
+  static __shared__ accscalar_t shmem_sum_dy_xmu[MAX_BLOCK_SIZE];
+
+  merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
+
+  if (gridDim.y > 1) {
+    volatile accscalar_t* staging_sum_dy = staging_data;
+    volatile accscalar_t* staging_sum_dy_xmu = &staging_data[stride*gridDim.y];
+
+    address_base = c_offset + blockIdx.y * stride;
+    // write data to staging_data;
+    if (threadIdx.y == 0 && c_offset < stride) {
+      staging_sum_dy[address_base] = sum_dy_th;
+      staging_sum_dy_xmu[address_base] = sum_dy_xmu_th;
+    }
+
+    __threadfence();
+    __syncthreads(); // ensuring writes to staging_ is visible to all blocks
+
+    __shared__ bool is_last_block_done;
+    // mark block done
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int old = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done = (old == (gridDim.y-1));
+    }
+
+    __syncthreads();
+
+    // check that all data is now available in global memory
+    if (is_last_block_done) {
+      sum_dy_th = accscalar_t(0.0);
+      sum_dy_xmu_th = accscalar_t(0.0);
+
+      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+        address_base = c_offset + y * stride;
+        sum_dy_th += (c_offset < stride ? staging_sum_dy[address_base] : accscalar_t(0.0));
+        sum_dy_xmu_th += (c_offset < stride ? staging_sum_dy_xmu[address_base] : accscalar_t(0.0));
+      }
+
+      merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
+      if (threadIdx.y == 0 && c_offset < stride) {
+        if (grad_bias != nullptr) {
+          grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
+        }
+        if (grad_weight != nullptr) {
+          grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
+        }
+        //mean_dy[c_offset] = sum_dy_th / reduction_size;
+        //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
+        sum_dy_o[c_offset] = sum_dy_th;
+        sum_dy_xmu_o[c_offset] = sum_dy_xmu_th;
+      }
+    }
+  } else {
+    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
+      if (grad_bias != nullptr) {
+        grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
+      }
+      if (grad_weight != nullptr) {
+        grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
+      }
+      //mean_dy[c_offset] = sum_dy_th / reduction_size;
+      //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
+      sum_dy_o[c_offset] = sum_dy_th;
+      sum_dy_xmu_o[c_offset] = sum_dy_xmu_th;
+    }
+  }
+}
+
+// elementwise BN kernel
+// original apex name: batchnorm_backward_c_last_kernel
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__device__ __forceinline__ void batch_norm_backward_elemt_channels_last_kernel_impl(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ sum_dy,
+      const accscalar_t* __restrict__ sum_dy_xmu,
+      scalar_t* __restrict__ grad_input,
+      const accscalar_t norm_fct,
+      const int reduction_size,
+      const int stride) {
+  // tensor dimension (m,c)
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (c_offset >= stride || m_offset >= reduction_size) {
+    return;
+  }
+
+  auto m_c = mean[c_offset];
+  auto m_dy_c = sum_dy[c_offset] * norm_fct;
+  auto factor_1_c = inv_std[c_offset];
+  auto factor_2_c = (weight == nullptr? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset])) * factor_1_c;
+  factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[c_offset] * norm_fct;
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        grad_input[address_base] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(grad_output[address_base]) - m_dy_c -
+            (static_cast<accscalar_t>(input[address_base]) - m_c) * factor_1_c)
+            * factor_2_c);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+  }
+}
+
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__global__ void batch_norm_backward_elemt_channels_last_kernel(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ sum_dy,
+      const accscalar_t* __restrict__ sum_dy_xmu,
+      const int* __restrict__ numel,
+      scalar_t* __restrict__ grad_input,
+      const int64_t world_size,
+      const int reduction_size,
+      const int stride) {
+
+  int64_t total_numel = 0;
+  for (int i = 0; i < world_size; i++) {
+    total_numel += numel[i];
+  }
+
+  auto norm_fct = static_cast<accscalar_t>(1) / static_cast<accscalar_t>(total_numel);
+  batch_norm_backward_elemt_channels_last_kernel_impl<PARALLEL_LOADS>(
+      grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu,
+      grad_input, norm_fct, reduction_size, stride);
+}
+
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__global__ void batch_norm_backward_elemt_channels_last_kernel(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ sum_dy,
+      const accscalar_t* __restrict__ sum_dy_xmu,
+      scalar_t* __restrict__ grad_input,
+      const accscalar_t norm_fct,
+      const int reduction_size,
+      const int stride) {
+  batch_norm_backward_elemt_channels_last_kernel_impl<PARALLEL_LOADS>(
+      grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu,
+      grad_input, norm_fct, reduction_size, stride);
+}
+
+template<typename scalar_t, typename VarTransform>
+void batch_norm_stats_channels_last_zoom_template(
+    const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input, double epsilon) {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  resize_output(out_mean, {stride});
+  resize_output(out_invstd, {stride});
+  TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() &&
+                        out_invstd.sizes()[0]);
+  TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() &&
+                        out_mean.sizes()[0]);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid, true);
+
+  at::Tensor staging_data;
+  at::Tensor semaphores;
+  if (grid.y > 1) {
+    staging_data = at::empty({4*stride*grid.y}, out_mean.options());
+    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
+  }
+
+  accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr<accscalar_t>() : nullptr;
+  int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
+ hipLaunchKernelGGL(( batch_norm_collect_statistics_channels_last_kernel<VarTransform, scalar_t, accscalar_t, ELEMENTS_PER_ITER>)
+      , dim3(grid), dim3(block), 0, c10::zoom::getCurrentZoomStream(), 
+      input.const_data_ptr<scalar_t>(),
+      out_mean.mutable_data_ptr<accscalar_t>(),
+      out_invstd.mutable_data_ptr<accscalar_t>(),
+      staging_data_ptr,
+      semaphores_ptr,
+      reduction_size,
+      stride,
+      epsilon);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+void batch_norm_elemt_channels_last_zoom_template(
+    const at::Tensor& output,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& shift,  // bias of BN
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::optional<at::Tensor>& z = c10::nullopt,  // bias after BN
+    const bool fuse_relu = false) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  const auto second_dtype = weight.defined() ? weight.scalar_type() :
+      (shift.defined() ? shift.scalar_type() : input.scalar_type());
+
+  if (input.scalar_type() != second_dtype) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+     hipLaunchKernelGGL(( batch_norm_transform_input_channels_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          input.const_data_ptr<scalar_t>(),
+          z.has_value() ? z.value().const_data_ptr<scalar_t>() : nullptr,
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<accscalar_t>() : nullptr,
+          shift.defined() ? shift.const_data_ptr<accscalar_t>() : nullptr,
+          output.mutable_data_ptr<scalar_t>(),
+          reduction_size,
+          stride,
+          fuse_relu);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    if (weight.defined()){
+      TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_forward: input.scalar_type() ", input.scalar_type(),
+        " is not supported with weight.scalar_type() ", weight.scalar_type());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+     hipLaunchKernelGGL(( batch_norm_transform_input_channels_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          input.const_data_ptr<scalar_t>(),
+          z.has_value() ? z.value().const_data_ptr<scalar_t>() : nullptr,
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<scalar_t>() : nullptr,
+          shift.defined() ? shift.const_data_ptr<scalar_t>(): nullptr,
+          output.mutable_data_ptr<scalar_t>(),
+          reduction_size,
+          stride,
+          fuse_relu);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  }
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+batch_norm_backward_reduce_zoom_channels_last_template(const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::Tensor& weight,
+    const bool input_g, const bool weight_g, const bool bias_g) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  at::Tensor sumn_dy = at::empty({stride}, mean.options());
+  at::Tensor sum_dy_xmu = at::empty({stride}, mean.options());
+
+  at::Tensor grad_weight;
+  at::Tensor grad_bias;
+  if (weight.defined()) {
+    grad_weight = at::empty({stride}, weight.options());
+    grad_bias = at::empty({stride}, weight.options());
+  } else {
+    // because I cannot return an uninitialized at::Tensor
+    grad_weight = at::empty({0}, mean.options());
+    grad_bias = at::empty({0}, mean.options());
+  }
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid, true);
+
+  at::Tensor staging_data;
+  at::Tensor semaphores;
+  if (grid.y > 1) {
+    staging_data = at::empty({2*stride*grid.y}, mean.options());
+    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
+  }
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  if (weight.defined() && input.scalar_type() != weight.scalar_type()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
+     hipLaunchKernelGGL(( batch_norm_backward_reduce_channels_last_kernel<ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          input.const_data_ptr<scalar_t>(),
+          grad_output.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          sumn_dy.mutable_data_ptr<accscalar_t>(),
+          sum_dy_xmu.mutable_data_ptr<accscalar_t>(),
+          grad_weight.mutable_data_ptr<accscalar_t>(),
+          grad_bias.mutable_data_ptr<accscalar_t>(),
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    if (weight.defined()) {
+      TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_reduce: input.scalar_type() ", input.scalar_type(),
+        " is not supported with weight.scalar_type() ", weight.scalar_type());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
+     hipLaunchKernelGGL(( batch_norm_backward_reduce_channels_last_kernel<ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          input.const_data_ptr<scalar_t>(),
+          grad_output.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          sumn_dy.mutable_data_ptr<accscalar_t>(),
+          sum_dy_xmu.mutable_data_ptr<accscalar_t>(),
+          weight.defined() ? grad_weight.mutable_data_ptr<scalar_t>() : nullptr,
+          weight.defined() ? grad_bias.mutable_data_ptr<scalar_t>() : nullptr,
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  }
+
+  return std::make_tuple(sumn_dy, sum_dy_xmu, grad_weight, grad_bias);
+}
+
+at::Tensor batch_norm_backward_elemt_channels_last_zoom_template(
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::Tensor& weight,
+    const at::Tensor& sum_dy,
+    const at::Tensor& sum_dy_xmu,
+    const at::Tensor& count) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+     hipLaunchKernelGGL(( batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.const_data_ptr<accscalar_t>(),
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          count.const_data_ptr<int>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          count.numel(),
+          reduction_size,
+          stride);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    if (weight.defined()) {
+      TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_element: input.scalar_type() ", input.scalar_type(),
+        " is not supported with weight.scalar_type() ", weight.scalar_type());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "batchnorm_backward_element", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+     hipLaunchKernelGGL(( batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<scalar_t>() : nullptr,
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          count.const_data_ptr<int>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          count.numel(),
+          reduction_size,
+          stride);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  }
+
+  return grad_input;
+}
+
+at::Tensor batch_norm_backward_elemt_channels_last_zoom_template(
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::Tensor& weight,
+    const at::Tensor& sum_dy,
+    const at::Tensor& sum_dy_xmu) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+  auto norm_fct = 1.0 / reduction_size;
+
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+
+    if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
+     hipLaunchKernelGGL(( batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.const_data_ptr<accscalar_t>(),
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          static_cast<accscalar_t>(norm_fct),
+          reduction_size,
+          stride);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+     hipLaunchKernelGGL(( batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>)
+          , dim3(grid), dim3(block), 0, stream, 
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<scalar_t>() : nullptr,
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          static_cast<accscalar_t>(norm_fct),
+          reduction_size,
+          stride);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+
+  return grad_input;
+}
+
+} } // namespace at::native
diff --git a/aten/src/ATen/native/zoom/PointwiseOpsKernels.cu b/aten/src/ATen/native/zoom/PointwiseOpsKernels.cu
new file mode 100644
index 00000000000000..206f504ad21906
--- /dev/null
+++ b/aten/src/ATen/native/zoom/PointwiseOpsKernels.cu
@@ -0,0 +1,145 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/PointwiseOps.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char addcmul_name[] = "addcmul";
+void addcmul_zoom_kernel(TensorIteratorBase& iter, const Scalar& value) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    #if AT_USE_JITERATOR()
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_zoom", [&]() {
+        auto alpha = value.to<scalar_t>();
+        static const auto addcmul_string = jiterator_stringify(
+          template <typename T> T addcmul(T a, T b, T c, T alpha) { return a + alpha * (b * c); });
+        jitted_gpu_kernel<
+            /*name=*/addcmul_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/3>(
+            iter,
+            addcmul_string,
+            /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+            /*scalar_val=*/0,
+            /*extra_args=*/std::make_tuple(alpha));
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcmul_zoom", [&]() {
+        auto alpha = value.to<scalar_t>();
+        gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+          return a + alpha * b * c;
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "addcmul_zoom", [&]() {
+      // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float
+      // and do math in fp32 for better accuracy.
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto alpha = value.to<accscalar_t>();
+      gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+        return a + alpha * (static_cast<accscalar_t>(b) * static_cast<accscalar_t>(c));
+      });
+    });
+  }
+}
+
+// return a + alpha * (b / static_cast<accscalar_t>(c));
+CONSTEXPR_EXCEPT_WIN_CUDA char addcdiv_name[] = "addcdiv";
+void addcdiv_zoom_kernel(TensorIteratorBase& iter, const Scalar& value) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+    #if AT_USE_JITERATOR()
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_zoom", [&]() {
+        auto alpha = value.to<scalar_t>();
+        static const auto addcdiv_string =
+            jiterator_stringify(template <typename T> T addcdiv(
+                T a, T b, T c, T alpha) { return a + alpha * (b / c); });
+        jitted_gpu_kernel<
+            /*name=*/addcdiv_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/3>(
+            iter,
+            addcdiv_string,
+            /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+            /*scalar_val=*/0,
+            /*extra_args=*/std::make_tuple(alpha));
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES(dtype, "addcdiv_zoom", [&]() {
+        auto alpha = value.to<scalar_t>();
+        gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+          return a + alpha * (b / c);
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "addcdiv_zoom", [&]() {
+      // note(mkozuki): If scalar_t is fp16 or bfloat16, cast scalar to float
+      // and do math in fp32 for better accuracy.
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      auto alpha = value.to<accscalar_t>();
+      gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+        return a + alpha * (b / static_cast<accscalar_t>(c));
+      });
+    });
+  }
+}
+
+void smooth_l1_backward_zoom_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
+  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "smooth_l1_backward_zoom", [&iter, &norm, beta] {
+      auto norm_val = norm.to<scalar_t>();
+      scalar_t beta_val(beta);
+      gpu_kernel(iter, [norm_val, beta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
+        const auto x = input - target;
+        if (x < -beta_val)
+          return -norm_val * grad_output;
+        else if (x > beta_val)
+          return norm_val * grad_output;
+        else
+          return norm_val * x * grad_output / beta_val;
+    });
+  });
+}
+
+void huber_backward_zoom_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "huber_backward_zoom", [&iter, &norm, delta] {
+    auto norm_val = norm.to<scalar_t>();
+    scalar_t delta_val(delta);
+    gpu_kernel(iter, [norm_val, delta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
+      const auto x = input - target;
+      if (x < -delta_val) {
+        return -norm_val * grad_output * delta_val;
+      } else if (x > delta_val) {
+        return norm_val * grad_output * delta_val;
+      } else {
+        return norm_val * x * grad_output;
+      }
+    });
+  });
+}
+
+void mse_backward_zoom_kernel(TensorIterator& iter, const Scalar& value) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "mse_backward_zoom", [&]() {
+    auto alpha = value.to<scalar_t>();
+    gpu_kernel(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
+      return alpha * (a - b) * c;
+    });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(addcdiv_stub, &addcdiv_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(addcmul_stub, &addcmul_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(smooth_l1_backward_stub, &smooth_l1_backward_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(huber_backward_stub, &huber_backward_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(mse_backward_stub, &mse_backward_zoom_kernel);
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/PowKernel.cu b/aten/src/ATen/native/zoom/PowKernel.cu
new file mode 100644
index 00000000000000..e67e47201687ad
--- /dev/null
+++ b/aten/src/ATen/native/zoom/PowKernel.cu
@@ -0,0 +1,209 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/native/zoom/Pow.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/Pow.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+// Forward declare some unary kernels
+void rsqrt_kernel_zoom(TensorIteratorBase& iter);
+void sqrt_kernel_zoom(TensorIteratorBase& iter);
+void reciprocal_kernel_zoom(TensorIteratorBase& iter);
+
+namespace {
+
+void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar);
+
+template <typename scalar_t>
+void pow_scalar_tensor_impl(TensorIteratorBase& iter, scalar_t base) {
+  gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t exp) -> scalar_t {
+    return pow_(base, exp);
+  });
+}
+
+template <typename value_t>
+void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<value_t> base) {
+  // For complex, thrust::pow uses the identity
+  // pow(a, b) = exp(log(a) * b)
+  const auto fct = std::log(base);
+  gpu_kernel(iter, [=]GPU_LAMBDA(c10::complex<value_t> exp) -> c10::complex<value_t> {
+    return std::exp(fct * exp);
+  });
+}
+
+/* complex<Half> support impl */
+CONSTEXPR_EXCEPT_WIN_CUDA char pow_scalar_base_name[] = "pow_scalar_base_kernel";
+template <>
+void pow_scalar_tensor_impl(TensorIteratorBase& iter, c10::complex<at::Half> base) {
+  using scalar_t = c10::complex<at::Half>;
+  using opmath_t = at::opmath_type<scalar_t>;
+  // For complex, thrust::pow uses the identity
+  // pow(a, b) = exp(log(a) * b)
+  const auto fct = std::log(opmath_t{base});
+#if AT_USE_JITERATOR()
+  static const auto pow_kernel_string =
+      jiterator_stringify(template <typename T> T pow_scalar_base_kernel(T exp, T fct) {
+        return std::exp(fct * exp);
+      });
+  jitted_gpu_kernel<pow_scalar_base_name, scalar_t, scalar_t, 1>(
+      iter,
+      pow_kernel_string,
+      /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+      /*scalar_val=*/0,
+      /*extra_args=*/std::make_tuple(fct));
+#else
+  gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t exp) -> scalar_t {
+    return std::exp(fct * opmath_t{exp});
+  });
+#endif
+}
+
+namespace {
+
+#if AT_USE_JITERATOR()
+/* complex<Half> support impl */
+CONSTEXPR_EXCEPT_WIN_CUDA char pow_name[] = "pow_kernel";
+static const auto pow_kernel_string =
+    jiterator_stringify(template <typename T> T pow_kernel(T base, T exp) {
+      return std::pow(base, exp);
+    });
+#endif
+
+/* complex<Half> support impl */
+void pow_chalf_tensor_scalar_impl(TensorIteratorBase& iter, const Scalar& exp_scalar) {
+  using scalar_t = c10::complex<at::Half>;
+  using opmath_t = at::opmath_type<scalar_t>;
+  auto exp = exp_scalar.to<opmath_t>();
+#if AT_USE_JITERATOR()
+  jitted_gpu_kernel<pow_name, scalar_t, scalar_t, 1>(
+      iter,
+      pow_kernel_string,
+      /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+      /*scalar_val=*/0,
+      /*extra_args=*/std::make_tuple(exp));
+#else
+  gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t base) -> scalar_t {
+    return std::pow(opmath_t{base}, exp);
+  });
+#endif
+}
+
+}  // anonymous namespace
+
+void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (common_dtype == kComplexHalf) {
+    using scalar_t = c10::complex<at::Half>;
+    if (iter.is_cpu_scalar(1)) {
+      const auto base = iter.scalar_value<scalar_t>(1);
+      iter.remove_operand(1);
+      pow_scalar_tensor_impl(iter, base);
+    } else if (iter.is_cpu_scalar(2)) {
+      const auto exp = iter.scalar_value<scalar_t>(2);
+      iter.remove_operand(2);
+      pow_chalf_tensor_scalar_impl(iter, exp);
+    } else {
+      using opmath_t = at::opmath_type<scalar_t>;
+      TORCH_INTERNAL_ASSERT(!iter.is_cpu_scalar(1) && !iter.is_cpu_scalar(2));
+#if AT_USE_JITERATOR()
+      jitted_gpu_kernel<pow_name, scalar_t, scalar_t, 2>(
+          iter, pow_kernel_string);
+#else
+      gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return pow_(opmath_t{base}, opmath_t{exp});
+          });
+#endif
+    }
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+        kHalf, kBFloat16, iter.common_dtype(), "pow_zoom", [&] {
+      if (iter.is_cpu_scalar(1)) {
+        const auto base = iter.scalar_value<scalar_t>(1);
+        iter.remove_operand(1);
+        pow_scalar_tensor_impl(iter, base);
+      } else if (iter.is_cpu_scalar(2)) {
+        const auto exp = iter.scalar_value<scalar_t>(2);
+        iter.remove_operand(2);
+        pow_tensor_scalar_kernel(iter, exp);
+      } else {
+        gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base, scalar_t exp) -> scalar_t {
+          return pow_(base, exp);
+        });
+      }
+    });
+  }
+}
+
+
+template<typename Base_type, typename Exp_type>
+void pow_tensor_scalar_kernel_impl(TensorIteratorBase& iter,
+                                                 Exp_type exp) {
+  const auto d_exp = static_cast<double>(exp);
+  // .5 (sqrt), -.5 (rsqrt) and -1 (reciprocal) specializations are handled
+  // in pow_tensor_scalar_kernel
+  if (d_exp == 2) {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return base * base;
+    });
+  } else if (d_exp == 3) {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return base * base * base;
+    });
+  } else if (d_exp == -2) {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return 1.0 / (base * base);
+    });
+  } else {
+    gpu_kernel(iter, [=]GPU_LAMBDA(Base_type base) -> Base_type {
+      return pow_(base, exp);
+    });
+  }
+}
+
+void pow_tensor_scalar_kernel(TensorIteratorBase& iter, const Scalar& exp_scalar) {
+  // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
+  if (!exp_scalar.isComplex()) {
+    if (exp_scalar.equal(.5)) {
+      return sqrt_kernel_zoom(iter);
+    } else if (exp_scalar.equal(-0.5)) {
+      return rsqrt_kernel_zoom(iter);
+    } else if (exp_scalar.equal(-1.0)) {
+      return reciprocal_kernel_zoom(iter);
+    }
+  }
+  if (isComplexType(iter.common_dtype()) || exp_scalar.isComplex()) {
+    if (iter.common_dtype() == kComplexHalf) {
+      using scalar_t = c10::complex<at::Half>;
+      pow_chalf_tensor_scalar_impl(iter, exp_scalar);
+      return;
+    }
+    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "pow_zoom", [&]() {
+      const auto exp = exp_scalar.to<scalar_t>();
+      gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t base) -> scalar_t {
+        return pow_(base, exp);
+      });
+    });
+  } else if (isFloatingType(iter.common_dtype()) || exp_scalar.isIntegral(false)) {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "pow_zoom", [&]() {
+      const auto exp = exp_scalar.to<scalar_t>();
+      pow_tensor_scalar_kernel_impl<scalar_t>(iter, exp);
+    });
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "invalid combination of type in Pow function, common dtype:", iter.common_dtype(),
+                                 "exp is integral?", exp_scalar.isIntegral(false));
+  }
+}
+
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(pow_tensor_tensor_stub, &pow_tensor_tensor_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(pow_tensor_scalar_stub, &pow_tensor_scalar_kernel);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Reduce.cu b/aten/src/ATen/native/zoom/Reduce.cu
new file mode 100644
index 00000000000000..894c9c35e4e4f8
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Reduce.cu
@@ -0,0 +1,56 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/Reduce.cuh>
+#include <c10/util/ArrayRef.h>
+
+#include <iostream>
+
+
+namespace at::native {
+
+static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
+  if (dim.y == 1 && dim.z == 1) {
+    out << dim.x;
+  } else {
+    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
+  }
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
+  out << "ReduceConfig(";
+  out << "element_size_bytes=" << config.element_size_bytes << ", ";
+  out << "num_inputs=" << config.num_inputs << ", ";
+  out << "num_outputs=" << config.num_outputs << ", ";
+  out << "step_input=" << config.step_input << ", ";
+  out << "step_output=" << config.step_output << ", ";
+  out << "ctas_per_output=" << config.ctas_per_output << ", ";
+  out << "input_mult=[";
+  for (int i = 0; i < 3; i++) {
+    if (i != 0) {
+      out << ",";
+    }
+    out << config.input_mult[i];
+  }
+  out << "], ";
+  out << "output_mult=[";
+  for (int i = 0; i < 2; i++) {
+    if (i != 0) {
+      out << ",";
+    }
+    out << config.output_mult[i];
+  }
+  out << "], ";
+  out << "vectorize_input=" << config.vectorize_input << ", ";
+  out << "output_vec_size=" << config.output_vec_size << ", ";
+  out << "block_width=" << config.block_width << ", ";
+  out << "block_height=" << config.block_height << ", ";
+  out << "num_threads=" << config.num_threads << ", ";
+  out << "values_per_thread=" << config.values_per_thread() << ", ";
+  out << "block=" << config.block() << ", ";
+  out << "grid=" << config.grid() << ", ";
+  out << "global_memory_size=" << config.global_memory_size();
+  out << ")";
+  return out;
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReduceMomentKernel.cu b/aten/src/ATen/native/zoom/ReduceMomentKernel.cu
new file mode 100644
index 00000000000000..e4aa3af58a0c1b
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReduceMomentKernel.cu
@@ -0,0 +1,68 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Reduce.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/ReduceOps.h>
+
+namespace at::native {
+
+template <typename scalar_t, typename out_t=scalar_t>
+void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
+  // reducing unrolling factor to 2 for welford kernel
+  // This is necessary to lower register usage that leads to register spills.
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  using ops_t = WelfordOps<scalar_t, accscalar_t, int32_t, thrust::pair<out_t, out_t>>;
+  ops_t ops(static_cast<accscalar_t>(correction), take_sqrt);
+  gpu_reduce_kernel<scalar_t, out_t, 2>(iter, ops, typename ops_t::acc_t{});
+}
+
+static void std_var_kernel_zoom(TensorIterator& iter, double correction, bool take_sqrt) {
+  const auto input_dtype = iter.input_dtype();
+  if (input_dtype == kHalf && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    std_var_kernel_impl<at::Half, float>(iter, correction, take_sqrt);
+  } else if (input_dtype == kBFloat16 && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    std_var_kernel_impl<at::BFloat16, float>(iter, correction, take_sqrt);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+                                    iter.dtype(), "std_zoom", [&]() {
+      std_var_kernel_impl<scalar_t>(iter, correction, take_sqrt);
+    });
+  }
+}
+
+template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
+void mean_kernel_impl(TensorIterator& iter) {
+  //  returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
+  using factor_t = typename c10::scalar_value_type<acc_t>::type;
+  factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
+  gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+}
+
+static void mean_kernel_zoom(TensorIterator& iter) {
+  if (iter.dtype() == kHalf) {
+    mean_kernel_impl<at::Half, float>(iter);
+  } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    mean_kernel_impl<at::Half, float, float>(iter);
+  } else if(iter.dtype() == kBFloat16) {
+    mean_kernel_impl<at::BFloat16, float>(iter);
+  } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) {
+    // type promotion that does cast and reduction in a single kernel
+    mean_kernel_impl<at::BFloat16, float, float>(iter);
+  } else {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "mean_zoom", [&]() {
+      mean_kernel_impl<scalar_t>(iter);
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(std_var_stub, &std_var_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(mean_stub, &mean_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryComplexKernels.cu b/aten/src/ATen/native/zoom/UnaryComplexKernels.cu
new file mode 100644
index 00000000000000..66752ae37b80b2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryComplexKernels.cu
@@ -0,0 +1,91 @@
+// #define TORCH_ASSERT_NO_OPERATORS
+#include <limits>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/zoom/Copy.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native {
+
+// We manually overload angle because std::arg does not work with types other than c10::complex.
+template<typename scalar_t>
+__host__ __device__ static inline scalar_t angle_wrapper(scalar_t v) {
+  if (at::_isnan(v)){
+    return v;
+  }
+  return v < 0 ? M_PI : 0;
+}
+
+template<typename T>
+__host__ __device__ static inline c10::complex<T> angle_wrapper(c10::complex<T> v) {
+  return c10::complex<T>{std::arg(v), 0};
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char angle_name[] = "angle_kernel";
+
+
+void angle_kernel_zoom(TensorIteratorBase& iter) {
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+
+    static const auto angle_string = jiterator_stringify(
+        template <typename T>
+        T angle_kernel(T v) {
+          return T{std::arg(v)};
+        }
+    ); // angle string
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "angle_zoom", [&]() {
+        jitted_gpu_kernel<
+          /*name=*/ angle_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 1>(iter, angle_string);
+    });
+
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(dtype, "angle_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return angle_wrapper(a);
+        });
+    });
+  }
+}
+
+// NB: Ignores the negative bit on tensors
+CONSTEXPR_EXCEPT_WIN_CUDA char conj_name[] = "conj_kernel";
+void conj_kernel_zoom(TensorIteratorBase& iter) {
+  auto conj_chalf = [&] {
+    using scalar_t = c10::complex<at::Half>;
+
+      static const auto conj_string = jiterator_stringify(
+        template <typename T>
+        T conj_kernel(T z) {
+          return std::conj(z);
+        }
+      );
+      jitted_gpu_kernel<conj_name, scalar_t, scalar_t, 1>(iter, conj_string);
+
+  };
+
+  AT_DISPATCH_SWITCH(iter.common_dtype(), "conj_zoom",
+    AT_DISPATCH_CASE_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, [&] {
+      // Conj is a no-op for non-complex types
+      direct_copy_kernel_zoom(iter);
+    })
+    AT_DISPATCH_CASE_COMPLEX_TYPES([&] {
+      gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return std::conj(a);
+      });
+    })
+    AT_DISPATCH_CASE(kComplexHalf, conj_chalf)
+  );
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(angle_stub, &angle_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(conj_physical_stub, &conj_kernel_zoom);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/UnaryFractionKernels.cu b/aten/src/ATen/native/zoom/UnaryFractionKernels.cu
new file mode 100644
index 00000000000000..a7a2c356e76b8f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryFractionKernels.cu
@@ -0,0 +1,199 @@
+#define TORCH_ASSERT_NO_OPERATORS
+#include <limits>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Math.cuh>
+
+namespace at::native {
+
+// We manually overload ceil because std::ceil does not work with std::complex types.
+template <typename scalar_t>
+__host__ __device__ static inline scalar_t ceil_wrapper(scalar_t a) {
+  return std::ceil(a);
+}
+
+template<typename T>
+__host__ __device__ static inline std::complex<T> ceil_wrapper(std::complex<T> v) {
+  return std::complex<T>(std::ceil(v.real()), std::ceil(v.imag()));
+}
+
+void ceil_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.dtype(), "ceil_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return ceil_wrapper(a);
+        });
+      });
+}
+
+void frac_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.dtype(), "frac_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return a - ::trunc(a);
+        });
+      });
+}
+
+// We manually overload floor because std::floor does not work with std::complex types.
+template <typename scalar_t>
+__host__ __device__ static inline scalar_t floor_wrapper(scalar_t a) {
+  return std::floor(a);
+}
+
+template<typename T>
+__host__ __device__ static inline std::complex<T> floor_wrapper(std::complex<T> v) {
+  return std::complex<T>(std::floor(v.real()), std::floor(v.imag()));
+}
+
+void floor_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.dtype(), "floor_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return floor_wrapper(a);
+        });
+      });
+}
+
+template <typename scalar_t>
+__host__ __device__ static inline scalar_t reciprocal_wrapper(scalar_t a) {
+  return static_cast<scalar_t>(1)/a;
+}
+
+template<typename T>
+__host__ __device__ static inline c10::complex<T> reciprocal_wrapper(c10::complex<T> v) {
+  // Handle extreme cases for numpy compatibility
+  auto both_inf = [](T real, T imag) {
+    return (::isinf(real) && ::isinf(imag));
+  };
+
+  auto either_inf = [](T real, T imag) {
+    return ::isinf(real) || ::isinf(imag);
+  };
+
+  auto either_nan = [](T real, T imag) {
+    return ::isnan(real) || ::isnan(imag);
+  };
+
+  if (either_nan(v.real(), v.imag()) || both_inf(v.real(), v.imag())) {
+    // If either is Nan or both are infinite, return {nan, nan}
+    return {std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()};
+  } else if (either_inf(v.real(), v.imag())) {
+    // If either is Inf, return {0, 0}
+    return {0, 0};
+  }
+  const c10::complex<T> one = c10::complex<T>(1.0, 0);
+  return one/v;
+}
+
+void reciprocal_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.common_dtype(), "reciprocal_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return reciprocal_wrapper(a);
+        });
+      });
+}
+
+// We manually overload nearbyint because std::nearbyint does not work with std::complex types and ROCm.
+template <typename scalar_t>
+__host__ __device__ static inline scalar_t nearbyint_wrapper(scalar_t a) {
+  return static_cast<scalar_t>(::nearbyintf(static_cast<float>(a)));
+}
+
+__host__ __device__ static inline double nearbyint_wrapper(double a) {
+  return ::nearbyint(a);
+}
+
+#pragma push
+#pragma nv_diag_suppress 177   // Function was declared but never referenced
+__host__ __device__ static inline c10::complex<float> nearbyint_wrapper(c10::complex<float> a) {
+  return c10::complex<float>(::nearbyintf(static_cast<float>(a.real())), ::nearbyintf(static_cast<float>(a.imag())));
+}
+
+__host__ __device__ static inline c10::complex<double> nearbyint_wrapper(c10::complex<double> a) {
+  return c10::complex<double>(::nearbyint(static_cast<double>(a.real())), ::nearbyint(static_cast<double>(a.imag())));
+}
+#pragma pop
+
+void round_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.dtype(), "round_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          // We do not use std::round because we would like to round midway numbers to the nearest even integer.
+          return nearbyint_wrapper(a);
+        });
+      });
+}
+
+void round_decimals_kernel_zoom(TensorIteratorBase& iter, int64_t decimals) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.dtype(), "round_zoom",
+      [&]() {
+        bool neg_flag = false;
+        scalar_t ten_pow_decimals;
+        if (decimals < 0) {
+          decimals = -decimals;
+          neg_flag = true;
+        }
+        ten_pow_decimals = static_cast<scalar_t>(std::pow(10, decimals));
+        gpu_kernel(iter, [ten_pow_decimals, neg_flag]GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return neg_flag ? std::nearbyint(a / ten_pow_decimals) * ten_pow_decimals
+                          : std::nearbyint(a * ten_pow_decimals) / ten_pow_decimals;
+        });
+      });
+}
+
+// We manually overload trunc because std::trunc does not work with std::complex types and ROCm.
+template <typename scalar_t>
+__host__ __device__ static inline scalar_t trunc_wrapper(scalar_t a) {
+  return static_cast<scalar_t>(::truncf(static_cast<float>(a)));
+}
+
+__host__ __device__ static inline double trunc_wrapper(double a) {
+  return ::trunc(a);
+}
+
+__host__ __device__ static inline c10::complex<float> trunc_wrapper(c10::complex<float> a) {
+  return c10::complex<float>(::truncf(static_cast<float>(a.real())), ::truncf(static_cast<float>(a.imag())));
+}
+
+__host__ __device__ static inline c10::complex<double> trunc_wrapper(c10::complex<double> a) {
+  return c10::complex<double>(::trunc(static_cast<double>(a.real())), ::trunc(static_cast<double>(a.imag())));
+}
+
+void trunc_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.dtype(), "trunc_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return trunc_wrapper(a);
+        });
+      });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(ceil_stub, &ceil_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(frac_stub, &frac_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(floor_stub, &floor_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(reciprocal_stub, &reciprocal_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(round_stub, &round_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(round_decimals_stub, &round_decimals_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(trunc_stub, &trunc_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGammaKernels.cu b/aten/src/ATen/native/zoom/UnaryGammaKernels.cu
new file mode 100644
index 00000000000000..c2ecd248a20e96
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGammaKernels.cu
@@ -0,0 +1,133 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <limits>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/native/Math.h>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char digamma_name[] = "digamma";
+#endif // AT_USE_JITERATOR()
+// See note [Jiterator]
+void digamma_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(), "digamma_zoom", [&]() {
+        jitted_gpu_kernel</*name=*/digamma_name,
+                          /*return_dtype=*/ scalar_t,
+                          /*common_dtype=*/ scalar_t,
+                          /*arity=*/ 1>(iter, digamma_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(), "digamma_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return calc_digamma(a);
+        });
+    });
+  #endif // AT_USE_JITERATOR()
+}
+
+// See note [Jiterator]
+CONSTEXPR_EXCEPT_WIN_CUDA char trigamma_name[] = "trigamma";
+void trigamma_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(), "trigamma_zoom", [&]() {
+        jitted_gpu_kernel</*name=*/trigamma_name,
+                          /*return_dtype=*/ scalar_t,
+                          /*common_dtype=*/ scalar_t,
+                          /*arity=*/ 1>(iter, trigamma_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(), "trigamma_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return calc_trigamma(a);
+        });
+    });
+  #endif // AT_USE_JITERATOR()
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char polygamma_name[] = "polygamma";
+void polygamma_kernel_zoom(TensorIteratorBase& iter, int64_t n) {
+  if (n == 0) {
+    digamma_kernel_zoom(iter);
+  } else if (n == 1) {
+    trigamma_kernel_zoom(iter);
+  } else {
+#if AT_USE_JITERATOR()
+    // TODO : `unary_jitted_gpu_kernel` for cleaner UX.
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+        iter.common_dtype(), "polygamma_zoom", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/polygamma_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(
+              iter,
+              polygamma_string,
+              /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+              /*scalar_val=*/0,
+              /*extra_args=*/std::make_tuple(n));
+        });
+#else
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+        iter.common_dtype(), "polygamma_zoom", [&]() {
+          gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return calc_polygamma<scalar_t, /*is_zoom=*/true>(a, static_cast<int>(n));
+          });
+        });
+#endif // AT_USE_JITERATOR()
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char lgamma_name[] = "lgamma_kernel";
+void lgamma_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(), "lgamma_zoom", [&]() {
+        jitted_gpu_kernel</*name=*/lgamma_name,
+                          /*return_dtype=*/ scalar_t,
+                          /*common_dtype=*/ scalar_t,
+                          /*arity=*/ 1>(iter, lgamma_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(), "lgamma_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return ::lgamma(a);
+        });
+    });
+  #endif
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(digamma_stub, &digamma_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(polygamma_stub, &polygamma_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(lgamma_stub, &lgamma_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricAcosKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricAcosKernel.cu
new file mode 100644
index 00000000000000..5f025ea605b228
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricAcosKernel.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if 0 && AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char acos_name[] = "acos_impl";
+#endif
+void acos_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    // Disabled due to accuracy issues
+#if 0 && AT_USE_JITERATOR()
+    static const auto acos_string = jiterator_stringify(
+        template <typename T> T acos_impl(T a) { return std::acos(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "acos_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/acos_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, acos_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "acos_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::acos(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "acos_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::acos(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(acos_stub, &acos_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricAcoshKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricAcoshKernel.cu
new file mode 100644
index 00000000000000..3be27ddcccbfb9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricAcoshKernel.cu
@@ -0,0 +1,60 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if 0 && AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char acosh_name[] = "acosh_impl";
+#endif
+
+void acosh_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if(at::isComplexType(common_dtype)) {
+    // Disabled due to accuracy issues
+#if 0 && AT_USE_JITERATOR()
+  static const auto acosh_string = jiterator_stringify(
+    template <typename T>
+    T acosh_impl(T a) {
+        return std::acosh(a);
+    }
+  );
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "acosh_name", [&]() {
+    jitted_gpu_kernel<
+        /*name=*/ acosh_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, acosh_string);
+  });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "acosh_name", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        return ::acosh(static_cast<opmath_t>(a));
+    });
+  });
+#endif
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      common_dtype, "acosh_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return ::acosh(a);
+        });
+      });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(acosh_stub, &acosh_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricAsinKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricAsinKernel.cu
new file mode 100644
index 00000000000000..76dc5052acc52a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricAsinKernel.cu
@@ -0,0 +1,56 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if 0 && AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char asin_name[] = "asin_impl";
+#endif
+
+void asin_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    // Disabled due to accuracy issues
+#if 0 && AT_USE_JITERATOR()
+    static const auto asin_string = jiterator_stringify(
+        template <typename T> T asin_impl(T a) { return std::asin(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "asin_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/asin_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, asin_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "asin_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::asin(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        kHalf, kBFloat16, common_dtype, "asin_zoom", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::asin(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(asin_stub, &asin_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricAsinhKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricAsinhKernel.cu
new file mode 100644
index 00000000000000..bd787a7784090f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricAsinhKernel.cu
@@ -0,0 +1,60 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if 0 && AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char asinh_name[] = "asinh_impl";
+#endif
+
+void asinh_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    // Disabled due to accuracy issues
+#if 0 && AT_USE_JITERATOR()
+    static const auto asinh_string = jiterator_stringify(
+        template <typename T> T asinh_impl(T a) { return std::asinh(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "asinh_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/asinh_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, asinh_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "asinh_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::asinh(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "asinh_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::asinh(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(asinh_stub, &asinh_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricAtanKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricAtanKernel.cu
new file mode 100644
index 00000000000000..52d97f99546aa6
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricAtanKernel.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char atan_name[] = "atan_impl";
+#endif
+
+void atan_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+  static const auto atan_string = jiterator_stringify(
+    template <typename T>
+    T atan_impl(T a) {
+        return std::atan(a);
+    }
+  );
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "atan_name", [&]() {
+    jitted_gpu_kernel<
+        /*name=*/ atan_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 1>(iter, atan_string);
+  });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "atan_name", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        return ::atan(static_cast<opmath_t>(a));
+    });
+  });
+#endif
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      common_dtype, "atan_zoom",
+      [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          return ::atan(a);
+        });
+      });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(atan_stub, &atan_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricAtanhKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricAtanhKernel.cu
new file mode 100644
index 00000000000000..e27129d762f5bf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricAtanhKernel.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char atanh_name[] = "atanh_impl";
+#endif
+
+void atanh_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto atanh_string = jiterator_stringify(
+        template <typename T> T atanh_impl(T a) { return std::atanh(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "atanh_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/atanh_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, atanh_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "atanh_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::atanh(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "atanh_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::atanh(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(atanh_stub, &atanh_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricCoshKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricCoshKernel.cu
new file mode 100644
index 00000000000000..2e1dbc7f0c00bf
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricCoshKernel.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char cosh_name[] = "cosh_impl";
+#endif
+
+void cosh_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto cosh_string = jiterator_stringify(
+        template <typename T> T cosh_impl(T a) { return std::cosh(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "cosh_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/cosh_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, cosh_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "cosh_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::cosh(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "cosh_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::cosh(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(cosh_stub, &cosh_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricSinhKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricSinhKernel.cu
new file mode 100644
index 00000000000000..2c4d3aebeeb9df
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricSinhKernel.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char sinh_name[] = "sinh_impl";
+#endif
+
+void sinh_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto sinh_string = jiterator_stringify(
+        template <typename T> T sinh_impl(T a) { return std::sinh(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "sinh_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/sinh_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, sinh_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "sinh_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::sinh(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "sinh_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::sinh(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(sinh_stub, &sinh_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricTanKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricTanKernel.cu
new file mode 100644
index 00000000000000..08a6899ce0aee6
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricTanKernel.cu
@@ -0,0 +1,58 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char tan_name[] = "tan_impl";
+#endif
+
+void tan_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto tan_string = jiterator_stringify(
+        template <typename T> T tan_impl(T a) { return std::tan(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "tan_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/tan_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, tan_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "tan_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::tan(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "tan_zoom",
+        [&]() {
+          gpu_kernel(
+              iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::tan(a); });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(tan_stub, &tan_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryGeometricTanhKernel.cu b/aten/src/ATen/native/zoom/UnaryGeometricTanhKernel.cu
new file mode 100644
index 00000000000000..055c3bd3d457c2
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryGeometricTanhKernel.cu
@@ -0,0 +1,59 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <limits>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char tanh_name[] = "tanh_impl";
+#endif
+
+void tanh_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto tanh_string = jiterator_stringify(
+        template <typename T> T tanh_impl(T a) { return std::tanh(a); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "tanh_name", [&]() {
+          jitted_gpu_kernel<
+              /*name=*/tanh_name,
+              /*return_dtype=*/scalar_t,
+              /*common_dtype=*/scalar_t,
+              /*arity=*/1>(iter, tanh_string);
+        });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(
+        kComplexHalf, common_dtype, "tanh_name", [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::tanh(static_cast<opmath_t>(a));
+          });
+        });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        common_dtype,
+        "tanh_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::tanh(a);
+          });
+        });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(tanh_stub, &tanh_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnaryLogKernels.cu b/aten/src/ATen/native/zoom/UnaryLogKernels.cu
new file mode 100644
index 00000000000000..a760d8c737f488
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnaryLogKernels.cu
@@ -0,0 +1,122 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <limits>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/zoom/Math.cuh>
+
+namespace at::native {
+
+#if AT_USE_JITERATOR()
+CONSTEXPR_EXCEPT_WIN_CUDA char log_name[] = "log_kernel";
+#endif
+
+void log_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto log_string = jiterator_stringify(
+        template <typename T> T log_kernel(T x) { return ::log(x); });
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "log_zoom", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/log_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, log_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, iter.common_dtype(), "log_zoom", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            using opmath_t = at::opmath_type<scalar_t>;
+            return ::log(static_cast<opmath_t>(a));
+          });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ::log(a);
+      });
+    });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char log10_name[] = "log10_kernel";
+void log10_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto log10_string = jiterator_stringify(
+        template <typename T> T log10_kernel(T x) { return std::log10(x); });
+    AT_DISPATCH_COMPLEX_TYPES(common_dtype, "log10_zoom", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/log10_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, log10_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "log10_zoom", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::log10(a); });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log10_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ::log10(a);
+      });
+    });
+  }
+}
+
+void log1p_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log1p_zoom", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return ::log1p(a);
+    });
+  });
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char log2_name[] = "log2_kernel";
+void log2_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+#if AT_USE_JITERATOR()
+    static const auto log2_string = jiterator_stringify(
+        template <typename T> T log2_kernel(T x) { return std::log2(x); });
+    AT_DISPATCH_COMPLEX_TYPES(common_dtype, "log2_zoom", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/log2_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/1>(iter, log2_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES(iter.common_dtype(), "log2_zoom", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::log2(a); });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "log2_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return ::log2(a);
+      });
+    });
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(log_stub, &log_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(log10_stub, &log10_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(log2_stub, &log2_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(log1p_stub, &log1p_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnarySpecialOpsKernel.cu b/aten/src/ATen/native/zoom/UnarySpecialOpsKernel.cu
new file mode 100644
index 00000000000000..a119c7b1efef8a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnarySpecialOpsKernel.cu
@@ -0,0 +1,399 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+
+CONSTEXPR_EXCEPT_WIN_CUDA char exp2_name[] = "exp2_kernel";
+void exp2_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "exp2_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/exp2_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, exp2_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        ScalarType::Half, ScalarType::BFloat16,
+        iter.common_dtype(), "exp2_zoom",
+        [&]() {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return exp2_impl(a);
+          });
+        });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char i0_name[] = "i0";
+void i0_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/i0_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, i0_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        // implicit conversion of a to opmath_t will happen here,
+        //   but as far as TI is concerned, it's still a no-dynamic-cast kernel because lambda input is scalar_t
+        return calc_i0<opmath_t>(a);
+      });
+    });
+  #endif
+}
+
+// See note [Jiterator]
+CONSTEXPR_EXCEPT_WIN_CUDA char i0e_name[] = "calc_i0e";
+void i0e_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0e_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/i0e_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, i0e_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "i0e_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        return calc_i0e<opmath_t>(a);
+      });
+    });
+  #endif
+}
+
+// See note [Jiterator]
+
+CONSTEXPR_EXCEPT_WIN_CUDA char i1_name[] = "i1";
+void i1_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "i1_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/i1_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, i1_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "i1_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return calc_i1(a);
+      });
+    });
+  #endif // AT_USE_JITERATOR()
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char i1e_name[] = "i1e";
+void i1e_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "i1e_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/i1e_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, i1e_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "i1e_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return calc_i1e(a);
+      });
+    });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char sigmoid_name[] = "sigmoid";
+void sigmoid_kernel_zoom(TensorIteratorBase& iter) {
+  auto common_dtype = iter.common_dtype();
+  if (at::isComplexType(common_dtype)) {
+    // only jiterate for complex-dtype
+    #if AT_USE_JITERATOR()
+      static const auto sigmoid_string = jiterator_stringify(
+        template <typename T>
+        T sigmoid(T x) {
+          return T{1} / (T{1} + ::exp(-x));
+        }
+      ); // sigmoid_string
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sigmoid_zoom", [&]() {
+        jitted_gpu_kernel<
+            /*name=*/sigmoid_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/1>(iter, sigmoid_string);
+      });
+    #else
+      AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sigmoid_zoom", [&]() {
+        gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+          using opmath_t = at::opmath_type<scalar_t>;
+          const auto one = opmath_t{1};
+          return static_cast<scalar_t>(one / (one + ::exp(-opmath_t{a})));
+        });
+      });
+    #endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, common_dtype, "sigmoid_zoom", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        using opmath_t = at::opmath_type<scalar_t>;
+        const auto one = opmath_t{1};
+        return static_cast<scalar_t>(one/(one + ::exp(-opmath_t{a})));
+      });
+    });
+  }
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char sinc_name[] = "sinc";
+void sinc_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      ScalarType::Half, ScalarType::BFloat16,
+      iter.common_dtype(), "sinc_zoom",
+      [&]() {
+        jitted_gpu_kernel</*name=*/sinc_name,
+                          /*return_dtype=*/ scalar_t,
+                          /*common_dtype=*/ scalar_t,
+                          /*arity=*/ 1>(iter, sinc_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+        ScalarType::Half, ScalarType::BFloat16,
+        iter.common_dtype(), "sinc_zoom",
+        [&]() {
+          gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            if (a == scalar_t(0)) {
+              return scalar_t(1);
+            } else {
+              // NVCC says constexpr var is not accessible from device
+              using opmath_t = at::opmath_type<scalar_t>;
+              opmath_t product = c10::detail::pi<opmath_t>() * opmath_t{a};
+              return static_cast<scalar_t>(std::sin(product) / product);
+            }
+          });
+        });
+  #endif
+}
+
+void logit_kernel_zoom(TensorIteratorBase& iter, const Scalar& eps_scalar) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      iter.common_dtype(),
+      "logit_zoom",
+      [&]() {
+        using T_ACC = acc_type<scalar_t, true>;
+        const T_ACC eps = eps_scalar.to<T_ACC>();
+        if (eps < T_ACC(0)) {
+          gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) -> scalar_t {
+            const T_ACC x_acc = static_cast<T_ACC>(x);
+            return c10::hip::compat::log(x_acc / (T_ACC(1) - x_acc));
+          });
+        } else {
+          const T_ACC lo = eps;
+          const T_ACC hi = T_ACC(1) - eps;
+          gpu_kernel(
+              iter, [lo, hi] GPU_LAMBDA(scalar_t x) -> scalar_t {
+                const T_ACC x_acc = static_cast<T_ACC>(x);
+                T_ACC z = x_acc < lo ? lo : (x_acc > hi ? hi : x_acc);
+                return c10::hip::compat::log(z / (T_ACC(1) - z));
+              });
+        }
+      });
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char ndtri_name[] = "ndtri";
+void ndtri_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "ndtri_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/ndtri_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, ndtri_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "ndtri_zoom", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return calc_ndtri(a); });
+      });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char log_ndtr_name[] = "log_ndtr";
+void log_ndtr_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/log_ndtr_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, log_ndtr_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "log_ndtr_zoom", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return calc_log_ndtr(a); });
+      });
+  #endif
+}
+
+void erf_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "erf_zoom", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return ::erf(a);
+    });
+  });
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char erfc_name[] = "erfc_kernel";
+void erfc_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "erfc_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/erfc_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, erfc_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
+        iter.common_dtype(), "erfc_zoom", [&]() {
+          gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::erfc(a);
+          });
+        });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char erfinv_name[] = "erfinv_kernel";
+void erfinv_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "erfinv_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/erfinv_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, erfinv_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
+        iter.common_dtype(), "erfinv_zoom", [&]() {
+          gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+            return ::erfinv(a);
+          });
+        });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char erfcx_name[] = "erfcx";
+void erfcx_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "erfcx_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/erfcx_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, erfcx_string);
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "erfcx_zoom", [&]() {
+      gpu_kernel(
+          iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return calc_erfcx(a); });
+    });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char kaiser_window_name[] = "kaiser_window";
+void kaiser_window_kernel_zoom(TensorIteratorBase& iter, int64_t window_length, double beta_){
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_zoom", [&](){
+        using opmath_t = at::opmath_type<scalar_t>;
+        const opmath_t inv_alpha = static_cast<opmath_t>(2.0 / (window_length - 1));
+        const opmath_t beta = static_cast<opmath_t>(beta_);
+        const opmath_t inv_i0_beta = 1.0 / calc_i0(beta);
+        jitted_gpu_kernel<
+            /*name=*/kaiser_window_name,
+            /*return_dtype=*/scalar_t,
+            /*common_dtype=*/scalar_t,
+            /*arity=*/1>(
+            iter,
+            kaiser_window_string,
+            /*scalar_pos=*/at::zoom::jit::BinaryFuncVariant::NoScalar,
+            /*scalar_val=*/0,
+            /*extra_args=*/std::make_tuple(inv_alpha, beta, inv_i0_beta));
+    });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_zoom", [&](){
+      using opmath_t = at::opmath_type<scalar_t>;
+      const opmath_t inv_alpha = static_cast<opmath_t>(2.0 / (window_length - 1));
+      const opmath_t beta = static_cast<opmath_t>(beta_);
+      const opmath_t inv_i0_beta = 1.0 / calc_i0(beta);
+      gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t a) -> scalar_t {
+        opmath_t x = static_cast<opmath_t>(a) * inv_alpha - 1;
+        opmath_t y = std::max<opmath_t>(0, 1 - x * x);
+        return calc_i0(beta * ::sqrt(y)) * inv_i0_beta;
+      });
+    });
+  #endif
+}
+
+CONSTEXPR_EXCEPT_WIN_CUDA char entr_name[] = "entr";
+void entr_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.common_dtype(), "entr_zoom", [&]() {
+      jitted_gpu_kernel</*name=*/entr_name,
+                        /*return_dtype=*/ scalar_t,
+                        /*common_dtype=*/ scalar_t,
+                        /*arity=*/ 1>(iter, entr_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        ScalarType::Half,
+        ScalarType::BFloat16,
+        iter.common_dtype(),
+        "entr_zoom",
+        [&]() {
+          gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t x) -> scalar_t {
+            if (at::_isnan(x)) {
+              return x;
+            } else if (x > 0) {
+              return -x * ::log(x);
+            } else if (x == 0) {
+              return 0;
+            }
+            return static_cast<scalar_t>(-INFINITY);
+          });
+        });
+  #endif
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(exp2_stub, &exp2_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(i0_stub, &i0_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_i0e_stub, &i0e_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_i1_stub, &i1_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_i1e_stub, &i1e_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(sigmoid_stub, &sigmoid_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(sinc_stub, &sinc_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(logit_stub, &logit_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(erf_stub, &erf_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(erfc_stub, &erfc_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(erfinv_stub, &erfinv_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_entr_stub, &entr_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_ndtri_stub, &ndtri_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_log_ndtr_stub, &log_ndtr_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(special_erfcx_stub, &erfcx_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/vol2col.cuh b/aten/src/ATen/native/zoom/vol2col.cuh
new file mode 100644
index 00000000000000..84b80e523479ec
--- /dev/null
+++ b/aten/src/ATen/native/zoom/vol2col.cuh
@@ -0,0 +1,263 @@
+#pragma once
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/detail/TensorInfo.cuh>
+
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+using namespace at::zoom::detail;
+
+// Kernel for fast unfold+copy on volumes
+template <typename T>
+__global__ void vol2col_kernel(
+    const int64_t n,
+    const T* data_vol,
+    const int depth,
+    const int height,
+    const int width,
+    const int ksize_t,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_t,
+    const int pad_h,
+    const int pad_w,
+    const int stride_t,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_t,
+    const int dilation_h,
+    const int dilation_w,
+    const int depth_col,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  HIP_KERNEL_LOOP_TYPE(index, n, int64_t) {
+    auto w_out = index % width_col;
+    index /= width_col;
+    auto h_out = index % height_col;
+    index /= height_col;
+    auto t_out = index % depth_col;
+    auto channel_in = index / depth_col;
+    auto channel_out = channel_in * ksize_t * ksize_h * ksize_w;
+    auto t_in = t_out * stride_t - pad_t;
+    auto h_in = h_out * stride_h - pad_h;
+    auto w_in = w_out * stride_w - pad_w;
+    data_col +=
+        ((channel_out * depth_col + t_out) * height_col + h_out) * width_col +
+        w_out;
+    data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in;
+    for (int i = 0; i < ksize_t; ++i) {
+      for (int j = 0; j < ksize_h; ++j) {
+        for (int k = 0; k < ksize_w; ++k) {
+          auto t = t_in + i * dilation_t;
+          auto h = h_in + j * dilation_h;
+          auto w = w_in + k * dilation_w;
+          *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height &&
+                       w < width)
+              ? data_vol
+                    [i * dilation_t * height * width + j * dilation_h * width +
+                     k * dilation_w]
+              : static_cast<T>(0);
+          data_col += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void vol2col(
+    hipStream_t stream,
+    const T* data_vol,
+    const int channels,
+    const int depth,
+    const int height,
+    const int width,
+    const int depth_col,
+    const int height_col,
+    const int width_col,
+    const int ksize_t,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_t,
+    const int pad_h,
+    const int pad_w,
+    const int stride_t,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_t,
+    const int dilation_h,
+    const int dilation_w,
+    T* data_col) {
+  // We are going to launch channels * depth_col * height_col * width_col
+  // kernels, each kernel responsible for copying a single-channel grid.
+  // We cast an operand to int64 so that the product will not overflow
+  const auto num_kernels = static_cast<int64_t>(channels) * depth_col * height_col * width_col;
+  // Launch
+  vol2col_kernel<<<GET_BLOCKS(num_kernels), HIP_NUM_THREADS, 0, stream>>>(
+      num_kernels,
+      data_vol,
+      depth,
+      height,
+      width,
+      ksize_t,
+      ksize_h,
+      ksize_w,
+      pad_t,
+      pad_h,
+      pad_w,
+      stride_t,
+      stride_h,
+      stride_w,
+      dilation_t,
+      dilation_h,
+      dilation_w,
+      depth_col,
+      height_col,
+      width_col,
+      data_col);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename T, typename accT>
+__global__ void vol2im_kernel(
+    const int64_t n,
+    const T* data_col,
+    const unsigned depth,
+    const unsigned height,
+    const unsigned width,
+    const unsigned channels,
+    const unsigned kernel_t,
+    const unsigned kernel_h,
+    const unsigned kernel_w,
+    const unsigned pad_t,
+    const unsigned pad_h,
+    const unsigned pad_w,
+    const unsigned stride_t,
+    const unsigned stride_h,
+    const unsigned stride_w,
+    const unsigned dilation_t,
+    const unsigned dilation_h,
+    const unsigned dilation_w,
+    const unsigned depth_col,
+    const unsigned height_col,
+    const unsigned width_col,
+    T* data_vol) {
+  HIP_KERNEL_LOOP(index, n) {
+    accT val = static_cast<accT>(0);
+    const auto w_im = index % width + pad_w;
+    const auto h_im = (index / width) % height + pad_h;
+    const auto t_im = (index / width / height) % depth + pad_t;
+    const auto c_im = index / (width * height * depth);
+    auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1;
+    // compute the start and end of the output
+    const auto w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const auto w_col_end = std::min(w_im / stride_w + 1, width_col);
+    const auto h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const auto h_col_end = std::min(h_im / stride_h + 1, height_col);
+    const auto t_col_start =
+        (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1;
+    const auto t_col_end = std::min(t_im / stride_t + 1, depth_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) {
+      for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+        for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+          uint64_t t_k = (t_im - t_col * stride_t);
+          uint64_t h_k = (h_im - h_col * stride_h);
+          uint64_t w_k = (w_im - w_col * stride_w);
+          if (t_k % dilation_t == 0 && h_k % dilation_h == 0 &&
+              w_k % dilation_w == 0) {
+            t_k /= dilation_t;
+            h_k /= dilation_h;
+            w_k /= dilation_w;
+            const int64_t idx_k =
+                ((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k;
+            const int64_t data_col_index =
+                ((idx_k * depth_col + t_col) *
+                    height_col + h_col) *
+                  width_col + w_col;
+            val += data_col[data_col_index];
+          }
+        }
+      }
+    }
+    data_vol[index] = static_cast<T>(val);
+  }
+}
+
+template <typename T, typename accT>
+void col2vol(
+    hipStream_t stream,
+    const T* data_col,
+    const int64_t channels,
+    const int64_t depth,
+    const int64_t height,
+    const int64_t width,
+    const int64_t output_depth,
+    const int64_t output_height,
+    const int64_t output_width,
+    const int64_t patch_t,
+    const int64_t patch_h,
+    const int64_t patch_w,
+    const int64_t pad_t,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_t,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_t,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    T* data_vol) {
+  const auto num_kernels = channels * depth * height * width;
+
+  auto check_fits_in_unsigned =
+    [](int64_t val, const char * name) {
+      constexpr auto umax = std::numeric_limits<unsigned>::max();
+      TORCH_CHECK(val >= 0 && val <= umax,
+                  name, " must fit in a 32-bit unsigned value");
+    };
+  check_fits_in_unsigned(num_kernels, "input size");
+  check_fits_in_unsigned(
+      channels * patch_t * patch_h * patch_w, "channels x kernel size");
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  vol2im_kernel<T, accT>
+      <<<GET_BLOCKS(num_kernels), HIP_NUM_THREADS, 0, stream>>>(
+          num_kernels,
+          data_col,
+          depth,
+          height,
+          width,
+          channels,
+          patch_t,
+          patch_h,
+          patch_w,
+          pad_t,
+          pad_h,
+          pad_w,
+          stride_t,
+          stride_h,
+          stride_w,
+          dilation_t,
+          dilation_h,
+          dilation_w,
+          output_depth,
+          output_height,
+          output_width,
+          data_vol);
+  #pragma once
+}
+
+} // namespace native
+} // namespace at
\ No newline at end of file

From 019c1703d76d6766a4fe04f4c96b4c6f12fdcdc4 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 14 Apr 2025 18:35:01 -0500
Subject: [PATCH 23/23] get to 100% device tests; add back Convolution,
 randperm, reflection/repeat pad, mode, unique, upsample, and special
 polynomial kernels

---
 aten/src/ATen/Context.h                       |    3 +
 aten/src/ATen/EmptyTensor.cpp                 |    3 +-
 aten/src/ATen/native/Convolution.cpp          |   19 +-
 .../ATen/native/TensorAdvancedIndexing.cpp    |   17 +-
 aten/src/ATen/native/native_functions.yaml    |  128 +-
 aten/src/ATen/native/zoom/LinearAlgebra.cu    |  146 ++
 aten/src/ATen/native/zoom/LogAddExpKernel.cu  |   58 +
 aten/src/ATen/native/zoom/RNN.cu              |  669 ++++++++
 aten/src/ATen/native/zoom/Randperm.cu         |  133 ++
 aten/src/ATen/native/zoom/Randperm.cuh        |   57 +
 aten/src/ATen/native/zoom/ReflectionPad.cu    |  679 ++++++++
 aten/src/ATen/native/zoom/RenormKernel.cu     |   30 +
 aten/src/ATen/native/zoom/Repeat.cu           |   69 +
 .../ATen/native/zoom/ReplicationPadding.cu    |  700 ++++++++
 aten/src/ATen/native/zoom/SegmentReduce.cu    |  604 +++++++
 aten/src/ATen/native/zoom/StepKernel.cu       |   34 +
 aten/src/ATen/native/zoom/SummaryOps.cu       |  400 +++++
 .../src/ATen/native/zoom/TensorModeKernel.cpp |  103 ++
 aten/src/ATen/native/zoom/TensorModeKernel.cu |  292 ++++
 .../src/ATen/native/zoom/TensorModeKernel.cuh |  427 +++++
 aten/src/ATen/native/zoom/TensorModeKernel.h  |   19 +
 .../ATen/native/zoom/TensorTransformations.cu |  154 ++
 .../ATen/native/zoom/UnfoldBackwardKernel.cu  |  162 ++
 aten/src/ATen/native/zoom/Unique.cu           |  233 +++
 aten/src/ATen/native/zoom/UniqueCub.cu        |  348 ++++
 aten/src/ATen/native/zoom/UniqueCub.cuh       |   16 +
 .../src/ATen/native/zoom/UpSampleBicubic2d.cu |  299 ++++
 .../ATen/native/zoom/UpSampleBilinear2d.cu    |  923 +++++++++++
 aten/src/ATen/native/zoom/UpSampleLinear1d.cu |  232 +++
 .../src/ATen/native/zoom/UpSampleNearest1d.cu |  239 +++
 .../src/ATen/native/zoom/UpSampleNearest2d.cu |  490 ++++++
 .../src/ATen/native/zoom/UpSampleNearest3d.cu |  342 ++++
 .../ATen/native/zoom/UpSampleTrilinear3d.cu   |  405 +++++
 .../zoom/ValidateCompressedIndicesKernel.cu   |   30 +
 aten/src/ATen/native/zoom/WeightNorm.cu       |  527 +++++++
 aten/src/ATen/native/zoom/ZetaKernel.cu       |   40 +
 .../ATen/native/zoom/laguerre_polynomial_l.cu |   32 +
 .../src/ATen/native/zoom/layer_norm_kernel.cu | 1403 +++++++++++++++++
 .../ATen/native/zoom/legendre_polynomial_p.cu |   32 +
 .../ATen/native/zoom/modified_bessel_i0.cu    |   42 +
 .../ATen/native/zoom/modified_bessel_i1.cu    |   42 +
 .../ATen/native/zoom/modified_bessel_k0.cu    |   42 +
 .../ATen/native/zoom/modified_bessel_k1.cu    |   42 +
 .../native/zoom/scaled_modified_bessel_k0.cu  |   42 +
 .../native/zoom/scaled_modified_bessel_k1.cu  |   42 +
 .../zoom/shifted_chebyshev_polynomial_t.cu    |   32 +
 .../zoom/shifted_chebyshev_polynomial_u.cu    |   32 +
 .../zoom/shifted_chebyshev_polynomial_v.cu    |   33 +
 .../zoom/shifted_chebyshev_polynomial_w.cu    |   32 +
 .../ATen/native/zoom/spherical_bessel_j0.cu   |   42 +
 .../ATen/templates/RegisterBackendSelect.cpp  |    8 +-
 aten/src/ATen/zoom/HIPConfig.h                |    1 -
 aten/src/ATen/zoom/detail/ZoomHooks.cpp       |    2 -
 test/test_torch.py                            |    5 +-
 54 files changed, 10882 insertions(+), 54 deletions(-)
 create mode 100644 aten/src/ATen/native/zoom/LinearAlgebra.cu
 create mode 100644 aten/src/ATen/native/zoom/LogAddExpKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/RNN.cu
 create mode 100644 aten/src/ATen/native/zoom/Randperm.cu
 create mode 100644 aten/src/ATen/native/zoom/Randperm.cuh
 create mode 100644 aten/src/ATen/native/zoom/ReflectionPad.cu
 create mode 100644 aten/src/ATen/native/zoom/RenormKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/Repeat.cu
 create mode 100644 aten/src/ATen/native/zoom/ReplicationPadding.cu
 create mode 100644 aten/src/ATen/native/zoom/SegmentReduce.cu
 create mode 100644 aten/src/ATen/native/zoom/StepKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/SummaryOps.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorModeKernel.cpp
 create mode 100644 aten/src/ATen/native/zoom/TensorModeKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/TensorModeKernel.cuh
 create mode 100644 aten/src/ATen/native/zoom/TensorModeKernel.h
 create mode 100644 aten/src/ATen/native/zoom/TensorTransformations.cu
 create mode 100644 aten/src/ATen/native/zoom/UnfoldBackwardKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/Unique.cu
 create mode 100644 aten/src/ATen/native/zoom/UniqueCub.cu
 create mode 100644 aten/src/ATen/native/zoom/UniqueCub.cuh
 create mode 100644 aten/src/ATen/native/zoom/UpSampleBicubic2d.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSampleBilinear2d.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSampleLinear1d.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSampleNearest1d.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSampleNearest2d.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSampleNearest3d.cu
 create mode 100644 aten/src/ATen/native/zoom/UpSampleTrilinear3d.cu
 create mode 100644 aten/src/ATen/native/zoom/ValidateCompressedIndicesKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/WeightNorm.cu
 create mode 100644 aten/src/ATen/native/zoom/ZetaKernel.cu
 create mode 100644 aten/src/ATen/native/zoom/laguerre_polynomial_l.cu
 create mode 100644 aten/src/ATen/native/zoom/layer_norm_kernel.cu
 create mode 100644 aten/src/ATen/native/zoom/legendre_polynomial_p.cu
 create mode 100644 aten/src/ATen/native/zoom/modified_bessel_i0.cu
 create mode 100644 aten/src/ATen/native/zoom/modified_bessel_i1.cu
 create mode 100644 aten/src/ATen/native/zoom/modified_bessel_k0.cu
 create mode 100644 aten/src/ATen/native/zoom/modified_bessel_k1.cu
 create mode 100644 aten/src/ATen/native/zoom/scaled_modified_bessel_k0.cu
 create mode 100644 aten/src/ATen/native/zoom/scaled_modified_bessel_k1.cu
 create mode 100644 aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_t.cu
 create mode 100644 aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_u.cu
 create mode 100644 aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_v.cu
 create mode 100644 aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_w.cu
 create mode 100644 aten/src/ATen/native/zoom/spherical_bessel_j0.cu

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 4b71d3813353cd..8a9d96eaae5289 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -127,6 +127,9 @@ class TORCH_API Context {
   static bool hasCuBLASLt() {
     return detail::getCUDAHooks().hasCuBLASLt();
   }
+  static bool hasZoom() {
+    return detail::getZoomHooks().hasROCM();
+  }
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp
index 8b5cd8e8123920..8891874b764cc9 100644
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@@ -18,10 +18,11 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
     // To properly support this, see https://github.com/pytorch/pytorch/issues/14560
     if (at::globalContext().hasCUDA()) {
       return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+    } else if (at::globalContext().hasZoom()) {
+      return at::detail::getZoomHooks().getPinnedMemoryAllocator();
     } else if (at::globalContext().hasXPU()) {
       return at::detail::getXPUHooks().getPinnedMemoryAllocator();
     } else if(at::isPrivateUse1HooksRegistered()) {
-      // TODO(Arham): exchange keys
       return at::GetPrivateUse1HooksInterface()->getPinnedMemoryAllocator();
     } else {
       TORCH_CHECK(false, "Need to provide pin_memory allocator to use pin memory.")
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index ecedc73579d669..f9ad9a4f725a00 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -598,7 +598,8 @@ struct ConvParams {
   // nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
   // a depthwise multiplier)
   bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    return input.is_cuda() &&
+    //TODO(Arham): exchange keys
+    return (input.is_cuda() || input.is_privateuseone()) &&
            !transposed &&
            (input.ndimension() == 4 || input.ndimension() == 5) &&
            at::symint::size<T>(input, 1) == groups &&
@@ -1254,7 +1255,7 @@ ConvBackend _select_conv_backend(
       !params.is_dilated()) {
     // fast path for grouped conv3d
     return ConvBackend::Slow3d;
-  } else if (input.device().is_cpu() || input.is_cuda()) {
+  } else if (input.device().is_cpu() || input.is_cuda() || input.is_privateuseone()) {
     // backends without support for groups
     if (params.transposed) {
       if (input.ndimension() == 4) {
@@ -1277,7 +1278,7 @@ ConvBackend _select_conv_backend(
             return ConvBackend::Slow2d;
           }
         }
-      } else if (input.ndimension() == 5 && (input.is_cuda() || params.is_dilated())) {
+      } else if (input.ndimension() == 5 && (input.is_cuda() || input.is_privateuseone() || params.is_dilated())) {
         return ConvBackend::SlowDilated3d;
       } else if (input.ndimension() == 5) { /* dim == 5, CPU, non-dilated */
         /* CPU implementation has specialized MM kernels
@@ -1767,14 +1768,14 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
   Tensor ggO;
   if (input.numel() != 0) {
     if (ggI.defined()) {
-      if (weight.is_cuda()) {
+      if (weight.is_cuda() || weight.is_privateuseone()) {
         weight = weight.contiguous();
       }
       ggO = at::convolution(ggI, weight, Tensor(), params.stride, params.padding, params.dilation, params.transposed, params.output_padding, params.groups);
     }
 
     if (ggW.defined()) {
-      if (ggW.is_cuda()) {
+      if (ggW.is_cuda() || ggW.is_privateuseone()) {
         ggW = ggW.contiguous();
       }
       auto ggW_term = at::convolution(input, ggW, Tensor(), params.stride, params.padding, params.dilation, params.transposed, params.output_padding, params.groups);
@@ -1826,7 +1827,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
     if (input.numel() != 0) {
       if (groups == 1) {
 
-        if (gOt.is_cuda()) {
+        if (gOt.is_cuda() || gOt.is_privateuseone()) {
           gOt = gOt.contiguous();
         }
         // Compute conv
@@ -1841,7 +1842,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
         for (const auto g : c10::irange(groups)) {
           auto ggIt_g = subvariable(ggIt, 0, groups, g);
           auto gOt_g = subvariable(gOt, 0, groups, g);
-          if (gOt_g.is_cuda()) {
+          if (gOt_g.is_cuda() || gOt_g.is_privateuseone()) {
             gOt_g = gOt_g.contiguous();
           }
 
@@ -1883,7 +1884,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
       gi_conv_params.transposed = !params.transposed;
 
       if (params.transposed) {
-        if (gO.is_cuda()) {
+        if (gO.is_cuda() || gO.is_privateuseone()) {
           gO = gO.contiguous();
         }
         gI = at::convolution(gO, ggW, Tensor(), gi_conv_params.stride, gi_conv_params.padding, gi_conv_params.dilation, gi_conv_params.transposed, gi_conv_params.output_padding, gi_conv_params.groups);
@@ -1917,7 +1918,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
           }
         }
 
-        if (gO.is_cuda()) {
+        if (gO.is_cuda() || gO.is_privateuseone()) {
           gO = gO.contiguous();
         }
 
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 395af8e5ef139e..12c37abd37decd 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -660,7 +660,9 @@ Tensor & put_(Tensor & self, const Tensor& index, const Tensor & source, const b
   // See note [Writing Nondeterministic Operations]
   // Nondeterministic when index contains duplicate entries and we do not accumulate
   // If we accumulate on GPU, we use atomicGPUAdd, which is non-deterministic
-  if (!accumulate || (accumulate && self.device().type() == DeviceType::CUDA)) {
+  // TODO(Arham): replace PU1 with Zoom key
+  bool non_deterministic_device = self.device().type() == DeviceType::CUDA || self.device().type() == DeviceType::PrivateUse1;
+  if (!accumulate || (accumulate && non_deterministic_device)) {
     at::globalContext().alertNotDeterministic("put_");
   }
 
@@ -735,7 +737,9 @@ Tensor & _index_put_impl_(Tensor & self, const torch::List<std::optional<Tensor>
       at::assert_no_overlap(self, *index);
     }
   }
-  if (self.device().type() == DeviceType::CUDA && (accumulate || globalContext().deterministicAlgorithms())) {
+  // TODO(Arham): replace PU1 with Zoom key
+  bool non_deterministic_device = ( self.device().type() == DeviceType::CUDA || self.device().type() == DeviceType::PrivateUse1 );
+  if ( non_deterministic_device && (accumulate || globalContext().deterministicAlgorithms())) {
       TORCH_CHECK(value_.device() == self.device(), "expected device ", self.device(), " but got device ",
       value_.device(), " for value tensor");
       index_put_with_sort_stub(self.device().type(), self, indices, value_, accumulate, unsafe);
@@ -797,7 +801,8 @@ TORCH_IMPL_FUNC(index_copy_out)
     if (!result.is_same(self)) result.copy_(self);
 
     // See Note [Enabling Deterministic Operations]
-    if (result.is_cuda() && globalContext().deterministicAlgorithms()){
+    // TODO(Arham): exchange keys
+    if ((result.is_cuda() || result.is_privateuseone()) && globalContext().deterministicAlgorithms()){
         torch::List<std::optional<Tensor>> indices;
         indices.reserve(dim + 1);
         for (const auto i: c10::irange(dim)) {
@@ -1732,7 +1737,8 @@ void scatter_impl(
   if (index.numel() == 0) return;
 
   auto op = ReductionType::SUM;
-  bool deterministic = globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA;
+  // TODO(Arham): replace PU1 with Zoom key
+  bool deterministic = globalContext().deterministicAlgorithms() && (self.device().type() == DeviceType::CUDA || self.device().type() == DeviceType::PrivateUse1);
 
   if (reduce.has_value()) {
     op = get_operator_enum(reduce.value(), use_new_options);
@@ -1826,7 +1832,8 @@ TORCH_IMPL_FUNC(scatter_add)
 
   // See Note [Enabling Deterministic Operations]
   // Avoid gpuAtomicAdd for CUDA if deterministic mode is turned on
-  if (globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA) {
+  // TODO(Arham): replace PU1 with Zoom key
+  if (globalContext().deterministicAlgorithms() && (self.device().type() == DeviceType::CUDA || self.device().type() == DeviceType::PrivateUse1)) {
     _scatter_via_index_put(self, dim, index, src, mut_out, /*accumulate*/true);
   } else {
     if (can_use_expanded_index_path(mut_out, dim, index, src, /*is_scatter_like*/true)) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2286f7ca6f6513..d4ad8a63126b38 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -169,11 +169,13 @@
   dispatch:
     CPU: _assert_async_cpu
     CUDA: _assert_async_cuda
+    PrivateUse1: _assert_async_zoom
 
 - func: _assert_async.msg(Tensor self, str assert_msg) -> ()
   dispatch:
     CPU: _assert_async_msg_cpu
     CUDA: _assert_async_msg_cuda
+    PrivateUse1: _assert_async_msg_zoom
 
 - func: _assert_scalar(Scalar self, str assert_msg) -> ()
   dispatch:
@@ -650,7 +652,7 @@
 - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   variants: function, method
   dispatch:
-    CPU, CUDA: addr
+    CPU, CUDA, PrivateUse1: addr
     MPS: addr_mps
     CompositeExplicitAutograd: math_addr
 
@@ -661,7 +663,7 @@
 
 - func: addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: addr_out
+    CPU, CUDA, PrivateUse1: addr_out
     MPS: addr_out_mps
     CompositeExplicitAutograd: math_addr_out
 
@@ -1184,6 +1186,7 @@
   dispatch:
     CPU: _bincount_cpu
     CUDA: _bincount_cuda
+    PrivateUse1: _bincount_zoom
     MPS: _bincount_mps
   tags: dynamic_output_shape
   autogen: bincount.out
@@ -3058,6 +3061,7 @@
   dispatch:
     CPU: _validate_compressed_sparse_indices_cpu
     CUDA: _validate_compressed_sparse_indices_cuda
+    PrivateUse1: _validate_compressed_sparse_indices_zoom
 
 - func: _cufft_get_plan_cache_size(DeviceIndex device_index) -> int
 
@@ -3146,7 +3150,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
-    CPU, CUDA, MPS: _index_put_impl_
+    CPU, CUDA, PrivateUse1, MPS: _index_put_impl_
     QuantizedCPU: _index_put_impl_quantized_cpu_
     QuantizedCUDA: _index_put_impl_quantized_cuda_
   autogen: _index_put_impl, _index_put_impl.out
@@ -3290,6 +3294,7 @@
   dispatch:
     CPU: layer_norm_cpu
     CUDA: layer_norm_cuda
+    PrivateUse1: layer_norm_zoom
     MPS: layer_norm_mps
     CompositeExplicitAutograd: math_native_layer_norm
     NestedTensorCPU, NestedTensorCUDA: nested_layer_norm
@@ -3300,6 +3305,7 @@
   dispatch:
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
+    PrivateUse1: layer_norm_backward_zoom
     MPS: layer_norm_backward_mps
     NestedTensorCPU, NestedTensorCUDA: layer_norm_backward_nested
   autogen: native_layer_norm_backward.out
@@ -3571,7 +3577,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: logaddexp_out
+    CPU, CUDA, PrivateUse1: logaddexp_out
     MPS: logaddexp_out_mps
   tags: pointwise
 
@@ -3584,7 +3590,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: logaddexp2_out
+    CPU, CUDA, PrivateUse1: logaddexp2_out
     MPS: logaddexp2_out_mps
   tags: pointwise
 
@@ -3874,7 +3880,7 @@
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: amax_out
+    CPU, CUDA, PrivateUse1: amax_out
     MPS: amax_out_mps
 
 # Return: (Tensor output, Tensor indices)
@@ -4058,7 +4064,7 @@
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: amin_out
+    CPU, CUDA, PrivateUse1: amin_out
     MPS: amin_out_mps
 
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
@@ -4196,7 +4202,7 @@
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
   dispatch:
-    CPU, CUDA: mode
+    CPU, CUDA, PrivateUse1: mode
 
 - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   dispatch:
@@ -4594,6 +4600,7 @@
   variants: method
   dispatch:
     NestedTensorCUDA, CUDA: is_pinned_cuda
+    PrivateUse1: is_pinned_zoom
     MPS: is_pinned_mps
     CompositeExplicitAutograd: is_pinned_default
 
@@ -4606,6 +4613,7 @@
 - func: _pin_memory(Tensor self, Device? device=None) -> Tensor
   dispatch:
     CUDA: _pin_memory_cuda
+    PrivateUse1: _pin_memory_zoom
     MPS: _pin_memory_mps
     NestedTensorCUDA, NestedTensorCPU: _pin_memory_nested
   autogen: _pin_memory.out
@@ -4823,6 +4831,7 @@
   dispatch:
     CPU: randperm_out_cpu
     CUDA: randperm_out_cuda
+    PrivateUse1: randperm_out_zoom
     MPS: randperm_out_mps
 
 - func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -4922,6 +4931,7 @@
   dispatch:
     CPU: repeat_interleave_cpu
     CUDA: repeat_interleave_cuda
+    PrivateUse1: repeat_interleave_zoom
     MPS: repeat_interleave_mps
   tags: dynamic_output_shape
   autogen: repeat_interleave.Tensor_out
@@ -6170,6 +6180,7 @@
   dispatch:
     CPU, MPS: roll
     CUDA: roll_cuda
+    PrivateUse1: roll_zoom
   autogen: roll.out
 
 # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
@@ -6362,6 +6373,7 @@
   dispatch:
     CPU: _unique_cpu
     CUDA: _unique_cuda
+    PrivateUse1: _unique_zoom
   autogen: _unique.out
 
 - func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
@@ -6369,6 +6381,7 @@
   dispatch:
     CPU: unique_dim_cpu
     CUDA: unique_dim_cuda
+    PrivateUse1: unique_dim_zoom
   tags: dynamic_output_shape
   autogen: unique_dim.out
 
@@ -6377,6 +6390,7 @@
   dispatch:
     CPU: unique_consecutive_cpu
     CUDA: unique_consecutive_cuda
+    PrivateUse1: unique_consecutive_zoom
     MPS: unique_consecutive_mps
   tags: dynamic_output_shape
   autogen: unique_consecutive.out
@@ -6386,6 +6400,7 @@
   dispatch:
     CPU: unique_dim_consecutive_cpu
     CUDA: unique_dim_consecutive_cuda
+    PrivateUse1: unique_dim_consecutive_zoom
     MPS: unique_dim_consecutive_mps
   tags: dynamic_output_shape
   autogen: unique_dim_consecutive.out
@@ -6399,6 +6414,7 @@
   dispatch:
     CPU: _unique2_cpu
     CUDA: _unique2_cuda
+    PrivateUse1: _unique2_zoom
     MPS: _unique2_mps
   tags: dynamic_output_shape
   autogen: _unique2.out
@@ -6544,6 +6560,7 @@
   dispatch:
     CPU: weight_norm_cpu
     CUDA: weight_norm_cuda
+    PrivateUse1: weight_norm_zoom
     MPS: weight_norm_mps
   autogen: _weight_norm_interface.out
 
@@ -6552,6 +6569,7 @@
   dispatch:
     CPU: weight_norm_backward_cpu
     CUDA: weight_norm_backward_cuda
+    PrivateUse1: weight_norm_backward_zoom
     MPS: weight_norm_backward_mps
   autogen: _weight_norm_interface_backward.out
 
@@ -6794,7 +6812,7 @@
   structured: True
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: norm_out
+    CPU, CUDA, PrivateUse1: norm_out
     MPS: norm_out_mps
 
 # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
@@ -6964,7 +6982,7 @@
   structured_inherits: TensorIteratorBase
   device_check: NoCheck   # TensorIterator
   dispatch:
-    CPU, CUDA: heaviside_out
+    CPU, CUDA, PrivateUse1: heaviside_out
   tags: pointwise
 
 - func: heaviside(Tensor self, Tensor values) -> Tensor
@@ -7824,6 +7842,7 @@
 - func: _thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_lstm_cell_cuda
+    PrivateUse1: _thnn_fused_lstm_cell_zoom
   autogen: _thnn_fused_lstm_cell.out
 
 # NB: The composite version of this function below is a simple wrapper that duplicates some of the outputs
@@ -7832,6 +7851,7 @@
 - func: _thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_lstm_cell_backward_impl_cuda
+    PrivateUse1: _thnn_fused_lstm_cell_backward_impl_zoom
   autogen: _thnn_fused_lstm_cell_backward_impl.out
 
 - func: _thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -7841,11 +7861,13 @@
 - func: _thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_gru_cell_cuda
+    PrivateUse1: _thnn_fused_gru_cell_zoom
   autogen: _thnn_fused_gru_cell.out
 
 - func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
   dispatch:
     CUDA: _thnn_fused_gru_cell_backward_cuda
+    PrivateUse1: _thnn_fused_gru_cell_backward_zoom
   autogen: _thnn_fused_gru_cell_backward.out
 
 - func: _thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -8005,7 +8027,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, MPS: is_set_to
+    CPU, CUDA, PrivateUse1, MPS: is_set_to
 
 - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -9759,12 +9781,14 @@
   dispatch:
     CPU, MPS: histogram_histc_out
     CUDA: _histc_out_cuda
+    PrivateUse1: _histc_out_zoom
 
 - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
   variants: method, function
   dispatch:
     CPU, MPS: histogram_histc
     CUDA: _histc_cuda
+    PrivateUse1: _histc_zoom
 
 - func: histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
   dispatch:
@@ -9902,7 +9926,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA, MPS: nextafter_out
+    CPU, CUDA, PrivateUse1, MPS: nextafter_out
   tags: pointwise
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
@@ -10188,7 +10212,7 @@
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: renorm_out
+    CPU, CUDA, PrivateUse1: renorm_out
     MPS: renorm_out_mps
 
 - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
@@ -10206,13 +10230,13 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta, MPS: unfold
+    CPU, CUDA, PrivateUse1, Meta, MPS: unfold
     QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
   variants: function
   dispatch:
-    CPU, CUDA: unfold_backward
+    CPU, CUDA, PrivateUse1: unfold_backward
   autogen: unfold_backward.out
 
 - func: equal(Tensor self, Tensor other) -> bool
@@ -11951,7 +11975,7 @@
   structured_inherits: TensorIteratorBase
   python_module: nn
   dispatch:
-    CPU, CUDA: glu_out
+    CPU, CUDA, PrivateUse1: glu_out
     MPS: glu_out_mps
 
 - func: glu(Tensor self, int dim=-1) -> Tensor
@@ -12608,6 +12632,7 @@
     CPU: reflection_pad1d_out_cpu
     QuantizedCPU: reflection_pad1d_out_quantized_cpu
     CUDA: reflection_pad1d_out_cuda
+    PrivateUse1: reflection_pad1d_out_zoom
     MPS: reflection_pad1d_out_mps
 
 - func: reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
@@ -12621,6 +12646,7 @@
   dispatch:
     CPU: reflection_pad1d_backward_out_cpu
     CUDA: reflection_pad1d_backward_out_cuda
+    PrivateUse1: reflection_pad1d_backward_out_zoom
     MPS: reflection_pad1d_backward_out_mps
 
 - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
@@ -12632,6 +12658,7 @@
   dispatch:
     CPU, QuantizedCPU: reflection_pad2d_out_cpu
     CUDA: reflection_pad2d_out_cuda
+    PrivateUse1: reflection_pad2d_out_zoom
     MPS: reflection_pad2d_out_mps
 
 - func: reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
@@ -12640,6 +12667,7 @@
     CPU: reflection_pad2d_cpu
     QuantizedCPU: reflection_pad2d_quantized_cpu
     CUDA: reflection_pad2d_cuda
+    PrivateUse1: reflection_pad2d_zoom
     MPS: reflection_pad2d_mps
   tags: core
 
@@ -12648,6 +12676,7 @@
   dispatch:
     CPU: reflection_pad2d_backward_out_cpu
     CUDA: reflection_pad2d_backward_out_cuda
+    PrivateUse1: reflection_pad2d_backward_out_zoom
     MPS: reflection_pad2d_backward_out_mps
 
 - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
@@ -12655,6 +12684,7 @@
   dispatch:
     CPU: reflection_pad2d_backward_cpu
     CUDA: reflection_pad2d_backward_cuda
+    PrivateUse1: reflection_pad2d_backward_zoom
     MPS: reflection_pad2d_backward_mps
 
 - func: reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
@@ -12663,6 +12693,7 @@
   dispatch:
     CPU: reflection_pad3d_out_cpu
     CUDA: reflection_pad3d_out_cuda
+    PrivateUse1: reflection_pad3d_out_zoom
     MPS: reflection_pad3d_out_mps
 
 - func: reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
@@ -12676,6 +12707,7 @@
   dispatch:
     CPU: reflection_pad3d_backward_out_cpu
     CUDA: reflection_pad3d_backward_out_cuda
+    PrivateUse1: reflection_pad3d_backward_out_zoom
     MPS: reflection_pad3d_backward_out_mps
 
 - func: reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
@@ -12688,6 +12720,7 @@
   dispatch:
     CPU: replication_pad1d_out_cpu
     CUDA: replication_pad1d_out_cuda
+    PrivateUse1: replication_pad1d_out_zoom
     MPS: replication_pad1d_out_mps
 
 - func: replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
@@ -12700,6 +12733,7 @@
   dispatch:
     CPU: replication_pad1d_backward_out_cpu
     CUDA: replication_pad1d_backward_out_cuda
+    PrivateUse1: replication_pad1d_backward_out_zoom
     MPS: replication_pad1d_backward_out_mps
 
 - func: replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
@@ -12712,6 +12746,7 @@
   dispatch:
     CPU: replication_pad2d_out_cpu
     CUDA: replication_pad2d_out_cuda
+    PrivateUse1: replication_pad2d_out_zoom
     MPS: replication_pad2d_out_mps
 
 - func: replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
@@ -12724,6 +12759,7 @@
   dispatch:
     CPU: replication_pad2d_backward_out_cpu
     CUDA: replication_pad2d_backward_out_cuda
+    PrivateUse1: replication_pad2d_backward_out_zoom
     MPS: replication_pad2d_backward_out_mps
 
 - func: replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
@@ -12731,6 +12767,7 @@
   dispatch:
     CPU: replication_pad2d_backward_cpu
     CUDA: replication_pad2d_backward_cuda
+    PrivateUse1: replication_pad2d_backward_zoom
     MPS: replication_pad2d_backward_mps
 
 - func: replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
@@ -12739,6 +12776,7 @@
   dispatch:
     CPU: replication_pad3d_out_cpu
     CUDA: replication_pad3d_out_cuda
+    PrivateUse1: replication_pad3d_out_zoom
     MPS: replication_pad3d_out_mps
 
 - func: replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
@@ -12752,6 +12790,7 @@
   dispatch:
     CPU: replication_pad3d_backward_out_cpu
     CUDA: replication_pad3d_backward_out_cuda
+    PrivateUse1: replication_pad3d_backward_out_zoom
     MPS: replication_pad3d_backward_out_mps
 
 - func: replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
@@ -12759,6 +12798,7 @@
   dispatch:
     CPU: replication_pad3d_backward_cpu
     CUDA: replication_pad3d_backward_cuda
+    PrivateUse1: replication_pad3d_backward_zoom
     MPS: replication_pad3d_backward_mps
 
 - func: _pad_circular(Tensor self, SymInt[] pad) -> Tensor
@@ -12833,6 +12873,7 @@
   dispatch:
     CPU: upsample_linear1d_out_cpu
     CUDA: upsample_linear1d_out_cuda
+    PrivateUse1: upsample_linear1d_out_zoom
     MPS: upsample_linear1d_out_mps
 
 - func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
@@ -12845,6 +12886,7 @@
   dispatch:
     CPU: upsample_linear1d_backward_out_cpu
     CUDA: upsample_linear1d_backward_out_cuda
+    PrivateUse1: upsample_linear1d_backward_out_zoom
     MPS: upsample_linear1d_backward_out_mps
 
 - func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
@@ -12857,6 +12899,7 @@
   dispatch:
     CPU: upsample_bilinear2d_out_cpu
     CUDA: upsample_bilinear2d_out_cuda
+    PrivateUse1: upsample_bilinear2d_out_zoom
     MPS: upsample_bilinear2d_out_mps
 
 - func: upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
@@ -12871,6 +12914,7 @@
   dispatch:
     CPU: upsample_bilinear2d_backward_out_cpu
     CUDA: upsample_bilinear2d_backward_out_cuda
+    PrivateUse1: upsample_bilinear2d_backward_out_zoom
     MPS: upsample_bilinear2d_backward_out_mps
 
 - func: upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
@@ -12883,6 +12927,7 @@
   dispatch:
     CPU: _upsample_bilinear2d_aa_out_cpu
     CUDA: _upsample_bilinear2d_aa_out_cuda
+    PrivateUse1: _upsample_bilinear2d_aa_out_zoom
 
 - func: _upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12894,6 +12939,7 @@
   dispatch:
     CPU: _upsample_bilinear2d_aa_backward_out_cpu
     CUDA: _upsample_bilinear2d_aa_backward_out_cuda
+    PrivateUse1: _upsample_bilinear2d_aa_backward_out_zoom
 
 - func: _upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12905,6 +12951,7 @@
   dispatch:
     CPU: upsample_bicubic2d_out_cpu
     CUDA: upsample_bicubic2d_out_cuda
+    PrivateUse1: upsample_bicubic2d_out_zoom
 
 - func: upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12916,6 +12963,7 @@
   dispatch:
     CPU: upsample_bicubic2d_backward_out_cpu
     CUDA: upsample_bicubic2d_backward_out_cuda
+    PrivateUse1: upsample_bicubic2d_backward_out_zoom
 
 - func: upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12927,6 +12975,7 @@
   dispatch:
     CPU: _upsample_bicubic2d_aa_out_cpu
     CUDA: _upsample_bicubic2d_aa_out_cuda
+    PrivateUse1: _upsample_bicubic2d_aa_out_zoom
 
 - func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12938,6 +12987,7 @@
   dispatch:
     CPU: _upsample_bicubic2d_aa_backward_out_cpu
     CUDA: _upsample_bicubic2d_aa_backward_out_cuda
+    PrivateUse1: _upsample_bicubic2d_aa_backward_out_zoom
 
 - func: _upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12949,6 +12999,7 @@
   dispatch:
     CPU: upsample_trilinear3d_out_cpu
     CUDA: upsample_trilinear3d_out_cuda
+    PrivateUse1: upsample_trilinear3d_out_zoom
 
 - func: upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12960,6 +13011,7 @@
   dispatch:
     CPU: upsample_trilinear3d_backward_out_cpu
     CUDA: upsample_trilinear3d_backward_out_cuda
+    PrivateUse1: upsample_trilinear3d_backward_out_zoom
 
 - func: upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -12971,6 +13023,7 @@
   dispatch:
     CPU: upsample_nearest1d_out_cpu
     CUDA: upsample_nearest1d_out_cuda
+    PrivateUse1: upsample_nearest1d_out_zoom
     MPS: upsample_nearest1d_out_mps
 
 - func: _upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
@@ -12979,6 +13032,7 @@
   dispatch:
     CPU: _upsample_nearest_exact1d_out_cpu
     CUDA: _upsample_nearest_exact1d_out_cuda
+    PrivateUse1: _upsample_nearest_exact1d_out_zoom
     MPS: _upsample_nearest_exact1d_out_mps
 
 - func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
@@ -12995,6 +13049,7 @@
   dispatch:
     CPU: upsample_nearest1d_backward_out_cpu
     CUDA: upsample_nearest1d_backward_out_cuda
+    PrivateUse1: upsample_nearest1d_backward_out_zoom
     MPS: upsample_nearest1d_backward_out_mps
 
 - func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -13003,6 +13058,7 @@
   dispatch:
     CPU: _upsample_nearest_exact1d_backward_out_cpu
     CUDA: _upsample_nearest_exact1d_backward_out_cuda
+    PrivateUse1: _upsample_nearest_exact1d_backward_out_zoom
     MPS: _upsample_nearest_exact1d_backward_out_mps
 
 - func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
@@ -13019,6 +13075,7 @@
   dispatch:
     CPU: upsample_nearest2d_out_cpu
     CUDA: upsample_nearest2d_out_cuda
+    PrivateUse1: upsample_nearest2d_out_zoom
     MPS: upsample_nearest2d_out_mps
 
 - func: _upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
@@ -13027,6 +13084,7 @@
   dispatch:
     CPU: _upsample_nearest_exact2d_out_cpu
     CUDA: _upsample_nearest_exact2d_out_cuda
+    PrivateUse1: _upsample_nearest_exact2d_out_zoom
     MPS: _upsample_nearest_exact2d_out_mps
 
 - func: upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
@@ -13047,6 +13105,7 @@
   dispatch:
     CPU: upsample_nearest2d_backward_out_cpu
     CUDA: upsample_nearest2d_backward_out_cuda
+    PrivateUse1: upsample_nearest2d_backward_out_zoom
     MPS: upsample_nearest2d_backward_out_mps
 
 - func: _upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
@@ -13055,6 +13114,7 @@
   dispatch:
     CPU: _upsample_nearest_exact2d_backward_out_cpu
     CUDA: _upsample_nearest_exact2d_backward_out_cuda
+    PrivateUse1: _upsample_nearest_exact2d_backward_out_zoom
     MPS: _upsample_nearest_exact2d_backward_out_mps
 
 - func: upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
@@ -13071,6 +13131,7 @@
   dispatch:
     CPU: upsample_nearest3d_out_cpu
     CUDA: upsample_nearest3d_out_cuda
+    PrivateUse1: upsample_nearest3d_out_zoom
 
 - func: _upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -13078,6 +13139,7 @@
   dispatch:
     CPU: _upsample_nearest_exact3d_out_cpu
     CUDA: _upsample_nearest_exact3d_out_cuda
+    PrivateUse1: _upsample_nearest_exact3d_out_zoom
 
 - func: upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -13097,6 +13159,7 @@
   dispatch:
     CPU: upsample_nearest3d_backward_out_cpu
     CUDA: upsample_nearest3d_backward_out_cuda
+    PrivateUse1: upsample_nearest3d_backward_out_zoom
 
 - func: _upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -13104,6 +13167,7 @@
   dispatch:
     CPU: _upsample_nearest_exact3d_backward_out_cpu
     CUDA: _upsample_nearest_exact3d_backward_out_cuda
+    PrivateUse1: _upsample_nearest_exact3d_backward_out_zoom
 
 - func: upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
   python_module: nn
@@ -13632,7 +13696,7 @@
   python_module: special
   variants: function
   dispatch:
-    CPU, CUDA: special_zeta_out
+    CPU, CUDA, PrivateUse1: special_zeta_out
   tags: pointwise
 
 - func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -14652,13 +14716,13 @@
 - func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
   variants: function
   dispatch:
-    CPU, CUDA: segment_reduce_kernel
+    CPU, CUDA, PrivateUse1: segment_reduce_kernel
   autogen: segment_reduce.out
 
 - func: _segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor
   variants: function
   dispatch:
-    CPU, CUDA: _segment_reduce_backward_kernel
+    CPU, CUDA, PrivateUse1: _segment_reduce_backward_kernel
   autogen: _segment_reduce_backward.out
 
 - func: pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor
@@ -15476,7 +15540,7 @@
 - func: special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_laguerre_polynomial_l_out
+    CPU, CUDA, PrivateUse1: special_laguerre_polynomial_l_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15525,7 +15589,7 @@
 - func: special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_legendre_polynomial_p_out
+    CPU, CUDA, PrivateUse1: special_legendre_polynomial_p_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15556,7 +15620,7 @@
 
 - func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_modified_bessel_i0_out
+    CPU, CUDA, PrivateUse1: special_modified_bessel_i0_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15571,7 +15635,7 @@
 
 - func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_modified_bessel_i1_out
+    CPU, CUDA, PrivateUse1: special_modified_bessel_i1_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15586,7 +15650,7 @@
 
 - func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_modified_bessel_k0_out
+    CPU, CUDA, PrivateUse1: special_modified_bessel_k0_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15601,7 +15665,7 @@
 
 - func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_modified_bessel_k1_out
+    CPU, CUDA, PrivateUse1: special_modified_bessel_k1_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15616,7 +15680,7 @@
 
 - func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_scaled_modified_bessel_k0_out
+    CPU, CUDA, PrivateUse1: special_scaled_modified_bessel_k0_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15631,7 +15695,7 @@
 
 - func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_scaled_modified_bessel_k1_out
+    CPU, CUDA, PrivateUse1: special_scaled_modified_bessel_k1_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15664,7 +15728,7 @@
 - func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_t_out
+    CPU, CUDA, PrivateUse1: special_shifted_chebyshev_polynomial_t_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15713,7 +15777,7 @@
 - func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_u_out
+    CPU, CUDA, PrivateUse1: special_shifted_chebyshev_polynomial_u_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15762,7 +15826,7 @@
 - func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_v_out
+    CPU, CUDA, PrivateUse1: special_shifted_chebyshev_polynomial_v_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15811,7 +15875,7 @@
 - func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   dispatch:
-    CPU, CUDA: special_shifted_chebyshev_polynomial_w_out
+    CPU, CUDA, PrivateUse1: special_shifted_chebyshev_polynomial_w_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
@@ -15842,7 +15906,7 @@
 
 - func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU, CUDA: special_spherical_bessel_j0_out
+    CPU, CUDA, PrivateUse1: special_spherical_bessel_j0_out
   python_module: special
   structured_inherits: TensorIteratorBase
   structured: True
diff --git a/aten/src/ATen/native/zoom/LinearAlgebra.cu b/aten/src/ATen/native/zoom/LinearAlgebra.cu
new file mode 100644
index 00000000000000..da5e7286366b7e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/LinearAlgebra.cu
@@ -0,0 +1,146 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/LinearAlgebra.h>
+#include <ATen/native/BatchLinearAlgebra.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/ReduceOps.h>
+#include <c10/core/Scalar.h>
+
+#include <thrust/swap.h>
+
+namespace at::native {
+
+namespace {
+
+void addr_kernel_zoom(TensorIterator &iter, const Scalar& beta, const Scalar& alpha) {
+  if (iter.dtype() == ScalarType::Bool) {
+    using scalar_t = bool;
+    auto beta_val = beta.to<scalar_t>();
+    auto alpha_val = alpha.to<scalar_t>();
+
+    // when beta is false, values in self should be ignored,
+    // nans and infs in self should not propagate.
+    if (beta_val == false) {
+      gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA (scalar_t self_val,
+                        scalar_t vec1_val, scalar_t vec2_val) -> scalar_t {
+          return alpha_val && vec1_val && vec2_val;
+        }
+      );
+    } else {
+      gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA (scalar_t self_val,
+                        scalar_t vec1_val, scalar_t vec2_val) -> scalar_t {
+          return (beta_val && self_val) || (alpha_val && vec1_val && vec2_val);
+        }
+      );
+    }
+    return;
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf,
+                                         iter.dtype(), "addr_zoom", [&] {
+    auto beta_val = beta.to<scalar_t>();
+    auto alpha_val = alpha.to<scalar_t>();
+
+    scalar_t zero_val(0);
+    // when beta==0, values in self should be ignored,
+    // nans and infs in self should not propagate.
+    if (beta_val == zero_val) {
+      gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA (scalar_t self_val,
+                        scalar_t vec1_val, scalar_t vec2_val) -> scalar_t {
+          return alpha_val * vec1_val * vec2_val;
+        }
+      );
+    } else {
+      gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA (scalar_t self_val,
+                        scalar_t vec1_val, scalar_t vec2_val) -> scalar_t {
+          return beta_val * self_val + alpha_val * vec1_val * vec2_val;
+        }
+      );
+    }
+  });
+}
+
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+C10_LAUNCH_BOUNDS_2(n_threads, n_elems_per_thread)
+__global__ void _elementwise_kernel(int total_n_elems, func_t f) {
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  int idx = total_work_block * blockIdx.x + threadIdx.x;
+
+  #pragma unroll
+  for (int i = 0; i < n_elems_per_thread; ++i) {
+    if (idx < total_n_elems) {
+      f(idx);
+      idx += n_threads;
+    }
+  }
+}
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+static void _launch_kernel(int total_n_elems, func_t f) {
+  TORCH_INTERNAL_ASSERT(
+    total_n_elems >= 0 && total_n_elems <= std::numeric_limits<int32_t>::max()
+  );
+
+  dim3 block(n_threads);
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  dim3 grid((total_n_elems + total_work_block - 1) / total_work_block);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+ hipLaunchKernelGGL(( _elementwise_kernel<n_threads, n_elems_per_thread, func_t>)
+    , dim3(grid), dim3(block), 0, stream, total_n_elems, f);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+void unpack_pivots_zoom_kernel(TensorIterator& iter, const int64_t dim_size, const int64_t max_pivot) {
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      unpack_pivots_zoom_kernel(sub_iter, dim_size, max_pivot);
+    }
+    return;
+  }
+
+  const auto offset_calculator = make_offset_calculator<2>(iter);
+
+  const auto perm_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  const auto pivots_ptr = reinterpret_cast<const char*>(iter.data_ptr(1));
+
+  auto loop = [=]C10_DEVICE(const int idx) {
+    const auto offsets = offset_calculator.get(idx);
+
+    int64_t* const __restrict__ perm_data = reinterpret_cast<int64_t*>(perm_ptr + offsets[0]);
+    const int32_t* const __restrict__ pivots_data = reinterpret_cast<const int32_t*>(pivots_ptr + offsets[1]);
+
+    // QUESTION: can we mix 64bit offsets with 32bit Iterator indexing?
+    for (int64_t i = 0; i < dim_size; ++i) {
+      thrust::swap(
+        perm_data[i],
+        perm_data[pivots_data[i] - 1]
+      );
+    }
+  };
+
+  _launch_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+}
+} // anonymous namespace
+
+REGISTER_PRIVATEUSE1_DISPATCH(unpack_pivots_stub, &unpack_pivots_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(addr_stub, &addr_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/LogAddExpKernel.cu b/aten/src/ATen/native/zoom/LogAddExpKernel.cu
new file mode 100644
index 00000000000000..ccd1ac19fb0ace
--- /dev/null
+++ b/aten/src/ATen/native/zoom/LogAddExpKernel.cu
@@ -0,0 +1,58 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/OpMathType.h>
+#include <c10/util/MathConstants.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void logaddexp_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::BFloat16, ScalarType::Half,
+      iter.dtype(), "logaddexp_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
+          const auto a = static_cast<opmath_t>(a_);
+          const auto b = static_cast<opmath_t>(b_);
+          if (::isinf(a) && a == b) {
+            return a;
+          } else {
+            const auto m = ::max(a, b);
+            return m + ::log1p(::exp(-::abs(a - b)));
+          }
+        });
+      });
+}
+
+void logaddexp2_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      ScalarType::BFloat16, ScalarType::Half,
+      iter.dtype(), "logaddexp2_zoom",
+      [&]() {
+        using opmath_t = at::opmath_type<scalar_t>;
+        const auto inv_log_2 = static_cast<opmath_t>(1.0 / c10::ln_2<double>);
+        gpu_kernel(iter, [inv_log_2] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
+          const auto a = static_cast<opmath_t>(a_);
+          const auto b = static_cast<opmath_t>(b_);
+          if (::isinf(a) && a == b) {
+            return a;
+          } else {
+            const auto m = ::max(a, b);
+            return m + ::log1p(::exp2(-::abs(a - b))) * inv_log_2;
+          }
+        });
+      });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(logaddexp_stub, &logaddexp_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(logaddexp2_stub, &logaddexp2_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/RNN.cu b/aten/src/ATen/native/zoom/RNN.cu
new file mode 100644
index 00000000000000..8164ef773a4b42
--- /dev/null
+++ b/aten/src/ATen/native/zoom/RNN.cu
@@ -0,0 +1,669 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_native.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_native.h>
+#endif
+
+namespace at::native {
+
+namespace {
+
+using at::zoom::detail::TensorInfo;
+using at::zoom::detail::getTensorInfo;
+using at::zoom::detail::IndexToOffset;
+using at::zoom::detail::canUse32BitIndexMath;
+
+// Factor will be 3 for GRU and 4 for LSTM
+void checkSizes(CheckedFrom c,
+                const TensorArg& input_gates, const TensorArg& hidden_gates,
+                const TensorArg& input_bias, const TensorArg& hidden_bias,
+                int64_t factor, const TensorArg& prev_hidden) {
+  checkDim(c, input_gates, 2);
+  checkSameSize(c, input_gates, hidden_gates);
+  int64_t gates_size = input_gates->size(1);
+
+  if (input_bias->defined()) {
+    checkDim(c, input_bias, 1);
+    checkNumel(c, input_bias, gates_size);
+    checkSameSize(c, input_bias, hidden_bias);
+  }
+
+  checkDim(c, prev_hidden, 2);
+  checkNumel(c, prev_hidden, input_gates->size(0) * gates_size / factor);
+
+  checkAllSameGPU(c, {input_gates, hidden_gates, input_bias, hidden_bias, prev_hidden});
+}
+
+bool allContiguous(at::TensorList tensors) {
+  return std::all_of(tensors.begin(), tensors.end(),
+                     [](const at::Tensor& t) { return !t.defined() || t.is_contiguous(); });
+}
+
+void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) {
+  c10::DeviceIndex curDevice = -1;
+  c10::zoom::GetDevice(&curDevice);
+  *block = zoom::getApplyBlock();
+  TORCH_INTERNAL_ASSERT(zoom::getApplyGrid(numel, *grid, curDevice),
+                        "Could not get grid size for pointwise apply.");
+}
+
+template<typename T, typename T2>
+TensorInfo<T, T2> tryGetTensorInfo(const at::Tensor& t) {
+  return t.defined() ? getTensorInfo<T, T2>(t) : TensorInfo<T, T2>{};
+}
+
+void collapseDims() {};
+template<typename T, typename T2, typename... Args>
+void collapseDims(TensorInfo<T, T2>& info, Args&... infos) {
+  info.collapseDims();
+  collapseDims(infos...);
+}
+
+#define DEVICE_LINEAR_GET(D_TENSOR, INDEX)                              \
+  D_TENSOR.data[IndexToOffset<scalar_t, index_type, indexing_kind>::get(INDEX, D_TENSOR)]
+
+// Biases are always 1D
+#define DEVICE_BIAS_GET(D_TENSOR, INDEX)                              \
+  D_TENSOR.data[IndexToOffset<scalar_t, index_type, 1>::get(INDEX, D_TENSOR)]
+
+#define H2F(input) static_cast<accscalar_t>(input)
+#define F2H(input) static_cast<scalar_t>(input)
+
+template<typename T>
+__device__ __forceinline__
+T sigmoid(T in)  {
+  T one = static_cast<T>(1.0);
+  return one / (one + ::exp(-in));
+}
+
+namespace kernel {
+
+template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
+C10_LAUNCH_BOUNDS_2(512, 4)
+__global__ void lstm_cell_forward(
+            TensorInfo<scalar_t, index_type> input,
+            TensorInfo<scalar_t, index_type> hidden,
+            TensorInfo<scalar_t, index_type> bias1,
+            TensorInfo<scalar_t, index_type> bias2,
+            TensorInfo<scalar_t, index_type> _cx,
+            TensorInfo<scalar_t, index_type> _hy,
+            TensorInfo<scalar_t, index_type> _cy,
+            TensorInfo<scalar_t, index_type> workspace,
+            index_type hsz,
+            index_type totalElements) {
+    bool has_bias = bias1.data != nullptr;
+    for (index_type linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+      index_type offset = (linearIndex/hsz)*4*hsz+linearIndex%hsz;
+
+      scalar_t iig = DEVICE_LINEAR_GET(input, offset+0*hsz);
+      scalar_t ifg = DEVICE_LINEAR_GET(input, offset+1*hsz);
+      scalar_t icg = DEVICE_LINEAR_GET(input, offset+2*hsz);
+      scalar_t iog = DEVICE_LINEAR_GET(input, offset+3*hsz);
+
+      scalar_t hig = DEVICE_LINEAR_GET(hidden, offset+0*hsz);
+      scalar_t hfg = DEVICE_LINEAR_GET(hidden, offset+1*hsz);
+      scalar_t hcg = DEVICE_LINEAR_GET(hidden,  offset+2*hsz);
+      scalar_t hog = DEVICE_LINEAR_GET(hidden,  offset+3*hsz);
+
+      scalar_t* wig = &DEVICE_LINEAR_GET(workspace, offset+0*hsz);
+      scalar_t* wfg = &DEVICE_LINEAR_GET(workspace, offset+1*hsz);
+      scalar_t* wcg = &DEVICE_LINEAR_GET(workspace, offset+2*hsz);
+      scalar_t* wog = &DEVICE_LINEAR_GET(workspace, offset+3*hsz);
+
+      scalar_t cx = DEVICE_LINEAR_GET(_cx, linearIndex);
+
+      scalar_t* hy = &DEVICE_LINEAR_GET(_hy, linearIndex);
+      scalar_t* cy = &DEVICE_LINEAR_GET(_cy, linearIndex);
+
+      scalar_t b1i, b1f, b1c, b1o;
+      scalar_t b2i, b2f, b2c, b2o;
+
+      if (has_bias) {
+        b1i = DEVICE_BIAS_GET(bias1, linearIndex % hsz + 0 * hsz);
+        b1f = DEVICE_BIAS_GET(bias1, linearIndex % hsz + 1 * hsz);
+        b1c = DEVICE_BIAS_GET(bias1, linearIndex % hsz + 2 * hsz);
+        b1o = DEVICE_BIAS_GET(bias1, linearIndex % hsz + 3 * hsz);
+
+        b2i = DEVICE_BIAS_GET(bias2, linearIndex % hsz + 0 * hsz);
+        b2f = DEVICE_BIAS_GET(bias2, linearIndex % hsz + 1 * hsz);
+        b2c = DEVICE_BIAS_GET(bias2, linearIndex % hsz + 2 * hsz);
+        b2o = DEVICE_BIAS_GET(bias2, linearIndex % hsz + 3 * hsz);
+      } else {
+#ifndef THC_REAL_IS_HALF
+        b1i = 0.0; b1f = 0.0; b1c = 0.0; b1o = 0.0;
+        b2i = 0.0; b2f = 0.0; b2c = 0.0; b2o = 0.0;
+#else
+        b1i = F2H(0.0); b1f = F2H(0.0); b1c = F2H(0.0); b1o = F2H(0.0);
+        b2i = F2H(0.0); b2f = F2H(0.0); b2c = F2H(0.0); b2o = F2H(0.0);
+#endif
+      }
+
+      accscalar_t ig, fg, cg, og;
+      accscalar_t f_hy, f_cy;
+
+      ig = sigmoid(H2F(iig) + H2F(hig) + H2F(b1i) + H2F(b2i));
+      fg = sigmoid(H2F(ifg) + H2F(hfg) + H2F(b1f) + H2F(b2f));
+      cg = ::tanh(H2F(icg) + H2F(hcg) + H2F(b1c) + H2F(b2c));
+      og = sigmoid(H2F(iog) + H2F(hog) + H2F(b1o) + H2F(b2o));
+
+      f_cy = (fg * H2F(cx)) + (ig * cg);
+      f_hy = og * ::tanh(f_cy);
+
+      *hy = F2H(f_hy);
+      *cy = F2H(f_cy);
+
+      //SAVE FOR BACKWARDS
+      //Also need cy and cx but can be saved easily in python
+      *wig = F2H(ig);
+      *wfg = F2H(fg);
+      *wcg = F2H(cg);
+      *wog = F2H(og);
+    }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
+C10_LAUNCH_BOUNDS_2(512, 4)
+__global__ void lstm_cell_backward(
+              TensorInfo<scalar_t, index_type> storage,
+              TensorInfo<scalar_t, index_type> gradInGates,
+              TensorInfo<scalar_t, index_type> _cx,
+              TensorInfo<scalar_t, index_type> _cy,
+              TensorInfo<scalar_t, index_type> gradoutput,
+              TensorInfo<scalar_t, index_type> gradoutputcell,
+              TensorInfo<scalar_t, index_type> gradInputCx,
+              index_type hsz,
+              index_type totalElements) {
+  bool has_gradoutput = gradoutput.data != nullptr;
+  bool has_gradoutputcell = gradoutputcell.data != nullptr;
+  for (index_type linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    index_type offset = (linearIndex/hsz)*4*hsz+linearIndex%hsz;
+
+    scalar_t ig = DEVICE_LINEAR_GET(storage, offset+0*hsz);
+    scalar_t fg = DEVICE_LINEAR_GET(storage, offset+1*hsz);
+    scalar_t cg = DEVICE_LINEAR_GET(storage, offset+2*hsz);
+    scalar_t og = DEVICE_LINEAR_GET(storage, offset+3*hsz);
+
+    scalar_t* ih = &DEVICE_LINEAR_GET(gradInGates, offset+0*hsz);
+    scalar_t* fh = &DEVICE_LINEAR_GET(gradInGates, offset+1*hsz);
+    scalar_t* ch = &DEVICE_LINEAR_GET(gradInGates, offset+2*hsz);
+    scalar_t* oh = &DEVICE_LINEAR_GET(gradInGates, offset+3*hsz);
+
+    //will return hidden grads here
+    scalar_t cx = DEVICE_LINEAR_GET(_cx, linearIndex);
+    scalar_t cy = DEVICE_LINEAR_GET(_cy, linearIndex);
+
+    scalar_t* gi = &DEVICE_LINEAR_GET(gradInputCx, linearIndex);
+
+    accscalar_t go  = has_gradoutput ? H2F(DEVICE_LINEAR_GET(gradoutput, linearIndex)) : 0.f;
+    accscalar_t goc = has_gradoutputcell ? H2F(DEVICE_LINEAR_GET(gradoutputcell, linearIndex)) : 0.f;
+
+    accscalar_t gcx = ::tanh(H2F(cy));
+
+    accscalar_t gog = go * gcx;
+    gcx = go * H2F(og) * (1 - gcx*gcx) + goc;
+
+    accscalar_t gig = gcx * H2F(cg);
+    accscalar_t gfg = gcx * H2F(cx);
+    accscalar_t gcg = gcx * H2F(ig);
+
+    gcx = gcx * H2F(fg);
+
+    gig = gig * (1-H2F(ig)) * H2F(ig);
+    gfg = gfg * (1-H2F(fg)) * H2F(fg);
+    gcg = gcg * (1-H2F(cg)*H2F(cg));
+    gog = gog * (1-H2F(og)) * H2F(og);
+
+    *ih = F2H(gig);
+    *fh = F2H(gfg);
+    *ch = F2H(gcg);
+    *oh = F2H(gog);
+
+    *gi = F2H(gcx);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
+C10_LAUNCH_BOUNDS_2(512, 4)
+__global__ void gru_cell_forward(
+            TensorInfo<scalar_t, index_type> Input,
+            TensorInfo<scalar_t, index_type> Hidden,
+            TensorInfo<scalar_t, index_type> Bias1,
+            TensorInfo<scalar_t, index_type> Bias2,
+            TensorInfo<scalar_t, index_type> _hx,
+            TensorInfo<scalar_t, index_type> _hy,
+            TensorInfo<scalar_t, index_type> storage,
+            index_type hsz,
+            index_type totalElements) {
+  bool has_bias = Bias1.data != nullptr;
+  for (index_type linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+      index_type offset = (linearIndex/hsz)*3*hsz+linearIndex%hsz;
+
+      scalar_t ir = DEVICE_LINEAR_GET(Input, offset+0*hsz);
+      scalar_t ii = DEVICE_LINEAR_GET(Input, offset+1*hsz);
+      scalar_t in = DEVICE_LINEAR_GET(Input, offset+2*hsz);
+      scalar_t hr = DEVICE_LINEAR_GET(Hidden,offset+0*hsz);
+      scalar_t hi = DEVICE_LINEAR_GET(Hidden,offset+1*hsz);
+      scalar_t hn = DEVICE_LINEAR_GET(Hidden,  offset+2*hsz);
+
+      scalar_t hx = DEVICE_LINEAR_GET(_hx, linearIndex);
+      scalar_t* hy = &DEVICE_LINEAR_GET(_hy, linearIndex);
+
+      scalar_t b1r, b1i, b1n, b2r, b2i, b2n;
+
+      if (has_bias) {
+        b1r = DEVICE_BIAS_GET(Bias1, linearIndex%hsz+0*hsz);
+        b1i = DEVICE_BIAS_GET(Bias1, linearIndex%hsz+1*hsz);
+        b1n = DEVICE_BIAS_GET(Bias1, linearIndex%hsz+2*hsz);
+
+        b2r = DEVICE_BIAS_GET(Bias2, linearIndex%hsz+0*hsz);
+        b2i = DEVICE_BIAS_GET(Bias2, linearIndex%hsz+1*hsz);
+        b2n = DEVICE_BIAS_GET(Bias2, linearIndex%hsz+2*hsz);
+      } else {
+#ifndef THC_REAL_IS_HALF
+        b1r = 0.0; b1i = 0.0; b1n = 0.0;
+        b2r = 0.0; b2i = 0.0; b2n = 0.0;
+#else
+        b1r = F2H(0.0); b1i = F2H(0.0); b1n = F2H(0.0);
+        b2r = F2H(0.0); b2i = F2H(0.0); b2n = F2H(0.0);
+#endif
+      }
+
+      offset = (linearIndex/hsz)*5*hsz+linearIndex%hsz;
+
+      accscalar_t rg, ig, ng;
+
+      rg = sigmoid(H2F(ir) + H2F(hr) + H2F(b1r) + H2F(b2r));
+      ig = sigmoid(H2F(ii) + H2F(hi) + H2F(b1i) + H2F(b2i));
+
+      ng = H2F(in) + H2F(b1n) + rg*( H2F(hn)+H2F(b2n) );
+      ng = ::tanh(ng);
+      *hy = F2H( ng + ig * ( H2F(hx)-ng ) );
+
+      //SAVE FOR BACKWARDS
+      DEVICE_LINEAR_GET(storage, offset+0*hsz) = F2H(rg);
+      DEVICE_LINEAR_GET(storage, offset+1*hsz) = F2H(ig);
+      DEVICE_LINEAR_GET(storage, offset+2*hsz) = F2H(ng);
+      DEVICE_LINEAR_GET(storage, offset+3*hsz) = hx;
+      DEVICE_LINEAR_GET(storage, offset+4*hsz) = F2H(H2F(hn) + H2F(b2n));
+    }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_type, int indexing_kind>
+C10_LAUNCH_BOUNDS_2(512, 4)
+__global__ void gru_cell_backward(
+             TensorInfo<scalar_t, index_type> gradInInput,
+             TensorInfo<scalar_t, index_type> gradInHidden,
+             TensorInfo<scalar_t, index_type> gradOutput,
+             TensorInfo<scalar_t, index_type> gradInputHx,
+             TensorInfo<scalar_t, index_type> storage,
+             index_type hsz,
+             index_type totalElements) {
+  for (index_type linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x) {
+    index_type offset = (linearIndex/hsz)*5*hsz+linearIndex%hsz;
+
+    scalar_t rg = DEVICE_LINEAR_GET(storage, offset+0*hsz);
+    scalar_t ig = DEVICE_LINEAR_GET(storage, offset+1*hsz);
+    scalar_t ng = DEVICE_LINEAR_GET(storage, offset+2*hsz);
+    scalar_t hx = DEVICE_LINEAR_GET(storage, offset+3*hsz);
+    scalar_t hn = DEVICE_LINEAR_GET(storage, offset+4*hsz);
+
+    scalar_t go = DEVICE_LINEAR_GET(gradOutput, linearIndex);
+
+    offset = (linearIndex/hsz)*3*hsz+linearIndex%hsz;
+
+    accscalar_t gig = H2F(go)*( H2F(hx)-H2F(ng) )*( 1-H2F(ig) )*H2F(ig);
+    accscalar_t ghx = H2F(go)*H2F(ig);
+    accscalar_t gin = H2F(go)*( 1-H2F(ig) )*( 1-H2F(ng)*H2F(ng) );
+    accscalar_t ghn = gin * H2F(rg);
+    accscalar_t grg = gin *H2F(hn)*( 1-H2F(rg) )*H2F(rg);
+
+    DEVICE_LINEAR_GET(gradInInput, offset+0*hsz) = F2H(grg);
+    DEVICE_LINEAR_GET(gradInInput, offset+1*hsz) = F2H(gig);
+    DEVICE_LINEAR_GET(gradInInput, offset+2*hsz) = F2H(gin);
+
+    DEVICE_LINEAR_GET(gradInHidden, offset+0*hsz) = F2H(grg);
+    DEVICE_LINEAR_GET(gradInHidden, offset+1*hsz) = F2H(gig);
+    DEVICE_LINEAR_GET(gradInHidden, offset+2*hsz) = F2H(ghn);
+    DEVICE_LINEAR_GET(gradInputHx, linearIndex) = F2H(ghx);
+  }
+}
+
+#undef DEVICE_LINEAR_GET
+#undef DEVICE_BIAS_GET
+#undef H2F
+#undef F2H
+
+} // namespace kernel
+
+template<typename scalar_t, typename index_type>
+void lstm_forward_impl(const Tensor& input_gates, const Tensor& hidden_gates,
+                       const Tensor& input_bias, const Tensor& hidden_bias,
+                       const Tensor& cx,
+                       const Tensor& hy, const Tensor& cy, const Tensor& workspace) {
+  using accscalar_t = acc_type<scalar_t, /*is_cuda=*/true>;
+
+  dim3 block, grid;
+  int64_t numel = cx.numel();
+  if (numel == 0) return;
+  getLaunchConfig(&block, &grid, numel);
+
+  auto input_gatesI = getTensorInfo<scalar_t, index_type>(input_gates);
+  auto hidden_gatesI = getTensorInfo<scalar_t, index_type>(hidden_gates);
+  auto input_biasI = tryGetTensorInfo<scalar_t, index_type>(input_bias);
+  auto hidden_biasI = tryGetTensorInfo<scalar_t, index_type>(hidden_bias);
+  auto cxI = getTensorInfo<scalar_t, index_type>(cx);
+  auto hyI = getTensorInfo<scalar_t, index_type>(hy);
+  auto cyI = getTensorInfo<scalar_t, index_type>(cy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  index_type hidden_size = cxI.sizes[cxI.dims-1];
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  if (allContiguous({input_gates, hidden_gates, input_bias, hidden_bias, cx, hy, cy, workspace})) {
+    collapseDims(input_gatesI, hidden_gatesI, input_biasI, hidden_biasI, cxI, hyI, cyI, workspaceI);
+   hipLaunchKernelGGL(( kernel::lstm_cell_forward<scalar_t, accscalar_t, index_type, 1>)
+      , dim3(grid), dim3(block), 0, stream, 
+        input_gatesI, hidden_gatesI, input_biasI, hidden_biasI, cxI, hyI, cyI, workspaceI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+   hipLaunchKernelGGL(( kernel::lstm_cell_forward<scalar_t, accscalar_t, index_type, 2>)
+      , dim3(grid), dim3(block), 0, stream, 
+        input_gatesI, hidden_gatesI, input_biasI, hidden_biasI, cxI, hyI, cyI, workspaceI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template<typename scalar_t, typename index_type>
+void lstm_backward_impl(const Tensor& grad_hy, const Tensor& grad_cy,
+                        const Tensor& cx, const Tensor& cy,
+                        const Tensor& workspace,
+                        const Tensor& grad_gates, const Tensor& grad_cx) {
+  using accscalar_t = acc_type<scalar_t, /*is_cuda=*/true>;
+
+  dim3 block, grid;
+  int64_t numel = cx.numel();
+  getLaunchConfig(&block, &grid, numel);
+  if (numel == 0) return;
+
+  auto grad_hyI = tryGetTensorInfo<scalar_t, index_type>(grad_hy);
+  auto grad_cyI = tryGetTensorInfo<scalar_t, index_type>(grad_cy);
+  auto cxI = getTensorInfo<scalar_t, index_type>(cx);
+  auto cyI = getTensorInfo<scalar_t, index_type>(cy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  auto grad_gatesI = getTensorInfo<scalar_t, index_type>(grad_gates);
+  auto grad_cxI = getTensorInfo<scalar_t, index_type>(grad_cx);
+  index_type hidden_size = cxI.sizes[cxI.dims-1];
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  if (allContiguous({grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx})) {
+    collapseDims(grad_hyI, grad_cyI, cxI, cyI, workspaceI, grad_gatesI, grad_cxI);
+   hipLaunchKernelGGL(( kernel::lstm_cell_backward<scalar_t, accscalar_t, index_type, 1>)
+      , dim3(grid), dim3(block), 0, stream, 
+        workspaceI, grad_gatesI, cxI, cyI, grad_hyI, grad_cyI, grad_cxI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+   hipLaunchKernelGGL(( kernel::lstm_cell_backward<scalar_t, accscalar_t, index_type, 2>)
+      , dim3(grid), dim3(block), 0, stream, 
+        workspaceI, grad_gatesI, cxI, cyI, grad_hyI, grad_cyI, grad_cxI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template<typename scalar_t, typename index_type>
+void gru_forward_impl(const Tensor& input_gates, const Tensor& hidden_gates,
+                      const Tensor& input_bias, const Tensor& hidden_bias,
+                      const Tensor& hx,
+                      const Tensor& hy, const Tensor& workspace) {
+  using accscalar_t = acc_type<scalar_t, /*is_cuda=*/true>;
+
+  dim3 block, grid;
+  int64_t numel = hx.numel();
+  if (numel == 0) return;
+  getLaunchConfig(&block, &grid, numel);
+
+  auto input_gatesI = getTensorInfo<scalar_t, index_type>(input_gates);
+  auto hidden_gatesI = getTensorInfo<scalar_t, index_type>(hidden_gates);
+  auto input_biasI = tryGetTensorInfo<scalar_t, index_type>(input_bias);
+  auto hidden_biasI = tryGetTensorInfo<scalar_t, index_type>(hidden_bias);
+  auto hxI = getTensorInfo<scalar_t, index_type>(hx);
+  auto hyI = getTensorInfo<scalar_t, index_type>(hy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  index_type hidden_size = hxI.sizes[hxI.dims-1];
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  if (allContiguous({input_gates, hidden_gates, input_bias, hidden_bias, hx, hy, workspace})) {
+    collapseDims(input_gatesI, hidden_gatesI, input_biasI, hidden_biasI, hxI, hyI, workspaceI);
+   hipLaunchKernelGGL(( kernel::gru_cell_forward<scalar_t, accscalar_t, index_type, 1>)
+      , dim3(grid), dim3(block), 0, stream, 
+        input_gatesI, hidden_gatesI, input_biasI, hidden_biasI, hxI, hyI, workspaceI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+   hipLaunchKernelGGL(( kernel::gru_cell_forward<scalar_t, accscalar_t, index_type, 2>)
+      , dim3(grid), dim3(block), 0, stream, 
+        input_gatesI, hidden_gatesI, input_biasI, hidden_biasI, hxI, hyI, workspaceI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template<typename scalar_t, typename index_type>
+void gru_backward_impl(const Tensor& grad_hy, const Tensor& workspace,
+                       const Tensor& grad_input_gates, const Tensor& grad_hidden_gates, const Tensor& grad_hx) {
+  using accscalar_t = acc_type<scalar_t, /*is_cuda=*/true>;
+
+  dim3 block, grid;
+  int64_t numel = grad_hy.numel();
+  if (numel == 0) return;
+  getLaunchConfig(&block, &grid, numel);
+
+  auto grad_hyI = getTensorInfo<scalar_t, index_type>(grad_hy);
+  auto workspaceI = getTensorInfo<scalar_t, index_type>(workspace);
+  auto grad_input_gatesI = getTensorInfo<scalar_t, index_type>(grad_input_gates);
+  auto grad_hidden_gatesI = getTensorInfo<scalar_t, index_type>(grad_hidden_gates);
+  auto grad_hxI = getTensorInfo<scalar_t, index_type>(grad_hx);
+  index_type hidden_size = grad_hyI.sizes[grad_hyI.dims-1];
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  if (allContiguous({grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx})) {
+    collapseDims(grad_hyI, workspaceI, grad_input_gatesI, grad_hidden_gatesI, grad_hxI);
+   hipLaunchKernelGGL(( kernel::gru_cell_backward<scalar_t, accscalar_t, index_type, 1>)
+      , dim3(grid), dim3(block), 0, stream, 
+        grad_input_gatesI, grad_hidden_gatesI, grad_hyI, grad_hxI, workspaceI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  } else {
+   hipLaunchKernelGGL(( kernel::gru_cell_backward<scalar_t, accscalar_t, index_type, 2>)
+      , dim3(grid), dim3(block), 0, stream, 
+        grad_input_gatesI, grad_hidden_gatesI, grad_hyI, grad_hxI, workspaceI, hidden_size, numel);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // anonymous namespace
+
+// Note [64-bit index math check elision]
+// It's enough to perform the check for 64-bit math on the largest tensor only.
+// If 32-bit is enough for it, it will suffice for all other tensors too, and we
+// can save some work using this trick.
+
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_zoom(
+      const Tensor& input_gates, const Tensor& hidden_gates,
+      const Tensor& cx, const std::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
+  const Tensor& input_bias = *input_bias_maybe_owned;
+  const Tensor& hidden_bias = c10::value_or_else(hidden_bias_opt, [] {return Tensor();});
+
+  checkSizes("_thnn_fused_lstm_cell_zoom",
+             {input_gates, "input_gates", 1}, {hidden_gates, "hidden_gates", 2},
+             {input_bias, "input_bias", 3}, {hidden_bias, "hidden_bias", 4},
+             /*factor=*/4, {cx, "prev_hidden", 5});
+
+  auto workspace = at::empty_like(input_gates, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto hy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto cy = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    input_gates.scalar_type(),
+    "_thnn_fused_lstm_cell_zoom",
+    [&] {
+      if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
+        lstm_forward_impl<scalar_t, int32_t>(input_gates, hidden_gates, input_bias, hidden_bias, cx, hy, cy, workspace);
+      } else {
+        lstm_forward_impl<scalar_t, int64_t>(input_gates, hidden_gates, input_bias, hidden_bias, cx, hy, cy, workspace);
+      }
+  });
+  return std::make_tuple(std::move(hy), std::move(cy), std::move(workspace));
+}
+
+void checkLSTMBackwardSizes(const TensorArg& grad_hy, const TensorArg& grad_cy,
+                            const TensorArg& cx, const TensorArg& cy,
+                            const TensorArg& workspace) {
+  CheckedFrom c = "fused_lstm_cell_backward";
+  const TensorArg& defined_grad = grad_hy->defined() ? grad_hy : grad_cy;
+  checkDim(c, defined_grad, 2);
+  auto exp_size = defined_grad->sizes();
+  if (grad_hy->defined()) {
+    checkSize(c, grad_hy, exp_size);
+  }
+  if (grad_cy->defined()) {
+    checkSize(c, grad_cy, exp_size);
+  }
+  checkSize(c, cx, exp_size);
+  checkSize(c, cy, exp_size);
+  checkDim(c, workspace, 2);
+  checkNumel(c, workspace, exp_size[0] * exp_size[1] * 4);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backward_impl_zoom( const std::optional<Tensor>& grad_hy_opt, const c10::optional<Tensor>& grad_cy_opt,
+      const Tensor& cx, const Tensor& cy,
+      const Tensor& workspace, bool has_bias) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> grad_hy_maybe_owned = at::borrow_from_optional_tensor(grad_hy_opt);
+  const Tensor& grad_hy = *grad_hy_maybe_owned;
+  const Tensor& grad_cy = c10::value_or_else(grad_cy_opt, [] {return Tensor();});
+
+  if (!grad_hy.defined() && !grad_cy.defined()) {
+    return std::tuple<Tensor, Tensor, Tensor>();
+  }
+  checkLSTMBackwardSizes({grad_hy, "grad_hy", 1}, {grad_cy, "grad_cy", 2},
+                         {cx, "cx", 3}, {cy, "cy", 4},
+                         {workspace, "workspace", 5});
+
+  auto grad_gates = at::empty_like(workspace, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_cx = at::empty_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    workspace.scalar_type(),
+    "_thnn_fused_lstm_cell_zoom_backward",
+    [&] {
+      if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
+        lstm_backward_impl<scalar_t, int32_t>(grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
+      } else {
+        lstm_backward_impl<scalar_t, int64_t>(grad_hy, grad_cy, cx, cy, workspace, grad_gates, grad_cx);
+      }
+  });
+
+  auto grad_bias = has_bias ? grad_gates.sum(0, /*keepdim=*/false) : at::Tensor{};
+  return std::make_tuple(std::move(grad_gates), std::move(grad_cx), std::move(grad_bias));
+}
+
+static constexpr int64_t GRU_WORKSPACE_MULTIPLIER = 5;
+
+std::tuple<Tensor, Tensor> _thnn_fused_gru_cell_zoom(
+      const Tensor& input_gates, const Tensor& hidden_gates,
+      const Tensor& hx, const std::optional<Tensor>& input_bias_opt, const c10::optional<Tensor>& hidden_bias_opt) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> input_bias_maybe_owned = at::borrow_from_optional_tensor(input_bias_opt);
+  const Tensor& input_bias = *input_bias_maybe_owned;
+  const Tensor& hidden_bias = c10::value_or_else(hidden_bias_opt, [] {return Tensor();});
+
+  checkSizes("_thnn_fused_gru_cell_zoom",
+             {input_gates, "input_gates", 1}, {hidden_gates, "hidden_gates", 2},
+             {input_bias, "input_bias", 3}, {hidden_bias, "hidden_bias", 4},
+             /*factor=*/3, {hx, "prev_hidden", 5});
+
+  auto workspace = at::empty({hx.size(0), hx.size(1) * GRU_WORKSPACE_MULTIPLIER}, hx.options());
+  auto hy = at::empty_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    input_gates.scalar_type(),
+    "_thnn_fused_gru_cell_zoom",
+    [&] {
+      if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
+        gru_forward_impl<scalar_t, int32_t>(input_gates, hidden_gates, input_bias, hidden_bias, hx, hy, workspace);
+      } else {
+        gru_forward_impl<scalar_t, int64_t>(input_gates, hidden_gates, input_bias, hidden_bias, hx, hy, workspace);
+      }
+  });
+  return std::make_tuple(std::move(hy), std::move(workspace));
+}
+
+void checkGRUBackwardSizes(const TensorArg& grad_hy, const TensorArg& workspace) {
+  CheckedFrom c = "fused_gru_cell_backward";
+  checkDim(c, grad_hy, 2);
+  checkSize(c, workspace, {grad_hy->size(0), grad_hy->size(1) * GRU_WORKSPACE_MULTIPLIER});
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_gru_cell_backward_zoom(
+      const Tensor& grad_hy, const Tensor& workspace, bool has_bias) {
+  checkGRUBackwardSizes({grad_hy, "grad_hy", 1}, {workspace, "workspace", 2});
+
+  int64_t hidden_size = workspace.size(1) / GRU_WORKSPACE_MULTIPLIER;
+  auto grad_input_gates = at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
+  auto grad_hidden_gates = at::empty({workspace.size(0), hidden_size * 3}, workspace.options());
+  auto grad_hx = at::empty_like(grad_hy, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    grad_hy.scalar_type(),
+    "_thnn_fused_gru_cell_zoom_backward",
+    [&] {
+      if (canUse32BitIndexMath(workspace)) { // See Note [64-bit index math check elision]
+        gru_backward_impl<scalar_t, int32_t>(grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
+      } else {
+        gru_backward_impl<scalar_t, int64_t>(grad_hy, workspace, grad_input_gates, grad_hidden_gates, grad_hx);
+      }
+  });
+
+  at::Tensor grad_input_bias, grad_hidden_bias;
+  if (has_bias) {
+    grad_input_bias = grad_input_gates.sum(0, /*keepdim=*/false);
+    grad_hidden_bias = grad_hidden_gates.sum(0, /*keepdim=*/false);
+  }
+
+  return std::make_tuple(
+    std::move(grad_input_gates),
+    std::move(grad_hidden_gates),
+    std::move(grad_hx),
+    std::move(grad_input_bias),
+    std::move(grad_hidden_bias)
+  );
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Randperm.cu b/aten/src/ATen/native/zoom/Randperm.cu
new file mode 100644
index 00000000000000..ed5a9702b2ce23
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Randperm.cu
@@ -0,0 +1,133 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/TensorFactories.h>
+#include <ATen/zoom/cub.h>
+#include <ATen/native/zoom/Randperm.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/randperm_native.h>
+#endif
+
+#include <limits>
+
+namespace at::native {
+
+// [Algorithm of randperm]
+//
+// randperm is implemented by sorting an arange tensor of size n with randomly
+// generated keys. When random keys are different from each other, all different
+// permutations have the same probability.
+//
+// However, there is a pitfall here:
+// For better performance, these N random keys are generated independently,
+// and there is no effort to make sure they are different at the time of generation.
+// When two keys are identical, stable sorting algorithms will not permute these two keys.
+// As a result, (0, 1) will appear more often than (1, 0).
+//
+// To overcome this pitfall we first carefully choose the number of bits in these keys,
+// so that the probability of having duplicate keys is under a threshold. Let q be the
+// threshold probability for having non-duplicate keys, then it can be proved that[1]
+// the number of bits required is: ceil(log2(n - (6 n^2 + 1) / (12 log(q))))
+//
+// Then after sort, we lauch a separate kernel that additionally shuffles any islands
+// of values whose keys matched. The algorithm of this kernel is as follows:
+// Each thread reads its key and the keys of its neighbors to tell if it's part of an island.
+// For each island, the first thread in the island sees a key match at index i+1 but not index i-1.
+// This thread considers itself the "island leader". The island leader then reads more indices to
+// the right to figure out how big the island is. Most likely, the island will be very small,
+// just a few values. The island leader then rolls that many RNG, uses them to additionally
+// shuffle values within the island using serial Fisher-Yates, and writes them out.
+//
+// Reference
+// [1] https://osf.io/af2hy/
+
+// The kernels are templated on an opaque, self-aligned type of the correct
+// size to avoid redundant kernels for different types of the same size.
+namespace {
+template <int N> struct alignas(N) OpaqueType { char data[N]; };
+}
+
+Tensor& randperm_out_zoom(int64_t n, std::optional<Generator> generator, Tensor& result) {
+  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
+
+  check_supported_max_int_with_precision(n, result);
+
+  result.resize_({n});
+
+  auto range = at::arange(n, result.options());
+
+  // shuffled_data points to the underlying data of the output tensor if the tensor is contiguous; otherwise it
+  // points to a new tensor.
+  Tensor shuffled;
+  void *shuffled_data;
+  if (result.is_contiguous()) {
+    shuffled_data = result.data_ptr();
+  } else {
+    shuffled = at::empty(n, result.options());
+    shuffled_data = shuffled.data_ptr();
+  }
+
+  auto opt = TensorOptions().device(result.device());
+
+  // See note [Algorithm of randperm]
+  const double log_threshold_12 = std::log(0.9) * 12;
+  double nd = static_cast<double>(n);
+
+  int bits = std::min(64,
+    static_cast<int>(std::ceil(std::log2(nd - (6 * nd * nd + 1) / log_threshold_12))));
+
+  if (n == 0) {
+    return result;
+  } else if (bits <= 32) {
+    // For asserting device type match of the generator and result,
+    // we deligate that to the 'random_' function below.
+
+    auto keys = at::empty(result.sizes(), opt.dtype(kInt)).random_(
+      std::numeric_limits<int>::min(), std::numeric_limits<int>::max(), generator);
+    auto keys_tmp = at::empty_like(keys);
+    auto keys_out = keys_tmp.mutable_data_ptr<int>();
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, result.scalar_type(), "randperm_out_zoom", [&] {
+      using dtype = OpaqueType<sizeof(scalar_t)>;
+      auto shuffled_data_ = reinterpret_cast<dtype*>(shuffled_data);
+      auto* range_data = reinterpret_cast<const dtype*>(range.const_data_ptr());
+      at::zoom::hipcub::radix_sort_pairs<int, dtype>(
+        keys.const_data_ptr<int>(), keys_out,
+        range_data, shuffled_data_,
+        n, false, 0, bits);
+
+      randperm_handle_duplicate_keys(keys_out, shuffled_data_, bits, n, generator);
+    });
+  } else {
+    auto keys = at::empty(result.sizes(), opt.dtype(kLong)).random_(
+      std::numeric_limits<int64_t>::min(), std::numeric_limits<int64_t>::max(), generator);
+    auto keys_tmp = at::empty_like(keys);
+    auto keys_out = keys_tmp.mutable_data_ptr<int64_t>();
+    AT_DISPATCH_ALL_TYPES_AND(kHalf, result.scalar_type(), "randperm_out_zoom", [&] {
+      using dtype = OpaqueType<sizeof(scalar_t)>;
+      auto shuffled_data_ = reinterpret_cast<dtype*>(shuffled_data);
+      auto* range_data = reinterpret_cast<const dtype*>(range.data_ptr());
+      at::zoom::hipcub::radix_sort_pairs<int64_t, dtype>(
+        keys.const_data_ptr<int64_t>(), keys_out,
+        range_data, shuffled_data_,
+        n, false, 0, bits);
+
+      randperm_handle_duplicate_keys(keys_out, shuffled_data_, bits, n, generator);
+    });
+  }
+
+  if (!result.is_contiguous()) {
+    result.copy_(shuffled);
+  }
+
+  return result;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Randperm.cuh b/aten/src/ATen/native/zoom/Randperm.cuh
new file mode 100644
index 00000000000000..f68baa339fefa0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Randperm.cuh
@@ -0,0 +1,57 @@
+#include <ATen/zoom/ZoomGeneratorImpl.h>
+#include <ATen/zoom/HIPGraphsUtils.hpp>
+#include <ATen/Utils.h>
+
+#include <hiprand/hiprand.h>
+#include <hiprand/hiprand_kernel.h>
+
+namespace {
+
+// See note [Algorithm of randperm]
+template<typename T, typename scalar_t>
+__global__ void randperm_handle_duplicate_keys_kernel(T *keys, scalar_t *data, T mask, int n, at::PhiloxHIPState philox_args) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+
+  // find the beginning of islands
+  if (tid >= n - 1) return;  // out of range
+  if ((keys[tid] & mask) != (keys[tid + 1] & mask)) return;  // not in an island
+  if (tid != 0 && (keys[tid] & mask) == (keys[tid - 1] & mask)) return;  // not the beginning of an island
+
+  // find the size of islands
+  int island_size = 0;
+  do { island_size++; }
+  while ((tid + island_size < n) && (keys[tid + island_size] & mask) == (keys[tid] & mask));
+
+  // do random permutation inside each island.
+  data += tid;
+  auto seeds = at::zoom::philox::unpack(philox_args);
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(std::get<0>(seeds), tid, std::get<1>(seeds), &state);
+  for (int i = island_size - 1; i > 0; i--) {
+    unsigned int r = hiprand(&state) % (i + 1);
+    if (i != r) {
+      scalar_t tmp = data[i];
+      data[i] = data[r];
+      data[r] = tmp;
+    }
+  }
+}
+
+// See note [Algorithm of randperm]
+template<typename T, typename scalar_t>
+void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, c10::optional<at::Generator> &gen_) {
+  auto gen = at::get_generator_or_default<at::ZoomGeneratorImpl>(gen_, at::zoom::detail::getDefaultZoomGenerator());
+  int64_t counter_offset = n;
+  at::PhiloxHIPState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_hip_state(counter_offset);
+  }
+  T mask = static_cast<T>((1UL << bits) - 1);
+  randperm_handle_duplicate_keys_kernel<<<(n + 511) / 512, 512, 0, c10::zoom::getCurrentZoomStream()>>>(
+    keys, data, mask, n, rng_engine_inputs);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/ReflectionPad.cu b/aten/src/ATen/native/zoom/ReflectionPad.cu
new file mode 100644
index 00000000000000..a0a5ca8e81ab1f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReflectionPad.cu
@@ -0,0 +1,679 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/native/Padding.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#endif
+
+#include <thrust/pair.h>
+
+namespace at::native {
+namespace {
+
+using at::zoom::detail::canUse32BitIndexMath;
+
+__device__
+inline thrust::pair<int64_t, int64_t> get_index_mapping1d(
+    int64_t input_w, int64_t output_w,
+    int64_t output_x,
+    int64_t pad_l) {
+  // 3D grid of 1D blocks
+  auto input_offset =
+    (blockIdx.y + blockIdx.z * gridDim.y) * input_w;
+  auto output_offset =
+    (blockIdx.y + blockIdx.z * gridDim.y) * output_w;
+
+  auto i_start_x = ::max(int64_t(0), -pad_l);
+  auto o_start_x = ::max(int64_t(0), pad_l);
+
+  int64_t input_x = ::abs(output_x - pad_l)
+                    - ::abs(output_x - (input_w + pad_l - 1))
+                    - output_x
+                    + 2 * pad_l + input_w - 1
+                    - o_start_x + i_start_x;
+
+  return thrust::make_pair<int64_t, int64_t>(
+    input_offset + input_x, output_offset + output_x);
+}
+
+
+__device__
+inline thrust::pair<int64_t, int64_t>  get_index_mapping2d(
+    int64_t input_dim_x, int64_t input_dim_y,
+    int64_t output_dim_x, int64_t output_dim_y,
+    int64_t pad_l, int64_t pad_t,
+    int64_t output_xy, int y_shift, int z_shift, int nplane) {
+  // 3D grid of 1D blocks
+  auto input_offset =
+    ((blockIdx.y + y_shift) + (blockIdx.z + z_shift) * nplane) * input_dim_x * input_dim_y;
+  auto output_offset =
+    ((blockIdx.y + y_shift) + (blockIdx.z + z_shift) * nplane) * output_dim_x * output_dim_y;
+
+  auto output_x = output_xy % output_dim_x;
+  auto output_y = output_xy / output_dim_x;
+
+  auto i_start_x = ::max(int64_t(0), -pad_l);
+  auto i_start_y = ::max(int64_t(0), -pad_t);
+  auto o_start_x = ::max(int64_t(0), pad_l);
+  auto o_start_y = ::max(int64_t(0), pad_t);
+
+  auto input_x = ::abs(output_x - pad_l)
+                 - ::abs(output_x - (input_dim_x + pad_l - 1))
+                 - output_x
+                 + 2 * pad_l + input_dim_x - 1
+                 - o_start_x + i_start_x;
+
+  auto input_y = ::abs(output_y - pad_t)
+                 - ::abs(output_y - (input_dim_y + pad_t - 1))
+                 - output_y
+                 + 2 * pad_t + input_dim_y - 1
+                 - o_start_y + i_start_y;
+
+  return thrust::make_pair<int64_t, int64_t>(
+    input_offset + input_y * input_dim_x + input_x,
+    output_offset + output_y * output_dim_x + output_x);
+}
+
+template<typename scalar_t>
+__global__ void reflection_pad1d_out_kernel(
+    const scalar_t * input, scalar_t * output,
+    int64_t input_w,
+    int64_t pad_l, int64_t pad_r) {
+  auto output_x = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_w = input_w + pad_l + pad_r;
+
+  if (output_x < output_w) {
+    auto index_pair = get_index_mapping1d(input_w, output_w, output_x, pad_l);
+    output[index_pair.second] = input[index_pair.first];
+  }
+}
+
+template <typename scalar_t>
+__global__ void reflection_pad1d_backward_out_kernel(
+    scalar_t * grad_input, const scalar_t * grad_output,
+    int64_t input_w,
+    int64_t pad_l, int64_t pad_r) {
+  auto output_x = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_w = input_w + pad_l + pad_r;
+
+  if (output_x < output_w) {
+    auto index_pair = get_index_mapping1d(input_w, output_w, output_x, pad_l);
+    gpuAtomicAddNoReturn(
+      &grad_input[index_pair.first], grad_output[index_pair.second]);
+  }
+}
+
+template<typename scalar_t>
+__global__ void reflection_pad2d_out_kernel(
+    const scalar_t * input, scalar_t * output,
+    int64_t input_dim_x, int64_t input_dim_y,
+    int pad_t, int pad_b, int pad_l, int pad_r, int y_shift, int z_shift, int nplane) {
+  auto output_xy = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_dim_x = input_dim_x + pad_l + pad_r;
+  auto output_dim_y = input_dim_y + pad_t + pad_b;
+
+  if (output_xy < output_dim_x * output_dim_y) {
+    auto index_pair = get_index_mapping2d(
+      input_dim_x, input_dim_y,
+      output_dim_x, output_dim_y,
+      pad_l, pad_t,
+      output_xy, y_shift, z_shift, nplane);
+
+    output[index_pair.second] = input[index_pair.first];
+  }
+}
+
+template <typename scalar_t>
+__global__ void reflection_pad2d_backward_out_kernel(
+    scalar_t * grad_input, const scalar_t * grad_output,
+    int64_t input_dim_x, int64_t input_dim_y,
+    int pad_t, int pad_b, int pad_l, int pad_r, int y_shift, int z_shift, int nplane) {
+  auto output_xy = threadIdx.x + blockIdx.x * blockDim.x;
+  auto output_dim_x = input_dim_x + pad_l + pad_r;
+  auto output_dim_y = input_dim_y + pad_t + pad_b;
+
+  if (output_xy < output_dim_x * output_dim_y) {
+    auto index_pair = get_index_mapping2d(
+      input_dim_x, input_dim_y,
+      output_dim_x, output_dim_y,
+      pad_l, pad_t,
+      output_xy, y_shift, z_shift, nplane);
+
+    gpuAtomicAddNoReturn(&grad_input[index_pair.first], grad_output[index_pair.second]);
+  }
+}
+template <typename input_scalar_t, typename output_scalar_t, typename F>
+__device__ inline void parallel_reflection_pad3d(
+    PackedTensorAccessor64<input_scalar_t, 5> input,
+    PackedTensorAccessor64<output_scalar_t, 5> output,
+    int64_t pad_left,
+    int64_t pad_top,
+    int64_t pad_front,
+    int64_t y_shift,
+    int64_t z_shift,
+    const F& f) {
+  int64_t output_id = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (output_id >= (output.size(2) * output.size(3) * output.size(4))) {
+    return;
+  }
+
+  int64_t output_x = output_id % output.size(4);
+  int64_t output_y = (output_id / output.size(4)) % output.size(3);
+  int64_t output_z = output_id / (output.size(3) * output.size(4));
+
+  int64_t i_start_x = ::max(int64_t(0), -pad_left);
+  int64_t o_start_x = ::max(int64_t(0), pad_left);
+  int64_t i_start_y = ::max(int64_t(0), -pad_top);
+  int64_t o_start_y = ::max(int64_t(0), pad_top);
+  int64_t i_start_z = ::max(int64_t(0), -pad_front);
+  int64_t o_start_z = ::max(int64_t(0), pad_front);
+
+  int64_t input_x = ::abs(output_x - pad_left)
+                 - ::abs(output_x - (input.size(4) + pad_left - 1))
+                 - output_x
+                 + 2 * pad_left + input.size(4) - 1
+                 - o_start_x + i_start_x;
+  int64_t input_y = ::abs(output_y - pad_top)
+                 - ::abs(output_y - (input.size(3) + pad_top - 1))
+                 - output_y
+                 + 2 * pad_top + input.size(3) - 1
+                 - o_start_y + i_start_y;
+
+  int64_t input_z = ::abs(output_z - pad_front)
+                 - ::abs(output_z - (input.size(2) + pad_front - 1))
+                 - output_z
+                 + 2 * pad_front + input.size(2) - 1
+                 - o_start_z + i_start_z;
+
+  int64_t plane = blockIdx.y + y_shift;
+  int64_t batch = blockIdx.z + z_shift;
+  f(plane, batch, output_z, output_y, output_x, input_z, input_y, input_x);
+}
+
+template<typename scalar_t>
+__global__ void reflection_pad3d_out_kernel(
+    PackedTensorAccessor64<const scalar_t, 5> input,
+    PackedTensorAccessor64<scalar_t, 5> output,
+    int64_t pad_left,  int64_t pad_top, int64_t pad_front,
+    int64_t y_shift, int64_t z_shift
+){
+  parallel_reflection_pad3d(
+      input,
+      output,
+      pad_left,
+      pad_top,
+      pad_front,
+      y_shift,
+      z_shift,
+      [&] __device__(
+          int64_t plane,
+          int64_t batch,
+          int64_t output_z,
+          int64_t output_y,
+          int64_t output_x,
+          int64_t input_z,
+          int64_t input_y,
+          int64_t input_x) {
+        auto value_to_copy = input[batch][plane][input_z][input_y][input_x];
+        output[batch][plane][output_z][output_y][output_x] = value_to_copy;
+      });
+}
+
+template <typename scalar_t>
+__global__ void reflection_pad3d_backward_out_kernel(
+    PackedTensorAccessor64<scalar_t, 5> grad_input,
+    PackedTensorAccessor64<const scalar_t, 5> grad_output,
+    int64_t pad_left,  int64_t pad_top, int64_t pad_front,
+    int64_t y_shift, int64_t z_shift
+) {
+  parallel_reflection_pad3d(
+      grad_input,
+      grad_output,
+      pad_left,
+      pad_top,
+      pad_front,
+      y_shift,
+      z_shift,
+      [&] __device__(
+          int64_t plane,
+          int64_t batch,
+          int64_t output_z,
+          int64_t output_y,
+          int64_t output_x,
+          int64_t input_z,
+          int64_t input_y,
+          int64_t input_x) {
+        auto value_to_add = grad_output[batch][plane][output_z][output_y][output_x];
+        auto target = &grad_input[batch][plane][input_z][input_y][input_x];
+        gpuAtomicAddNoReturn(target, value_to_add);
+      });
+}
+
+void reflection_pad2d_out_template(
+    Tensor &output, const Tensor &input_, IntArrayRef padding) {
+
+  TORCH_CHECK(canUse32BitIndexMath(input_),
+    "input tensor must fit into 32-bit index math");
+
+  int plane_dim = 0;
+  int dim_h = 1;
+  int dim_w = 2;
+  int nbatch = 1;
+
+  at::native::padding::check_valid_input<2>(input_, padding);
+
+  if (input_.ndimension() == 4) {
+    nbatch = input_.size(0);
+    plane_dim++;
+    dim_h++;
+    dim_w++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+  int64_t pad_t = padding[2];
+  int64_t pad_b = padding[3];
+
+  int nplane = input_.size(plane_dim);
+  int input_h = input_.size(dim_h);
+  int input_w = input_.size(dim_w);
+
+  TORCH_CHECK(pad_l < input_w && pad_r < input_w,
+    "Padding size should be less than the corresponding input dimension, but "
+    "got: padding (", pad_l, ", ", pad_r, ") at dimension ", dim_w,
+    " of input ", input_.sizes());
+
+  TORCH_CHECK(pad_t < input_h && pad_b < input_h,
+    "Padding size should be less than the corresponding input dimension, but "
+    "got: padding (", pad_t, ", ", pad_b, ") at dimension ", dim_h,
+    " of input ", input_.sizes());
+
+  int output_h = input_h + pad_t + pad_b;
+  int output_w  = input_w + pad_l + pad_r;
+
+  TORCH_CHECK(output_w >= 1 || output_h >= 1,
+    "input (H: ", input_h, ", W: ", input_w, ") is too small.  Calculated "
+    "output H: ", output_h, " W: ", output_w);
+
+  if (input_.ndimension() == 3) {
+    output.resize_({nplane, output_h, output_w});
+  } else {
+    output.resize_({nbatch, nplane, output_h, output_w});
+  }
+  if (output.numel() == 0) {
+    return;
+  }
+
+  Tensor input = input_.contiguous();
+
+  int64_t output_plane_size = output_h * output_w;
+  dim3 block_size(output_plane_size > 256 ? 256 : output_plane_size);
+
+  int64_t size_y = nplane;
+  int64_t size_z = nbatch;
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "reflection_pad2d_out_template", [&] {
+
+      for (int64_t block_y = 0; block_y < size_y; block_y += 65535) {
+        int64_t block_y_size = ::min(size_y - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size_z; block_z += 65535) {
+          int64_t block_z_size = ::min(size_z - block_z, static_cast<int64_t>(65535));
+
+          dim3 grid_size(at::ceil_div(output_plane_size, static_cast<int64_t>(256)), block_y_size, block_z_size);
+
+         hipLaunchKernelGGL(( reflection_pad2d_out_kernel), 
+            dim3(grid_size), dim3(block_size), 0, c10::zoom::getCurrentZoomStream(), 
+              input.const_data_ptr<scalar_t>(), output.mutable_data_ptr<scalar_t>(),
+              input_w, input_h,
+              pad_t, pad_b, pad_l, pad_r, block_y, block_z, nplane);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+    }
+  );
+}
+
+void reflection_pad2d_backward_out_template(
+    Tensor &grad_input, const Tensor &grad_output_,
+    const Tensor &input, IntArrayRef padding) {
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(canUse32BitIndexMath(input),
+    "input tensor must fit into 32-bit index math");
+  TORCH_CHECK(canUse32BitIndexMath(grad_output_),
+    "output gradient tensor must fit into 32-bit index math");
+
+  int plane_dim = 0;
+  int dim_h = 1;
+  int dim_w = 2;
+  int nbatch = 1;
+
+  if (input.ndimension() == 4) {
+    nbatch = input.size(0);
+    plane_dim++;
+    dim_h++;
+    dim_w++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+  int64_t pad_t = padding[2];
+  int64_t pad_b = padding[3];
+
+  int nplane = input.size(plane_dim);
+  int input_h = input.size(dim_h);
+  int input_w = input.size(dim_w);
+
+  int output_h = input_h + pad_t + pad_b;
+  int output_w  = input_w + pad_l + pad_r;
+
+  TORCH_CHECK(output_w == grad_output_.size(dim_w), "grad_output width "
+    "unexpected. Expected: ", output_w, ", Got: ", grad_output_.size(dim_w));
+  TORCH_CHECK(output_h == grad_output_.size(dim_h), "grad_output height "
+    "unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h));
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  int64_t output_plane_size = output_h * output_w;
+  dim3 block_size(output_plane_size > 256 ? 256 : output_plane_size);
+
+  int64_t size_y = nplane;
+  int64_t size_z = nbatch;
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "reflection_pad2d_backward_out_template", [&] {
+
+      for (int64_t block_y = 0; block_y < size_y; block_y += 65535) {
+        int64_t block_y_size = ::min(size_y - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size_z; block_z += 65535) {
+          int64_t block_z_size = ::min(size_z - block_z, static_cast<int64_t>(65535));
+
+          dim3 grid_size(at::ceil_div(output_plane_size, static_cast<int64_t>(256)), block_y_size, block_z_size);
+
+         hipLaunchKernelGGL(( reflection_pad2d_backward_out_kernel), 
+            dim3(grid_size), dim3(block_size), 0, c10::zoom::getCurrentZoomStream(), 
+              grad_input.mutable_data_ptr<scalar_t>(), grad_output.const_data_ptr<scalar_t>(),
+              input_w, input_h,
+              pad_t, pad_b, pad_l, pad_r, block_y, block_z, nplane);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+    }
+  );
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(reflection_pad1d_out_zoom)
+(const Tensor& input_, IntArrayRef padding, const Tensor& output) {
+  TORCH_CHECK(
+      canUse32BitIndexMath(input_),
+      "input tensor must fit into 32-bit index math");
+
+  if (output.numel() == 0) {
+    return;
+  }
+
+  int64_t dim_plane = 0;
+  int64_t dim_w = 1;
+  int64_t nbatch = 1;
+
+  if (input_.ndimension() == 3) {
+    nbatch = input_.size(0);
+    dim_plane++;
+    dim_w++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+
+  int64_t nplane = input_.size(dim_plane);
+  int64_t input_w = input_.size(dim_w);
+  int64_t output_w = input_w + pad_l + pad_r;
+
+  dim3 block_size(output_w > 256 ? 256 : output_w);
+  dim3 grid_size((int)::ceil(output_w / 256.0), nplane, nbatch);
+
+  Tensor input = input_.contiguous();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+      kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out_template", [&] {
+       hipLaunchKernelGGL(( reflection_pad1d_out_kernel), 
+            dim3(grid_size),
+            dim3(block_size),
+            0,
+            c10::zoom::getCurrentZoomStream(), 
+            input.const_data_ptr<scalar_t>(),
+            output.mutable_data_ptr<scalar_t>(),
+            input_w,
+            pad_l,
+            pad_r);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+TORCH_IMPL_FUNC(reflection_pad1d_backward_out_zoom)(const Tensor& grad_output_,
+    const Tensor& input,
+    IntArrayRef padding,
+    const Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("reflection_pad1d_backward_out_zoom");
+  grad_input.zero_();
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  TORCH_CHECK(canUse32BitIndexMath(input),
+    "input tensor must fit into 32-bit index math");
+
+  TORCH_CHECK(canUse32BitIndexMath(grad_output_),
+    "input tensor must fit into 32-bit index math");
+
+  int64_t dim_plane = 0;
+  int64_t dim_w = 1;
+  int64_t nbatch = 1;
+
+  if (input.ndimension() == 3) {
+    nbatch = input.size(0);
+    dim_plane++;
+    dim_w++;
+  }
+
+  int64_t pad_l = padding[0];
+  int64_t pad_r = padding[1];
+
+  int64_t nplane = input.size(dim_plane);
+  int64_t input_w = input.size(dim_w);
+  int64_t output_w  = input_w + pad_l + pad_r;
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  dim3 block_size(output_w > 256 ? 256 : output_w);
+  dim3 grid_size((int) ::ceil(output_w / 256.0), nplane, nbatch);
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+    grad_input.scalar_type(), "reflection_pad1d_backward_out_zoom", [&] {
+     hipLaunchKernelGGL(( reflection_pad1d_backward_out_kernel), 
+        dim3(grid_size), dim3(block_size), 0, c10::zoom::getCurrentZoomStream(), 
+          grad_input.mutable_data_ptr<scalar_t>(), grad_output.const_data_ptr<scalar_t>(),
+          input_w, pad_l, pad_r);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  );
+}
+
+Tensor& reflection_pad2d_out_zoom(const Tensor& input, IntArrayRef padding,
+    Tensor& output) {
+  reflection_pad2d_out_template(output, input, padding);
+  return output;
+}
+
+Tensor reflection_pad2d_zoom(const Tensor& input, IntArrayRef padding) {
+  auto output = at::empty({0}, input.options());
+  reflection_pad2d_out_template(output, input, padding);
+  return output;
+}
+
+Tensor& reflection_pad2d_backward_out_zoom(const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("reflection_pad2d_backward_out_zoom");
+  grad_input.resize_as_(input);
+  grad_input.zero_();
+  reflection_pad2d_backward_out_template(
+    grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+Tensor reflection_pad2d_backward_zoom(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("reflection_pad2d_backward_zoom");
+  auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  reflection_pad2d_backward_out_template(
+    grad_input, grad_output, input, padding);
+  return grad_input;
+}
+
+
+TORCH_IMPL_FUNC(reflection_pad3d_out_zoom) (
+  const Tensor& input_, IntArrayRef padding, const Tensor& output
+  ) {
+  TORCH_CHECK(
+      canUse32BitIndexMath(input_),
+      "input tensor must fit into 32-bit index math");
+
+  if (output.numel() == 0) {
+    return;
+  }
+
+  int64_t pad_left = padding[0];
+  int64_t pad_top = padding[2];
+  int64_t pad_front = padding[4];
+
+  auto input = input_.contiguous();
+  bool batch_mode = (input.dim() == 5);
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "reflection_pad3d_out_zoom", [&] {
+        auto input_inner = input;
+        auto output_inner = output;
+        if (!batch_mode) {
+          // non-batch mode
+          input_inner = input.unsqueeze(0);
+          output_inner = output.unsqueeze(0);
+        }
+
+        auto input_packed = input_inner.packed_accessor64<const scalar_t, 5>();
+        auto output_packed = output_inner.packed_accessor64<scalar_t, 5>();
+
+        int64_t output_plane_size = output_packed.size(2) * output_packed.size(3) * output_packed.size(4);
+        int64_t size_y = input_packed.size(1);
+        int64_t size_z = input_packed.size(0);
+        dim3 block_size(output_plane_size > 256 ? 256 : output_plane_size);
+
+        for (int64_t block_y = 0; block_y < size_y; block_y += 65535) {
+          int64_t block_y_size = ::min(size_y - block_y, static_cast<int64_t>(65535));
+          for (int64_t block_z = 0; block_z < size_z; block_z += 65535) {
+            int64_t block_z_size = ::min(size_z - block_z, static_cast<int64_t>(65535));
+
+            dim3 grid_size(at::ceil_div(output_plane_size, static_cast<int64_t>(256)), \
+                           block_y_size, block_z_size);
+
+           hipLaunchKernelGGL(( reflection_pad3d_out_kernel), 
+                dim3(grid_size), dim3(block_size),0, c10::zoom::getCurrentZoomStream(), 
+                input_packed, output_packed, pad_left, pad_top, pad_front,
+                block_y, block_z);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        }
+      });
+}
+
+TORCH_IMPL_FUNC(reflection_pad3d_backward_out_zoom) (
+  const Tensor& grad_output, const Tensor& input, IntArrayRef padding,
+  const Tensor& grad_input) {
+  globalContext().alertNotDeterministic("reflection_pad3d_backward_out_zoom");
+  TORCH_CHECK(canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math");
+  TORCH_CHECK(canUse32BitIndexMath(grad_output), "input tensor must fit into 32-bit index math");
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+  grad_input.zero_();
+
+  int64_t pad_left = padding[0];
+  int64_t pad_top = padding[2];
+  int64_t pad_front = padding[4];
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "reflection_pad3d_backward_out_zoom", [&] {
+        auto grad_input_ = grad_input;
+        auto grad_output_ = grad_output;
+        if (input.dim() == 4) {
+          // non-batch mode
+          grad_input_ = grad_input.unsqueeze(0);
+          grad_output_ = grad_output.unsqueeze(0);
+        }
+
+        auto grad_input_packed = grad_input_.packed_accessor64<scalar_t, 5>();
+        auto grad_output_packed = grad_output_.packed_accessor64<const scalar_t, 5>();
+
+        int64_t output_plane_size = grad_output_packed.size(2) *
+            grad_output_packed.size(3) * grad_output_packed.size(4);
+        int64_t size_y = grad_input_packed.size(1);
+        int64_t size_z = grad_input_packed.size(0);
+        dim3 block_size(output_plane_size > 256 ? 256 : output_plane_size);
+
+        for (int64_t block_y = 0; block_y < size_y; block_y += 65535) {
+          int64_t block_y_size = ::min(size_y - block_y, static_cast<int64_t>(65535));
+          for (int64_t block_z = 0; block_z < size_z; block_z += 65535) {
+            int64_t block_z_size = ::min(size_z - block_z, static_cast<int64_t>(65535));
+
+            dim3 grid_size(at::ceil_div(output_plane_size, static_cast<int64_t>(256)), \
+                           block_y_size, block_z_size);
+
+           hipLaunchKernelGGL(( reflection_pad3d_backward_out_kernel), 
+                dim3(grid_size), dim3(block_size),0, c10::zoom::getCurrentZoomStream(), 
+                grad_input_packed, grad_output_packed, pad_left, pad_top, pad_front,
+                block_y, block_z);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        }
+      });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/RenormKernel.cu b/aten/src/ATen/native/zoom/RenormKernel.cu
new file mode 100644
index 00000000000000..3f8963414375dd
--- /dev/null
+++ b/aten/src/ATen/native/zoom/RenormKernel.cu
@@ -0,0 +1,30 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/Normalization.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+#include <ATen/Dispatch.h>
+
+namespace at::native {
+namespace {
+
+void renorm_scale_factor_impl(TensorIteratorBase& iter, double maxnorm) {
+  AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "renorm_scale_factor_cpu", [&] {
+    const auto maxnorm_s = static_cast<scalar_t>(maxnorm);
+    gpu_kernel(
+      iter,
+      [maxnorm_s] GPU_LAMBDA (scalar_t norm) -> scalar_t {
+        const auto eps = static_cast<scalar_t>(1e-7);
+        const auto one = static_cast<scalar_t>(1.0);
+        return (norm > maxnorm_s) ?
+            maxnorm_s / (norm + eps) : one;
+      });
+  });
+}
+
+}  // namespace (anonymous)
+
+REGISTER_PRIVATEUSE1_DISPATCH(renorm_scale_factor_stub, &renorm_scale_factor_impl);
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/Repeat.cu b/aten/src/ATen/native/zoom/Repeat.cu
new file mode 100644
index 00000000000000..0e0b20bd47fe91
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Repeat.cu
@@ -0,0 +1,69 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/Repeat.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/repeat_interleave_native.h>
+#endif
+
+template <typename index_t>
+__global__ static void compute_zoom_kernel(
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
+    index_t* result_ptr,
+    int64_t size,
+    int64_t result_size) {
+  ZOOM_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1]);
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
+  int warp_id = idx / C10_WARP_SIZE;
+  int tid_in_warp = idx % C10_WARP_SIZE;
+  for (int64_t i = warp_id; i < size; i += stride) {
+    int64_t end = cumsum_ptr[i];
+    index_t repeat = repeat_ptr[i];
+    ZOOM_KERNEL_ASSERT(repeat >= 0);
+    int64_t start = end - repeat;
+    for (int64_t j = start + tid_in_warp; j < end; j += C10_WARP_SIZE) {
+      result_ptr[j] = i;
+    }
+  }
+}
+
+template <typename index_t>
+static void compute_zoom(
+    const index_t* repeat_ptr,
+    const int64_t* cumsum_ptr,
+    index_t* result_ptr,
+    int64_t size,
+    int64_t result_size) {
+  int64_t block = 512;
+  int64_t warps_per_block = block / at::zoom::warp_size();
+  int64_t grid =
+      std::min<int64_t>((size + warps_per_block - 1) / warps_per_block, 2048L);
+
+ hipLaunchKernelGGL(( compute_zoom_kernel), dim3(grid), dim3(block), 0, c10::zoom::getCurrentZoomStream(), 
+      repeat_ptr, cumsum_ptr, result_ptr, size, result_size);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+namespace at::native {
+
+Tensor repeat_interleave_zoom(
+    const Tensor& repeat,
+    std::optional<int64_t> output_size) {
+  Tensor output;
+  AT_DISPATCH_INDEX_TYPES(
+      repeat.scalar_type(), "repeat_interleave_zoom", [&]() {
+        output = repeat_interleave_common<index_t, compute_zoom<index_t>>(
+            repeat, output_size);
+      });
+  return output;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ReplicationPadding.cu b/aten/src/ATen/native/zoom/ReplicationPadding.cu
new file mode 100644
index 00000000000000..623f1610501ee0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ReplicationPadding.cu
@@ -0,0 +1,700 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#endif
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+
+
+namespace at::native {
+__host__ __device__ __forceinline__ int imin(int a, int b) {
+  return a > b ? b : a;
+}
+
+__host__ __device__ __forceinline__ int imax(int a, int b) {
+  return a > b ? a : b;
+}
+
+namespace {
+template <typename scalar_t>
+__global__ void replication_pad_forward_kernel1d(
+    PackedTensorAccessor64<const scalar_t, 3> input,
+    PackedTensorAccessor64<scalar_t, 3> output,
+    const int padL,
+    const int y_shift,
+    const int z_shift) {
+  const int64_t outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int64_t plane = blockIdx.y + y_shift;
+  const int64_t batch = blockIdx.z + z_shift;
+  if (outputPointId >= output.size(2)) {
+    return;
+  }
+  const auto outputPointX = outputPointId % output.size(2);
+
+  const int iStartX = imax(0, -padL);
+  const int oStartX = imax(0, padL);
+
+  const auto inputPointX = imin(imax(padL, outputPointX), input.size(2) + padL - 1) - oStartX + iStartX;
+
+  scalar_t valueToCopy = input[batch][plane][inputPointX];
+  output[batch][plane][outputPointX] = valueToCopy;
+}
+
+template <typename scalar_t>
+__global__ void replication_pad_backward_kernel(
+    PackedTensorAccessor64<scalar_t, 3> gradInput,
+    PackedTensorAccessor64<const scalar_t, 3> gradOutput,
+    const int padL,
+    const int y_shift,
+    const int z_shift) {
+  const int64_t outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int64_t plane = blockIdx.y + y_shift;
+  const int64_t batch = blockIdx.z + z_shift;
+  if (outputPointId >= gradOutput.size(2)) {
+    return;
+  }
+  const auto outputPointX = outputPointId % gradOutput.size(2);
+
+  const int iStartX = imax(0, -padL);
+  const int oStartX = imax(0, padL);
+
+  const auto inputPointX = imin(imax(padL, outputPointX), gradInput.size(2) + padL - 1) - oStartX + iStartX;
+
+  scalar_t valueToCopy = gradOutput[batch][plane][outputPointX];
+  gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointX], valueToCopy);
+}
+
+template <typename scalar_t>
+__global__ void replication_pad_forward_kernel2d(
+    PackedTensorAccessor64<const scalar_t, 4> input,
+    PackedTensorAccessor64<scalar_t, 4> output,
+    const int padT,
+    const int padL,
+    const int y_shift,
+    const int z_shift) {
+  const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int plane = blockIdx.y + y_shift;
+  const int batch = blockIdx.z + z_shift;
+  if (outputPointId >= output.size(2) * output.size(3)) {
+    return;
+  }
+  const int outputPointX = outputPointId % output.size(3);
+  const int outputPointY = outputPointId / output.size(3);
+
+  const int iStartX = imax(0, -padL);
+  const int iStartY = imax(0, -padT);
+  const int oStartX = imax(0, padL);
+  const int oStartY = imax(0, padT);
+
+  const int inputPointX = imin(imax(padL, outputPointX), input.size(3) + padL - 1) - oStartX + iStartX;
+  const int inputPointY = imin(imax(padT, outputPointY), input.size(2) + padT - 1) - oStartY + iStartY;
+
+  scalar_t valueToCopy = input[batch][plane][inputPointY][inputPointX];
+  output[batch][plane][outputPointY][outputPointX] = valueToCopy;
+}
+
+template <typename scalar_t>
+__global__ void replication_pad_backward_kernel(
+    PackedTensorAccessor64<scalar_t, 4> gradInput,
+    PackedTensorAccessor64<const scalar_t, 4> gradOutput,
+    const int padT,
+    const int padL,
+    const int y_shift,
+    const int z_shift) {
+  const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int plane = blockIdx.y + y_shift;
+  const int batch = blockIdx.z + z_shift;
+  if (outputPointId >= gradOutput.size(2) * gradOutput.size(3)) {
+    return;
+  }
+  const int outputPointX = outputPointId % gradOutput.size(3);
+  const int outputPointY = outputPointId / gradOutput.size(3);
+
+  const int iStartX = imax(0, -padL);
+  const int iStartY = imax(0, -padT);
+  const int oStartX = imax(0, padL);
+  const int oStartY = imax(0, padT);
+
+  const int inputPointX = imin(imax(padL, outputPointX), gradInput.size(3) + padL - 1) - oStartX + iStartX;
+  const int inputPointY = imin(imax(padT, outputPointY), gradInput.size(2) + padT - 1) - oStartY + iStartY;
+
+  scalar_t valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+  gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
+}
+
+template <typename scalar_t>
+__global__ void replication_pad_forward_kernel3d(
+    PackedTensorAccessor64<const scalar_t, 5> input,
+    PackedTensorAccessor64<scalar_t, 5> output,
+    const int pfront,
+    const int ptop,
+    const int pleft,
+    const int y_shift,
+    const int z_shift) {
+  const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int plane = blockIdx.y + y_shift;
+  const int batch = blockIdx.z + z_shift;
+  if (outputPointId >= (output.size(2) * output.size(3) *
+        output.size(4))) {
+    return;
+  }
+  const int outputPointX = outputPointId % output.size(4);
+  const int outputPointY = (outputPointId / output.size(4)) % output.size(3);
+  const int outputPointZ = outputPointId / (output.size(3) * output.size(4));
+
+  const int iStartX = imax(0, -pleft);
+  const int iStartY = imax(0, -ptop);
+  const int iStartZ = imax(0, -pfront);
+  const int oStartX = imax(0, pleft);
+  const int oStartY = imax(0, ptop);
+  const int oStartZ = imax(0, pfront);
+
+  const int inputPointX = imin(imax(pleft, outputPointX),
+      input.size(4) + pleft - 1) - oStartX + iStartX;
+  const int inputPointY = imin(imax(ptop, outputPointY),
+      input.size(3) + ptop - 1) - oStartY + iStartY;
+  const int inputPointZ = imin(imax(pfront, outputPointZ),
+      input.size(2) + pfront - 1) - oStartZ + iStartZ;
+
+  scalar_t valueToCopy =
+    input[batch][plane][inputPointZ][inputPointY][inputPointX];
+  output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
+}
+
+template <typename scalar_t>
+__global__ void replication_pad_backward_kernel(
+    PackedTensorAccessor64<scalar_t, 5> gradInput,
+    PackedTensorAccessor64<const scalar_t, 5> gradOutput,
+    const int pfront,
+    const int ptop,
+    const int pleft,
+    const int y_shift,
+    const int z_shift) {
+  const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
+  const int plane = blockIdx.y + y_shift;
+  const int batch = blockIdx.z + z_shift;
+
+  if (outputPointId >= (gradOutput.size(2) * gradOutput.size(3) *
+        gradOutput.size(4))) {
+    return;
+  }
+  const int outputPointX = outputPointId % gradOutput.size(4);
+  const int outputPointY = (outputPointId / gradOutput.size(4)) %
+    gradOutput.size(3);
+  const int outputPointZ = outputPointId / (gradOutput.size(3) *
+      gradOutput.size(4));
+
+  const int iStartX = imax(0, -pleft);
+  const int iStartY = imax(0, -ptop);
+  const int iStartZ = imax(0, -pfront);
+  const int oStartX = imax(0, pleft);
+  const int oStartY = imax(0, ptop);
+  const int oStartZ = imax(0, pfront);
+
+  const int inputPointX = imin(imax(pleft, outputPointX),
+      gradInput.size(4) + pleft - 1) - oStartX + iStartX;
+  const int inputPointY = imin(imax(ptop, outputPointY),
+      gradInput.size(3) + ptop - 1) - oStartY + iStartY;
+  const int inputPointZ = imin(imax(pfront, outputPointZ),
+      gradInput.size(2) + pfront - 1) - oStartZ + iStartZ;
+
+  scalar_t valueToCopy =
+    gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX];
+  gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX],
+      valueToCopy);
+}
+
+void replication_pad2d_backward_out_zoom_template(
+    Tensor& gradInput,
+    const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef paddingSize)
+{
+
+  TORCH_CHECK(at::zoom::detail::canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  TORCH_CHECK(at::zoom::detail::canUse32BitIndexMath(gradOutput),
+      "output gradient tensor must fit into 32-bit index math");
+  TORCH_CHECK(paddingSize.size() == 4, "padding Size is expected to be 4");
+
+  const auto padL = paddingSize[0];
+  const auto padR = paddingSize[1];
+  const auto padT = paddingSize[2];
+  const auto padB = paddingSize[3];
+  int dimh = 1;
+  int dimw = 2;
+
+  int numInputDims = input.dim();
+  if (numInputDims == 4) {
+    dimh++;
+    dimw++;
+  }
+  const auto iheight = input.size(dimh);
+  const auto iwidth = input.size(dimw);
+  const auto oheight = iheight + padT + padB;
+  const auto owidth  = iwidth + padL + padR;
+
+  TORCH_CHECK(owidth == gradOutput.size(dimw),
+      "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
+      gradOutput.size(dimw));
+  TORCH_CHECK(oheight == gradOutput.size(dimh),
+      "gradOutput height unexpected. Expected: ", oheight, ", Got: ",
+      gradOutput.size(dimh));
+
+  gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return;
+  }
+  gradInput.zero_();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "replication_pad2d_backward_zoom", [&] {
+
+        auto gradInput_ = gradInput;
+        auto gradOutput_ = gradOutput;
+        if (numInputDims == 3) {
+          gradInput_ = gradInput.unsqueeze(0);
+          gradOutput_ = gradOutput.unsqueeze(0);
+        }
+        auto devGradInput = gradInput_.packed_accessor64<scalar_t, 4>();
+        auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 4>();
+
+        int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3);
+        int64_t size1 = devGradOutput.size(1);
+        int64_t size0 = devGradOutput.size(0);
+
+        for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
+          int64_t block_y_size = ::min(size1 - block_y, static_cast<int64_t>(65535));
+          for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
+            int64_t block_z_size = ::min(size0 - block_z, static_cast<int64_t>(65535));
+
+            dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
+            dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+           hipLaunchKernelGGL(( replication_pad_backward_kernel) , dim3(gridSize), dim3(blockSize), 0, c10::zoom::getCurrentZoomStream(), 
+              devGradInput, devGradOutput, padT, padL, block_y, block_z);
+            C10_ZOOM_KERNEL_LAUNCH_CHECK();
+          }
+        }
+      }
+  );
+}
+
+static inline void shapeAndGradOutputCheck3d(
+    const Tensor& input,
+    const Tensor& gradOutput,
+    int pleft, int pright,
+    int ptop, int pbottom,
+    int pfront, int pback) {
+  TORCH_CHECK(at::zoom::detail::canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  int numInputDims = input.dim();
+
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0;
+  TORCH_CHECK(
+      (numInputDims == 4 && valid_dims) ||
+      (numInputDims == 5 && valid_dims && input.size(4) != 0),
+      "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+  if (numInputDims == 5) {
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = input.size(planeDim);
+  int idepth = input.size(dimd);
+  int iheight = input.size(dimh);
+  int iwidth = input.size(dimw);
+  int odepth = idepth + pfront + pback;
+  int oheight = iheight + ptop + pbottom;
+  int owidth  = iwidth + pleft + pright;
+  TORCH_CHECK(owidth >= 1 || oheight >= 1 || odepth >= 1,
+      "input (D: ", idepth, " H: ", iheight, ", W: ", iwidth,
+      ") is too small."
+      " Calculated output D: ", odepth, " H: ", oheight, " W: ", owidth);
+
+  TORCH_CHECK(at::zoom::detail::canUse32BitIndexMath(gradOutput),
+      "output gradient tensor must fit into 32-bit index math");
+
+  TORCH_CHECK(numPlanes == gradOutput.size(planeDim),
+      "gradOutput width unexpected. Expected: ", numPlanes, ", Got: ",
+      gradOutput.size(planeDim));
+  TORCH_CHECK(owidth == gradOutput.size(dimw),
+      "gradOutput width unexpected. Expected: ", owidth, ", Got: ",
+      gradOutput.size(dimw));
+  TORCH_CHECK(oheight == gradOutput.size(dimh),
+      "gradOutput height unexpected. Expected: ", oheight, ", Got: ",
+      gradOutput.size(dimh));
+  TORCH_CHECK(odepth == gradOutput.size(dimd),
+      "gradOutput depth unexpected. Expected: ", odepth, ", Got: ",
+      gradOutput.size(dimd));
+}
+
+void replication_pad3d_backward_out_zoom_template(
+    Tensor& gradInput,
+    const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef paddingSize)
+{
+  TORCH_CHECK(paddingSize.size() == 6, "padding Size is expected to be 6");
+  const auto pleft = paddingSize[0];
+  const auto pright = paddingSize[1];
+  const auto ptop = paddingSize[2];
+  const auto pbottom = paddingSize[3];
+  const auto pfront = paddingSize[4];
+  const auto pback = paddingSize[5];
+  shapeAndGradOutputCheck3d(input, gradOutput, pleft, pright, ptop,
+      pbottom, pfront, pback);
+
+
+  int numInputDims = input.dim();
+
+  gradInput.resize_as_(input);
+  if (gradInput.numel() == 0) {
+    return;
+  }
+  gradInput.zero_();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "replication_pad3d_backward_zoom", [&] {
+      auto gradInput_ = gradInput;
+      auto gradOutput_ = gradOutput;
+      if (numInputDims == 4) {
+        gradInput_ = gradInput.unsqueeze(0);
+        gradOutput_ = gradOutput.unsqueeze(0);
+      }
+      auto devGradInput = gradInput_.packed_accessor64<scalar_t, 5>();
+      auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 5>();
+
+      const int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3) * devGradOutput.size(4);
+      const int64_t size1 = devGradOutput.size(1);
+      const int64_t size0 = devGradOutput.size(0);
+
+      for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
+        int64_t block_y_size = ::min(size1 - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
+          int64_t block_z_size = ::min(size0 - block_z, static_cast<int64_t>(65535));
+
+          dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
+          dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+         hipLaunchKernelGGL(( replication_pad_backward_kernel) , dim3(gridSize), dim3(blockSize), 0, c10::zoom::getCurrentZoomStream(), 
+                    devGradInput, devGradOutput, pfront, ptop, pleft, block_y, block_z);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+    }
+  );
+}
+} // namespace
+
+TORCH_IMPL_FUNC(replication_pad1d_out_zoom) (
+  const Tensor& input, IntArrayRef paddingSize, const Tensor& output
+) {
+  TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports input tensors with less than 2^63 - 1 elements");
+
+  int64_t padL = paddingSize[0];
+  int64_t padR = paddingSize[1];
+  constexpr int64_t planeDim = -2;
+  constexpr int64_t dimw = -1;
+
+  int numInputDims = input.ndimension();
+
+  int64_t numPlanes = input.size(planeDim);
+  int64_t inputW = input.size(dimw);
+  int64_t outputW  = output.size(dimw);
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "replication_pad1d_zoom", [&] {
+      at::Tensor input_ = input;
+      at::Tensor output_ = output;
+      if (numInputDims == 2) {
+        input_ = input.unsqueeze(0);
+        output_ = output.unsqueeze(0);
+      }
+
+      auto devInput = input_.packed_accessor64<const scalar_t, 3>();
+      auto devOutput = output_.packed_accessor64<scalar_t, 3>();
+
+      int64_t outputPlaneSize = devOutput.size(2);
+      int64_t size1 = devOutput.size(1);
+      int64_t size0 = devOutput.size(0);
+
+      for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
+        int64_t block_y_size = ::min(size1 - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
+          int64_t block_z_size = ::min(size0 - block_z, static_cast<int64_t>(65535));
+
+          dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
+          dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+         hipLaunchKernelGGL(( replication_pad_forward_kernel1d) , dim3(gridSize), dim3(blockSize), 0,
+            c10::zoom::getCurrentZoomStream(), devInput, devOutput, padL, block_y, block_z);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+    }
+  );
+}
+
+TORCH_IMPL_FUNC(replication_pad1d_backward_out_zoom) (
+  const Tensor& gradOutput,
+  const Tensor& input,
+  IntArrayRef paddingSize,
+  const Tensor& gradInput
+) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad1d_backward_zoom");
+
+  TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports input tensors with less than 2^63 - 1 elements");
+  TORCH_CHECK(gradOutput.numel() < std::numeric_limits<int64_t>::max(),
+      "replication_pad1d only supports output tensors with less than 2^63 - 1 elements");
+
+  const int64_t padL = paddingSize[0];
+  int64_t dimw = 1;
+
+  int64_t numInputDims = input.ndimension();
+  if (numInputDims == 3) {
+    dimw++;
+  }
+  int64_t iwidth = input.size(dimw);
+
+  if (gradInput.numel() == 0) {
+    return;
+  }
+  gradInput.zero_();
+
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
+      input.scalar_type(), "replication_pad1d_backward_zoom", [&] {
+
+      auto gradInput_ = gradInput;
+      auto gradOutput_ = gradOutput;
+      if (numInputDims == 2) {
+        gradInput_ = gradInput.unsqueeze(0);
+        gradOutput_ = gradOutput.unsqueeze(0);
+      }
+      auto devGradInput = gradInput_.packed_accessor64<scalar_t, 3>();
+      auto devGradOutput = gradOutput_.packed_accessor64<const scalar_t, 3>();
+
+      int64_t outputPlaneSize = devGradOutput.size(2);
+      int64_t size1 = devGradOutput.size(1);
+      int64_t size0 = devGradOutput.size(0);
+
+      for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
+        int64_t block_y_size = ::min(size1 - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
+          int64_t block_z_size = ::min(size0 - block_z, static_cast<int64_t>(65535));
+
+          dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
+          dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+         hipLaunchKernelGGL(( replication_pad_backward_kernel) , dim3(gridSize), dim3(blockSize), 0, c10::zoom::getCurrentZoomStream(), 
+            devGradInput, devGradOutput, padL, block_y, block_z);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+  });
+}
+
+TORCH_IMPL_FUNC(replication_pad2d_out_zoom) (
+  const Tensor& input, IntArrayRef paddingSize, const Tensor& output
+) {
+  TORCH_CHECK(at::zoom::detail::canUse32BitIndexMath(input),
+      "input tensor must fit into 32-bit index math");
+  if (input.numel() == 0) {
+    return;
+  }
+  const auto padL = paddingSize[0];
+  // const auto padR = paddingSize[1]; // This padding is ignored here
+  const auto padT = paddingSize[2];
+  // const auto padB = paddingSize[3]; // This padding is ignored here
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "replication_pad2d_zoom", [&] {
+      at::Tensor input_ = input;
+      at::Tensor output_ = output;
+      if (input.dim() == 3) {
+        input_ = input.unsqueeze(0);
+        output_ = output.unsqueeze(0);
+      }
+      auto devInput = input_.packed_accessor64<const scalar_t, 4>();
+      auto devOutput = output_.packed_accessor64<scalar_t, 4>();
+      int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3);
+      int64_t size1 = devOutput.size(1);
+      int64_t size0 = devOutput.size(0);
+      for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
+        int64_t block_y_size = ::min(size1 - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
+          int64_t block_z_size = ::min(size0 - block_z, static_cast<int64_t>(65535));
+          dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
+          dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+         hipLaunchKernelGGL(( replication_pad_forward_kernel2d) , dim3(gridSize), dim3(blockSize), 0, c10::zoom::getCurrentZoomStream(), 
+              devInput, devOutput, padT, padL, block_y, block_z);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+    }
+  );
+}
+
+Tensor& replication_pad2d_backward_out_zoom(const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef paddingSize,
+    Tensor& gradInput)
+{
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad2d_backward_out_zoom");
+  replication_pad2d_backward_out_zoom_template(
+      gradInput, gradOutput, input, paddingSize);
+  return gradInput;
+}
+
+Tensor replication_pad2d_backward_zoom(
+    const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef paddingSize)
+{
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad2d_backward_zoom");
+  auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  replication_pad2d_backward_out_zoom_template(
+      gradInput, gradOutput, input, paddingSize);
+  return gradInput;
+}
+
+
+TORCH_IMPL_FUNC(replication_pad3d_out_zoom) (
+  const Tensor& input, IntArrayRef paddingSize, const Tensor& output
+) {
+  const auto pleft = paddingSize[0];
+  // const auto pright = paddingSize[1]; // Ignored here
+  const auto ptop = paddingSize[2];
+  // const auto pbottom = paddingSize[3]; // Ignored here
+  const auto pfront = paddingSize[4];
+  // const auto pback = paddingSize[5]; // Ignored here
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  int numInputDims = input.dim();
+
+  if (numInputDims == 5) {
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  const auto numPlanes = input.size(planeDim);
+  const auto inputD = input.size(dimd);
+  const auto inputH = input.size(dimh);
+  const auto inputW = input.size(dimw);
+  const auto outputD = output.size(dimd);
+  const auto outputH = output.size(dimh);
+  const auto outputW = output.size(dimw);
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16,
+    input.scalar_type(), "replication_pad3d_zoom", [&] {
+      at::Tensor input_ = input;
+      at::Tensor output_ = output;
+      if (numInputDims == 4) {
+        input_ = input.unsqueeze(0);
+        output_ = output.unsqueeze(0);
+      }
+
+      auto devInput = input_.packed_accessor64<const scalar_t, 5>();
+      auto devOutput = output_.packed_accessor64<scalar_t, 5>();
+
+      const int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3) * devOutput.size(4);
+      const int64_t size1 = devOutput.size(1);
+      const int64_t size0 = devOutput.size(0);
+
+      for (int64_t block_y = 0; block_y < size1; block_y += 65535) {
+        int64_t block_y_size = ::min(size1 - block_y, static_cast<int64_t>(65535));
+        for (int64_t block_z = 0; block_z < size0; block_z += 65535) {
+          int64_t block_z_size = ::min(size0 - block_z, static_cast<int64_t>(65535));
+
+          dim3 gridSize(ceil_div(outputPlaneSize, static_cast<int64_t>(256)), block_y_size, block_z_size);
+          dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+         hipLaunchKernelGGL(( replication_pad_forward_kernel3d) , dim3(gridSize), dim3(blockSize), 0, c10::zoom::getCurrentZoomStream(), 
+              devInput, devOutput, pfront, ptop, pleft, block_y, block_z);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        }
+      }
+    }
+  );
+}
+
+Tensor& replication_pad3d_backward_out_zoom(const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef paddingSize,
+    Tensor& gradInput)
+{
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad3d_backward_out_zoom");
+  replication_pad3d_backward_out_zoom_template(
+      gradInput, gradOutput, input, paddingSize);
+  return gradInput;
+}
+
+Tensor replication_pad3d_backward_zoom(
+    const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef paddingSize)
+{
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("replication_pad3d_backward_zoom");
+  auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  replication_pad3d_backward_out_zoom_template(
+      gradInput, gradOutput, input, paddingSize);
+  return gradInput;
+}
+
+} // at::native
diff --git a/aten/src/ATen/native/zoom/SegmentReduce.cu b/aten/src/ATen/native/zoom/SegmentReduce.cu
new file mode 100644
index 00000000000000..2f86fe8fe36e48
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SegmentReduce.cu
@@ -0,0 +1,604 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/SegmentReduce.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/cub.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cumsum.h>
+#endif
+
+namespace at::native {
+
+namespace {
+struct CustomMax {
+  template <typename OutputT>
+  __host__ __device__ __forceinline__ OutputT
+  operator()(const OutputT& a, const OutputT& b) const {
+    if (at::_isnan(a)) {
+      return a;
+    } else if (at::_isnan(b)) {
+      return b;
+    }
+    return std::max<OutputT>(a, b);
+  }
+};
+
+struct CustomSum {
+  template <typename OutputT>
+  __host__ __device__ __forceinline__ OutputT
+  operator()(const OutputT& a, const OutputT& b) const {
+    return a + b;
+  }
+};
+
+struct CustomProd {
+  template <typename OutputT>
+  __host__ __device__ __forceinline__ OutputT
+  operator()(const OutputT& a, const OutputT& b) const {
+    return a * b;
+  }
+};
+
+struct CustomMin {
+  template <typename OutputT>
+  __host__ __device__ __forceinline__ OutputT
+  operator()(const OutputT& a, const OutputT& b) const {
+    if (at::_isnan(a)) {
+      return a;
+    } else if (at::_isnan(b)) {
+      return b;
+    }
+    return std::min<OutputT>(a, b);
+  }
+};
+
+template <typename scalar_t, typename index_t>
+__global__ static void post_sum_div_kernel(
+    scalar_t* output_data,
+    const index_t* lengths_data,
+    const int64_t segment_count,
+    bool is_initial_set,
+    scalar_t initial) {
+  HIP_KERNEL_LOOP(index, segment_count) {
+    ZOOM_KERNEL_ASSERT(lengths_data[index] >= 0);
+    if (lengths_data[index] == 0) {
+      if (is_initial_set) {
+        output_data[index] = initial;
+      } else {
+        output_data[index] = NAN;
+      }
+    } else if (!at::_isnan(output_data[index])) {
+      output_data[index] = output_data[index] / lengths_data[index];
+    }
+  }
+}
+
+template <typename scalar_t, typename index_t>
+__global__ void segment_reduce_forward_kernel(
+    ReductionType reduction,
+    scalar_t* output_data,
+    const scalar_t* values_data,
+    const index_t* lengths_data,
+    const index_t* lengths_cumsum_data,
+    const int64_t segment_count,
+    const int64_t lengths_stride_axis,
+    bool is_initial_set,
+    scalar_t initial_value,
+    const int64_t outer_offset,
+    const int64_t inner_offset,
+    const int64_t data_stride_axis,
+    const int64_t data_size_axis,
+    const int64_t output_stride_axis,
+    const int64_t output_size_axis,
+    const int64_t lengths_cumsum_stride_axis) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= (outer_offset * segment_count * inner_offset)) {
+    return;
+  }
+  int64_t row_id = idx / inner_offset;
+  int64_t lane_id = idx % inner_offset;   // lane_id is the inner_idx
+  int64_t outer_idx = row_id / segment_count;
+  int64_t dim_idx = row_id % segment_count;
+
+  int64_t offset_idx = outer_idx * lengths_cumsum_stride_axis * (segment_count + 1) + dim_idx;
+  index_t offset_start = lengths_cumsum_data[offset_idx];
+  index_t offset_end = lengths_cumsum_data[offset_idx + 1];
+
+  // ===== step2: apply reduction
+  for (index_t j = offset_start; j < offset_end; ++j) {
+    int64_t data_index = outer_idx * data_stride_axis * data_size_axis
+                         + j * data_stride_axis + lane_id;
+    const auto data = values_data[data_index];
+    // TODO: There is no need to branch with every element
+    if (reduction == ReductionType::MAX) {
+      initial_value =
+          at::_isnan(data) ? data : std::max<scalar_t>(initial_value, data);
+    } else if (
+        reduction == ReductionType::MEAN ||
+        reduction == ReductionType::SUM) {
+      initial_value = initial_value + data;
+    } else if (reduction == ReductionType::MIN) {
+      initial_value =
+          at::_isnan(data) ? data : std::min<scalar_t>(initial_value, data);
+    } else if (
+      reduction == ReductionType::PROD) {
+      initial_value = initial_value * data;
+    }
+  }
+
+  // ===== step3: finalize reduction
+  int64_t lengths_idx = outer_idx * lengths_stride_axis * segment_count + dim_idx;
+  ZOOM_KERNEL_ASSERT(lengths_data[lengths_idx] >= 0);
+  if (lengths_data[lengths_idx] == 0 && !is_initial_set &&
+      reduction == ReductionType::MEAN) {
+    initial_value = static_cast<scalar_t>(NAN);
+  } else if (
+      reduction == ReductionType::MEAN && lengths_data[lengths_idx] > 0 &&
+      !at::_isnan(initial_value)) {
+    initial_value = initial_value / lengths_data[lengths_idx];
+  }
+  int64_t output_index = outer_idx * output_stride_axis * output_size_axis
+                         + dim_idx * output_stride_axis + lane_id;
+  output_data[output_index] = initial_value;
+}
+
+
+template <typename scalar_t, typename index_t>
+__global__ void segment_reduce_backward_kernel(
+    ReductionType reduction,
+    scalar_t* grad_input_data,
+    const scalar_t* grad_data,
+    const scalar_t* output_data,
+    const scalar_t* values_data,
+    const index_t* lengths_data,
+    const index_t* lengths_cumsum_data,
+    const int64_t segment_count,
+    const int64_t lengths_stride_axis,
+    scalar_t initial_prod_value,
+    const int64_t outer_offset,
+    const int64_t inner_offset,
+    const int64_t data_stride_axis,
+    const int64_t data_size_axis,
+    const int64_t output_stride_axis,
+    const int64_t output_size_axis,
+    const int64_t lengths_cumsum_stride_axis) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= (outer_offset * segment_count * inner_offset)) {
+    return;
+  }
+  int64_t row_id = idx / inner_offset;
+  int64_t lane_id = idx % inner_offset;  // lane_id is the inner_idx
+  int64_t outer_idx = row_id / segment_count;
+  int64_t dim_idx = row_id % segment_count;
+
+  int64_t lengths_idx = outer_idx * lengths_stride_axis * segment_count + dim_idx;
+  auto segment_length = lengths_data[lengths_idx];
+  if (segment_length == 0) {
+    return;
+  }
+
+  int64_t offset_idx = outer_idx * lengths_cumsum_stride_axis * (segment_count + 1) + dim_idx;
+  index_t offset_start = lengths_cumsum_data[offset_idx];
+  index_t offset_end = lengths_cumsum_data[offset_idx + 1];
+
+  int64_t output_index = outer_idx * output_stride_axis * output_size_axis
+                         + dim_idx * output_stride_axis + lane_id;
+
+  if (reduction == ReductionType::MAX ||
+      reduction == ReductionType::MIN) {
+    int64_t counter = 0;
+    for (int64_t j = offset_start; j < offset_end; ++j) {
+      int64_t data_index = outer_idx * data_stride_axis * data_size_axis
+                           + j * data_stride_axis + lane_id;
+      if (at::_isnan(values_data[data_index]) ||
+          values_data[data_index] == output_data[output_index]) {
+        grad_input_data[data_index] = grad_data[output_index];
+        counter++;
+      }
+    }
+    // Average gradient based on number of maximum elements in the
+    // segment
+    if (counter < 2) {
+      return;
+    }
+    for (int64_t j = offset_start; j < offset_end; ++j) {
+      int64_t data_index = outer_idx * data_stride_axis * data_size_axis
+                           + j * data_stride_axis + lane_id;
+      if (grad_input_data[data_index] > 0) {
+        grad_input_data[data_index] =
+            grad_input_data[data_index] / counter;
+      }
+    }
+  } else if (reduction == ReductionType::MEAN) {
+    auto grad_val = grad_data[output_index] / segment_length;
+    for (int64_t j = offset_start; j < offset_end; ++j) {
+      int64_t data_index = outer_idx * data_stride_axis * data_size_axis
+                           + j * data_stride_axis + lane_id;
+      grad_input_data[data_index] = grad_val;
+    }
+  } else if (reduction == ReductionType::SUM) {
+    const auto& grad_val = grad_data[output_index];
+    for (int64_t j = offset_start; j < offset_end; ++j) {
+      int64_t data_index = outer_idx * data_stride_axis * data_size_axis
+                           + j * data_stride_axis + lane_id;
+      grad_input_data[data_index] = grad_val;
+    }
+  } else if (reduction == ReductionType::PROD) {
+    const auto& grad_val = grad_data[output_index] * output_data[output_index];
+    for (int64_t j = offset_start; j < offset_end; ++j) {
+      int64_t data_index = outer_idx * data_stride_axis * data_size_axis
+                           + j * data_stride_axis + lane_id;
+      if (at::_isnan(values_data[data_index]) ||
+          values_data[data_index] == 0) {
+        // explicitly compute exclusive prod
+        scalar_t exclusive_prod = initial_prod_value;
+        int64_t prod_idx;
+        for (int64_t k = offset_start; k < offset_end; ++k) {
+          if (k != j) {
+            prod_idx = outer_idx * data_stride_axis * data_size_axis
+                       + k * data_stride_axis + lane_id;
+            exclusive_prod *= values_data[prod_idx];
+          }
+        }
+        grad_input_data[data_index] = grad_data[output_index] * exclusive_prod;
+      } else {
+        grad_input_data[data_index] = grad_val / values_data[data_index];
+      }
+    }
+  }
+}
+} // namespace
+
+Tensor _segment_reduce_lengths_offsets_backward_zoom_kernel(
+    const Tensor& grad_contig,
+    const Tensor& output_contig,
+    const Tensor& data_contig,
+    ReductionType reduction,
+    const Tensor& lengths_or_offsets_contig,
+    int64_t axis,
+    const std::optional<Scalar>& initial,
+    bool is_offsets_like) {
+  axis = lengths_or_offsets_contig.dim() - 1;
+  int64_t segment_count = is_offsets_like ?
+                          lengths_or_offsets_contig.size(axis) - 1 :
+                          lengths_or_offsets_contig.size(axis);
+  int64_t lengths_stride_axis = lengths_or_offsets_contig.stride(axis);
+  auto grad_input = at::zeros({data_contig.sizes()}, grad_contig.options());
+
+  auto offsets = lengths_or_offsets_contig;
+  auto lengths = lengths_or_offsets_contig;
+  if (is_offsets_like) {
+    lengths = lengths.diff();
+  } else {
+    auto zeros_shape = offsets.sizes().vec();
+    zeros_shape[axis] = 1;
+    offsets = at::cat({at::zeros(zeros_shape, offsets.options()), offsets}, axis);
+    offsets.cumsum_(axis);
+  }
+
+  // outer_offset is the size of the outer dimensions of output (before axis)
+  // inner_offset is the size of the inner dimensions of output (after axis)
+  int64_t outer_offset = 1, inner_offset = 1;
+  for (int64_t d = 0; d < axis; d++) {
+    outer_offset *= output_contig.size(d);
+  }
+  for (int64_t d = axis + 1; d < output_contig.dim(); d++) {
+    inner_offset *= output_contig.size(d);
+  }
+
+  constexpr int threads_per_block = 256;
+  int64_t num_blocks = (outer_offset * inner_offset * segment_count + threads_per_block - 1) / threads_per_block;
+
+  num_blocks = ::max(num_blocks, (int64_t)1);
+
+  auto data_stride_axis = data_contig.stride(axis);
+  auto data_size_axis = data_contig.size(axis);
+  auto output_stride_axis = output_contig.stride(axis);
+  auto output_size_axis = output_contig.size(axis);
+  auto offsets_stride_axis = offsets.stride(axis);
+
+  AT_DISPATCH_INDEX_TYPES(
+      lengths_or_offsets_contig.scalar_type(), "_segment_reduce_zoom_lengths_offsets_backward_kernel1", ([&] {
+        const auto* lengths_data = lengths.const_data_ptr<index_t>();
+        auto* offsets_data = offsets.const_data_ptr<index_t>();
+
+        // TODO: Switch to TensorIterator for better maintainablility and
+        // readability
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            kBFloat16,
+            kHalf,
+            data_contig.scalar_type(),
+            "_segment_reduce_cpu",
+            ([&]() {
+              auto* output_data = output_contig.const_data_ptr<scalar_t>();
+              auto* grad_data = grad_contig.const_data_ptr<scalar_t>();
+              auto* grad_input_data = grad_input.mutable_data_ptr<scalar_t>();
+              const auto* values_data = data_contig.const_data_ptr<scalar_t>();
+
+              scalar_t initial_prod_value;
+              if (initial.has_value()) {
+                initial_prod_value = initial.value().to<scalar_t>();
+              } else {
+                initial_prod_value = 1;
+              }
+
+             hipLaunchKernelGGL(( segment_reduce_backward_kernel<scalar_t>)
+                  , dim3(num_blocks),
+                     dim3(threads_per_block),
+                     0,
+                     c10::zoom::getCurrentZoomStream(), 
+                      reduction,
+                      grad_input_data,
+                      grad_data,
+                      output_data,
+                      values_data,
+                      lengths_data,
+                      offsets_data,
+                      segment_count,
+                      lengths_stride_axis,
+                      initial_prod_value,
+                      outer_offset,
+                      inner_offset,
+                      data_stride_axis,
+                      data_size_axis,
+                      output_stride_axis,
+                      output_size_axis,
+                      offsets_stride_axis
+                    );
+              C10_ZOOM_KERNEL_LAUNCH_CHECK();
+            }));
+      }));
+  return grad_input;
+}
+
+Tensor _segment_reduce_lengths_backward_zoom_kernel(
+  const Tensor& grad_contig,
+  const Tensor& output_contig,
+  const Tensor& data_contig,
+  ReductionType reduction,
+  const Tensor& lengths_contig,
+  int64_t axis,
+  const std::optional<Scalar>& initial) {
+  return _segment_reduce_lengths_offsets_backward_zoom_kernel(
+    grad_contig, output_contig, data_contig, reduction, lengths_contig, axis, initial, /*is_offsets_like=*/false);
+}
+
+Tensor _segment_reduce_offsets_backward_zoom_kernel(
+  const Tensor& grad_contig,
+  const Tensor& output_contig,
+  const Tensor& data_contig,
+  ReductionType reduction,
+  const Tensor& offsets_contig,
+  int64_t axis,
+  const std::optional<Scalar>& initial) {
+  return _segment_reduce_lengths_offsets_backward_zoom_kernel(
+    grad_contig, output_contig, data_contig, reduction, offsets_contig, axis, initial, /*is_offsets_like=*/true);
+}
+
+Tensor _segment_reduce_lengths_offsets_zoom_kernel(
+  ReductionType reduction,
+  const Tensor& data,
+  const Tensor& lengths_or_offsets,
+  int64_t axis,
+  const std::optional<Scalar>& initial,
+  bool is_offsets_like) {
+  // data and lengths_or_offsets should be contiguous from the call to .contiguous in segment_reduce_kernel
+  TORCH_CHECK(data.is_contiguous());
+  TORCH_CHECK(lengths_or_offsets.is_contiguous());
+  axis = lengths_or_offsets.dim() - 1;
+  int64_t segment_count = is_offsets_like ? lengths_or_offsets.size(axis) - 1 : lengths_or_offsets.size(axis);
+  int64_t lengths_stride_axis = lengths_or_offsets.stride(axis);
+  auto output_shape = data.sizes().vec();
+  output_shape[axis] = segment_count;
+  auto output = at::empty(output_shape, data.options());
+
+
+  auto offsets = lengths_or_offsets;
+  auto lengths = lengths_or_offsets;
+  if (is_offsets_like) {
+    lengths = lengths.diff();
+  } else {
+    auto zeros_shape = offsets.sizes().vec();
+    zeros_shape[axis] = 1;
+    offsets = at::cat({at::zeros(zeros_shape, offsets.options()), offsets}, axis);
+    offsets.cumsum_(axis);
+  }
+
+  // outer_offset is the size of the outer dimensions of output (before axis)
+  // inner_offset is the size of the inner dimensions of output (after axis)
+  int64_t outer_offset = 1, inner_offset = 1;
+  for (int64_t d = 0; d < axis; d++) {
+    outer_offset *= output.size(d);
+  }
+  for (int64_t d = axis + 1; d < output.dim(); d++) {
+    inner_offset *= output.size(d);
+  }
+
+  constexpr int threads_per_block = 256;
+  // segment_count * stride_count is just output.numel() ?
+  int64_t num_blocks = (output.numel() + threads_per_block - 1) / threads_per_block;
+
+  num_blocks = ::max(num_blocks, (int64_t)1);
+
+  auto data_stride_axis = data.stride(axis);
+  auto data_size_axis = data.size(axis);
+  auto output_stride_axis = output.stride(axis);
+  auto output_size_axis = output.size(axis);
+  auto offsets_stride_axis = offsets.stride(axis);
+
+  AT_DISPATCH_INDEX_TYPES(
+      lengths_or_offsets.scalar_type(), "_segment_reduce_zoom_kernel1", ([&] {
+        auto* offsets_data_ptr = offsets.const_data_ptr<index_t>();
+        auto* lengths_data_ptr = lengths.const_data_ptr<index_t>();
+        AT_DISPATCH_FLOATING_TYPES_AND2(
+            at::ScalarType::Half,
+            at::ScalarType::BFloat16,
+            data.scalar_type(),
+            "segment_reduce_zoom",
+            [&]() {
+              auto* data_data_ptr = data.const_data_ptr<scalar_t>();
+              auto* output_data_ptr = output.mutable_data_ptr<scalar_t>();
+
+              // initialize starting value
+              scalar_t initial_value = 0;
+              if (initial.has_value()) {
+                initial_value = initial.value().to<scalar_t>();
+              } else if (reduction == ReductionType::MAX) {
+                initial_value = -std::numeric_limits<scalar_t>::infinity();
+              } else if (
+                  reduction == ReductionType::MEAN ||
+                  reduction == ReductionType::SUM) {
+                initial_value = 0;
+              } else if (reduction == ReductionType::MIN) {
+                initial_value = std::numeric_limits<scalar_t>::infinity();
+              } else if (reduction == ReductionType::PROD) {
+                initial_value = 1;
+              }
+
+              if (output_shape.size() > 1) {
+               hipLaunchKernelGGL(( segment_reduce_forward_kernel<scalar_t>)
+                    , dim3(num_blocks),
+                       dim3(threads_per_block),
+                       0,
+                       c10::zoom::getCurrentZoomStream(), 
+                        reduction,
+                        output_data_ptr,
+                        data_data_ptr,
+                        lengths_data_ptr,
+                        offsets_data_ptr,
+                        segment_count,
+                        lengths_stride_axis,
+                        initial.has_value(),
+                        initial_value,
+                        outer_offset,
+                        inner_offset,
+                        data_stride_axis,
+                        data_size_axis,
+                        output_stride_axis,
+                        output_size_axis,
+                        offsets_stride_axis
+                      );
+                C10_ZOOM_KERNEL_LAUNCH_CHECK();
+              } else {
+                if (reduction == ReductionType::MAX) {
+                  CustomMax max_op{};
+                  HIPCUB_WRAPPER(
+                      hipcub::DeviceSegmentedReduce::Reduce,
+                      data_data_ptr,
+                      output_data_ptr,
+                      segment_count,
+                      offsets_data_ptr,
+                      offsets_data_ptr + 1,
+                      max_op,
+                      initial_value,
+                      c10::zoom::getCurrentZoomStream());
+                } else if (reduction == ReductionType::MEAN) {
+                  CustomSum sum_op{};
+                  HIPCUB_WRAPPER(
+                      hipcub::DeviceSegmentedReduce::Reduce,
+                      data_data_ptr,
+                      output_data_ptr,
+                      segment_count,
+                      offsets_data_ptr,
+                      offsets_data_ptr + 1,
+                      sum_op,
+                      initial_value,
+                      c10::zoom::getCurrentZoomStream());
+
+                 hipLaunchKernelGGL(( post_sum_div_kernel<scalar_t>)
+                      , dim3(num_blocks),
+                         dim3(threads_per_block),
+                         0,
+                         c10::zoom::getCurrentZoomStream(), 
+                          output_data_ptr,
+                          lengths_data_ptr,
+                          segment_count,
+                          initial.has_value(),
+                          initial_value);
+                  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+                } else if (reduction == ReductionType::MIN) {
+                  CustomMin min_op{};
+                  HIPCUB_WRAPPER(
+                      hipcub::DeviceSegmentedReduce::Reduce,
+                      data_data_ptr,
+                      output_data_ptr,
+                      segment_count,
+                      offsets_data_ptr,
+                      offsets_data_ptr + 1,
+                      min_op,
+                      initial_value,
+                      c10::zoom::getCurrentZoomStream());
+                } else if (reduction == ReductionType::SUM) {
+                  CustomSum sum_op{};
+                  HIPCUB_WRAPPER(
+                      hipcub::DeviceSegmentedReduce::Reduce,
+                      data_data_ptr,
+                      output_data_ptr,
+                      segment_count,
+                      offsets_data_ptr,
+                      offsets_data_ptr + 1,
+                      sum_op,
+                      initial_value,
+                      c10::zoom::getCurrentZoomStream());
+                } else if (reduction == ReductionType::PROD) {
+                  CustomProd prod_op{};
+                  HIPCUB_WRAPPER(
+                      hipcub::DeviceSegmentedReduce::Reduce,
+                      data_data_ptr,
+                      output_data_ptr,
+                      segment_count,
+                      offsets_data_ptr,
+                      offsets_data_ptr + 1,
+                      prod_op,
+                      initial_value,
+                      c10::zoom::getCurrentZoomStream());
+                }
+              }
+            });
+      }));
+
+  return output;
+}
+
+Tensor _segment_reduce_lengths_zoom_kernel(
+  ReductionType reduction,
+  const Tensor& data,
+  const Tensor& lengths,
+  int64_t axis,
+  const std::optional<Scalar>& initial) {
+  return _segment_reduce_lengths_offsets_zoom_kernel(
+    reduction, data, lengths, axis, initial, /*is_offsets_like=*/false);
+}
+
+Tensor _segment_reduce_offsets_zoom_kernel(
+  ReductionType reduction,
+  const Tensor& data,
+  const Tensor& offsets,
+  int64_t axis,
+  const std::optional<Scalar>& initial) {
+  return _segment_reduce_lengths_offsets_zoom_kernel(
+    reduction, data, offsets, axis, initial, /*is_offsets_like=*/true);
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    _segment_reduce_lengths_backward_stub,
+    &_segment_reduce_lengths_backward_zoom_kernel);
+REGISTER_PRIVATEUSE1_DISPATCH(
+  _segment_reduce_offsets_backward_stub,
+  &_segment_reduce_offsets_backward_zoom_kernel);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/StepKernel.cu b/aten/src/ATen/native/zoom/StepKernel.cu
new file mode 100644
index 00000000000000..7785b928a17d2d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/StepKernel.cu
@@ -0,0 +1,34 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/BinaryOps.h>
+#include <c10/util/BFloat16-math.h>
+
+// NOTE: CUDA on Windows requires that the enclosing function
+// of a __device__ lambda not have internal linkage.
+
+namespace at::native {
+
+void nextafter_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.common_dtype(), "nextafter_zoom", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return std::nextafter(a, b);
+    });
+  });
+}
+
+void heaviside_kernel_zoom(TensorIteratorBase& iter) {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, iter.dtype(), "heaviside_zoom", [&]() {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+      return a == 0 ? b : static_cast<scalar_t>(a > 0);
+    });
+  });
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(nextafter_stub, &nextafter_kernel_zoom);
+REGISTER_PRIVATEUSE1_DISPATCH(heaviside_stub, &heaviside_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/SummaryOps.cu b/aten/src/ATen/native/zoom/SummaryOps.cu
new file mode 100644
index 00000000000000..7c9f4252383941
--- /dev/null
+++ b/aten/src/ATen/native/zoom/SummaryOps.cu
@@ -0,0 +1,400 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/Resize.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bincount_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/histc_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at {
+namespace zoom {
+#define RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD 8
+#define FOR_KERNEL_LOOP(i, lim)                                      \
+  for (IndexType i = blockIdx.x * blockDim.x + threadIdx.x; i < lim; \
+       i += gridDim.x * blockDim.x)
+
+/*
+  Memory types used for the 3 histogram implementations.
+  See `CUDA_tensor_histogram` below.
+ */
+enum class ZoomHistogramMemoryType { SHARED, GLOBAL };
+namespace {
+template <typename input_t, typename IndexType>
+__device__ static IndexType getBin(
+    input_t bVal,
+    at::acc_type<input_t, /*is_zoom=*/true> minvalue,
+    at::acc_type<input_t, /*is_zoom=*/true> maxvalue,
+    int64_t nbins) {
+  IndexType bin = (int)(((bVal - minvalue)) * nbins / (maxvalue - minvalue));
+  // (only applicable for histc)
+  // while each bin is inclusive at the lower end and exclusive at the higher,
+  // i.e. [start, end) the last bin is inclusive at both, i.e. [start, end], in
+  // order to include maxvalue if exists therefore when bin == nbins, adjust bin
+  // to the last bin
+  if (bin == nbins)
+    bin -= 1;
+  return bin;
+}
+}
+
+/*
+  Kernel for computing the histogram of the input.
+ */
+template <
+    typename output_t,
+    typename input_t,
+    typename IndexType,
+    int ADims,
+    int PDims,
+    int BDims,
+    ZoomHistogramMemoryType MemoryType,
+    typename Op>
+C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
+__global__ void kernelHistogram1D(
+    detail::TensorInfo<output_t, IndexType> a, /* output */
+    detail::TensorInfo<output_t, IndexType> p, /* partial output */
+    detail::TensorInfo<const input_t, IndexType> b, /* input */
+    int64_t nbins,
+    at::acc_type<input_t, /*is_zoom=*/true> minvalue,
+    at::acc_type<input_t, /*is_zoom=*/true> maxvalue,
+    IndexType totalElements,
+    Op getOp) {
+  extern __shared__ unsigned char my_smem[];
+  output_t* smem = nullptr;
+
+  if (MemoryType == ZoomHistogramMemoryType::SHARED) {
+    ////////////////////////// Shared memory //////////////////////////
+    // atomically add to block specific shared memory
+    // then atomically add to the global output tensor
+    smem = reinterpret_cast<output_t*>(my_smem);
+    for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
+      smem[i] = 0;
+    }
+    __syncthreads();
+    FOR_KERNEL_LOOP(linearIndex, totalElements) {
+      // Convert `linearIndex` into an offset of `b`
+      const IndexType bOffset =
+          detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
+      const auto bVal = b.data[bOffset];
+      if (bVal >= minvalue && bVal <= maxvalue) {
+        // Use value at `b` as an offset of `smem`
+        const IndexType bin =
+            getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
+        gpuAtomicAddNoReturn(&smem[bin], getOp(linearIndex));
+      }
+    }
+    __syncthreads();
+    // NOTE: atomically update output bin count.
+    //   Atomic update is imp since __syncthread() will only synchronize threads
+    //   in a given block, not across blocks.
+    for (IndexType i = threadIdx.x; i < a.sizes[0]; i += blockDim.x) {
+      const IndexType aOffset =
+          detail::IndexToOffset<output_t, IndexType, ADims>::get(i, a);
+      gpuAtomicAddNoReturn(&a.data[aOffset], smem[i]);
+    }
+
+  } else {
+    ////////////////////////// Global memory //////////////////////////
+    // atomically add to the output tensor
+    // compute histogram for the block
+    FOR_KERNEL_LOOP(linearIndex, totalElements) {
+      // Convert `linearIndex` into an offset of `b`
+      const IndexType bOffset =
+          detail::IndexToOffset<const input_t, IndexType, BDims>::get(linearIndex, b);
+      const auto bVal = b.data[bOffset];
+      if (bVal >= minvalue && bVal <= maxvalue) {
+        // Use value at `b` as an offset of `a`
+        const IndexType bin =
+            getBin<input_t, IndexType>(bVal, minvalue, maxvalue, nbins);
+        const IndexType aOffset =
+            detail::IndexToOffset<output_t, IndexType, ADims>::get(bin, a);
+        gpuAtomicAddNoReturn(&a.data[aOffset], getOp(linearIndex));
+      }
+    }
+  }
+}
+
+#define HANDLE_CASE(MEMORY_TYPE, WEIGHTS_OP, SHARED_MEM)                 \
+ hipLaunchKernelGGL(( kernelHistogram1D<                                                     \
+      output_t,                                                          \
+      input_t,                                                           \
+      IndexType,                                                         \
+      1,                                                                 \
+      2,                                                                 \
+      -1,                                                                \
+      MEMORY_TYPE>), dim3(grid), dim3(block), SHARED_MEM, c10::zoom::getCurrentZoomStream(),  \
+      aInfo,                                                             \
+      pInfo,                                                             \
+      bInfo,                                                             \
+      nbins,                                                             \
+      minvalue,                                                          \
+      maxvalue,                                                          \
+      totalElements,                                                     \
+      WEIGHTS_OP);                                                       \
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+#define HANDLE_SWITCH_CASE(mType, getOp)                                   \
+  switch (mType) {                                                         \
+    case ZoomHistogramMemoryType::SHARED:                                  \
+      HANDLE_CASE(ZoomHistogramMemoryType::SHARED, getOp, sharedMem);      \
+      break;                                                               \
+    default:                                                               \
+      HANDLE_CASE(ZoomHistogramMemoryType::GLOBAL, getOp, 0);              \
+  }
+
+/*
+  Calculate the frequency of the input values.
+
+  `a` contains the final output or the histogram.
+  Input `b` is assumed to be 1-D non-negative int array.
+  `c` optionally contains the weight vector.
+  See `help torch.bincount` for details on the math.
+
+  3 implementations based of input size and memory usage:
+    case: enough shared mem
+        SHARED: Each block atomically adds to it's own **shared** hist copy,
+        then atomically updates the global tensor.
+    case: no enough shared mem
+        GLOBAL: all threads atomically update to a single **global** hist copy.
+ */
+template <typename output_t, typename input_t, bool HasWeights>
+bool Zoom_tensor_histogram(
+    at::Tensor a, /* output */
+    at::Tensor b, /* input */
+    at::Tensor c, /* weights(optional) */
+    int64_t nbins,
+    at::acc_type<input_t, /*is_zoom=*/true> minvalue,
+    at::acc_type<input_t, /*is_zoom=*/true> maxvalue,
+    TensorArgType aType = TensorArgType::ReadWrite,
+    TensorArgType bType = TensorArgType::ReadOnly,
+    TensorArgType cType = TensorArgType::ReadOnly) {
+  checkBackend("Zoom_tensor_histogram", {a, b}, Backend::PrivateUse1);
+  if (HasWeights) {
+    checkBackend("Zoom_tensor_histogram", {c}, Backend::PrivateUse1);
+  }
+  auto totalElements = b.numel();
+
+  if (totalElements == 0) {
+    return false;
+  }
+
+  const dim3 block = getApplyBlock();
+  dim3 grid;
+  auto curDevice = c10::zoom::current_device();
+  if (curDevice == -1 || !getApplyGrid(totalElements, grid, curDevice)) {
+    return false;
+  }
+
+  ZoomHistogramMemoryType memType = ZoomHistogramMemoryType::GLOBAL;
+  auto maxSharedMem = getCurrentDeviceProperties()->sharedMemPerBlock;
+  auto sharedMem = nbins * sizeof(output_t) + 8; // 8 guard bytes
+  // determine memory type to use in the kernel
+  if (sharedMem < maxSharedMem) {
+    // Solve equations:
+    // (1) #(smem atomicAdd per SM) = totalElements / min(grid.x, #SM)
+    // (2) #(gmem atomicAdd) = grid.x * nbins
+    // (3) RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD = #(gmem atomicAdd) / #(smem atomicAdd per SM)
+    unsigned optimalGrid = ceil_div<size_t>(RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD * totalElements,
+                                            nbins * getCurrentDeviceProperties()->multiProcessorCount);
+    if (optimalGrid < (unsigned)getCurrentDeviceProperties()->multiProcessorCount) {
+      optimalGrid = 1 + (unsigned)std::sqrt(RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD * totalElements / nbins);
+    }
+    auto optimalSteps = ceil_div<size_t>(totalElements, optimalGrid * block.x);
+    optimalGrid = ceil_div<size_t>(totalElements, optimalSteps * block.x);
+    grid.x = ::min(grid.x, optimalGrid);
+    memType = ZoomHistogramMemoryType::SHARED;
+  }
+
+  using IndexType = int64_t;
+  auto aInfo = detail::getTensorInfo<output_t, IndexType>(a);
+  auto bInfo = detail::getTensorInfo<const input_t, IndexType>(b);
+  detail::TensorInfo<output_t, IndexType> pInfo(nullptr, 0, {}, {});
+
+  if (HasWeights) {
+    auto cInfo = detail::getTensorInfo<output_t, IndexType>(c);
+    const auto getWeightsOp = [cInfo] __device__(IndexType cIndex) {
+      const IndexType cOffset =
+          detail::IndexToOffset<output_t, IndexType, 1>::get(cIndex, cInfo);
+      return cInfo.data[cOffset];
+    };
+    HANDLE_SWITCH_CASE(memType, getWeightsOp)
+  } else {
+    static const auto getDummyOp = [] __device__(IndexType) { return 1L; };
+    HANDLE_SWITCH_CASE(memType, getDummyOp)
+  }
+  return true;
+}
+
+#undef HANDLE_CASE
+#undef HANDLE_SWITCH_CASE
+#undef FOR_KERNEL_LOOP
+#undef RATIO_OF_GMEM_ATOMIC_ADD_TO_SMEM_ATOMIC_ADD
+} // namespace zoom
+
+namespace {
+///////////////// bincount /////////////////
+template <typename input_t, typename weights_t>
+Tensor _bincount_zoom_template(
+    const Tensor& self,
+    const Tensor& weights,
+    int64_t minlength) {
+  if (minlength < 0) {
+    AT_ERROR("minlength should be >= 0");
+  }
+  if (self.dim() == 1 && self.numel() == 0) {
+    return at::zeros(
+        {minlength},
+        kLong,
+        c10::nullopt /* layout */,
+        kPrivateUse1,
+        c10::nullopt /* pin_memory */);
+  }
+  if (self.dim() != 1 ||
+      (!std::is_same<input_t, uint8_t>::value &&
+       *self.min().cpu().const_data_ptr<input_t>() < 0)) {
+    AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
+  }
+
+  bool has_weights = weights.defined();
+  if (has_weights && (weights.dim() != 1 || weights.size(0) != self.size(0))) {
+    AT_ERROR("weights should be 1-d and have the same length as input");
+  }
+
+  const int64_t nbins =
+      ::max(self.max().item<input_t>() + (int64_t)1, minlength);
+
+  // we are using acc_type for the bounds, in particular int64_t for integers
+  // in order to avoid overflows (e.g. using 256 bins for dtype uint8)
+  using bounds_t = at::acc_type<input_t, /*is_zoom=*/true>;
+  const bounds_t minvalue = 0;
+  const bounds_t maxvalue = nbins;
+  // alloc output counter on GPU
+  Tensor output;
+  if (has_weights) {
+    output = at::zeros(
+        {nbins},
+        optTypeMetaToScalarType(weights.options().dtype_opt()),
+        weights.options().layout_opt(),
+        weights.options().device_opt(),
+        weights.options().pinned_memory_opt());
+    zoom::Zoom_tensor_histogram<weights_t, input_t, true>(
+        output, self, weights, nbins, minvalue, maxvalue);
+  } else {
+    output = at::zeros(
+        {nbins},
+        kLong,
+        c10::nullopt /* layout */,
+        DeviceType::PrivateUse1,
+        c10::nullopt /* pin_memory */);
+    zoom::Zoom_tensor_histogram<int64_t, input_t, false>(
+        output, self, weights, nbins, minvalue, maxvalue);
+  }
+  return output;
+}
+
+///////////////// histc /////////////////
+template <typename input_t>
+Tensor _histc_zoom_template(
+    const Tensor& self,
+    int64_t nbins,
+    at::acc_type<input_t, /*is_zoom=*/true> min,
+    at::acc_type<input_t, /*is_zoom=*/true> max) {
+  if (nbins <= 0) {
+    AT_ERROR("bins must be > 0");
+  }
+  Tensor output = at::zeros(
+      {nbins},
+      self.scalar_type(),
+      c10::nullopt /* layout */,
+      DeviceType::PrivateUse1,
+      c10::nullopt /* pin_memory */);
+  input_t minvalue = min;
+  input_t maxvalue = max;
+  if (min == max && self.numel() > 0) {
+    minvalue = *self.min().cpu().const_data_ptr<input_t>();
+    maxvalue = *self.max().cpu().const_data_ptr<input_t>();
+  }
+  if (minvalue == maxvalue) {
+    minvalue = minvalue - 1;
+    maxvalue = maxvalue + 1;
+  }
+
+  TORCH_CHECK(
+      !(std::isinf(minvalue) || std::isinf(maxvalue) || std::isnan(minvalue) ||
+        std::isnan(maxvalue)),
+      "range of [",
+      minvalue,
+      ", ",
+      maxvalue,
+      "] is not finite");
+  TORCH_CHECK(minvalue < maxvalue, "max must be larger than min");
+
+  zoom::Zoom_tensor_histogram<input_t, input_t, false>(
+      output, self, Tensor(), nbins, minvalue, maxvalue);
+  return output;
+}
+} // namespace
+
+namespace native {
+Tensor _bincount_zoom(
+    const Tensor& self, const std::optional<Tensor>& weights_opt,
+    int64_t minlength) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt);
+  const Tensor& weights = *weights_maybe_owned;
+
+  if (weights_opt.has_value()) {
+    // See Note [Writing Nondeterministic Operations]
+    // Nondeterministic if weights are given, because of floating point
+    // atomicAdd usage
+    globalContext().alertNotDeterministic("_bincount_zoom");
+  }
+  return AT_DISPATCH_INTEGRAL_TYPES(self.scalar_type(), "bincount_zoom", [&] {
+    const auto scalar = weights.scalar_type();
+    if (scalar == ScalarType::Undefined || scalar == ScalarType::Float)
+      return _bincount_zoom_template<scalar_t, float>(self, weights, minlength);
+    return _bincount_zoom_template<scalar_t, double>(
+        self, weights.to(kDouble), minlength);
+  });
+}
+
+Tensor _histc_zoom(
+    const Tensor& self,
+    int64_t nbins,
+    const Scalar& min,
+    const Scalar& max) {
+  if (self.scalar_type() == ScalarType::Half) {
+    AT_ERROR("HalfTensor is not supported");
+  }
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("_histc_zoom");
+  return AT_DISPATCH_ALL_TYPES(self.scalar_type(), "histc", [&] {
+    using bounds_t = at::acc_type<scalar_t, /*is_zoom=*/true>;
+    return _histc_zoom_template<scalar_t>(
+        self, nbins, min.to<bounds_t>(), max.to<bounds_t>());
+  });
+}
+
+Tensor& _histc_out_zoom(const Tensor& self, int64_t bins, const Scalar& min, const Scalar& max, Tensor& result) {
+  auto ret = _histc_zoom(self, bins, min, max);
+  resize_output(result, ret.sizes());
+  result.copy_(ret);
+  return result;
+}
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/TensorModeKernel.cpp b/aten/src/ATen/native/zoom/TensorModeKernel.cpp
new file mode 100644
index 00000000000000..b6567ef0fcfc15
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorModeKernel.cpp
@@ -0,0 +1,103 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/TensorModeKernel.h>
+#include <ATen/zoom/HIPConfig.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorCompare.h>
+
+constexpr int MAX_BLOCK_SIZE = AT_ROCM_ENABLED() ? 256 : 1024;
+
+// Maximum size per grid dimension that we assume (compute capability >= 2.0)
+constexpr int64_t MAX_GRID_SIZE = 65535LL;
+
+namespace at::native {
+
+void mode_kernel_impl(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim) {
+  auto self_sizes = ensure_nonempty_vec(self.sizes().vec());
+  int64_t ndim = ensure_nonempty_dim(self.dim());
+  int64_t slice_size = ensure_nonempty_size(self, dim);
+  int64_t slices = self.numel() / slice_size;
+
+  // Resize output value, index Tensors to appropriate sizes (i.e. the same as
+  // the input Tensor, except at dim=dimension, the size is 1)
+  assert(0 <= dim && static_cast<size_t>(dim) < self_sizes.size());
+  self_sizes[dim] = 1;
+
+  if (!keepdim) {
+    if (values.ndimension() >= dim) {
+      values.unsqueeze_(dim);
+    }
+    if (indices.ndimension() >= dim) {
+      indices.unsqueeze_(dim);
+    }
+  }
+
+  at::native::resize_output(values, self_sizes);
+  at::native::resize_output(indices, self_sizes);
+
+  // If sliceSize is 1, copy input to values and set indices
+  if (slice_size == 1) {
+    values.copy_(self);
+    indices.fill_(0);
+    if (!keepdim) {
+      values.squeeze_(dim);
+      indices.squeeze_(dim);
+    }
+    return;
+  }
+
+  // Beginning our optimized implementation. First thing we want to do is to
+  // transpose the input Tensor along the sort dimension, and then make it
+  // contiguous.
+  auto transposed = self.transpose(dim, ndim - 1);
+  auto contiguous = transposed.contiguous();
+
+  // We also need to view the values and indices Tensors as transposed in order
+  // to properly determine the offset into the underlying storage in which to
+  // place the mode and index for a particular set of dimension values.
+  auto values_transposed = values.transpose(dim, ndim - 1);
+  auto indices_transposed = indices.transpose(dim, ndim - 1);
+
+  // Requirements for fused kernel implementation:
+  //
+  // 1. sliceSize <= 2 * max threads per block
+  // 2. uses one block per slice, so number of slices must be less than the
+  // maximum number of blocks for a kernel launch
+  // 3. Can use 32-bit index math for indexing (mainly just for implementation
+  // conciseness, could be changed)
+  //
+  // MAX_BLOCK_SIZE and MAX_GRID_SIZE come from:
+  //     ATen/native/zoom/SortingCommon.cuh
+  if (slice_size <= 2 * MAX_BLOCK_SIZE &&
+      slices <= MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE &&
+      canUse32BitIndexMath(self)) {
+    launch_fused_mode_kernel(
+        values_transposed, indices_transposed, contiguous, slice_size, slices);
+  } else {
+    // [Note: CUDA torch.mode clones self]
+    //
+    // If transposed is already contiguous, it will return a tensor with the
+    // same storage. So, since we do not want to modify self, we clone it.
+    if (transposed.is_same(contiguous)) {
+      contiguous = contiguous.clone();
+    }
+
+    launch_apply_mode_kernel(
+        values_transposed, indices_transposed, contiguous, dim, ndim);
+  }
+
+  if (!keepdim) {
+    values.squeeze_(dim);
+    indices.squeeze_(dim);
+  }
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(mode_stub, &mode_kernel_impl);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorModeKernel.cu b/aten/src/ATen/native/zoom/TensorModeKernel.cu
new file mode 100644
index 00000000000000..ad59f3954347d0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorModeKernel.cu
@@ -0,0 +1,292 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/zoom/TensorModeKernel.cuh>
+#include <ATen/native/zoom/TensorModeKernel.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/NonEmptyUtils.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/ThrustAllocator.h>
+#include <c10/core/DeviceArray.h>
+
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+#include <thrust/find.h>
+#include <thrust/inner_product.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+namespace at::native {
+
+template <typename scalar_t>
+struct ModeImpl {
+  std::tuple<scalar_t, int64_t> operator()(
+      scalar_t *iter_begin,
+      scalar_t *iter_end) {
+    at::zoom::ThrustAllocator thrust_allocator;
+    auto stream = c10::zoom::getCurrentZoomStream();
+    auto policy = thrust::hip::par(thrust_allocator).on(stream);
+
+    const auto n_element = iter_end - iter_begin;
+    auto zoom_allocator = at::zoom::getZoomDeviceAllocator();
+    auto sort_buffer = c10::DeviceArray<int64_t>(*zoom_allocator, n_element);
+    auto sort_buffer_ptr = thrust::device_pointer_cast(sort_buffer.get());
+    auto count_from_zero_iter = thrust::make_counting_iterator(int64_t{0});
+    thrust::copy_n(policy, count_from_zero_iter, n_element, sort_buffer_ptr);
+
+
+    // Sort the input data. The original indices of the data are stored in
+    // sort_buffer_ptr
+    thrust::sort_by_key(policy, iter_begin, iter_end, sort_buffer_ptr);
+
+    // Count # of unique elements via an inner product between adjacent elements.
+    // Add 1 if two neighboring element are not equal.
+    int unique = 1 +
+        thrust::inner_product(
+                    policy,
+                    iter_begin,
+                    iter_end - 1,
+                    iter_begin + 1,
+                    0,
+                    thrust::plus<int>(),
+                    thrust::not_equal_to<scalar_t>());
+
+    // Count frequency of each element
+    auto keys = c10::DeviceArray<scalar_t>(*zoom_allocator, unique);
+    auto counts = c10::DeviceArray<int64_t>(*zoom_allocator, unique);
+
+    auto keys_ptr = thrust::device_pointer_cast(keys.get());
+    auto counts_ptr = thrust::device_pointer_cast(counts.get());
+
+    thrust::reduce_by_key(
+        policy,
+        iter_begin,
+        iter_end,
+        thrust::constant_iterator<int>(1),
+        keys_ptr,
+        counts_ptr);
+
+    // Find index of maximum count
+    auto it = thrust::max_element(policy, counts_ptr, counts_ptr + unique);
+    scalar_t mode = keys_ptr[it - counts_ptr];
+
+    // Find first index within which it occurs
+    auto position_iter = thrust::find(policy, iter_begin, iter_end, mode);
+
+    // Translate to original non-sorted index
+    TORCH_INTERNAL_ASSERT(position_iter != iter_end);
+    int64_t index = sort_buffer_ptr[position_iter - iter_begin];
+    return {mode, index};
+  }
+};
+
+struct EqualsMode {
+  bool mode;
+
+  C10_DEVICE bool operator()(const uint8_t x) {
+    return static_cast<bool>(x) == mode;
+  }
+};
+
+template <>
+struct ModeImpl<bool> {
+  std::tuple<bool, int64_t> operator()(
+      const bool *first,
+      const bool *last) {
+    at::zoom::ThrustAllocator thrust_allocator;
+    auto stream = c10::zoom::getCurrentZoomStream();
+    auto policy = thrust::hip::par(thrust_allocator).on(stream);
+
+    // For bool, we can skip finding the unique elements since there
+    // are only two possible values.
+
+    // See NOTE [Loading boolean values]
+    auto first_bytes = reinterpret_cast<const uint8_t*>(first);
+    auto last_bytes = reinterpret_cast<const uint8_t*>(last);
+
+    const auto numel = last - first;
+    const auto num_true = thrust::count_if(
+        policy,
+        first_bytes,
+        last_bytes,
+        [] GPU_LAMBDA (uint8_t x) {
+          return static_cast<bool>(x);
+        }
+      );
+    const auto num_false = (numel - num_true);
+    const auto mode = num_true > num_false;
+
+    // Find first index within which it occurs
+    const auto position_iter = thrust::find_if(
+        policy, first_bytes, last_bytes, EqualsMode{mode});
+    const int64_t index = position_iter - first_bytes;
+    return {mode, index};
+  }
+};
+
+template <typename scalar_t>
+void calculate_mode(
+    const TensorBase& values,
+    const TensorBase& indices,
+    const TensorBase& self,
+    std::vector<int64_t>& position,
+    int dim) {
+
+  TORCH_INTERNAL_ASSERT(self.is_contiguous());
+
+  // Because the input is contiguous, we want to get a reference to the
+  // location of the buffer at the innermost dimension that we are going
+  // to calculate the mode for --> we do this by manually doing the stride
+  // calculations to get an offset
+  //
+  // Yes, mutating self is a code smell, but we clone self before
+  // entering the bowels of this implementation.
+  //
+  // See [Note: CUDA torch.mode clones self]
+  scalar_t* data = self.mutable_data_ptr<scalar_t>();
+  for (int64_t i = 0; i < static_cast<int64_t>(position.size()); i++) {
+    data += position[i] * ensure_nonempty_stride(self, i);
+  }
+
+  int64_t ndim = ensure_nonempty_dim(self.dim());
+  int64_t n_element = ensure_nonempty_size(self, ndim - 1);
+
+  scalar_t* iter_begin = data;
+  scalar_t* iter_end = data + n_element;
+
+  scalar_t mode;
+  int64_t index;
+  std::tie(mode, index) = ModeImpl<scalar_t>{}(iter_begin, iter_end);
+
+  // Place mode, index in output
+  scalar_t* values_data = values.mutable_data_ptr<scalar_t>();
+  int64_t* indices_data = indices.mutable_data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < static_cast<int64_t>(position.size()); i++) {
+    int64_t pos = position[i];
+    values_data += ensure_nonempty_stride(values, i) * pos;
+    indices_data += ensure_nonempty_stride(indices, i) * pos;
+  }
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  C10_ZOOM_CHECK(hipMemcpyAsync(
+      values_data, &mode, sizeof(scalar_t), hipMemcpyHostToDevice, stream));
+  //memcpy_and_sync will synchronize results
+  c10::zoom::memcpy_and_sync(indices_data, &index, sizeof(int64_t), hipMemcpyHostToDevice, stream);
+}
+
+template <typename scalar_t>
+void apply_mode(
+    const TensorBase& values,
+    const TensorBase& indices,
+    const TensorBase& self,
+    std::vector<int64_t>& position,
+    int dim,
+    int curDim) {
+  // Because we have transposed the Tensor, the data for the dimension we are
+  // mode'ing along is always in the innermost dimension
+  int64_t ndim = ensure_nonempty_dim(self.dim());
+  if (curDim == ndim - 1) {
+    calculate_mode<scalar_t>(values, indices, self, position, dim);
+  } else {
+    for (int i = 0; i < ensure_nonempty_size(self, curDim); ++i) {
+      position[curDim] = i;
+      apply_mode<scalar_t>(values, indices, self, position, dim, curDim + 1);
+    }
+  }
+}
+
+template <int64_t size, typename scalar_t>
+void handle_fused_mode(
+    dim3 grid,
+    const TensorBase& self,
+    zoom::detail::TensorInfo<scalar_t, unsigned int>& ti_values,
+    zoom::detail::TensorInfo<int64_t, unsigned int>& ti_indices,
+    int64_t slice_size,
+    int64_t slices) {
+  constexpr int num_threads = size / 2;
+  int warp_size = at::zoom::warp_size();
+  TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 &&
+                num_threads <= zoom_utils::kHIPBlockReduceMaxThreads, "");
+  const auto memsize =
+      (sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int));
+ hipLaunchKernelGGL(( compute_mode<scalar_t, size>)
+      , dim3(grid), dim3(num_threads), memsize, c10::zoom::getCurrentZoomStream(), 
+          self.const_data_ptr<scalar_t>(), ti_values, ti_indices, slice_size, slices);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t>
+void fused_mode(
+    const TensorBase& values,
+    const TensorBase& indices,
+    const TensorBase& self,
+    int64_t slice_size,
+    int64_t slices) {
+  // Set-up TensorInfo structs for passing to kernel
+  auto ti_values = zoom::detail::getTensorInfo<scalar_t, unsigned int>(values);
+  auto ti_indices = zoom::detail::getTensorInfo<int64_t, unsigned int>(indices);
+
+  // The number of blocks is the number of slices that we need to calculate
+  // the mode for. Each block is responsible for computing a single mode
+  dim3 grid;
+  getGridFromTiles(slices, grid);
+
+  // The blocksize is two elements per thread, rounded up to the nearest power
+  // of 2
+  auto ceilPowerOf2 = nextHighestPowerOf2(slice_size);
+
+  // Tradeoff between compilation time and the number of specializations.
+  // Ideally we would have one handle_fused_mode for each power of 2
+  switch (ceilPowerOf2) {
+    case 2048:
+      handle_fused_mode<2048, scalar_t>(
+          grid, self, ti_values, ti_indices, slice_size, slices);
+      break;
+    case 1024:
+    case 512:
+    case 256:
+      handle_fused_mode<1024, scalar_t>(
+          grid, self, ti_values, ti_indices, slice_size, slices);
+      break;
+    case 128:
+    case 64:
+    case 32:
+    case 16:
+    case 8:
+    case 4:
+    case 2:
+      handle_fused_mode<128, scalar_t>(
+          grid, self, ti_values, ti_indices, slice_size, slices);
+      break;
+    case 1:
+    default:
+      TORCH_INTERNAL_ASSERT(false);
+  }
+
+  C10_ZOOM_CHECK(hipGetLastError());
+}
+
+void launch_fused_mode_kernel(
+    const TensorBase &values, const TensorBase &indices, const TensorBase &self,
+    int64_t slice_size, int64_t slices) {
+  AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, self.scalar_type(), "zoom_mode", [&] {
+    fused_mode<scalar_t>(values, indices, self, slice_size, slices);
+  });
+}
+
+void launch_apply_mode_kernel(const TensorBase &values, const TensorBase &indices,
+                              const TensorBase &self, int64_t dim, int64_t ndim) {
+  AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, self.scalar_type(), "zoom_mode", [&] {
+    // Position will store the dimension values we are processing
+    std::vector<int64_t> position(ndim - 1, 0);
+
+    apply_mode<scalar_t>(values, indices, self, position, dim, 0);
+  });
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorModeKernel.cuh b/aten/src/ATen/native/zoom/TensorModeKernel.cuh
new file mode 100644
index 00000000000000..f330019978ea81
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorModeKernel.cuh
@@ -0,0 +1,427 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#pragma once
+
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/SortingCommon.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+
+namespace at {
+namespace native {
+
+// Used for a segmented reduction
+struct ModeUnsignedBoolPair {
+  unsigned int val;
+  bool flag;
+};
+
+// In the kernel below, we have a common pattern of reducing (unsigned int,
+// unsigned int) pairs of data
+struct ModeUnsignedPair {
+  unsigned int val;
+  unsigned int index;
+};
+
+// Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
+//
+// 1. Power2ScanSize is a power of 2. This code still works for collections that
+// do not exactly contain a power of 2 number of elements, simply round up to
+// the nearest power of 2 and then call.
+//
+// 2. That there are two-elements per thread, i.e. the size of the smem storage
+// is 2 * blockDim.x * sizeof(T).
+//
+// Consider a (+)-Scan on the following elements:
+//
+// Upsweep:
+//
+//    0  1  2  3  4  5  6  7
+//       1     5     9    13
+//             6          22
+//                        28
+//
+// Downsweep:
+//                  15
+//         3     10    21
+template <int Power2ScanSize, typename T, class BinaryOp>
+__device__ void inclusivePrefixScan(T* smem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = binop(smem[index], smem[index - stride]);
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = binop(smem[index + stride], smem[index]);
+    }
+    __syncthreads();
+  }
+}
+
+// Block-wide reduction where each thread locally reduces N
+// values before letting a single warp take over - assumes
+// threadVals is in registers, not shared memory
+//
+// If smem is not used again, there is no need to __syncthreads before this
+// call. However, if smem will be used, e.g., this function is called in a loop,
+// then __syncthreads is needed either before or afterwards to prevent non-0
+// threads overriding smem in the next loop before num-0 thread reads from it.
+template <int N, typename T, typename ReduceOp>
+__device__ T reduceBlockWithNThreadLocalReductions(
+    T* smem,
+    T threadVals[N],
+    const unsigned int numVals,
+    ReduceOp reduceOp,
+    T init) {
+  int offset = threadIdx.x * N;
+  T local = offset < numVals ? threadVals[0] : init;
+
+#pragma unroll
+  for (int i = 1; i < N; ++i) {
+    ++offset;
+    T next = offset < numVals ? threadVals[i] : init;
+    local = reduceOp.combine(local, next);
+  }
+
+  return zoom_utils::BlockReduce(local, reduceOp, init, smem);
+}
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(
+    K& kA,
+    V& vA,
+    bool& validA,
+    K& kB,
+    V& vB,
+    bool& validB,
+    bool dir,
+    const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <typename Comparator, typename K>
+__device__ inline void bitonicSwapKeys(
+    K& kA,
+    bool& validA,
+    K& kB,
+    bool& validB,
+    bool dir,
+    const Comparator& comp) {
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(validA, validB);
+  }
+}
+
+template <
+    typename K,
+    typename IndexType,
+    int Power2SortSize,
+    typename Comparator>
+__device__ inline void bitonicSortKeys(
+    K keys[Power2SortSize],
+    bool valid[Power2SortSize],
+    const Comparator& comp) {
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwapKeys<Comparator, K>(
+          keys[pos],
+          valid[pos],
+          keys[pos + stride],
+          valid[pos + stride],
+          flag,
+          comp);
+    }
+  }
+
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    __syncthreads();
+
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwapKeys<Comparator, K>(
+        keys[pos],
+        valid[pos],
+        keys[pos + stride],
+        valid[pos + stride],
+        false,
+        comp);
+  }
+
+  __syncthreads();
+}
+
+// The mode kernel has the following characteristics: It uses internal shared
+// memory buffers of Power2Size, which must be greater than the number of
+// elements. Additionally, there is one block for every slice to calculate the
+// mode for, and in each block there is one thread for every two elements.
+//
+// Both sorted and positions are assumed to be contiguous Tensors with the mode
+// dimension as the innermost dim, such that we can get the particular slice for
+// a Tensor via its linear block dimension * the slice size.
+template <typename T, unsigned int Power2Size>
+#if defined(TORCH_HIP_VERSION) && TORCH_HIP_VERSION >= 11070
+__launch_bounds__(1024, 1)
+#endif
+__global__ void compute_mode(
+    const T* input,
+    at::zoom::detail::TensorInfo<T, unsigned int> values,
+    at::zoom::detail::TensorInfo<int64_t, unsigned int> indices,
+    int64_t sliceSize,
+    int64_t slices) {
+  int tidx = threadIdx.x;
+  int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for
+
+  // First, we need to calculate the offset into the sorted Tensor that
+  // represents the start of the slice for this block to calculate the mode for.
+  // This offset is a combination of the gridIndices, and the number of elements
+  // in the slice.
+  unsigned int blockId = getLinearBlockId<unsigned int>();
+  unsigned int linearOffset = blockId * sliceSize;
+
+  if (blockId >= slices) {
+      return;
+  }
+
+  // shmem is a dynamically sized buffer we will use throughout the kernel to
+  // handle computation efficiently. The size of this shmem must be
+  // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size)
+  //
+  // Initially, the buffer will be organized as follows:
+  //
+  // [smem (slice elements) | bmem (valid indices) | <scratch space>]
+  extern __shared__ char shmem[];
+
+  // smem represents a proportion of the shared memory buffer that is used to
+  // store the elements from the slice:
+  T* smem = reinterpret_cast<T*>(shmem);
+
+  // Each thread loads up to two elements from the Tensor into shared memory
+  if (tidx < sliceSize) {
+    smem[tidx] = c10::load(&input[linearOffset + tidx]);
+  }
+  if (stidx < sliceSize) {
+    smem[stidx] = c10::load(&input[linearOffset + stidx]);
+  }
+
+  // Next, we initialize a boolean region of the buffer, offset by the loaded
+  // element smem region
+  bool* bmem = reinterpret_cast<bool*>(&smem[Power2Size]);
+
+  // The first use of this region stores bmem[i] = i < sliceSize to mark the
+  // valid components in the smem buffer
+  bmem[tidx] = tidx < sliceSize;
+  bmem[stidx] = stidx < sliceSize;
+  __syncthreads(); // barrier for smem, bmem initialization
+
+  // First, sort the input slice in ascending order. smem contains the input
+  // elements, and bmem marks the valid indices
+  bitonicSortKeys<T, unsigned int, Power2Size>(
+      smem, bmem, [&] GPU_LAMBDA(const auto& a, const auto& b) {
+        return a < b;
+      });
+  __syncthreads(); // make no assumptions that the sort syncs at end
+
+  // The next step of our algorithm is performing a block-wide comparison of
+  // neighboring elements. In particular, given an sorted input slice A, we
+  // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise
+  // 0.
+  //
+  // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8]
+  //                 B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  //
+  // In particular, we can think of B[i] true indicating the start of a sequence
+  // of equal values in the sorted list. Similarly, we will also store the
+  // negation of B, which we'll call C. In particular, we can think of C[i] =
+  // true iff A[i-1] == A[i] in our original sorted slice.
+  //
+  //                 C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+
+  // We overwrite bmem, and treat the rest of shared memory as a buffer of
+  // (index, flag) pairs where the index represents values from C, and the flag
+  // represents values from B.
+  //
+  // [smem (sorted slice) | ubpmem (index, flag pairs)]
+
+  struct ModeUnsignedBoolPair* ubpmem =
+      reinterpret_cast<struct ModeUnsignedBoolPair*>(&smem[Power2Size]);
+
+  if (tidx == 0) {
+    ubpmem[0].flag = true;
+    ubpmem[0].val = 0;
+  }
+
+  // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ...
+  ubpmem[tidx * 2 + 1].flag =
+      smem[tidx * 2] != smem[tidx * 2 + 1]; // (0, 1), (1, 2), etc.
+  ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag;
+
+  // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ...
+  if (((tidx + 1) * 2) < Power2Size) {
+    ubpmem[(tidx + 1) * 2].flag =
+        smem[((tidx + 1) * 2) - 1] != smem[(tidx + 1) * 2];
+    ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag;
+  }
+  __syncthreads(); // barrier for ubpmem initialization
+
+  // Next, we perform a segmented prefix sum on the neighboring elements, where
+  // the presence of a one indicates the start of a segment. In this case B acts
+  // as the segment start flags, and C is the buffer to be summed:
+  //
+  // Input  (C)  = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+  // Flag   (B)  = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  // Output (C)  = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0]
+  //
+  // Afterwards, the (index) components of the ubpmem buffer contain the lengths
+  // of the segments (minus 1), i.e. the counts of each element in the original
+  // input.
+  inclusivePrefixScan<Power2Size>(
+      ubpmem, [=] GPU_LAMBDA(const auto& a, const auto& b) {
+        ModeUnsignedBoolPair c;
+        c.val = a.flag ? a.val : a.val + b.val;
+        c.flag = a.flag | b.flag;
+        return c;
+      });
+  // assumes scan syncs at the end
+
+  // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e.
+  // we treat the boolean flag regions as integers). We initialize these to
+  // represent indices, and we'll call this buffer I
+  struct ModeUnsignedPair* uupmem =
+      reinterpret_cast<struct ModeUnsignedPair*>(ubpmem);
+
+  // At this point, we need to find the maximum element in lengths buffer C.
+  // This element will represent the count (-1) of the mode. Because of the
+  // way we have set up the problem, the index where this mode occurs will
+  // also be the location of the mode value in the sorted array, e.g.
+  //
+  // smem = [0, 0, 1, 1, 1, 2]
+  // C    = [0, 1, 0, 1, 2, 0]
+  // I    = [0, 1, 2, 3, 4, 5]
+  //                     ^
+  //                     maximum value, also aligned with mode = 1
+  //
+  // We perform a block wide max-reduction of the C buffer, but we also need the
+  // indices to come along with it, so we utilize the uupmem construction.
+  //
+  // At the end we need to return the ModeUnsignedPair containing index = 4, val
+  // = 2, which represents the max
+
+  // In practice, we will make each thread locally reduce 2 values in its
+  // registers prior to the global block-wide reduction. Note that instead of
+  // tidx/stidx, we utilize tidx * 2, tidx * 2 + 1, so each thread deals with
+  // adjacent elements. This is because the reduce code below relies on thread
+  // elements to be adjacent.
+  struct ModeUnsignedPair uup[2];
+  uup[0].index = tidx * 2;
+  uup[0].val = ubpmem[tidx * 2].val;
+  uup[1].index = tidx * 2 + 1;
+  uup[1].val = ubpmem[tidx * 2 + 1].val;
+  __syncthreads();
+
+  struct ModeUnsignedPair max = {0, 0};
+
+  struct MaxOp {
+    inline __device__ ModeUnsignedPair combine(ModeUnsignedPair a, ModeUnsignedPair b) const {
+      return b.val > a.val ? b : a;
+    }
+
+    inline __device__ ModeUnsignedPair warp_shfl_down(ModeUnsignedPair acc, int offset) const {
+      ModeUnsignedPair ret;
+      ret.index = WARP_SHFL_DOWN(acc.index, offset);
+      ret.val = WARP_SHFL_DOWN(acc.val, offset);
+      return ret;
+    }
+  } max_op;
+
+  max = reduceBlockWithNThreadLocalReductions<2>(
+      uupmem,
+      uup,
+      sliceSize,
+      max_op,
+      max);
+
+  // Store the mode in shared memory for use in finding the mode in the input
+  // slice
+  __shared__ T mode;
+
+  // Given the above constraints, the mode is the value at the reduced index in
+  // the original sorted element buffer
+  if (tidx == 0) {
+    mode = smem[max.index];
+  }
+  __syncthreads(); // broadcast mode
+
+  // Finally, we need to find "an" index of the mode in the input
+  // Tensor. The API does not constrain which index we pick, but here
+  // we always pick the largest index. We store the index if the value
+  // is the mode, or 0 otherwise. Then find the maximum value.
+  //
+  // Again we reduce 2 elements in the thread's registers prior to the
+  // block-wide reduction
+  unsigned mode_index[2] = {0u, 0u};
+  if (tidx * 2 < sliceSize) {
+    const unsigned idx = tidx * 2;
+    mode_index[0] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u;
+  }
+  if (tidx * 2 + 1 < sliceSize) {
+    const unsigned idx = tidx * 2 + 1;
+    mode_index[1] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u;
+  }
+
+  struct MaxIndexOp {
+    inline __device__ unsigned combine(unsigned a, unsigned b) const {
+      return b > a ? b : a;
+    }
+
+    inline __device__ unsigned warp_shfl_down(unsigned acc, int offset) const {
+      return WARP_SHFL_DOWN(acc, offset);
+    }
+  } max_index_op;
+
+  int64_t index = reduceBlockWithNThreadLocalReductions<2>(
+      reinterpret_cast<unsigned*>(&shmem[0]),
+      mode_index,
+      sliceSize,
+      max_index_op,
+      0u);
+
+  // Finally, we have the mode, and an index where it occurs. We use a single
+  // thread to place this in the appropriate output position
+  if (tidx == 0) {
+    unsigned int outputOffset =
+        at::zoom::detail::IndexToOffset<T, unsigned int, -1>::get(
+            blockId, values);
+    values.data[outputOffset] = mode;
+    indices.data[outputOffset] = index;
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/zoom/TensorModeKernel.h b/aten/src/ATen/native/zoom/TensorModeKernel.h
new file mode 100644
index 00000000000000..b5660747997d4e
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorModeKernel.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_fused_mode_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t slice_size, int64_t slices);
+
+void launch_apply_mode_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t dim, int64_t ndim);
+
+}}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/TensorTransformations.cu b/aten/src/ATen/native/zoom/TensorTransformations.cu
new file mode 100644
index 00000000000000..fd84d2cb79a1bc
--- /dev/null
+++ b/aten/src/ATen/native/zoom/TensorTransformations.cu
@@ -0,0 +1,154 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/TensorTransformations.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/roll_native.h>
+#endif
+
+#include <cstddef>
+#include <vector>
+
+namespace at::native {
+
+template <typename scalar_t, typename IndexType>
+C10_LAUNCH_BOUNDS_2(zoom::getApplyBlockSize(), zoom::getApplyBlocksPerSM())
+__global__ void kernel_pointwise_flip_apply2(
+    const zoom::detail::TensorInfo<scalar_t, IndexType> in_tensor_info,
+    zoom::detail::TensorInfo<scalar_t, IndexType> out_tensor_info,
+    IndexType N,
+    int flip_dim,
+    IndexType total_dims) {
+  for (IndexType linear_index = blockIdx.x * blockDim.x + threadIdx.x; linear_index < N; linear_index += gridDim.x * blockDim.x) {
+    IndexType dst_offset = 0;
+    if (flip_dim == 0) {
+      // flip 1st dim
+      dst_offset = (in_tensor_info.sizes[0] - 1 - linear_index / in_tensor_info.strides[0]) * in_tensor_info.strides[0] + linear_index % in_tensor_info.strides[0];
+    }
+    else {
+      // flip last dim
+      IndexType i = total_dims - 1;
+      dst_offset = linear_index / in_tensor_info.strides[0] * in_tensor_info.strides[0] + (in_tensor_info.sizes[i] - 1 - linear_index % in_tensor_info.strides[0]);
+    }
+    out_tensor_info.data[dst_offset] = in_tensor_info.data[linear_index];
+  }
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
+__global__ void flip_zoom_kernel(
+    scalar_t* in_tensor,
+    scalar_t* out_tensor,
+    int64_t N,
+    int64_t* flip_dims,
+    int64_t flip_dims_size,
+    int64_t* strides,
+    int64_t* strides_contiguous,
+    int64_t* shape,
+    int64_t total_dims) {
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= N) {
+    return;
+  }
+
+  int64_t cur_indices = linear_index, rem = 0, dst_offset = 0;
+  for (int64_t i = 0; i < total_dims; i++) {
+    int64_t temp = cur_indices;
+    cur_indices = cur_indices / strides_contiguous[i];
+    rem = temp - cur_indices * strides_contiguous[i];
+    // flip the indices if it is in flip_dims
+    for (int64_t j = 0; j < flip_dims_size; j++) {
+      if (i == flip_dims[j]) {
+        cur_indices = shape[i] - 1 - cur_indices;
+      }
+    }
+    dst_offset += cur_indices * strides[i];
+    cur_indices = rem;
+  }
+  out_tensor[linear_index] = in_tensor[dst_offset];
+}
+
+template <typename scalar_t>
+C10_LAUNCH_BOUNDS_1(zoom::getApplyBlockSize())
+__global__ void roll_zoom_kernel(
+    const scalar_t* in_tensor,
+    scalar_t* out_tensor,
+    int64_t N,
+    int64_t roll_dim,
+    int64_t start,
+    int64_t size,
+    int64_t stride,
+    int64_t total_dims) {
+  int64_t linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (linear_index >= N) {
+    return;
+  }
+  // roll dim idx is the index of linear_index along the rolling dimension.
+  int64_t roll_dim_idx = linear_index % (stride * size) / stride;
+  // index into the source data to find appropriate value.
+  int64_t source_idx = 0;
+  if( roll_dim_idx >= (size - start) ) {
+    source_idx = linear_index - ((size - start) * stride);
+  } else {
+    source_idx = linear_index + (start * stride);
+  }
+  out_tensor[linear_index] = in_tensor[source_idx];
+}
+
+// Roll a tensor along a dimension
+Tensor roll_zoom(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
+  if (dims.size() != 1 || shifts.size() != 1) {
+    return roll_common(self, shifts, dims);
+  }
+
+  auto in_tensor = self;
+  if(!self.is_contiguous()) {
+    in_tensor = self.contiguous();
+  }
+  auto out_tensor = at::empty_like(in_tensor, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  if (out_tensor.numel() == 0) {
+    return out_tensor;
+  }
+  const int64_t N = in_tensor.numel();
+  const int64_t dim = dims[0];
+  const int64_t size = in_tensor.size(dim);
+  int64_t start = (size - shifts[0]) % size;
+  // Behavior of % is different in C++ vs Python for negative numbers. This
+  // corrects the difference.
+  if( start < 0 ) start = start + size;
+
+  dim3 dim_block = zoom::getApplyBlock();
+  dim3 dim_grid;
+  TORCH_CHECK(zoom::getApplyGrid(N, dim_grid, in_tensor.get_device()), "unable to get dim grid");
+
+  auto total_dims = in_tensor.dim();
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+      at::ScalarType::ComplexHalf,
+      in_tensor.scalar_type(), "roll_zoom",
+      [&] {
+       hipLaunchKernelGGL(( roll_zoom_kernel), dim3(dim_grid), dim3(dim_block), 0, c10::zoom::getCurrentZoomStream(), 
+          in_tensor.const_data_ptr<scalar_t>(), out_tensor.mutable_data_ptr<scalar_t>(), N,
+          dim, start,
+          size,
+          in_tensor.stride(dim),
+          total_dims);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+
+  return out_tensor;
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UnfoldBackwardKernel.cu b/aten/src/ATen/native/zoom/UnfoldBackwardKernel.cu
new file mode 100644
index 00000000000000..1bb453eb401f00
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UnfoldBackwardKernel.cu
@@ -0,0 +1,162 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/UnfoldBackward.h>
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/zoom/jit/OffsetCalculator.cuh>
+#include <ATen/zoom/ZoomContext.h>
+
+#include <vector>
+
+// Note on naming: it is unconventional.
+// grad_in does not mean that it is a gradient wrt to input,
+// grad_in/grad_out is just an input/output of unfold_backward kernel.
+//
+// unfold_backward, the algorithm is described in
+// /native/cpu/UnfoldBackwardKernel.cpp
+
+namespace at::native {
+
+namespace {
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+C10_LAUNCH_BOUNDS_2(n_threads, n_elems_per_thread)
+__global__ void _unfold_backward_elementwise_kernel(int total_n_elems, func_t f) {
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  int idx = total_work_block * blockIdx.x + threadIdx.x;
+
+  #pragma unroll
+  for (int i = 0; i < n_elems_per_thread; ++i) {
+    if (idx < total_n_elems) {
+      f(idx);
+      idx += n_threads;
+    }
+  }
+}
+
+template <int n_threads, int n_elems_per_thread, typename func_t>
+static void _launch_unfold_backward_kernel(int total_n_elems, func_t f) {
+  TORCH_INTERNAL_ASSERT(
+    total_n_elems >= 0 && total_n_elems <= std::numeric_limits<int32_t>::max()
+  );
+
+  dim3 block(n_threads);
+  constexpr int total_work_block = n_threads * n_elems_per_thread;
+  dim3 grid((total_n_elems + total_work_block - 1) / total_work_block);
+
+  auto stream = c10::zoom::getCurrentZoomStream();
+  _unfold_backward_elementwise_kernel<n_threads, n_elems_per_thread, func_t>
+    <<<grid, block, 0, stream>>>(total_n_elems, f);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t>
+void _unfold_backward_internal_kernel(
+  TensorIterator& iter,
+  int64_t size,
+  int64_t step,
+  int64_t grad_in_dim_stride,
+  int64_t grad_in_last_dim_stride,
+  int64_t grad_in_dim_size,
+  int64_t grad_out_dim_stride
+) {
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      _unfold_backward_internal_kernel<scalar_t>(
+        sub_iter,
+        size,
+        step,
+        grad_in_dim_stride,
+        grad_in_last_dim_stride,
+        grad_in_dim_size,
+        grad_out_dim_stride
+      );
+    }
+    return;
+  }
+
+  char* __restrict__ grad_out_ptr = reinterpret_cast<char*>(iter.data_ptr(0));
+  char* __restrict__ grad_in_ptr = reinterpret_cast<char*>(iter.data_ptr(1));
+  char* __restrict__ idx_dim_ptr = reinterpret_cast<char*>(iter.data_ptr(2));
+
+  auto offset_calc = make_offset_calculator<3>(iter);
+
+  // The algorithm is: for each index in grad_out find
+  // the elements contributing to it and sum them up.
+  // Note: the algorithm does not require any synchronization.
+  auto loop = [=]C10_DEVICE(int i) {
+    auto offsets = offset_calc.get(i);
+
+    auto* __restrict__ grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr + offsets[0]);
+    auto* __restrict__ grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr + offsets[1]);
+
+    auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr + offsets[2]);
+
+    // left_fold potentially intersecting with idx_dim
+    // is either (idx_dim - size) / step or the next integer.
+    int64_t left_fold_idx = (idx_dim > size) ? (idx_dim - size) / step : 0;
+    if (!(left_fold_idx * step <= idx_dim && idx_dim < left_fold_idx * step + size)) {
+      ++left_fold_idx;
+    }
+
+    auto right_fold_idx = idx_dim / step;
+    right_fold_idx = (right_fold_idx >= grad_in_dim_size) ?
+      (grad_in_dim_size - 1) : right_fold_idx;
+
+    for (auto fold_idx = left_fold_idx; fold_idx <= right_fold_idx; ++fold_idx) {
+      auto idx_last_dim = idx_dim - fold_idx * step;
+      *grad_out_data += grad_in_data[fold_idx * grad_in_dim_stride
+                                  + idx_last_dim * grad_in_last_dim_stride];
+    }
+
+  };
+
+  _launch_unfold_backward_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
+}
+
+void unfold_backward_zoom_kernel(
+  Tensor& grad_out,
+  const Tensor& grad_in,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+) {
+  dim = maybe_wrap_dim(dim, grad_out.dim());
+  // last dim stores the folds
+  auto last_dim = maybe_wrap_dim(-1, grad_in.dim());
+
+  auto grad_in_dim_stride = ensure_nonempty_stride(grad_in, dim);
+  auto grad_in_last_dim_stride = ensure_nonempty_stride(grad_in, last_dim);
+  auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim);
+
+  auto grad_out_dim_stride = ensure_nonempty_stride(grad_out, dim);
+
+  TensorIterator iter = _make_unfold_backward_iter_over_grad_out(
+      grad_out, grad_in, dim, size, step);
+
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+    at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
+    iter.dtype(),
+    "unfold_backward_zoom", [&] {
+      _unfold_backward_internal_kernel<scalar_t>(
+        iter,
+        size,
+        step,
+        grad_in_dim_stride,
+        grad_in_last_dim_stride,
+        grad_in_dim_size,
+        grad_out_dim_stride
+      );
+    }
+  );
+}
+
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(unfold_backward_stub, &unfold_backward_zoom_kernel);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/Unique.cu b/aten/src/ATen/native/zoom/Unique.cu
new file mode 100644
index 00000000000000..37d7e0e1c7e6d5
--- /dev/null
+++ b/aten/src/ATen/native/zoom/Unique.cu
@@ -0,0 +1,233 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ThrustAllocator.h>
+
+#include <c10/util/Load.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/_unique2_native.h>
+#include <ATen/ops/_unique_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/unique_consecutive_native.h>
+#include <ATen/ops/unique_dim_consecutive_native.h>
+#include <ATen/ops/unique_dim_native.h>
+#endif
+
+#include <tuple>
+#include <iterator>
+#include <thrust/adjacent_difference.h>
+#include <thrust/execution_policy.h>
+#include <thrust/unique.h>
+#include <thrust/sort.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+
+#include <ATen/native/zoom/UniqueCub.cuh>
+
+namespace at::native {
+
+namespace {
+
+template <
+  typename policy_t, typename scalar_t,
+  typename equal_t, typename not_equal_t
+>
+std::tuple<Tensor, Tensor, int64_t> compute_unique(
+  const policy_t &policy,
+  scalar_t *data,
+  int64_t num_inp,
+  const Tensor &sorted_indices,
+  const bool return_inverse,
+  const bool return_counts,
+  TensorOptions options,
+  equal_t equal,
+  not_equal_t not_equal
+) {
+  // inverse indices
+  Tensor inverse_indices;
+  if (!return_inverse || num_inp == 0) {
+    inverse_indices = at::empty({0}, options);
+  } else {
+    TORCH_CHECK(sorted_indices.defined(),
+      "return_inverse is set to true, but sorted_indices is undefined. Send a bug report!");
+    const int64_t *sorted_indices_ptr = sorted_indices.const_data_ptr<int64_t>();
+    Tensor inv_loc = at::empty({num_inp}, options);
+    inverse_indices = at::empty({num_inp}, options);
+    int64_t* inv_loc_ptr = inv_loc.mutable_data_ptr<int64_t>();
+    int64_t* inverse_indices_ptr = inverse_indices.mutable_data_ptr<int64_t>();
+    thrust::adjacent_difference(policy, data, data + num_inp, inv_loc_ptr, not_equal);
+    inv_loc[0] = 0;
+    thrust::inclusive_scan(policy, inv_loc_ptr, inv_loc_ptr + num_inp, inv_loc_ptr);
+    thrust::scatter(policy, inv_loc_ptr, inv_loc_ptr + num_inp, sorted_indices_ptr, inverse_indices_ptr);
+  }
+
+  // unique and count
+  Tensor counts = at::empty({0}, options);
+  int64_t num_out;
+  if (!return_counts) {
+    num_out = thrust::unique(policy, data, data + num_inp, equal) - data;
+  } else {
+    Tensor range = at::arange(0, num_inp + 1, options);
+    int64_t *range_ptr = range.mutable_data_ptr<int64_t>();
+    num_out = thrust::unique_by_key(policy, data, data + num_inp, range_ptr, equal).first - data;
+    range[num_out] = num_inp;
+    counts.resize_(num_out);
+    int64_t* counts_ptr = counts.mutable_data_ptr<int64_t>();
+    thrust::adjacent_difference(policy, range_ptr + 1, range_ptr + num_out + 1, counts_ptr);
+  }
+
+  C10_ZOOM_CHECK(hipGetLastError());
+  return std::tuple<Tensor, Tensor, int64_t>(inverse_indices, counts, num_out);
+}
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor> unique_dim_zoom_template(
+  const Tensor& self,
+  const int64_t dim,
+  const bool consecutive,
+  const bool return_inverse,
+  const bool return_counts
+) {
+
+  /**
+    * The idea for implementing this is basically the same as unique.
+    * For unique_dim, we are taking the unique with respect to a index
+    * tensor, but during the processes, we override the compare and equal
+    * operator by checking the data underlying it instead. After the
+    * algorithm, we would use index_select to map the resulting indices
+    * to the result on the actual data.
+    */
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  at::zoom::ThrustAllocator allocator;
+  auto policy = thrust::hip::par(allocator).on(stream);
+
+  auto sizes = self.sizes().vec();
+  // check how many zero dimensions exist
+  auto num_zero_dims = std::count(sizes.begin(), sizes.end(), 0);
+
+  // tensor is not well formed as it has 0 sized dimensions
+  if (self.size(dim) == 0){
+    TORCH_CHECK(
+        num_zero_dims == 1,
+        "Number of zero sized dimensions is more than one, so unique cannot be applied ")
+    Tensor output = at::empty(sizes, self.options());
+    Tensor inverse_indices =
+        at::empty({0}, self.options().dtype(kLong));
+    Tensor counts = at::empty({0}, self.options().dtype(kLong));
+
+    return std::make_tuple(output, inverse_indices, counts);
+  }
+
+  TORCH_CHECK(num_zero_dims == 0,
+    "There are 0 sized dimensions, and they aren't selected, so unique cannot be applied");
+
+  int64_t num_inp = self.size(dim);
+  auto options = self.options().dtype(kLong);
+  Tensor input_flat = self.moveaxis(dim, 0).contiguous().view({num_inp, -1});
+  int64_t n = input_flat.size(1);
+  const scalar_t *input_flat_ptr = input_flat.const_data_ptr<scalar_t>();
+
+  Tensor indices = at::arange(0, num_inp, options);
+  int64_t *indices_data = indices.mutable_data_ptr<int64_t>();
+  if (!consecutive) {
+    thrust::sort(policy, indices_data, indices_data + num_inp,
+      [=] __device__ (int64_t a, int64_t b) -> bool {
+        for (int64_t i = 0; i < n; ++i) {
+          scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
+          scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
+          if (lhs < rhs) {
+            return true;
+          } else if (lhs > rhs) {
+            return false;
+          }
+        }
+        return false;
+      }
+    );
+  }
+
+  auto [inverse_indices, counts, num_out] = compute_unique(
+    policy, indices_data, num_inp, indices,
+    return_inverse, return_counts, options,
+    [=] __device__ (int64_t a, int64_t b) -> bool {
+      for (int64_t i = 0; i < n; ++i) {
+        scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
+        scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
+        if (lhs != rhs) {
+          return false;
+        }
+      }
+      return true;
+    },
+    [=] __device__ (int64_t a, int64_t b) -> int64_t {
+      for (int64_t i = 0; i < n; ++i) {
+        scalar_t lhs = c10::load(&input_flat_ptr[i + a * n]);
+        scalar_t rhs = c10::load(&input_flat_ptr[i + b * n]);
+        if (lhs != rhs) {
+          return 1;
+        }
+      }
+      return 0;
+    }
+  );
+  indices.resize_(num_out);
+
+  return std::tuple<Tensor, Tensor, Tensor>(self.index_select(dim, indices), inverse_indices, counts);
+}
+
+} // namespace
+
+
+std::tuple<Tensor, Tensor>
+_unique_zoom(const Tensor& self, const bool sorted, const bool return_inverse) {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
+    // The current CUDA implementation of unique always sort due to the
+    // lack of hashtable implementation in thrust
+    auto [output, inverse, _] = internal::unique_zoom_template<scalar_t>(self, false, return_inverse, false);
+    return std::make_tuple(output, inverse);
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+_unique2_zoom(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
+    // The current CUDA implementation of unique always sort due to the
+    // lack of hashtable implementation in thrust
+    return internal::unique_zoom_template<scalar_t>(self, false, return_inverse, return_counts);
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+unique_dim_zoom(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
+    return unique_dim_zoom_template<scalar_t>(self, dim, false, return_inverse, return_counts);
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+unique_dim_consecutive_zoom(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) {
+  return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] {
+    return unique_dim_zoom_template<scalar_t>(self, dim, true, return_inverse, return_counts);
+  }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+unique_consecutive_zoom(const Tensor& self, const bool return_inverse, const bool return_counts, std::optional<int64_t> dim) {
+  if (!dim.has_value()) {
+    return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] {
+      // The current CUDA implementation of unique always sort due to the
+      // lack of hashtable implementation in thrust
+      return internal::unique_zoom_template<scalar_t>(self, true, return_inverse, return_counts);
+    }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+  }
+  return unique_dim_consecutive_zoom(self, dim.value(), return_inverse, return_counts);
+}
+
+}  // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UniqueCub.cu b/aten/src/ATen/native/zoom/UniqueCub.cu
new file mode 100644
index 00000000000000..04f1b99168d85a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UniqueCub.cu
@@ -0,0 +1,348 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/UniqueCub.cuh>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/zoom/cub.cuh>
+
+#include <c10/core/DeviceArray.h>
+#include <c10/util/Load.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native::internal {
+
+namespace {
+
+template <typename InputIteratorT>
+__global__ void adjacent_difference_kernel(
+    int64_t n,
+    InputIteratorT input,
+    int* output) {
+  HIP_KERNEL_LOOP(i, n) {
+    output[i] = i > 0 ? input[i] != input[i - 1] : 0;
+  }
+}
+
+__global__ void scatter_kernel(
+    int64_t n,
+    const int64_t* input,
+    const int64_t* indices,
+    int64_t* output) {
+  HIP_KERNEL_LOOP(i, n) {
+    output[indices[i]] = input[i];
+  }
+}
+
+template <typename scalar_t>
+const scalar_t * wrap_input_iterator(const scalar_t *data) {
+  return data;
+}
+
+struct LoadBoolOp {
+  __device__ bool operator()(uint8_t x) const {
+    return static_cast<bool>(x);
+  }
+};
+
+auto wrap_input_iterator(const bool *data) {
+  // See NOTE [Loading boolean values]
+  LoadBoolOp op;
+  return NO_ROCM(at_zoom_detail)::hipcub::TransformInputIterator<bool, LoadBoolOp, const uint8_t*, int>(
+      reinterpret_cast<const uint8_t*>(data), op);
+}
+
+// A variation of compute_unique (defined in Unique.cu) that doesn't allow
+// customizing equal and not_equal (CUB doesn't allow them).
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor> compute_unique(
+    const Tensor& sorted,
+    const Tensor& sorted_indices,
+    const bool return_inverse,
+    const bool return_counts,
+    const bool consecutive) {
+  int64_t num_inp = sorted.numel();
+  auto options = sorted.options().dtype(kLong);
+  auto data = wrap_input_iterator(sorted.const_data_ptr<scalar_t>());
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  // inverse indices
+  Tensor inverse_indices;
+  if (!return_inverse) {
+    inverse_indices = at::empty({0}, options);
+  } else {
+    inverse_indices = at::empty(sorted.sizes(), options);
+    Tensor inv_loc = consecutive ? at::empty({num_inp}, options.dtype(kInt))
+                                 : inverse_indices;
+    int* inv_loc_ptr = static_cast<int*>(inv_loc.mutable_data_ptr());
+    const dim3 block =
+        dim3(::min(static_cast<int64_t>(zoom::getApplyBlock().x), num_inp));
+    dim3 grid;
+    c10::DeviceIndex curDevice = -1;
+    c10::zoom::GetDevice(&curDevice);
+    zoom::getApplyGrid(num_inp, grid, curDevice);
+   hipLaunchKernelGGL(( adjacent_difference_kernel), dim3(grid), dim3(block), 0, stream, 
+        num_inp, data, inv_loc_ptr);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    Tensor inv_loc_out =
+        consecutive ? inverse_indices : at::empty({num_inp}, options);
+    at::zoom::hipcub::inclusive_sum_truncating(
+        inv_loc_ptr,
+        inv_loc_out.mutable_data_ptr<int64_t>(),
+        num_inp);
+
+    if (!consecutive) {
+      TORCH_INTERNAL_ASSERT(
+          sorted_indices.defined(),
+          "return_inverse is set to true, but sorted_indices is undefined. Send a bug report!");
+     hipLaunchKernelGGL(( scatter_kernel), dim3(grid), dim3(block), 0, stream, 
+          num_inp,
+          inv_loc_out.const_data_ptr<int64_t>(),
+          sorted_indices.const_data_ptr<int64_t>(),
+          inverse_indices.mutable_data_ptr<int64_t>());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+
+  // unique and count
+  Tensor data_out = at::empty({num_inp}, sorted.options());
+  Tensor counts = at::empty({0}, options);
+  Tensor length = at::empty({1}, options);
+  int64_t num_out;
+  if (!return_counts) {
+    zoom::hipcub::unique(data, data_out.mutable_data_ptr<scalar_t>(), length.mutable_data_ptr<int64_t>(), num_inp);
+    num_out = length.item<int64_t>();
+  } else {
+    counts.resize_(num_inp);
+    at::zoom::hipcub::run_length_encode(
+        data,
+        data_out.mutable_data_ptr<scalar_t>(),
+        counts.mutable_data_ptr<int64_t>(),
+        length.mutable_data_ptr<int64_t>(),
+        num_inp);
+    num_out = length.item<int64_t>();
+    counts.resize_(num_out);
+  }
+
+  data_out.resize_(num_out);
+  return std::tuple<Tensor, Tensor, Tensor>(
+      data_out, inverse_indices, counts);
+}
+
+} // namespace
+
+// This function (and compute_unique above) are defined in a separate file from
+// Unique.cu because for now ATen/cuda/cub.cuh can't be used together with
+// thrust in the same compilation unit.
+
+template <typename scalar_t>
+struct UniqueCub {
+  std::tuple<Tensor, Tensor, Tensor> operator() (
+      const Tensor& self,
+      const bool consecutive,
+      const bool return_inverse,
+      const bool return_counts) {
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+    int64_t num_inp = self.numel();
+    Tensor sorted;
+    if (consecutive) {
+      sorted = self;
+    } else {
+      sorted = at::empty(self.sizes(), self.options());
+    }
+
+    Tensor sorted_indices;
+    if (!return_inverse) {
+      if (!consecutive) {
+        zoom::hipcub::radix_sort_keys(
+          self.const_data_ptr<scalar_t>(),
+          sorted.mutable_data_ptr<scalar_t>(),
+          num_inp);
+      }
+    } else {
+      if (!consecutive) {
+        auto options = self.options().dtype(kLong);
+        Tensor range = at::arange(0, num_inp, options);
+        sorted_indices = at::empty({num_inp}, options);
+        zoom::hipcub::radix_sort_pairs(
+            self.const_data_ptr<scalar_t>(),
+            sorted.mutable_data_ptr<scalar_t>(),
+            range.const_data_ptr<int64_t>(),
+            sorted_indices.mutable_data_ptr<int64_t>(),
+            num_inp);
+      }
+    }
+
+    return compute_unique<scalar_t>(
+        sorted, sorted_indices, return_inverse, return_counts, consecutive);
+  }
+};
+
+struct MapNumberOfTrueValues {
+  __device__ int operator()(uint8_t x) const {
+    return static_cast<bool>(x);
+  }
+};
+
+C10_LAUNCH_BOUNDS_1(at::zoom::detail::HIP_NUM_THREADS)
+__global__ void unique_bool_write_inverse_indices(
+    const int numel,
+    const int *num_true_p,
+    const bool *self,
+    int64_t *inverse_indices_out) {
+  constexpr int false_idx = 0;
+  const int num_true = *num_true_p;
+  const int num_false = numel - num_true;
+  const int true_idx = num_false > 0;
+
+  HIP_KERNEL_LOOP(i, numel) {
+    const auto value = c10::load(&self[i]);
+    inverse_indices_out[i] = value ? true_idx : false_idx;
+  }
+}
+
+C10_LAUNCH_BOUNDS_1(1)
+__global__ void unique_bool_write_output(
+    const int numel,
+    const int *num_true_p,
+    bool *values_out,
+    int64_t *counts_out) {
+  constexpr int false_idx = 0;
+  const int num_true = *num_true_p;
+  const int num_false = numel - num_true;
+  const int true_idx = num_false > 0;
+
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    if (num_false > 0) {
+      values_out[false_idx] = false;
+      counts_out[false_idx] = num_false;
+    }
+    if (num_true > 0) {
+      values_out[true_idx] = true;
+      counts_out[true_idx] = num_true;
+    }
+  }
+}
+
+template <>
+struct UniqueCub<bool> {
+
+  std::tuple<Tensor, Tensor, Tensor> operator() (
+      const Tensor& self,
+      const bool consecutive,
+      const bool return_inverse,
+      const bool return_counts) {
+    auto stream = c10::zoom::getCurrentZoomStream();
+
+    int64_t num_inp = self.numel();
+
+    Tensor output, inverse_indices, counts;
+    if (consecutive) {
+      Tensor sorted_indices;
+      return compute_unique<bool>(
+          self, sorted_indices, return_inverse, return_counts, consecutive);
+    }
+
+    // Instead of sorting, we use a reduction to find the number of
+    // true values and from that we can infer the number of false.
+    // If either has a count of zero, we omit it from the output.
+    auto allocator = zoom::getZoomDeviceAllocator();
+    c10::DeviceArray<int> tmp_num_true(*allocator, 1);
+
+    const bool* self_data = self.const_data_ptr<bool>();
+    MapNumberOfTrueValues op;
+    NO_ROCM(at_zoom_detail)::hipcub::TransformInputIterator<int, MapNumberOfTrueValues, const uint8_t*, int>
+        data_iter(reinterpret_cast<const uint8_t*>(self_data), op);
+    at::zoom::hipcub::reduce(data_iter, tmp_num_true.get(), num_inp,
+                          NO_ROCM(at_zoom_detail)::hipcub::Sum{}, 0);
+
+    auto options = self.options();
+    output = at::empty({2}, self.options());
+    counts = at::empty({2}, options.dtype(kLong));
+
+   hipLaunchKernelGGL(( unique_bool_write_output), dim3(1), dim3(1), 0, stream, 
+        num_inp,
+        tmp_num_true.get(),
+        output.mutable_data_ptr<bool>(),
+        counts.mutable_data_ptr<int64_t>());
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+    if (return_inverse) {
+      using namespace at::zoom::detail;
+      inverse_indices = at::empty(self.sizes(), options.dtype(kLong));
+      dim3 block = HIP_NUM_THREADS;
+      dim3 grid = GET_BLOCKS(num_inp);
+     hipLaunchKernelGGL(( unique_bool_write_inverse_indices), dim3(grid), dim3(block), 0, stream, 
+          num_inp,
+          tmp_num_true.get(),
+          self_data,
+          inverse_indices.mutable_data_ptr<int64_t>());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+
+    // Final sync to fix the output tensors shape
+    int num_true = 0;
+    c10::zoom::memcpy_and_sync(&num_true, tmp_num_true.get(), sizeof(int),
+                              hipMemcpyDeviceToHost, stream);
+    const int num_false = num_inp - num_true;
+    const int num_out = ((num_true > 0) + (num_false > 0));
+    output.resize_({num_out});
+    counts.resize_({num_out});
+
+    return std::tuple<Tensor, Tensor, Tensor>(output, inverse_indices, counts);
+  }
+};
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor> unique_zoom_template(
+    const Tensor& self,
+    const bool consecutive,
+    const bool return_inverse,
+    const bool return_counts) {
+  auto num_inp = self.numel();
+  TORCH_CHECK(
+      num_inp <= INT_MAX, "num_inp ", num_inp, " is too big to for CUB");
+  if (num_inp == 0) {
+    Tensor output = at::empty({0}, self.options());
+    Tensor inverse_indices = at::empty(self.sizes(), self.options().dtype(kLong));
+    Tensor counts = at::empty({0}, self.options().dtype(kLong));
+    return std::tuple<Tensor, Tensor, Tensor>(output, inverse_indices, counts);
+  }
+
+  auto self_c = self.expect_contiguous();
+  return UniqueCub<scalar_t>{}(*self_c, consecutive, return_inverse, return_counts);
+}
+
+#define INSTANTIATE_UNIQUE_HIP_TEMPLATE(TYPE)                            \
+  template std::tuple<Tensor, Tensor, Tensor> unique_zoom_template<TYPE>( \
+      const Tensor& self,                                                 \
+      const bool consecutive,                                             \
+      const bool return_inverse,                                          \
+      const bool return_counts)
+
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(uint8_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(int8_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(double);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(float);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(int32_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(int64_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(int16_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(uint32_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(uint64_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(uint16_t);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(bool);
+INSTANTIATE_UNIQUE_HIP_TEMPLATE(at::Half);
+
+#undef INSTANTIATE
+
+} // namespace at::native::internal
diff --git a/aten/src/ATen/native/zoom/UniqueCub.cuh b/aten/src/ATen/native/zoom/UniqueCub.cuh
new file mode 100644
index 00000000000000..d8c30b9db1d5d8
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UniqueCub.cuh
@@ -0,0 +1,16 @@
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+namespace internal {
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor> unique_zoom_template(
+    const Tensor& self,
+    const bool consecutive,
+    const bool return_inverse,
+    const bool return_counts);
+
+} // namespace internal
+} // namespace at
+} // namespace native
diff --git a/aten/src/ATen/native/zoom/UpSampleBicubic2d.cu b/aten/src/ATen/native/zoom/UpSampleBicubic2d.cu
new file mode 100644
index 00000000000000..fbcca7c3eb5d65
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleBicubic2d.cu
@@ -0,0 +1,299 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/zoom/UpSample.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_bicubic2d_native.h>
+#include <ATen/ops/upsample_bicubic2d_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bicubic2d_out_frame(
+    const int num_elements,
+    const accscalar_t height_scale,
+    const accscalar_t width_scale,
+    const bool align_corners,
+    const PackedTensorAccessor64<const scalar_t, 4> idata,
+    PackedTensorAccessor64<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int input_height = idata.size(2);
+  const int input_width = idata.size(3);
+  const int output_height = odata.size(2);
+  const int output_width = odata.size(3);
+
+  if (index >= num_elements) {
+    return;
+  }
+
+  // Special case: input and output are the same size, just copy
+  const int output_x = index % output_width;
+  const int output_y = index / output_width;
+
+  if (input_height == output_height && input_width == output_width) {
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; c++) {
+        const scalar_t val = idata[n][c][output_y][output_x];
+        odata[n][c][output_y][output_x] = val;
+      }
+    }
+    return;
+  }
+
+  // Interpolation kernel
+  accscalar_t real_x = area_pixel_compute_source_index(
+      width_scale, output_x, align_corners, /*cubic=*/true);
+  int in_x = floorf(real_x);
+  accscalar_t t_x = real_x - in_x;
+
+  accscalar_t real_y = area_pixel_compute_source_index(
+      height_scale, output_y, align_corners, /*cubic=*/true);
+  int in_y = floorf(real_y);
+  accscalar_t t_y = real_y - in_y;
+
+  for (int n = 0; n < batchsize; n++) {
+    for (int c = 0; c < channels; c++) {
+      accscalar_t coefficients[4];
+
+      for (int k = 0; k < 4; k++) {
+        coefficients[k] = cubic_interp1d(
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x - 1),
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 0),
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 1),
+            upsample_get_value_bounded<scalar_t>(
+                idata, n, c, input_height, input_width, in_y - 1 + k, in_x + 2),
+            t_x);
+      }
+
+      odata[n][c][output_y][output_x] = static_cast<scalar_t>(cubic_interp1d(
+          coefficients[0],
+          coefficients[1],
+          coefficients[2],
+          coefficients[3],
+          t_y));
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bicubic2d_backward_out_frame(
+    const int num_elements,
+    const accscalar_t height_scale,
+    const accscalar_t width_scale,
+    const bool align_corners,
+    PackedTensorAccessor64<scalar_t, 4> idata,
+    const PackedTensorAccessor64<const scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int input_height = idata.size(2);
+  const int input_width = idata.size(3);
+  const int output_height = odata.size(2);
+  const int output_width = odata.size(3);
+
+  if (index >= num_elements) {
+    return;
+  }
+
+  const int output_x = index % output_width;
+  const int output_y = index / output_width;
+  // special case: output_xust copy
+  if (input_height == output_height && input_width == output_width) {
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t val = odata[n][c][output_y][output_x];
+        idata[n][c][output_y][output_x] = val;
+      }
+    }
+    return;
+  }
+
+  accscalar_t real_x = area_pixel_compute_source_index(
+      width_scale, output_x, align_corners, /*cubic=*/true);
+  int input_x = floorf(real_x);
+  accscalar_t t_x = real_x - input_x;
+
+  accscalar_t real_y = area_pixel_compute_source_index(
+      height_scale, output_y, align_corners, /*cubic=*/true);
+  int input_y = floorf(real_y);
+  accscalar_t t_y = real_y - input_y;
+
+  accscalar_t x_coeffs[4];
+  accscalar_t y_coeffs[4];
+
+  get_cubic_upsampling_coefficients(x_coeffs, t_x);
+  get_cubic_upsampling_coefficients(y_coeffs, t_y);
+
+  for (int n = 0; n < batchsize; n++) {
+    for (int c = 0; c < channels; ++c) {
+      scalar_t out_value = odata[n][c][output_y][output_x];
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+          upsample_increment_value_bounded<scalar_t, accscalar_t>(
+              idata,
+              n,
+              c,
+              input_height,
+              input_width,
+              input_y - 1 + i,
+              input_x - 1 + j,
+              out_value * y_coeffs[i] * x_coeffs[j]);
+        }
+      }
+    }
+  }
+}
+
+static void upsample_bicubic2d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  output.zero_();
+
+  const int num_output_elements = output_height * output_width;
+  const int max_threads = std::min(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+  // Launch kernel
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      input.scalar_type(), "upsample_bicubic2d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor64<const scalar_t, 4>();
+        auto odata = output.packed_accessor64<scalar_t, 4>();
+
+        // Get scaling factors
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+        upsample_bicubic2d_out_frame<scalar_t, accscalar_t>
+            <<<ceil_div(num_output_elements, max_threads),
+               max_threads,
+               0,
+               stream>>>(
+                num_output_elements,
+                rheight,
+                rwidth,
+                align_corners,
+                idata,
+                odata);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+static void upsample_bicubic2d_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.zero_();
+
+  const int num_kernels = output_height * output_width;
+  const int num_threads = std::min(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad_output.scalar_type(), "upsample_bicubic2d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor64<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 4>();
+
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+        upsample_bicubic2d_backward_out_frame<scalar_t, accscalar_t>
+            <<<ceil_div(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(
+                num_kernels, rheight, rwidth, align_corners, idata, odata);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_bicubic2d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_bicubic2d_out_zoom_template(output, input, output_size, align_corners, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(upsample_bicubic2d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("upsample_bicubic2d_backward_out_zoom");
+  upsample_bicubic2d_backward_out_zoom_template(
+      grad_input, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSampleBilinear2d.cu b/aten/src/ATen/native/zoom/UpSampleBilinear2d.cu
new file mode 100644
index 00000000000000..c17f3c2a14f1cb
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleBilinear2d.cu
@@ -0,0 +1,923 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/zoom/UpSample.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+#include <ATen/native/zoom/LaunchUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <ATen/ops/upsample_bilinear2d_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at::native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_out_frame(
+    const int n,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor<const scalar_t, 4> idata,
+    PackedTensorAccessor<scalar_t, 4> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int height1 = idata.size(2);
+  const int width1 = idata.size(3);
+  const int width2 = odata.size(3);
+
+  if (index < n) {
+    const int w2 = index % width2; // 0:width2-1
+    const int h2 = index / width2; // 0:height2-1
+
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const accscalar_t val = h0lambda *
+                (w0lambda * idata[n][c][h1][w1] +
+                 w1lambda * idata[n][c][h1][w1 + w1p]) +
+            h1lambda *
+                (w0lambda * idata[n][c][h1 + h1p][w1] +
+                 w1lambda * idata[n][c][h1 + h1p][w1 + w1p]);
+        odata[n][c][h2][w2] = static_cast<scalar_t>(val);
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_nhwc_out_frame(
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const int channels,
+    const int height1,
+    const int width1,
+    const int height2,
+    const int width2,
+    const scalar_t* idata,
+    scalar_t* odata,
+    const int out_numel) {
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < out_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+
+    const accscalar_t val = h0lambda * (
+        w0lambda * idata[idx_cl(n, h1, w1, c, height1, width1, channels)] +
+        w1lambda * idata[idx_cl(n, h1, w1 + w1p, c, height1, width1, channels)]
+      ) + h1lambda * (
+        w0lambda * idata[idx_cl(n, h1 + h1p, w1, c, height1, width1, channels)] +
+        w1lambda * idata[idx_cl(n, h1 + h1p, w1 + w1p, c, height1, width1, channels)]
+      );
+    odata[idx_cl(n, h2, w2, c, height2, width2, channels)] = static_cast<scalar_t>(val);
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_backward_out_frame(
+    const size_t nc,
+    const int height1,
+    const int width1,
+    const int height2,
+    const int width2,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    scalar_t* __restrict__ idata,
+    const scalar_t* __restrict__ odata) {
+  const size_t o_numel = nc * width2 * height2;
+  const size_t i_numel = nc * width1 * height1;
+  for (size_t index = blockDim.x * blockIdx.x + threadIdx.x; index < o_numel;
+       index += blockDim.x * gridDim.x) {
+    size_t index_temp = index;
+    const int w2 = index_temp % width2; // 0:width2-1
+    index_temp /= width2;
+    const int h2 = index_temp % height2; // 0:height2-1
+    const size_t nc = index_temp / height2;
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    const scalar_t d2val = odata[index];
+    fastAtomicAdd(
+        idata,
+        idx(nc, height1, width1, h1, w1),
+        i_numel,
+        static_cast<scalar_t>(h0lambda * w0lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx(nc, height1, width1, h1, w1 + w1p),
+        i_numel,
+        static_cast<scalar_t>(h0lambda * w1lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx(nc, height1, width1, h1 + h1p, w1),
+        i_numel,
+        static_cast<scalar_t>(h1lambda * w0lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx(nc, height1, width1, h1 + h1p, w1 + w1p),
+        i_numel,
+        static_cast<scalar_t>(h1lambda * w1lambda * d2val),
+        true);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_bilinear2d_backward_nhwc_out_frame(
+    const int height1,
+    const int width1,
+    const int height2,
+    const int width2,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    scalar_t* __restrict__ idata,
+    const scalar_t* __restrict__ odata,
+    const int channels,
+    const size_t o_numel,
+    const size_t i_numel) {
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < o_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+
+    const scalar_t d2val = odata[index];
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1, w1, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h0lambda * w0lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1, w1 + w1p, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h0lambda * w1lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1 + h1p, w1, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h1lambda * w0lambda * d2val),
+        true);
+    fastAtomicAdd(
+        idata,
+        idx_cl(n, h1 + h1p, w1 + w1p, c, height1, width1, channels),
+        i_numel,
+        static_cast<scalar_t>(h1lambda * w1lambda * d2val),
+        true);
+  }
+}
+
+static void upsample_bilinear2d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int channels = input.size(1);
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  const auto memory_format = input.suggest_memory_format();
+
+  if (input.sizes() == output.sizes()) {
+    output.copy_(input);
+    return;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
+    // heuristic: only use channels_last path when it's faster than the contiguous path
+    if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 16 && \
+          output.is_contiguous(memory_format)) {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      TORCH_CHECK(input.numel() < std::numeric_limits<int>::max(),
+        "upsample_bilinear2d_nhwc only supports input tensors with less than INT_MAX elements");
+      TORCH_CHECK(output.numel() < std::numeric_limits<int>::max(),
+        "upsample_bilinear2d_nhwc only supports output tensors with less than INT_MAX elements");
+
+      const int channels = input.size(1);
+      const int height1 = input.size(2);
+      const int width1 = input.size(3);
+      const int height2 = output.size(2);
+      const int width2 = output.size(3);
+
+      // const int num_kernels = output_height * output_width;
+      const int num_kernels = output.numel();
+      const int num_threads = std::min(
+          at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+      at::Tensor input_cl = input.contiguous(at::MemoryFormat::ChannelsLast);
+
+      const scalar_t* idata = input_cl.const_data_ptr<scalar_t>();
+      scalar_t* odata = output.mutable_data_ptr<scalar_t>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_nhwc_out_frame<scalar_t, accscalar_t>
+        <<<ceil_div(num_kernels, num_threads), num_threads, 0, c10::zoom::getCurrentZoomStream()>>>(
+          rheight, rwidth, align_corners,
+          channels,
+          height1,
+          width1,
+          height2,
+          width2,
+          idata, odata,
+          output.numel());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      // non-channels_last case, not necessarily contiguous
+      const int num_kernels = output_height * output_width;
+      const int num_threads = std::min(
+          at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+      hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      auto idata = input.packed_accessor64<const scalar_t, 4>();
+      auto odata = output.packed_accessor64<scalar_t, 4>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_out_frame<scalar_t, accscalar_t>
+          <<<ceil_div(num_kernels, num_threads),
+             num_threads,
+             0,
+             stream>>>(
+              num_kernels, rheight, rwidth, align_corners, idata, odata);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  });
+}
+
+static void upsample_bilinear2d_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  const auto memory_format = grad_output_.suggest_memory_format();
+
+  // initialization to zero is required here. As we launch one thread per output
+  // element, and atomicAdd to input gradient. Given a sparse sampling case, our
+  // threads are not covering the whole input tensor.
+  grad_input.zero_();
+
+  const size_t num_kernels = nbatch * channels * output_height * output_width;
+  const int num_threads = std::min(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  if (grad_output_.sizes() == grad_input.sizes()) {
+    grad_input.copy_(grad_output_);
+    return;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad_output_.scalar_type(), "upsample_bilinear2d_backward_out_frame", [&] {
+    if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
+          grad_input.is_contiguous(memory_format)) {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      Tensor grad_output = grad_output_.contiguous(at::MemoryFormat::ChannelsLast);
+
+      auto idata = grad_input.mutable_data_ptr<scalar_t>();
+      auto odata = grad_output.const_data_ptr<scalar_t>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_backward_nhwc_out_frame<scalar_t, accscalar_t>
+          <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)), num_threads, 0, stream>>>(
+              input_height,
+              input_width,
+              output_height,
+              output_width,
+              rheight,
+              rwidth,
+              align_corners,
+              idata,
+              odata,
+              channels,
+              grad_output.numel(),
+              grad_input.numel());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      // This is needed for non-contiguous tensors.
+      Tensor grad_input_c = grad_input.is_contiguous() ? grad_input : at::zeros(grad_input.sizes(), grad_input.options());
+      Tensor grad_output = grad_output_.contiguous();
+
+      auto idata = grad_input_c.mutable_data_ptr<scalar_t>();
+      auto odata = grad_output.const_data_ptr<scalar_t>();
+
+      const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+          input_height, output_height, align_corners, scales_h);
+      const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales_w);
+
+      upsample_bilinear2d_backward_out_frame<scalar_t, accscalar_t>
+          <<<ceil_div(num_kernels, static_cast<size_t>(num_threads)),
+             num_threads,
+             0,
+             stream>>>(
+              nbatch * channels,
+              input_height,
+              input_width,
+              output_height,
+              output_width,
+              rheight,
+              rwidth,
+              align_corners,
+              idata,
+              odata);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+      if (!grad_input.is_contiguous()) {
+          grad_input.copy_(grad_input_c);
+      }
+    }
+  });
+}
+
+// Code for upsampling with antialias
+template <typename scalar_t, typename accscalar_t, typename InterpFilter>
+C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024
+__global__ void upsample_gen2d_aa_out_frame(
+    const accscalar_t height_scale,
+    const accscalar_t width_scale,
+    const PackedTensorAccessor64<const scalar_t, 4> idata,
+    PackedTensorAccessor64<scalar_t, 4> odata,
+    const InterpFilter & interp_filter) {
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int input_height = idata.size(2);
+  const int input_width = idata.size(3);
+  const int output_height = odata.size(2);
+  const int output_width = odata.size(3);
+
+  const int output_x = threadIdx.x + blockIdx.x * blockDim.x;
+  const int output_y = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (output_x >= output_width || output_y >= output_height) {
+    return;
+  }
+
+  const accscalar_t half = 0.5;
+  const accscalar_t support_h = static_cast<accscalar_t>(
+      (height_scale >= 1.0) ? (interp_filter.size * half) * height_scale : interp_filter.size * half);
+  const accscalar_t support_w = static_cast<accscalar_t>(
+      (width_scale >= 1.0) ? (interp_filter.size * half) * width_scale : interp_filter.size * half);
+
+  const int interp_height = (int)ceilf(support_h) * 2 + 1;
+  const int interp_width = (int)ceilf(support_w) * 2 + 1;
+
+  // Setup weights and a buffer using shared memory
+  extern __shared__ int smem[];
+  scalar_t* wx = reinterpret_cast<scalar_t*>(smem) + interp_width * threadIdx.x;
+  scalar_t* wy = reinterpret_cast<scalar_t*>(smem) + interp_width * blockDim.x + interp_height * threadIdx.y;
+  const int offset = interp_width * blockDim.x + interp_height * blockDim.y;
+  scalar_t *buffer2 = reinterpret_cast<scalar_t*>(smem) + offset + \
+      interp_height * (threadIdx.x + threadIdx.y * blockDim.x);
+
+  // Compute weights and kernel spans
+  int xmin, xsize, ymin, ysize;
+  accscalar_t xcenter, ycenter;
+  upsample_antialias::_compute_weights_span(
+      output_x, input_width, width_scale, support_w, xmin, xsize, xcenter);
+  upsample_antialias::_compute_weights_span(
+      output_y, input_height, height_scale, support_h, ymin, ysize, ycenter);
+
+  if (threadIdx.y == 0)
+  {
+    // All threadIdx.y have the same wx weights
+    upsample_antialias::_compute_weights<scalar_t, accscalar_t>(
+        wx,
+        width_scale,
+        interp_width,
+        interp_filter,
+        xmin - xcenter,
+        xsize);
+  }
+
+  if (threadIdx.x == 0)
+  {
+    // All threadIdx.x have the same wy weights
+    upsample_antialias::_compute_weights<scalar_t, accscalar_t>(
+        wy,
+        height_scale,
+        interp_height,
+        interp_filter,
+        ymin - ycenter,
+        ysize);
+  }
+
+  __syncthreads();
+
+  const scalar_t * buffer1;
+
+  for (int n = 0; n < batchsize; n++) {
+    for (int c = 0; c < channels; c++) {
+      // interpolate on y-axis for ymin to ymin + ysize
+      for (int y = 0; y < ysize; y++) {
+        buffer1 = &(idata[n][c][ymin + y][xmin]);
+        buffer2[y] = static_cast<scalar_t>(
+            upsample_antialias::interpolate_aa_single_dim<scalar_t, accscalar_t>(
+                buffer1, wx, xsize));
+      }
+      odata[n][c][output_y][output_x] = static_cast<scalar_t>(
+          upsample_antialias::interpolate_aa_single_dim<scalar_t, accscalar_t>(
+              buffer2, wy, ysize));
+    }
+  }
+}
+
+// Code for upsampling with antialias
+template <typename scalar_t, typename accscalar_t, typename InterpFilter>
+C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024
+__global__ void upsample_gen2d_aa_backward_out_frame(
+    const accscalar_t height_scale,
+    const accscalar_t width_scale,
+    PackedTensorAccessor64<scalar_t, 4> idata,
+    const PackedTensorAccessor64<const scalar_t, 4> odata,
+    const InterpFilter & interp_filter) {
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int input_height = idata.size(2);
+  const int input_width = idata.size(3);
+  const int output_height = odata.size(2);
+  const int output_width = odata.size(3);
+
+  const int output_x = threadIdx.x + blockIdx.x * blockDim.x;
+  const int output_y = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (output_x >= output_width || output_y >= output_height) {
+    return;
+  }
+
+  // special case: output just copy
+  if (input_height == output_height && input_width == output_width) {
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; c++) {
+        const scalar_t val = odata[n][c][output_y][output_x];
+        idata[n][c][output_y][output_x] = val;
+      }
+    }
+    return;
+  }
+
+  const accscalar_t support_h = static_cast<accscalar_t>(
+      (height_scale >= 1.0) ? (interp_filter.size * 0.5) * height_scale
+                            : interp_filter.size * 0.5);
+  const accscalar_t support_w = static_cast<accscalar_t>(
+      (width_scale >= 1.0) ? (interp_filter.size * 0.5) * width_scale
+                           : interp_filter.size * 0.5);
+
+  const int interp_height = (int)ceilf(support_h) * 2 + 1;
+  const int interp_width = (int)ceilf(support_w) * 2 + 1;
+
+  // Setup weights using shared memory
+  extern __shared__ int smem[];
+  scalar_t* wx = reinterpret_cast<scalar_t*>(smem) + interp_width * threadIdx.x;
+  scalar_t* wy = reinterpret_cast<scalar_t*>(smem) + interp_width * blockDim.x + interp_height * threadIdx.y;
+
+  // Compute weights and kernel spans
+  int xmin, xsize, ymin, ysize;
+  accscalar_t xcenter, ycenter;
+  upsample_antialias::_compute_weights_span(
+      output_x, input_width, width_scale, support_w, xmin, xsize, xcenter);
+  upsample_antialias::_compute_weights_span(
+      output_y, input_height, height_scale, support_h, ymin, ysize, ycenter);
+
+  if (threadIdx.y == 0)
+  {
+    // All threadIdx.y have the same wx weights
+    upsample_antialias::_compute_weights<scalar_t, accscalar_t>(
+        wx,
+        width_scale,
+        interp_width,
+        interp_filter,
+        xmin - xcenter,
+        xsize);
+  }
+
+  if (threadIdx.x == 0)
+  {
+    // All threadIdx.x have the same wy weights
+    upsample_antialias::_compute_weights<scalar_t, accscalar_t>(
+        wy,
+        height_scale,
+        interp_height,
+        interp_filter,
+        ymin - ycenter,
+        ysize);
+  }
+
+  __syncthreads();
+
+  for (int n = 0; n < batchsize; n++) {
+    for (int c = 0; c < channels; c++) {
+      scalar_t out_value = odata[n][c][output_y][output_x];
+      for (int y = 0; y < ysize; y++) {
+        for (int x = 0; x < xsize; x++) {
+          upsample_increment_value_bounded<scalar_t, accscalar_t>(
+              idata,
+              n,
+              c,
+              input_height,
+              input_width,
+              ymin + y,
+              xmin + x,
+              wx[x] * wy[y] * out_value);
+        }
+      }
+    }
+  }
+}
+
+// In the code below interp_filter_t distinguishes between bilinear and bicubic interpolations
+// InterpFilter as BilinearFilterFunctor <--> bilinear
+// InterpFilter as BicubicFilterFunctor <--> bicubic
+template<typename InterpFilter>
+static void upsample_gen2d_aa_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input_,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_gen2d_aa_out_zoom", {input_arg, output_arg});
+
+  // TODO: remove this when the zoom kernel is updated to support the channels_last memory format.
+  // This is a temporary hack to prevent a silence correctness issue when calling this kernel
+  // with tensors in channels_last format.
+  auto output_c = output.is_contiguous() ? output : at::empty(output.sizes(), output.options());
+  auto input = input_.contiguous();
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int input_height = input.size(2);
+  int input_width = input.size(3);
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  size_t sharedMemPerBlock = at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock;
+  int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+  int maxThreadsPerBlock = std::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 256);
+  int* maxGridSize = at::zoom::getCurrentDeviceProperties()->maxGridSize;
+  int block_x = std::min<int>(maxThreadsDim[0], at::zoom::warp_size());
+  int grid_x = std::min<int>(maxGridSize[0], ceil_div(output_width, block_x));
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      input.scalar_type(), "upsample_bilinear2d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor64<const scalar_t, 4>();
+        auto odata = output_c.packed_accessor64<scalar_t, 4>();
+
+        const accscalar_t height_scale = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t width_scale = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+        // We are using shared memory to store weights wx, wy and a buffer of size wy unique per thread
+        // Let's compute block_y size depending on given height_scale and width_scale
+        // We have the following relationship:
+        // shmem_size / sizeofdtype =
+        //  interp_width * block_x +   <-- wx allocation
+        //  interp_height * block_y * (block_x + 1)   <-- wy and buffer allocations
+
+        auto interp_filter = InterpFilter();
+        const int interp_height = 1 + 2 * (int)ceilf(
+            (height_scale >= 1.0) ? interp_filter.size * 0.5 * height_scale : interp_filter.size * 0.5);
+        const int interp_width = 1 + 2 * (int)ceilf(
+            (width_scale >= 1.0) ? interp_filter.size * 0.5 * width_scale : interp_filter.size * 0.5);
+
+        int numer = sharedMemPerBlock * 1.0 / sizeof(scalar_t) - interp_width * block_x;
+        int denom = interp_height * (block_x + 1);
+        int block_y = lastPow2((unsigned int) (numer / denom));
+        block_y = std::min<int>(maxThreadsPerBlock / block_x, block_y);
+        const dim3 block(block_x, block_y);
+
+        int grid_y = std::min<int>(maxGridSize[1], ceil_div(output_height, block_y));
+        const dim3 grid(grid_x, grid_y);
+
+        // Compute actual size of required shared memory and verify if we can allocate it
+        // - wx and wy size:
+        size_t weights_per_block = interp_width * block_x + interp_height * block_y;
+        // - buffer size:
+        weights_per_block += interp_height * block_y * block_x;
+        size_t shmem_size = weights_per_block * sizeof(scalar_t);
+        TORCH_CHECK(
+            shmem_size <= sharedMemPerBlock,
+            "Provided interpolation parameters can not be handled with current algorithm implementation. ",
+            "Please reduce the scale factor. Too much shared memory required: ",
+            shmem_size, " vs ", sharedMemPerBlock);
+
+        upsample_gen2d_aa_out_frame<scalar_t, accscalar_t>
+            <<<grid,
+               block,
+               shmem_size,
+               stream>>>(height_scale, width_scale, idata, odata, interp_filter);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+
+  if (!output.is_contiguous()) {
+      output.copy_(output_c);
+  }
+}
+
+// In the code below interp_filter_t distinguishes between bilinear and bicubic interpolations
+// InterpFilter as BilinearFilterFunctor <--> bilinear
+// InterpFilter as BicubicFilterFunctor <--> bicubic
+template<typename InterpFilter>
+static void upsample_gen2d_aa_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+
+  // Inspired from UpSampleBicubic2d.cu::upsample_bicubic2d_backward_out_zoom_template
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_gen2d_backward_out_zoom", {grad_output_arg, grad_input_arg});
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.zero_();
+
+  const int num_threads = std::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 256);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+  int block_x = std::min<int>(maxThreadsDim[0], at::zoom::warp_size());
+  int block_y = std::min<int>(maxThreadsDim[1], num_threads / block_x);
+  const dim3 block(block_x, block_y);
+
+  int* maxGridSize = at::zoom::getCurrentDeviceProperties()->maxGridSize;
+  int grid_x = std::min<int>(maxGridSize[0], ceil_div(output_width, block_x));
+  int grid_y = std::min<int>(maxGridSize[1], ceil_div(output_height, block_y));
+  const dim3 grid(grid_x, grid_y);
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad_output.scalar_type(), "upsample_gen2d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor64<scalar_t, 4>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 4>();
+
+        const accscalar_t height_scale = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t width_scale = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+        auto interp_filter = InterpFilter();
+        const int interp_height = 1 + 2 * (int)ceilf(
+            (height_scale >= 1.0) ? interp_filter.size * 0.5 * height_scale : interp_filter.size * 0.5);
+        const int interp_width = 1 + 2 * (int)ceilf(
+            (width_scale >= 1.0) ? interp_filter.size * 0.5 * width_scale : interp_filter.size * 0.5);
+
+        size_t weights_per_block = interp_width * block_x + interp_height * block_y;
+        size_t shmem_size = weights_per_block * sizeof(scalar_t);
+        size_t sharedMemPerBlock = at::zoom::getCurrentDeviceProperties()->sharedMemPerBlock;
+        TORCH_CHECK(
+            shmem_size <= sharedMemPerBlock,
+            "Provided interpolation parameters can not be handled with current algorithm implementation. ",
+            "Please reduce the scale factor. Too much shared memory required: ",
+            shmem_size, " vs ", sharedMemPerBlock);
+
+        upsample_gen2d_aa_backward_out_frame<scalar_t, accscalar_t>
+            <<<grid,
+               block,
+               shmem_size,
+               stream>>>(height_scale, width_scale, idata, odata, interp_filter);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_bilinear2d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_bilinear2d_out_zoom_template(output, input, output_size, align_corners, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("upsample_bilinear2d_backward_out_zoom");
+  upsample_bilinear2d_backward_out_zoom_template(
+      grad_input, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+
+  upsample_gen2d_aa_out_zoom_template<upsample_antialias::BilinearFilterFunctor>(
+      output, input, output_size, align_corners, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("upsample_bilinear2d_aa_backward_out_zoom");
+  upsample_gen2d_aa_backward_out_zoom_template<upsample_antialias::BilinearFilterFunctor>(
+      grad_input, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+}
+
+// We define bicubic anti-alias function implementations in this file instead of
+// UpSampleBicubic2d.cu as we are using a single generic implementation
+TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_gen2d_aa_out_zoom_template<upsample_antialias::BicubicFilterFunctor>(
+      output, input, output_size, align_corners, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("upsample_bicubic2d_aa_backward_out_zoom");
+  upsample_gen2d_aa_backward_out_zoom_template<upsample_antialias::BicubicFilterFunctor>(
+      grad_input, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSampleLinear1d.cu b/aten/src/ATen/native/zoom/UpSampleLinear1d.cu
new file mode 100644
index 00000000000000..cdcfb33a513d7c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleLinear1d.cu
@@ -0,0 +1,232 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/zoom/UpSample.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_linear1d_native.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void upsample_linear1d_out_frame(
+    const int n,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor64<const scalar_t, 3> idata,
+    PackedTensorAccessor64<scalar_t, 3> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int width1 = idata.size(2);
+  const int width2 = odata.size(2);
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][w1];
+          odata[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const accscalar_t val =
+            w0lambda * idata[n][c][w1] + w1lambda * idata[n][c][w1 + w1p];
+        odata[n][c][w2] = static_cast<scalar_t>(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void upsample_linear1d_out_frame_backward(
+    const int n,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    PackedTensorAccessor64<scalar_t, 3> idata,
+    const PackedTensorAccessor64<const scalar_t, 3> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int width1 = idata.size(2);
+  const int width2 = odata.size(2);
+
+  if (index < n) {
+    const int w2 = index % width2;
+    // special case: just copy
+    if (width1 == width2) {
+      const int w1 = w2;
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][w1];
+          idata[n][c][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][w2];
+        gpuAtomicAddNoReturn(&idata[n][c][w1], static_cast<scalar_t>(w0lambda * d2val));
+        gpuAtomicAddNoReturn(
+            &idata[n][c][w1 + w1p], static_cast<scalar_t>(w1lambda * d2val));
+      }
+    }
+  }
+}
+
+static void upsample_linear1d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  int output_width = output_size[0];
+
+  int input_width = input.size(2);
+
+  output.zero_();
+
+  AT_ASSERT(input_width > 0 && output_width > 0);
+
+  const int num_kernels = output_width;
+  const int num_threads = 512;
+      //at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      input.scalar_type(), "upsample_linear1d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor64<const scalar_t, 3>();
+        auto odata = output.packed_accessor64<scalar_t, 3>();
+
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+          input_width, output_width, align_corners, scales);
+
+        upsample_linear1d_out_frame<scalar_t, accscalar_t>
+            <<<ceil_div(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, rwidth, align_corners, idata, odata);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+static void upsample_linear1d_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales) {
+  TensorArg grad_output_arg{grad_output_, "grad_output_", 1},
+      grad_input_arg{grad_input, "grad_input", 2};
+  checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
+
+  int output_width = output_size[0];
+
+  int input_width = input_size[2];
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  grad_input.zero_();
+
+  const int num_kernels = output_width;
+  const int num_threads = 512;
+      //at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock;
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad_output.scalar_type(), "upsample_linear1d_out_frame_backward", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor64<scalar_t, 3>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 3>();
+
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales);
+
+        upsample_linear1d_out_frame_backward<scalar_t, accscalar_t>
+            <<<ceil_div(num_kernels, num_threads),
+               num_threads,
+               0,
+               stream>>>(num_kernels, rwidth, align_corners, idata, odata);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_linear1d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales,
+    const Tensor& output
+) {
+  upsample_linear1d_out_zoom_template(output, input, output_size, align_corners, scales);
+}
+
+TORCH_IMPL_FUNC(upsample_linear1d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales,
+    const Tensor& grad_input
+) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("upsample_linear1d_backward_out_zoom");
+  upsample_linear1d_backward_out_zoom_template(
+      grad_input, grad_output, output_size, input_size, align_corners, scales);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSampleNearest1d.cu b/aten/src/ATen/native/zoom/UpSampleNearest1d.cu
new file mode 100644
index 00000000000000..293e34aa7a6c75
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleNearest1d.cu
@@ -0,0 +1,239 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/zoom/UpSample.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_nearest1d_native.h>
+#include <ATen/ops/upsample_nearest1d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+#define MAX_THREADS 512
+
+// Define a typedef to dispatch to nearest_neighbor_compute_source_index or
+// nearest_neighbor_exact_compute_source_index
+typedef int (*nn_compute_source_index_fn_t)(const float, int, int);
+
+// Define a typedef to dispatch to nearest_neighbor_bw_compute_source_index or
+// nearest_neighbor_exact_bw_compute_source_index
+typedef int (*nn_bw_compute_source_index_fn_t)(const float, int, int);
+
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename scalar_t, nn_compute_source_index_fn_t nn_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest1d_out_frame(
+    const scalar_t* input,
+    size_t dim_b,
+    size_t dim_c,
+    size_t src_dim_w,
+    size_t dst_dim_w,
+    scalar_t* output,
+    float scale_factor) {
+  int dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (dst_idx >= dim_c * dst_dim_w)
+    return;
+
+  int c = (dst_idx / dst_dim_w) % dim_c;
+
+  int dst_x = dst_idx % dst_dim_w;
+  int src_x = nn_compute_source_index_fn(scale_factor, dst_x, src_dim_w);
+
+  int src_idx = c * src_dim_w + src_x;
+  int src_stride = dim_c * src_dim_w;
+  int dst_stride = dim_c * dst_dim_w;
+
+  for (int b = 0; b < dim_b; b++) {
+    output[dst_idx] = input[src_idx];
+    src_idx += src_stride;
+    dst_idx += dst_stride;
+  }
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+// Backward operation
+template <typename scalar_t, typename accscalar_t, nn_bw_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest1d_backward_out_frame(
+    const scalar_t* grad_o,
+    size_t dim_b,
+    size_t dim_c,
+    size_t src_dim_w,
+    size_t dst_dim_w,
+    scalar_t* grad_i,
+    float scale_factor) {
+
+  int dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (dst_idx >= dim_c * dst_dim_w)
+    return;
+
+  int c = (dst_idx / (dst_dim_w)) % dim_c;
+
+  int dst_x = dst_idx % dst_dim_w;
+  // note that we do not want to clamp src_x to src_dim_w, since we might
+  // intentionally want to skip in case of scale_factor < 1.0
+  int src_x = nn_bw_compute_source_index_fn(scale_factor, dst_x, src_dim_w);
+  int src_x_up = nn_bw_compute_source_index_fn(scale_factor, dst_x+1, src_dim_w);
+
+  for (int b = 0; b < dim_b; b++) {
+    accscalar_t grad = 0;
+    int src_idx = b * dim_c * src_dim_w + c * src_dim_w + src_x;
+    for (int x = src_x; x < src_x_up; x++) {
+      grad += grad_o[src_idx++];
+    }
+    grad_i[dst_idx] = grad;
+    dst_idx += dim_c * dst_dim_w;
+  }
+}
+
+template<nn_compute_source_index_fn_t nn_compute_source_index_fn>
+static void upsample_nearest1d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input_,
+    IntArrayRef output_size,
+    std::optional<double> scales) {
+  TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_nearest1d_out_zoom", {input_arg, output_arg});
+
+  int output_width = output_size[0];
+
+  int nbatch = input_.size(0);
+  int channels = input_.size(1);
+  int input_width = input_.size(2);
+
+  Tensor input = input_.contiguous();
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  // upsample_nearest1d meta call makes sure `nbatch != 0`
+  unsigned int n = output.numel() / nbatch;
+  dim3 bdim{std::min<unsigned int>(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
+  dim3 gdim{ceil_div(n, bdim.x)};
+  // safe check for int32 indexing; implicitly restrict launch config for kernel
+  TORCH_CHECK(output.numel() <= std::numeric_limits<int32_t>::max());
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, input.scalar_type(), "upsample_nearest1d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.const_data_ptr<scalar_t>();
+        auto odata = output.mutable_data_ptr<scalar_t>();
+
+        const float scale_factor = compute_scales_value<float>(scales, input_width, output_width);
+
+        upsample_nearest1d_out_frame<scalar_t, nn_compute_source_index_fn><<<gdim, bdim, 0, stream>>>(
+            idata, nbatch, channels, input_width, output_width, odata, scale_factor);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+template<nn_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+static void upsample_nearest1d_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_nearest1d_backward_out_zoom_template",
+      {grad_output_arg, grad_input_arg});
+
+  int output_width = output_size[0];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_width = input_size[2];
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  // upsample_nearest1d meta call makes sure `nbatch != 0`
+  unsigned int n = grad_input.numel() / nbatch;
+  dim3 bdim{std::min<unsigned int>(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
+  dim3 gdim{ceil_div(n, bdim.x)};
+  // safe check for int32 indexing; implicitly restrict launch config for kernel
+  TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(grad_output.numel() <= std::numeric_limits<int32_t>::max());
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.mutable_data_ptr<scalar_t>();
+        auto odata = grad_output.const_data_ptr<scalar_t>();
+
+        const float scale_factor = compute_scales_value_backwards<float>(scales, output_width, input_width);
+
+        upsample_nearest1d_backward_out_frame<scalar_t, accscalar_t, nn_bw_compute_source_index_fn>
+            <<<gdim, bdim, 0, stream>>>(
+                odata, nbatch, channels, output_width, input_width, idata, scale_factor);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_nearest1d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    std::optional<double> scales,
+    const Tensor& output
+) {
+  upsample_nearest1d_out_zoom_template<nearest_neighbor_compute_source_index>(
+      output, input, output_size, scales);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    std::optional<double> scales,
+    const Tensor& output
+) {
+  upsample_nearest1d_out_zoom_template<nearest_neighbor_exact_compute_source_index>(output, input, output_size, scales);
+}
+
+TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales,
+    const Tensor& grad_input
+) {
+  upsample_nearest1d_backward_out_zoom_template<nearest_neighbor_bw_compute_source_index>(
+      grad_input, grad_output, output_size, input_size, scales);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales,
+    const Tensor& grad_input
+) {
+  upsample_nearest1d_backward_out_zoom_template<nearest_neighbor_exact_bw_compute_source_index>(
+      grad_input, grad_output, output_size, input_size, scales);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSampleNearest2d.cu b/aten/src/ATen/native/zoom/UpSampleNearest2d.cu
new file mode 100644
index 00000000000000..a7c0264d61d183
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleNearest2d.cu
@@ -0,0 +1,490 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/native/zoom/LaunchUtils.h>
+#include <ATen/native/zoom/UpSample.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+#include <ATen/zoom/detail/KernelUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/upsample_nearest2d_backward_native.h>
+#include <ATen/ops/upsample_nearest2d_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+#define MAX_THREADS 512
+
+// Define a typedef to dispatch to nearest_neighbor_compute_source_index or
+// nearest_neighbor_exact_compute_source_index
+typedef int (*nn_compute_source_index_fn_t)(const float, int, int);
+
+// Define a typedef to dispatch to nearest_neighbor_bw_compute_source_index or
+// nearest_neighbor_exact_bw_compute_source_index
+typedef int (*nn_bw_compute_source_index_fn_t)(const float, int, int);
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename scalar_t, nn_compute_source_index_fn_t nn_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_out_frame(
+    const scalar_t* idata,
+    scalar_t* odata,
+    const size_t nc,
+    const size_t height1,
+    const size_t width1,
+    const size_t height2,
+    const size_t width2,
+    float height_scale,
+    float width_scale) {
+  size_t nc_iter = threadIdx.z + blockIdx.z * blockDim.z;
+  int w2 = threadIdx.x + blockIdx.x * blockDim.x;
+  int h2 = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (w2 >= width2 || h2 >= height2) {
+    return;
+  }
+
+  int nc_stride = blockDim.z * gridDim.z;
+
+  const size_t h1 = height1 == height2
+      ? h2
+      : nn_compute_source_index_fn(height_scale, h2, height1);
+  const size_t w1 = width1 == width2
+      ? w2
+      : nn_compute_source_index_fn(width_scale, w2, width1);
+
+  size_t src_index = (nc_iter * height1 + h1) * width1 + w1;
+  size_t src_index_stride = nc_stride * width1 * height1;
+  size_t dst_index = (nc_iter * height2 + h2) * width2 + w2;
+  size_t dst_index_stride = nc_stride * width2 * height2;
+
+  // iterating over
+  while (nc_iter < nc) {
+    odata[dst_index] = idata[src_index];
+    dst_index += dst_index_stride;
+    src_index += src_index_stride;
+    nc_iter += nc_stride;
+  }
+}
+
+template <typename scalar_t, nn_compute_source_index_fn_t nn_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_nhwc_out_frame(
+    const scalar_t* idata,
+    scalar_t* odata,
+    const size_t channels,
+    const size_t height1,
+    const size_t width1,
+    const size_t height2,
+    const size_t width2,
+    float height_scale,
+    float width_scale,
+    const size_t out_numel) {
+
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < out_numel) {
+    const auto c = index % channels;
+    const auto w2 = (index / channels) % width2;
+    const auto h2 = (index / channels / width2) % height2;
+    const auto n = index / channels / width2 / height2;
+
+    const size_t h1 = height1 == height2 ? h2 : nn_compute_source_index_fn(height_scale, h2, height1);
+    const size_t w1 = width1 == width2 ? w2 : nn_compute_source_index_fn(width_scale, w2, width1);
+
+    odata[index] = idata[idx_cl(n, h1, w1, c, height1, width1, channels)];
+  }
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename scalar_t, typename accscalar_t, nn_bw_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_backward_out_frame(
+    const scalar_t* grad_o,
+    size_t dim_b,
+    size_t dim_c,
+    size_t src_dim_h,
+    size_t src_dim_w,
+    size_t dst_dim_h,
+    size_t dst_dim_w,
+    scalar_t* grad_i,
+    float height_scale,
+    float width_scale) {
+  int64_t dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (dst_idx >= dim_c * dst_dim_h * dst_dim_w)
+    return;
+
+  int dst_c_stride = dst_dim_h * dst_dim_w;
+  int src_c_stride = src_dim_h * src_dim_w;
+
+  int c = (dst_idx / (dst_c_stride)) % dim_c;
+
+  int dst_y = (dst_idx / dst_dim_w) % dst_dim_h;
+  // note that we do not want to clamp src_y to src_dim_y, since we might
+  // intentionally want to skip in case of scale_factor < 1.0
+  int src_y =
+      nn_bw_compute_source_index_fn(height_scale, dst_y, src_dim_h);
+  int src_y_up = nn_bw_compute_source_index_fn(
+      height_scale, dst_y + 1, src_dim_h);
+
+  int dst_x = dst_idx % dst_dim_w;
+  // note that we do not want to clamp src_x to src_dim_w, since we might
+  // intentionally want to skip in case of scale_factor < 1.0
+  int src_x =
+      nn_bw_compute_source_index_fn(width_scale, dst_x, src_dim_w);
+  int src_x_up = nn_bw_compute_source_index_fn(
+      width_scale, dst_x + 1, src_dim_w);
+
+  for (int b = 0; b < dim_b; b++) {
+    accscalar_t grad = 0;
+    for (int y = src_y; y < src_y_up; y++) {
+      for (int x = src_x; x < src_x_up; x++) {
+        int64_t src_idx =
+            b * dim_c * src_c_stride + c * src_c_stride + y * src_dim_w + x;
+        grad += grad_o[src_idx];
+      }
+    }
+    grad_i[dst_idx] = grad;
+    dst_idx += dim_c * dst_c_stride;
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, nn_bw_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest2d_backward_nhwc_out_frame(
+    const scalar_t* go,
+    scalar_t* gi,
+    const size_t height1,
+    const size_t width1,
+    const size_t height2,
+    const size_t width2,
+    const size_t channels,
+    const float height_scale,
+    const float width_scale,
+    const size_t gi_numel) {
+
+  // 1 is for grad_output (src)
+  // 2 is for grad_input (dst)
+
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < gi_numel) {
+    const int c = index % channels;
+    const int w2 = (index / channels) % width2;
+    const int h2 = (index / channels / width2) % height2;
+    const int n = index / channels / width2 / height2;
+
+    int h1 = nn_bw_compute_source_index_fn(height_scale, h2, height1);
+    int h1_up = nn_bw_compute_source_index_fn(height_scale, h2 + 1, height1);
+
+    int w1 = nn_bw_compute_source_index_fn(width_scale, w2, width1);
+    int w1_up = nn_bw_compute_source_index_fn(width_scale, w2 + 1, width1);
+
+    accscalar_t grad = 0;
+    for (int ih = h1; ih < h1_up; ih++) {
+      for (int iw = w1; iw < w1_up; iw++) {
+        grad += go[idx_cl(n, ih, iw, c, height1, width1, channels)];
+      }
+    }
+    gi[index] = static_cast<scalar_t>(grad);
+  }
+}
+
+template<nn_compute_source_index_fn_t nn_compute_source_index_fn>
+static void upsample_nearest2d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input_,
+    IntArrayRef output_size,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  if (input_.numel() == 0) {
+    return;
+  }
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_.size(0);
+  int channels = input_.size(1);
+  int input_height = input_.size(2);
+  int input_width = input_.size(3);
+
+  const float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
+  const float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
+
+  const auto memory_format = input_.suggest_memory_format();
+
+  if (input_.sizes() == output.sizes()) {
+    output.copy_(input_);
+    return;
+  }
+
+  // heuristic: only use channels_last path when it's faster than the contiguous path
+  if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
+        output.is_contiguous(memory_format)) {
+    at::Tensor input = input_.contiguous(at::MemoryFormat::ChannelsLast);
+
+    TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "upsample_nearest_nhwc only supports input tensors with less than 2^63 - 1 elements");
+    TORCH_CHECK(output.numel() < std::numeric_limits<int64_t>::max(),
+      "upsample_nearest_nhwc only supports output tensors with less than 2^63 - 1 elements");
+
+    const int64_t num_kernels = output.numel();
+    const int64_t num_threads = ::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_nhwc_out_frame", [&] {
+      const scalar_t* idata = input.const_data_ptr<scalar_t>();
+      scalar_t* odata = output.mutable_data_ptr<scalar_t>();
+
+     hipLaunchKernelGGL(( upsample_nearest2d_nhwc_out_frame<scalar_t, nn_compute_source_index_fn>)
+        , dim3(ceil_div(num_kernels, num_threads)), dim3(num_threads), 0, c10::zoom::getCurrentZoomStream(), 
+          idata,
+          odata,
+          channels,
+          input_height,
+          input_width,
+          output_height,
+          output_width,
+          height_scale,
+          width_scale,
+          output.numel()
+      );
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  }
+  else {
+    // This is needed for non-contiguous tensors.
+    Tensor output_c = output.is_contiguous() ? output : at::empty(output.sizes(), output.options());
+    Tensor input = input_.contiguous();
+
+    int nc = nbatch * channels;
+
+    const int max_threads = std::min<int>(
+        at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS);
+
+    int* maxThreadsDim = at::zoom::getCurrentDeviceProperties()->maxThreadsDim;
+    int* maxGridSize = at::zoom::getCurrentDeviceProperties()->maxGridSize;
+
+    // upsample_nearest2d meta call makes sure input/output tensor is not empty;
+    int block_x = std::min<int>(
+        maxThreadsDim[0], std::min<int>(lastPow2(output_width), max_threads));
+    int block_y = std::min<int>(
+        maxThreadsDim[1],
+        std::min<int>(lastPow2(output_height), max_threads / block_x));
+    int block_z = std::min<int>(
+        maxThreadsDim[2], std::min<int>(nc, max_threads / block_x / block_y));
+    const dim3 block(block_x, block_y, block_z);
+
+    int grid_x = ceil_div(output_width, block_x);
+    int grid_y = ceil_div(output_height, block_y);
+    int grid_z = std::min<int>(
+        maxGridSize[2], ceil_div(nc, block_z * 4));
+    const dim3 grid(grid_x, grid_y, grid_z);
+    // Error out on cases where grid_x & grid_y exceeds limit of launch config, as
+    // the current kernel implementation doesn't loop over the two dimensions.
+    // This is unlikely to happen.
+    // TODO: kernel implementation could stride on spatial dimension. We probably
+    //       need to overhaul the kernel.
+    TORCH_CHECK(
+        grid_x <= maxGridSize[0] && grid_y <= maxGridSize[1],
+        "input tensor has spatial dimension larger than the kernel capacity");
+
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_out_frame", [&] {
+          using accscalar_t = at::acc_type<scalar_t, true>;
+
+          auto idata = input.const_data_ptr<scalar_t>();
+          auto odata = output_c.mutable_data_ptr<scalar_t>();
+
+         hipLaunchKernelGGL(( upsample_nearest2d_out_frame<scalar_t, nn_compute_source_index_fn>)
+              , dim3(grid), dim3(block), 0, stream, 
+                  idata,
+                  odata,
+                  nc,
+                  input_height,
+                  input_width,
+                  output_height,
+                  output_width,
+                  height_scale,
+                  width_scale);
+          C10_ZOOM_KERNEL_LAUNCH_CHECK();
+        });
+
+    if (!output.is_contiguous()) {
+        output.copy_(output_c);
+    }
+  }
+}
+
+template<nn_bw_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+static void upsample_nearest2d_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(__func__, {grad_output_arg, grad_input_arg});
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  int output_height = output_size[0];
+  int output_width = output_size[1];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_height = input_size[2];
+  int input_width = input_size[3];
+
+  const float height_scale = compute_scales_value_backwards<float>(scales_h, output_height, input_height);
+  const float width_scale = compute_scales_value_backwards<float>(scales_w, output_width, input_width);
+
+  auto memory_format = grad_output_.suggest_memory_format();
+
+  if (grad_output_.sizes() == grad_input.sizes()) {
+    grad_input.copy_(grad_output_);
+    return;
+  }
+
+  if (memory_format == at::MemoryFormat::ChannelsLast && channels >= 4 && \
+        grad_input.is_contiguous(memory_format)) {
+    Tensor grad_output = grad_output_.contiguous(at::MemoryFormat::ChannelsLast);
+
+    TORCH_CHECK(grad_input.numel() < std::numeric_limits<int>::max(),
+      "upsample_nearest_nhwc only supports grad_input tensors with less than INT_MAX elements");
+    TORCH_CHECK(grad_output.numel() < std::numeric_limits<int>::max(),
+      "upsample_nearest_nhwc only supports grad_output tensors with less than INT_MAX elements");
+
+    const int num_kernels = grad_input.numel();
+    const int num_threads = ::min(at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_nhwc_out_frame", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      const scalar_t* go = grad_output.const_data_ptr<scalar_t>();
+      scalar_t* gi = grad_input.mutable_data_ptr<scalar_t>();
+
+     hipLaunchKernelGGL(( upsample_nearest2d_backward_nhwc_out_frame<scalar_t, accscalar_t, nn_bw_compute_source_index_fn>)
+        , dim3(ceil_div(num_kernels, num_threads)), dim3(num_threads), 0, c10::zoom::getCurrentZoomStream(), 
+          go,
+          gi,
+          output_height,
+          output_width,
+          input_height,
+          input_width,
+          channels,
+          height_scale,
+          width_scale,
+          grad_input.numel()
+      );
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    // This is needed for non-contiguous tensors.
+    Tensor grad_input_c = grad_input.is_contiguous() ? grad_input : at::empty(grad_input.sizes(), grad_input.options());
+    Tensor grad_output = grad_output_.contiguous();
+
+    // upsample_nearest2d meta call makes sure `nbatch != 0`
+    unsigned int n = grad_input.numel() / nbatch;
+    dim3 bdim{std::min<unsigned int>(
+        at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
+    dim3 gdim{ceil_div(n, bdim.x)};
+    // safe check for int64 indexing; implicitly restrict launch config for kernel
+    TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int64_t>::max(), "upsample2d grad_input.numel() <= std::numeric_limits<int64_t>::max()");
+    TORCH_CHECK(grad_output.numel() <= std::numeric_limits<int64_t>::max(), "upsample2d grad_output.numel() <= std::numeric_limits<int64_t>::max()");
+
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+    AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+
+      auto idata = grad_input_c.mutable_data_ptr<scalar_t>();
+      auto odata = grad_output.const_data_ptr<scalar_t>();
+
+
+     hipLaunchKernelGGL(( upsample_nearest2d_backward_out_frame<scalar_t, accscalar_t, nn_bw_compute_source_index_fn>)
+          , dim3(gdim), dim3(bdim), 0, stream, 
+              odata,
+              nbatch,
+              channels,
+              output_height,
+              output_width,
+              input_height,
+              input_width,
+              idata,
+              height_scale,
+              width_scale);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    });
+
+    if (!grad_input.is_contiguous()) {
+        grad_input.copy_(grad_input_c);
+    }
+  }
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_nearest2d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_nearest2d_out_zoom_template<nearest_neighbor_compute_source_index>(
+      output, input, output_size, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_nearest2d_out_zoom_template<nearest_neighbor_exact_compute_source_index>(
+      output, input, output_size, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  upsample_nearest2d_backward_out_zoom_template<nearest_neighbor_bw_compute_source_index>(
+      grad_input, grad_output, output_size, input_size, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  upsample_nearest2d_backward_out_zoom_template<nearest_neighbor_exact_bw_compute_source_index>(
+      grad_input, grad_output, output_size, input_size, scales_h, scales_w);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSampleNearest3d.cu b/aten/src/ATen/native/zoom/UpSampleNearest3d.cu
new file mode 100644
index 00000000000000..117ef9e524db82
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleNearest3d.cu
@@ -0,0 +1,342 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zoom/UpSample.cuh>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/ZoomContext.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+#define MAX_THREADS 512
+
+// Define a typedef to dispatch to nearest_neighbor_compute_source_index or
+// nearest_neighbor_exact_compute_source_index
+typedef int (*nn_compute_source_index_fn_t)(const float, int, int);
+
+// Define a typedef to dispatch to nearest_neighbor_bw_compute_source_index or
+// nearest_neighbor_exact_bw_compute_source_index
+typedef int (*nn_bw_compute_source_index_fn_t)(const float, int, int);
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename scalar_t, nn_compute_source_index_fn_t nn_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest3d_out_frame(
+    const scalar_t* input,
+    size_t dim_b,
+    size_t dim_c,
+    size_t src_dim_d,
+    size_t src_dim_h,
+    size_t src_dim_w,
+    size_t dst_dim_d,
+    size_t dst_dim_h,
+    size_t dst_dim_w,
+    scalar_t* output,
+    float depth_scale,
+    float height_scale,
+    float width_scale) {
+
+  int dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (dst_idx >= dim_c * dst_dim_d * dst_dim_h * dst_dim_w)
+    return;
+
+  int dst_c_stride = dst_dim_d * dst_dim_h * dst_dim_w;
+  int src_c_stride = src_dim_d * src_dim_h * src_dim_w;
+
+  int c = (dst_idx / (dst_c_stride)) % dim_c;
+
+  int dst_z = (dst_idx / dst_dim_h / dst_dim_w) % dst_dim_d;
+  int src_z = nn_compute_source_index_fn(depth_scale, dst_z, src_dim_d);
+  int dst_y = (dst_idx / dst_dim_w) % dst_dim_h;
+  int src_y = nn_compute_source_index_fn(height_scale, dst_y, src_dim_h);
+
+  int dst_x = dst_idx % dst_dim_w;
+  int src_x = nn_compute_source_index_fn(width_scale, dst_x, src_dim_w);
+
+  int src_idx = c * src_c_stride + src_z * src_dim_h * src_dim_w +
+      src_y * src_dim_w + src_x;
+  for (int b = 0; b < dim_b; b++) {
+    output[dst_idx] = input[src_idx];
+    src_idx += dim_c * src_c_stride;
+    dst_idx += dim_c * dst_c_stride;
+  }
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+// Backward operation
+template <typename scalar_t, typename accscalar_t, nn_bw_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void upsample_nearest3d_backward_out_frame(
+    const scalar_t* grad_o,
+    size_t dim_b,
+    size_t dim_c,
+    size_t src_dim_d,
+    size_t src_dim_h,
+    size_t src_dim_w,
+    size_t dst_dim_d,
+    size_t dst_dim_h,
+    size_t dst_dim_w,
+    scalar_t* grad_i,
+    float depth_scale,
+    float height_scale,
+    float width_scale) {
+
+  int dst_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (dst_idx >= dim_c * dst_dim_d * dst_dim_h * dst_dim_w)
+    return;
+
+  int dst_c_stride = dst_dim_d * dst_dim_h * dst_dim_w;
+  int src_c_stride = src_dim_d * src_dim_h * src_dim_w;
+
+  int c = (dst_idx / (dst_c_stride)) % dim_c;
+
+  int dst_z = (dst_idx / dst_dim_h / dst_dim_w) % dst_dim_d;
+  // note that we do not want to clamp src_z to src_dim_z, since we might
+  // intentionally want to skip in case of scale_factor < 1.0
+  int src_z = nn_bw_compute_source_index_fn(depth_scale, dst_z, src_dim_d);
+  int src_z_up = nn_bw_compute_source_index_fn(depth_scale, dst_z+1, src_dim_d);
+
+  int dst_y = (dst_idx / dst_dim_w) % dst_dim_h;
+  // note that we do not want to clamp src_y to src_dim_y, since we might
+  // intentionally want to skip in case of scale_factor < 1.0
+  int src_y = nn_bw_compute_source_index_fn(height_scale, dst_y, src_dim_h);
+  int src_y_up = nn_bw_compute_source_index_fn(height_scale, dst_y+1, src_dim_h);
+
+  int dst_x = dst_idx % dst_dim_w;
+  // note that we do not want to clamp src_x to src_dim_w, since we might
+  // intentionally want to skip in case of scale_factor < 1.0
+  int src_x = nn_bw_compute_source_index_fn(width_scale, dst_x, src_dim_w);
+  int src_x_up = nn_bw_compute_source_index_fn(width_scale, dst_x+1, src_dim_w);
+
+  for (int b = 0; b < dim_b; b++) {
+    accscalar_t grad = 0;
+    for (int z = src_z; z < src_z_up; z++) {
+      for (int y = src_y; y < src_y_up; y++) {
+        for (int x = src_x; x < src_x_up; x++) {
+          int src_idx = b * dim_c * src_c_stride + c * src_c_stride +
+              z * src_dim_h * src_dim_w + y * src_dim_w + x;
+          grad += grad_o[src_idx];
+        }
+      }
+    }
+    grad_i[dst_idx] = grad;
+    dst_idx += dim_c * dst_c_stride;
+  }
+}
+
+template<nn_compute_source_index_fn_t nn_compute_source_index_fn>
+static void upsample_nearest3d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input_,
+    IntArrayRef output_size,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg input_arg{input_, "input_", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU(__func__, {input_arg, output_arg});
+
+  // TODO: remove this when the zoom kernel is updated to support the channels_last memory format.
+  // This is a temporary hack to prevent a silence correctness issue when calling this kernel
+  // with tensors in channels_last format.
+  auto output_c = output.is_contiguous() ? output : at::empty(output.sizes(), output.options());
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int nbatch = input_.size(0);
+  int channels = input_.size(1);
+  int input_depth = input_.size(2);
+  int input_height = input_.size(3);
+  int input_width = input_.size(4);
+
+  Tensor input = input_.contiguous();
+
+  if (input.numel() == 0) {
+    return;
+  }
+
+  // upsample_nearest3d meta call makes sure `nbatch != 0`
+  unsigned int n = output.numel() / nbatch;
+  dim3 bdim{std::min<unsigned int>(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
+  dim3 gdim{ceil_div(n, bdim.x)};
+  // safe check for int32 indexing; implicitly restrict launch config for kernel
+  TORCH_CHECK(output.numel() <= std::numeric_limits<int32_t>::max());
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte,input.scalar_type(), "upsample_nearest3d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.const_data_ptr<scalar_t>();
+        auto odata = output_c.mutable_data_ptr<scalar_t>();
+
+        const float depth_scale = compute_scales_value<float>(scales_d, input_depth, output_depth);
+        const float height_scale = compute_scales_value<float>(scales_h, input_height, output_height);
+        const float width_scale = compute_scales_value<float>(scales_w, input_width, output_width);
+
+       hipLaunchKernelGGL(( upsample_nearest3d_out_frame<scalar_t, nn_compute_source_index_fn>)
+          , dim3(gdim), dim3(bdim), 0, stream, 
+            idata,
+            nbatch,
+            channels,
+            input_depth,
+            input_height,
+            input_width,
+            output_depth,
+            output_height,
+            output_width,
+            odata,
+            depth_scale,
+            height_scale,
+            width_scale);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+
+  if (!output.is_contiguous()) {
+      output.copy_(output_c);
+  }
+}
+
+template<nn_bw_compute_source_index_fn_t nn_bw_compute_source_index_fn>
+static void upsample_nearest3d_backward_out_zoom_template(
+    const Tensor& grad_input,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg grad_input_arg{grad_input, "grad_input", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      __func__,
+      {grad_output_arg, grad_input_arg});
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int nbatch = input_size[0];
+  int channels = input_size[1];
+  int input_depth = input_size[2];
+  int input_height = input_size[3];
+  int input_width = input_size[4];
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  if (grad_input.numel() == 0) {
+    return;
+  }
+
+  // upsample_nearest3d meta call makes sure `nbatch != 0`
+  unsigned int n = grad_input.numel() / nbatch;
+  dim3 bdim{std::min<unsigned int>(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)};
+  dim3 gdim{ceil_div(n, bdim.x)};
+  // safe check for int32 indexing; implicitly restrict launch config for kernel
+  TORCH_CHECK(grad_input.numel() <= std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(grad_output.numel() <= std::numeric_limits<int32_t>::max());
+
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+  AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.mutable_data_ptr<scalar_t>();
+        auto odata = grad_output.const_data_ptr<scalar_t>();
+
+        float depth_scale = compute_scales_value_backwards<float>(scales_d, output_depth, input_depth);
+        float height_scale = compute_scales_value_backwards<float>(scales_h, output_height, input_height);
+        float width_scale = compute_scales_value_backwards<float>(scales_w, output_width, input_width);
+
+       hipLaunchKernelGGL(( upsample_nearest3d_backward_out_frame<scalar_t, accscalar_t, nn_bw_compute_source_index_fn>)
+            , dim3(gdim), dim3(bdim), 0, stream, 
+                odata,
+                nbatch,
+                channels,
+                output_depth,
+                output_height,
+                output_width,
+                input_depth,
+                input_height,
+                input_width,
+                idata,
+                depth_scale,
+                height_scale,
+                width_scale);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_nearest3d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_nearest3d_out_zoom_template<nearest_neighbor_compute_source_index>(
+      output, input, output_size, scales_d, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact3d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_nearest3d_out_zoom_template<nearest_neighbor_exact_compute_source_index>(output, input, output_size, scales_d, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(upsample_nearest3d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  upsample_nearest3d_backward_out_zoom_template<nearest_neighbor_bw_compute_source_index>(
+      grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  upsample_nearest3d_backward_out_zoom_template<nearest_neighbor_exact_bw_compute_source_index>(
+      grad_input, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
+}
+
+using at::native::upsample::compute_output_size;
+using at::native::upsample_zoom::get_scale_value;
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/UpSampleTrilinear3d.cu b/aten/src/ATen/native/zoom/UpSampleTrilinear3d.cu
new file mode 100644
index 00000000000000..f3a337b0858dc0
--- /dev/null
+++ b/aten/src/ATen/native/zoom/UpSampleTrilinear3d.cu
@@ -0,0 +1,405 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/zoom/Atomic.cuh>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/ZoomApplyUtils.cuh>
+#include <ATen/native/zoom/UpSample.cuh>
+#include <ATen/native/zoom/KernelUtils.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_trilinear3d_native.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+__device__ __forceinline__ size_t
+idx_3d(const size_t nc,
+    const size_t depth,
+    const size_t height,
+    const size_t width,
+    const size_t z,
+    const size_t y,
+    const size_t x) {
+  return ((nc * depth + z) * height + y) * width + x;
+}
+
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void upsample_trilinear3d_out_frame(
+    const int n,
+    const accscalar_t rdepth,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    const PackedTensorAccessor64<const scalar_t, 5> idata,
+    PackedTensorAccessor64<scalar_t, 5> odata) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);
+
+  if (index < n) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = idata[n][c][t1][h1][w1];
+          odata[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
+        rdepth, t2, align_corners, /*cubic=*/false);
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const accscalar_t t1lambda = t1r - t1;
+    const accscalar_t t0lambda = static_cast<accscalar_t>(1) - t1lambda;
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const accscalar_t val = t0lambda *
+                (h0lambda *
+                     (w0lambda * idata[n][c][t1][h1][w1] +
+                      w1lambda * idata[n][c][t1][h1][w1 + w1p]) +
+                 h1lambda *
+                     (w0lambda * idata[n][c][t1][h1 + h1p][w1] +
+                      w1lambda * idata[n][c][t1][h1 + h1p][w1 + w1p])) +
+            t1lambda *
+                (h0lambda *
+                     (w0lambda * idata[n][c][t1 + t1p][h1][w1] +
+                      w1lambda * idata[n][c][t1 + t1p][h1][w1 + w1p]) +
+                 h1lambda *
+                     (w0lambda * idata[n][c][t1 + t1p][h1 + h1p][w1] +
+                      w1lambda * idata[n][c][t1 + t1p][h1 + h1p][w1 + w1p]));
+        odata[n][c][t2][h2][w2] = static_cast<scalar_t>(val);
+      }
+    }
+  }
+}
+
+// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename scalar_t, typename accscalar_t>
+C10_LAUNCH_BOUNDS_1(256)
+__global__ void upsample_trilinear3d_backward_out_frame(
+    const int num_kernels,
+    const accscalar_t rdepth,
+    const accscalar_t rheight,
+    const accscalar_t rwidth,
+    const bool align_corners,
+    PackedTensorAccessor64<scalar_t, 5> idata,
+    const PackedTensorAccessor64<const scalar_t, 5> odata,
+    scalar_t* idata_ptr) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int batchsize = idata.size(0);
+  const int channels = idata.size(1);
+  const int depth1 = idata.size(2);
+  const int height1 = idata.size(3);
+  const int width1 = idata.size(4);
+  const int depth2 = odata.size(2);
+  const int height2 = odata.size(3);
+  const int width2 = odata.size(4);
+
+  const size_t i_numel = batchsize * channels * depth1 * height1 * width1;
+
+  if (index < num_kernels) {
+    const int w2 = (index % (height2 * width2)) % width2; // 0:width2-1
+    const int h2 = (index % (height2 * width2)) / width2; // 0:height2-1
+    const int t2 = index / (height2 * width2); // 0:depth2-1
+    // special case: just copy
+    if (depth1 == depth2 && height1 == height2 && width1 == width2) {
+      const int t1 = t2;
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batchsize; n++) {
+        for (int c = 0; c < channels; ++c) {
+          const scalar_t val = odata[n][c][t1][h1][w1];
+          idata[n][c][t2][h2][w2] = val;
+        }
+      }
+      return;
+    }
+    //
+    const accscalar_t t1r = area_pixel_compute_source_index<accscalar_t>(
+        rdepth, t2, align_corners, /*cubic=*/false);
+    const int t1 = t1r;
+    const int t1p = (t1 < depth1 - 1) ? 1 : 0;
+    const accscalar_t t1lambda = t1r - t1;
+    const accscalar_t t0lambda = static_cast<accscalar_t>(1) - t1lambda;
+    //
+    const accscalar_t h1r = area_pixel_compute_source_index<accscalar_t>(
+        rheight, h2, align_corners, /*cubic=*/false);
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const accscalar_t h1lambda = h1r - h1;
+    const accscalar_t h0lambda = static_cast<accscalar_t>(1) - h1lambda;
+    //
+    const accscalar_t w1r = area_pixel_compute_source_index<accscalar_t>(
+        rwidth, w2, align_corners, /*cubic=*/false);
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const accscalar_t w1lambda = w1r - w1;
+    const accscalar_t w0lambda = static_cast<accscalar_t>(1) - w1lambda;
+    //
+    for (int n = 0; n < batchsize; n++) {
+      for (int c = 0; c < channels; ++c) {
+        const scalar_t d2val = odata[n][c][t2][h2][w2];
+        const size_t nc = n * channels + c;
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1, w1),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h0lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
+          true);
+        fastAtomicAdd(
+          idata_ptr,
+          idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
+          i_numel,
+          static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
+          true);
+      }
+    }
+  }
+}
+
+static void upsample_trilinear3d_out_zoom_template(
+    const Tensor& output,
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg input_arg{input, "input", 1}, output_arg{output, "output", 2};
+  checkAllSameGPU("upsample_trilinear3d_out_zoom", {input_arg, output_arg});
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int input_depth = input.size(2);
+  int input_height = input.size(3);
+  int input_width = input.size(4);
+
+  const int num_kernels = output_depth * output_height * output_width;
+  const int num_threads = ::min(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 512);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      input.scalar_type(), "upsample_trilinear3d_out_frame", [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = input.packed_accessor64<const scalar_t, 5>();
+        auto odata = output.packed_accessor64<scalar_t, 5>();
+
+        const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
+            input_depth, output_depth, align_corners, scales_d);
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+       hipLaunchKernelGGL(( upsample_trilinear3d_out_frame<scalar_t, accscalar_t>)
+            , dim3(ceil_div(num_kernels, num_threads)),
+               dim3(num_threads),
+               0,
+               stream, 
+                num_kernels,
+                rdepth,
+                rheight,
+                rwidth,
+                align_corners,
+                idata,
+                odata);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+      });
+}
+
+static void upsample_trilinear3d_backward_out_zoom_template(
+    const Tensor& grad_input_,
+    const Tensor& grad_output_,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w) {
+  TensorArg grad_input_arg{grad_input_, "grad_input_", 1},
+      grad_output_arg{grad_output_, "grad_output_", 2};
+  checkAllSameGPU(
+      "upsample_trilinear3d_backward_out_zoom",
+      {grad_output_arg, grad_input_arg});
+
+  int output_depth = output_size[0];
+  int output_height = output_size[1];
+  int output_width = output_size[2];
+
+  int input_depth = input_size[2];
+  int input_height = input_size[3];
+  int input_width = input_size[4];
+
+  Tensor grad_output = grad_output_.contiguous();
+
+  // A contiguous tensor is required for the kernel launch config
+  Tensor grad_input = grad_input_.contiguous();
+
+  // Numbers are added atomically to grad_input tensor from multiple threads,
+  // so it has to be initialized to zero.
+  grad_input.zero_();
+
+  const int num_kernels = output_depth * output_height * output_width;
+  const int num_threads = ::min(
+      at::zoom::getCurrentDeviceProperties()->maxThreadsPerBlock, 256);
+  hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half, at::ScalarType::BFloat16,
+      grad_output.scalar_type(),
+      "upsample_trilinear3d_backward_out_frame",
+      [&] {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+
+        auto idata = grad_input.packed_accessor64<scalar_t, 5>();
+        auto odata = grad_output.packed_accessor64<const scalar_t, 5>();
+        scalar_t* idata_ptr = grad_input.mutable_data_ptr<scalar_t>();
+
+        const accscalar_t rdepth = area_pixel_compute_scale<accscalar_t>(
+            input_depth, output_depth, align_corners, scales_d);
+        const accscalar_t rheight = area_pixel_compute_scale<accscalar_t>(
+            input_height, output_height, align_corners, scales_h);
+        const accscalar_t rwidth = area_pixel_compute_scale<accscalar_t>(
+            input_width, output_width, align_corners, scales_w);
+
+       hipLaunchKernelGGL(( upsample_trilinear3d_backward_out_frame<scalar_t, accscalar_t>)
+            , dim3(ceil_div(num_kernels, num_threads)),
+               dim3(num_threads),
+               0,
+               stream, 
+                num_kernels,
+                rdepth,
+                rheight,
+                rwidth,
+                align_corners,
+                idata,
+                odata,
+                idata_ptr);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+        if (!grad_input_.is_contiguous()) {
+            grad_input_.copy_(grad_input);
+        }
+      });
+}
+
+} // namespace
+
+TORCH_IMPL_FUNC(upsample_trilinear3d_out_zoom) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& output) {
+  upsample_trilinear3d_out_zoom_template(output, input, output_size, align_corners, scales_d, scales_h, scales_w);
+}
+
+TORCH_IMPL_FUNC(upsample_trilinear3d_backward_out_zoom) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    std::optional<double> scales_d,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w,
+    const Tensor& grad_input) {
+  // See Note [Writing Nondeterministic Operations]
+  // Nondeterministic because of atomicAdd usage
+  globalContext().alertNotDeterministic("upsample_trilinear3d_backward_out_zoom");
+  upsample_trilinear3d_backward_out_zoom_template(
+      grad_input, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ValidateCompressedIndicesKernel.cu b/aten/src/ATen/native/zoom/ValidateCompressedIndicesKernel.cu
new file mode 100644
index 00000000000000..51c149b66a85ba
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ValidateCompressedIndicesKernel.cu
@@ -0,0 +1,30 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/sparse/ValidateCompressedIndicesCommon.h>
+#include <ATen/zoom/jit/Loops.cuh>
+
+namespace at::native {
+
+namespace {
+
+template <typename func_t>
+struct HIPKernelLauncher {
+  static void launch(TensorIteratorBase& iter, const func_t& f) {
+    gpu_kernel(iter, f);
+  }
+};
+
+}
+
+void _validate_compressed_sparse_indices_zoom(
+    const bool is_crow,
+    const Tensor& cidx,
+    const Tensor& idx,
+    const int64_t cdim,
+    const int64_t dim,
+    const int64_t nnz) {
+  validate_compressed_sparse_indices_kernel<HIPKernelLauncher>(
+      is_crow, cidx, idx, cdim, dim, nnz);
+}
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/WeightNorm.cu b/aten/src/ATen/native/zoom/WeightNorm.cu
new file mode 100644
index 00000000000000..4da0a2c12da25c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/WeightNorm.cu
@@ -0,0 +1,527 @@
+// !!! This is a file automatically generated by hipify!!!
+#include <hip/hip_runtime.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/Exception.h>
+
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/DeviceUtils.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/_weight_norm_interface_native.h>
+#include <ATen/ops/_weight_norm_interface_backward_native.h>
+#endif
+
+
+namespace at::native {
+namespace {
+
+// Block size for weight_norm_*_first_dim_kernel.
+// Currently, kernels are non-persistent.
+// Dialing up the block size to, say 1024, can improve performance by
+// increase the amount of cache available per block, which can improve cache hit rate.
+// However, this is less efficient for short rows.  256 is pretty versatile.
+// May be worth implementing heuristics later.
+#define BLOCK 256
+
+// Block size for weight_norm_*_last_dim_kernel.
+// This is tricker than the first_dim case because we must make blocks
+// at least 16 fast elements wide to ensure fully-coalesced half-precision accesses.
+// Since output-element parallelism is along the fast dimension, this reduces the number of
+// blocks we can launch by 16X.
+#define TILE_W 16
+// Somewhat versatile strategy: max out intra-block parallelism by extending
+// blocks across the slow dimension up to the hardware-max block size of 1024.
+#define TILE_H 64
+
+template <typename T>
+struct ReduceAdd {
+  inline __device__ T operator()(const T a, const T b) const {
+    return (a + b);
+  }
+};
+
+template<typename T, typename ReduceOp>
+__device__ __forceinline__ void reduce_block_into_lanes
+  (T *x,
+   T val,
+   int lanes, // lanes is intended to be <= 32.
+   ReduceOp reduceOp)
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+
+#if !defined(USE_ROCM)
+  #pragma unroll
+#endif
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = reduceOp(x[tid], x[tid+i]);
+    __syncthreads();
+  }
+
+  if(tid < 32)
+  {
+    T final;
+    if(blockSize >= 64)
+      final = reduceOp(x[tid], x[tid+32]);
+    else
+      final = val;
+    // __SYNCWARP();
+
+#if !defined(USE_ROCM)
+    #pragma unroll
+#endif
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = reduceOp(final, WARP_SHFL_DOWN(final, i));
+
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+  }
+
+  // Make sure the smem result is visible to all warps.
+  __syncthreads();
+}
+
+template
+  <typename scalar_t,
+   typename accscalar_t>
+__global__ void weight_norm_fwd_first_dim_kernel
+  (scalar_t* __restrict__ w,
+   accscalar_t* __restrict__ norms,
+   const scalar_t* __restrict__ v,
+   const scalar_t* __restrict__ g,
+   const int rowSize)
+{
+  // We are norming each slowest-dim row of the tensor separately.
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride )
+  {
+    accscalar_t val_f = static_cast<accscalar_t>(v[i+rowStart]);
+    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  result = sqrtf(result);
+
+  if(tid == 0)
+    norms[row] = result;
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = static_cast<accscalar_t>(g[row]);
+
+  accscalar_t rnorm = 1.f/result; // for consistency with backward kernel
+
+  // Write data to output
+  for(int i = tid; i < rowSize; i += stride )
+  {
+    accscalar_t val_f = static_cast<accscalar_t>(v[i+rowStart]);
+    w[i+rowStart] = static_cast<scalar_t>(g_this_row*val_f*rnorm);
+  }
+}
+
+template
+  <typename scalar_t,
+   typename accscalar_t>
+__global__ void weight_norm_fwd_last_dim_kernel
+(
+  scalar_t* __restrict__ w,
+  accscalar_t* __restrict__ norms,
+  const scalar_t* __restrict__ v,
+  const scalar_t* __restrict__ g,
+  const int fast_dim_size,
+  const int slower_dims_size
+)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* alloc = (accscalar_t*)buf;
+  accscalar_t* s = &alloc[0];
+  accscalar_t* rnorms_this_block = &alloc[blockDim.x*blockDim.y];
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = static_cast<accscalar_t>(v[currentIdx]);
+      thread_sum += val_f*val_f; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y;
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
+
+  // Better to pass an EpilogueOp to reduce_block_into_lanes?
+  if(threadIdx.y == 0)
+  {
+    accscalar_t result = s[threadIdx.x];
+    accscalar_t norm_this_col = sqrtf(result);
+    norms[fast_dim_location] = norm_this_col;
+    rnorms_this_block[threadIdx.x] = 1.f/norm_this_col;
+  }
+
+  __syncthreads();
+
+  accscalar_t g_this_col = static_cast<accscalar_t>(g[fast_dim_location]);
+  accscalar_t rnorm = rnorms_this_block[threadIdx.x];
+
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t val_f = static_cast<accscalar_t>(v[currentIdx]);
+      w[currentIdx] = static_cast<scalar_t>(g_this_col*val_f*rnorm);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y;
+    }
+}
+
+template
+  <typename scalar_t,
+   typename accscalar_t>
+__global__ void weight_norm_bwd_first_dim_kernel
+  (scalar_t* __restrict__ grad_v,
+   scalar_t* __restrict__ grad_g,
+   const scalar_t* __restrict__ grad_w,
+   const scalar_t* __restrict__ saved_v,
+   const scalar_t* __restrict__ saved_g,
+   const accscalar_t* __restrict__ saved_norms,
+   const int rowSize)
+{
+  // For now, assign one block to each row.
+  const int tid = threadIdx.x;
+  const int row = blockIdx.x;
+  const int stride = blockDim.x;
+
+  // Logical index offset for this flattened row
+  const int rowStart = row*rowSize;
+
+  // Hack to get around nvcc complaining when an smem array is declared with the same name
+  // but different types in different kernels (in this case different instantiations)
+  // extern __shared__ accscalar_t s[]; // error: declaration is incompatible with previous "s"
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+
+  accscalar_t thread_sum = 0.f;
+  for(int i = tid; i < rowSize; i += stride )
+  {
+    accscalar_t grad_wi = static_cast<accscalar_t>(grad_w[i+rowStart]);
+    accscalar_t saved_vi = static_cast<accscalar_t>(saved_v[i+rowStart]);
+    thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
+  }
+
+  reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[0];
+
+  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
+  // more handy to keep around.
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/saved_norms[row];
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(tid == 0)
+    grad_g[row] = static_cast<scalar_t>(result*rnorm);
+
+  // Broadcast load, could use shared memory instead.
+  accscalar_t g_this_row = static_cast<accscalar_t>(saved_g[row]);
+
+  // Write v gradients.  We are reusing values that were loaded earlier, so there
+  // is an optimization opportunity here (store values persistently).
+  for(int j = tid; j < rowSize; j += stride )
+  {
+    accscalar_t grad_wj = static_cast<accscalar_t>(grad_w[j+rowStart]);
+    accscalar_t saved_vj = static_cast<accscalar_t>(saved_v[j+rowStart]);
+    accscalar_t grad_vj = g_this_row*(rnorm*grad_wj - rnorm3*saved_vj*result);
+    grad_v[j+rowStart] = static_cast<scalar_t>(grad_vj);
+  }
+}
+
+template
+  <typename scalar_t,
+   typename accscalar_t>
+__global__ void weight_norm_bwd_last_dim_kernel
+  (scalar_t* __restrict__ grad_v,
+   scalar_t* __restrict__ grad_g,
+   const scalar_t* __restrict__ grad_w,
+   const scalar_t* __restrict__ saved_v,
+   const scalar_t* __restrict__ saved_g,
+   const accscalar_t* __restrict__ saved_norms,
+   const int fast_dim_size,
+   const int slower_dims_size)
+{
+  const int fast_dim_location = threadIdx.x + blockIdx.x*blockDim.x;
+
+  extern __shared__ char buf[];
+  accscalar_t* s = (accscalar_t*)buf;
+
+  accscalar_t thread_sum = 0.f;
+
+  int slower_dims_location = threadIdx.y;
+  int currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t grad_wi = static_cast<accscalar_t>(grad_w[currentIdx]);
+      accscalar_t saved_vi = static_cast<accscalar_t>(saved_v[currentIdx]);
+      thread_sum += grad_wi*saved_vi; // AccumOp, could do Kahan here
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y;
+    }
+
+  reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
+  accscalar_t result = s[threadIdx.x];
+
+  // Broadcast load; could use shared memory instead.
+  accscalar_t rnorm = 1.f/saved_norms[fast_dim_location];
+  accscalar_t rnorm3 = rnorm*rnorm*rnorm;
+
+  // Write g gradients.
+  if(threadIdx.y == 0)
+    grad_g[fast_dim_location] = static_cast<scalar_t>(result*rnorm);
+
+  // Entire block pulls these values, could use shared memory instead.
+  accscalar_t g_this_col = static_cast<accscalar_t>(saved_g[fast_dim_location]);
+
+  // Write v gradients.
+  slower_dims_location = threadIdx.y;
+  currentIdx = fast_dim_location + fast_dim_size*slower_dims_location;
+  if(fast_dim_location < fast_dim_size)
+    while(slower_dims_location < slower_dims_size)
+    {
+      accscalar_t grad_wj = static_cast<accscalar_t>(grad_w[currentIdx]);
+      accscalar_t saved_vj = static_cast<accscalar_t>(saved_v[currentIdx]);
+      accscalar_t grad_vj = g_this_col*(rnorm*grad_wj - rnorm3*saved_vj*result);
+      grad_v[currentIdx] = static_cast<scalar_t>(grad_vj);
+      currentIdx += blockDim.y*fast_dim_size;
+      slower_dims_location += blockDim.y;
+    }
+}
+
+} // anonymous namespace
+
+std::tuple<Tensor,Tensor> weight_norm_zoom
+  (const Tensor & v,
+   const Tensor & g,
+   int64_t dim)
+{
+  auto w = at::empty_like(v, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  // weight_norm_fused does have a derivative defined in derivatives.yaml, therefore, VariableType.cpp
+  // sends the unpacked g.data() as the argument.  In other words, we expect "g" is a bare Tensor here.
+
+  // norms is only needed to stash for backward.
+  // g.scalar_type() may be at::ScalarType::Double, Float, or Half or BFloat16
+  // If Half or BFloat16, stash norms as float.
+  at::ScalarType AccType = g.scalar_type() == at::ScalarType::Half || g.scalar_type() == at::ScalarType::BFloat16 ?
+                           at::ScalarType::Float : g.scalar_type();
+  // Will this create norms on the same device as g, regardless of what the thread's default
+  // current device is?  I believe so, because Type::* functions are DeviceGuard()ed.
+  auto norms = at::empty_strided(g.sizes(), g.strides(), g.options().dtype(AccType));
+
+  const int ndims = v.dim();
+
+  if(dim == 0)
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= v.size(i);
+
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2
+      (kBFloat16, kHalf, v.scalar_type(),
+       "weight_norm_fwd_first_dim_kernel",
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+        hipLaunchKernelGGL(( weight_norm_fwd_first_dim_kernel<scalar_t, accscalar_t>)
+           , dim3(v.size(0)),
+              dim3(BLOCK),
+              BLOCK*sizeof(accscalar_t),
+              stream, 
+           w.mutable_data_ptr<scalar_t>(),
+            norms.mutable_data_ptr<accscalar_t>(),
+            v.const_data_ptr<scalar_t>(),
+            g.const_data_ptr<scalar_t>(),
+            rowSize);
+         C10_ZOOM_KERNEL_LAUNCH_CHECK();
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= v.size(i);
+
+    int fast_dim_size = v.size(ndims-1);
+
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2
+      (kBFloat16, kHalf, v.scalar_type(),
+       "weight_norm_fwd_last_dim_kernel",
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+        hipLaunchKernelGGL(( weight_norm_fwd_last_dim_kernel<scalar_t, accscalar_t>)
+           , dim3((fast_dim_size+TILE_W-1)/TILE_W),
+              dim3(dim3(TILE_W,TILE_H)),
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream, 
+           w.mutable_data_ptr<scalar_t>(),
+            norms.mutable_data_ptr<accscalar_t>(),
+            v.const_data_ptr<scalar_t>(),
+            g.const_data_ptr<scalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+         C10_ZOOM_KERNEL_LAUNCH_CHECK();
+       });
+  }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually
+  // synchronizing here, the foregoing is the best we can do.
+
+  return std::tuple<Tensor, Tensor>{w, norms};
+}
+
+std::tuple<Tensor, Tensor> weight_norm_backward_zoom
+  (const Tensor & grad_w,
+   const Tensor & saved_v,
+   const Tensor & saved_g,
+   const Tensor & saved_norms,
+   int64_t dim)
+{
+  // These checks should always succeed, because weight_norm_fused_backward should only
+  // ever be recorded in the autograd graph via weight_norm, which passes contiguous v and g.
+  TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
+  TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
+  TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
+  TORCH_CHECK(dim == 0 || dim == saved_v.dim() - 1, "fused kernels can only be applied for first or last dim")
+
+  auto grad_v = at::empty_like(saved_v, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto grad_g = at::empty_like(saved_g, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  const int ndims = saved_v.dim();
+
+  if(dim == 0)
+  {
+    // Find logical size of each flattened slowest-dim row
+    int rowSize = 1;
+    for(int i = ndims - 1; i > 0; i--)
+      rowSize *= saved_v.size(i);
+
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2
+      (kBFloat16, kHalf, saved_v.scalar_type(),
+       "weight_norm_bwd_first_dim_kernel",
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+        hipLaunchKernelGGL(( weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>)
+           , dim3(grad_w.size(0)),
+              dim3(BLOCK),
+              BLOCK*sizeof(accscalar_t),
+              stream, 
+           grad_v.mutable_data_ptr<scalar_t>(),
+            grad_g.mutable_data_ptr<scalar_t>(),
+            grad_w.const_data_ptr<scalar_t>(),
+            saved_v.const_data_ptr<scalar_t>(),
+            saved_g.const_data_ptr<scalar_t>(),
+            saved_norms.const_data_ptr<accscalar_t>(),
+            rowSize);
+         C10_ZOOM_KERNEL_LAUNCH_CHECK();
+       });
+  }
+  else if(dim == ndims - 1)
+  {
+    // Precompute slower_dims_size and fast_dim_size because they involve dynamically indexing an array.
+    int slower_dims_size = 1;
+    for(int i = 0; i < ndims - 1; i++)
+      slower_dims_size *= saved_v.size(i);
+
+    int fast_dim_size = saved_v.size(ndims-1);
+
+    hipStream_t stream = c10::zoom::getCurrentZoomStream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND2
+      (kBFloat16, kHalf, saved_v.scalar_type(),
+       "weight_norm_bwd_last_dim_kernel",
+       [&]
+       {
+         using accscalar_t = acc_type<scalar_t, true>;
+
+        hipLaunchKernelGGL(( weight_norm_bwd_last_dim_kernel<scalar_t, accscalar_t>)
+           , dim3((fast_dim_size+TILE_W-1)/TILE_W),
+              dim3(dim3(TILE_W,TILE_H)),
+              (TILE_W*TILE_H + TILE_W)*sizeof(accscalar_t),
+              stream, 
+           grad_v.mutable_data_ptr<scalar_t>(),
+            grad_g.mutable_data_ptr<scalar_t>(),
+            grad_w.const_data_ptr<scalar_t>(),
+            saved_v.const_data_ptr<scalar_t>(),
+            saved_g.const_data_ptr<scalar_t>(),
+            saved_norms.const_data_ptr<accscalar_t>(),
+            fast_dim_size,
+            slower_dims_size);
+         C10_ZOOM_KERNEL_LAUNCH_CHECK();
+       });
+  }
+
+  // The kernel execution is asynchronous, so this will only catch errors on the kernel launch,
+  // not the kernel's execution.  Errors in kernel execution aren't guaranteed to be caught
+  // until a later error check on a synchronizing CUDA call.  Unfortunately, without manually
+  // synchronizing here, the foregoing is the best we can do.
+
+  return std::tuple<Tensor, Tensor>{grad_v, grad_g};
+}
+
+#undef BLOCK
+#undef TILE_W
+#undef TILE_H
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/ZetaKernel.cu b/aten/src/ATen/native/zoom/ZetaKernel.cu
new file mode 100644
index 00000000000000..e57b3d3682d97c
--- /dev/null
+++ b/aten/src/ATen/native/zoom/ZetaKernel.cu
@@ -0,0 +1,40 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+namespace {
+
+/*
+ * This function is derived from the implementation of the zeta function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+// See note [Jiterator]
+CONSTEXPR_EXCEPT_WIN_CUDA char zeta_name[] = "zeta";
+void zeta_kernel_zoom(TensorIteratorBase& iter) {
+  #if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "zeta_zoom", [&]() {
+      opmath_jitted_gpu_kernel_with_scalars</*name=*/zeta_name,
+                                     /*return_dtype=*/ scalar_t,
+                                     /*f_inputs_dtype=*/ scalar_t>(iter, zeta_string);
+      });
+  #else
+    AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "zeta_zoom", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t x, scalar_t q) -> scalar_t {
+        return zeta<scalar_t, /*is_zoom=*/true>(x, q);
+      });
+    });
+  #endif //jiterator
+}
+
+}  // namespace (anonymous)
+
+REGISTER_PRIVATEUSE1_DISPATCH(zeta_stub, &zeta_kernel_zoom);
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/laguerre_polynomial_l.cu b/aten/src/ATen/native/zoom/laguerre_polynomial_l.cu
new file mode 100644
index 00000000000000..a2090419e529e1
--- /dev/null
+++ b/aten/src/ATen/native/zoom/laguerre_polynomial_l.cu
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char laguerre_polynomial_l_name[] = "laguerre_polynomial_l_forward";
+
+            void laguerre_polynomial_l_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "laguerre_polynomial_l_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<laguerre_polynomial_l_name, scalar_t, scalar_t>(iterator, laguerre_polynomial_l_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "laguerre_polynomial_l_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return laguerre_polynomial_l_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // laguerre_polynomial_l_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(laguerre_polynomial_l_stub, &laguerre_polynomial_l_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/layer_norm_kernel.cu b/aten/src/ATen/native/zoom/layer_norm_kernel.cu
new file mode 100644
index 00000000000000..93f44bc9be9425
--- /dev/null
+++ b/aten/src/ATen/native/zoom/layer_norm_kernel.cu
@@ -0,0 +1,1403 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/layer_norm.h>
+
+#include <type_traits>
+
+#include <thrust/tuple.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/ZoomContext.h>
+#include <ATen/zoom/detail/IndexUtils.cuh>
+#include <ATen/native/zoom/block_reduce.cuh>
+#include <ATen/zoom/jit/thread_constants.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/native_layer_norm_native.h>
+#include <ATen/ops/native_layer_norm_backward_native.h>
+#include <ATen/ops/zeros_like_native.h>
+#endif
+
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/env.h>
+
+
+namespace at::native {
+
+namespace {
+
+constexpr int kHIPNumThreads = 256;
+constexpr unsigned int kWarpSize = C10_WARP_SIZE;
+constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
+
+// aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh)
+template<typename scalar_t, int vec_size>
+struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+  scalar_t val[vec_size];
+};
+
+// Checks alignment of buffers for using vectorized loads / stores
+template<typename T>
+bool can_vectorize(const T * ptr, int alignment) {
+  uint64_t addr = reinterpret_cast<uint64_t>(ptr);
+  return addr % alignment == 0;
+};
+
+
+template <typename T, typename T_ACC>
+__global__ void RowwiseMomentsZoomKernel(
+    int64_t N,
+    T_ACC eps,
+    const T* X,
+    T_ACC* mean,
+    T_ACC* rstd) {
+  using WelfordType = WelfordData<T_ACC, int64_t>;
+  using WelfordOp =
+      WelfordOps<T_ACC, T_ACC, int64_t, thrust::pair<T_ACC, T_ACC>>;
+
+  __shared__
+      typename std::aligned_storage<sizeof(WelfordType), alignof(WelfordType)>::
+          type val_shared[C10_WARP_SIZE];
+  WelfordType* val_shared_ptr = reinterpret_cast<WelfordType*>(val_shared);
+
+  const int64_t i = blockIdx.x;
+  WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
+  WelfordType val(0, 0, 0, 0);
+
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    val = welford_op.reduce(val, static_cast<T_ACC>(X[index]), index);
+  }
+  val = zoom_utils::BlockReduce(
+      val,
+      welford_op,
+      /*identity_element=*/WelfordType(0, 0, 0, 0),
+      val_shared_ptr);
+
+  if (threadIdx.x == 0) {
+    T_ACC m1;
+    T_ACC m2;
+    thrust::tie(m2, m1) = welford_op.project(val);
+    mean[i] = m1;
+    rstd[i] = c10::hip::compat::rsqrt(m2 + eps);
+  }
+}
+
+template <typename T, typename T_ACC>
+__global__ void LayerNormForwardZoomKernel(
+    int64_t N,
+    const T* X,
+    const T_ACC* mean,
+    const T_ACC* rstd,
+    const T* gamma,
+    const T* beta,
+    T* Y) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    const T_ACC beta_v =
+        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
+    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+            static_cast<T_ACC>(rstd[i]) * gamma_v +
+        beta_v;
+  }
+}
+
+struct WelfordDataLN{
+  float mean;
+  float sigma2;
+  float count;
+  C10_HOST_DEVICE WelfordDataLN(): mean(0.f), sigma2(0.f), count(0.f){}
+  C10_HOST_DEVICE WelfordDataLN(float mean, float sigma2, float count): mean(mean), sigma2(sigma2), count(count) {}
+};
+
+template<typename U> __device__
+WelfordDataLN cuWelfordOnlineSum(
+  const U val,
+  const WelfordDataLN& curr_sum)
+{
+  U delta = val - curr_sum.mean;
+  U new_count = curr_sum.count + 1.f;
+  U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+  return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
+}
+
+__device__
+WelfordDataLN cuWelfordCombine(
+  const WelfordDataLN dataB,
+  const WelfordDataLN dataA
+) {
+  using U = decltype(dataB.count);
+  U delta = dataB.mean - dataA.mean;
+  U count = dataA.count + dataB.count;
+  U mean, sigma2;
+  if (count > decltype(dataB.count){0}) {
+    auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+    auto nA = dataA.count * coef;
+    auto nB = dataB.count * coef;
+    mean = nA*dataA.mean + nB*dataB.mean;
+    sigma2 = dataA.sigma2 + dataB.sigma2 + delta * delta * dataA.count * nB;
+  } else {
+    mean = U(0);
+    sigma2 = U(0);
+  }
+  return {mean, sigma2, count};
+}
+
+template<typename T>
+__device__ WelfordDataLN compute_stats(
+  const T*  __restrict__ X,
+  const int N,
+  float * buf
+  ) {
+    //X points to the row to read
+    using vec_t = aligned_vector<T, vec_size>;
+    using acc_t = acc_type<T, true>;
+    const vec_t * X_vec = reinterpret_cast<const vec_t*>(X);
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const int n_vec_to_read = N/vec_size;
+    WelfordDataLN wd(0.f, 0.f, 0.f);
+    //no tail, we check that N is multiple of vec_size
+    for (int i = thrx; i < n_vec_to_read; i += numx) {
+      vec_t data = X_vec[i];
+      #pragma unroll
+      for (int ii=0; ii < vec_size; ii++){
+        wd = cuWelfordOnlineSum(static_cast<acc_t>(data.val[ii]), wd);
+      }
+    }
+    // intra-warp reduction
+    for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+        WelfordDataLN wdB{WARP_SHFL_DOWN(wd.mean, offset),
+        WARP_SHFL_DOWN(wd.sigma2, offset), WARP_SHFL_DOWN(wd.count, offset)};
+        wd = cuWelfordCombine(wd, wdB);
+    }
+    // threadIdx.x == 0 has correct values for each warp
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      float * meansigmabuf = buf;
+      float * countbuf = buf + blockDim.y;
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_y = threadIdx.y - offset;
+          meansigmabuf[2*wrt_y] = wd.mean;
+          meansigmabuf[2*wrt_y+1] = wd.sigma2;
+          countbuf[wrt_y] = wd.count;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.x == 0 && threadIdx.y < offset) {
+          WelfordDataLN wdB{meansigmabuf[2*threadIdx.y],
+                          meansigmabuf[2*threadIdx.y+1],
+                          countbuf[threadIdx.y]};
+          wd = cuWelfordCombine(wd, wdB);
+        }
+        __syncthreads();
+      }
+      if (threadIdx.x == 0 && threadIdx.y ==0) {
+        meansigmabuf[0] = wd.mean;
+        meansigmabuf[1] = wd.sigma2/float(N);
+      }
+      __syncthreads();
+      return WelfordDataLN{meansigmabuf[0], meansigmabuf[1],0.f};
+
+    } else {
+      return WelfordDataLN{WARP_SHFL(wd.mean,0), WARP_SHFL(wd.sigma2,0)/float(N), 0.f};
+    }
+}
+
+
+template <typename T, typename T_ACC,
+typename std::enable_if<!std::is_same<T, double>::value, int>::type = 0>
+__device__ __inline__ void vectorized_layer_norm_kernel_impl(
+  const int N,
+  T_ACC eps,
+  const  T* __restrict__ X,
+  const  T* gamma,
+  const  T* beta,
+  T_ACC* mean,
+  T_ACC* rstd,
+  T* Y){
+    extern __shared__ float s_data[]; //if we made smem WelfordDataLN type, there would be bank conflicts,
+    //as one thread would have to write 3 consecutive floats
+    auto i1 = blockIdx.x;
+    const T * block_row = X + i1 * N;
+    WelfordDataLN wd = compute_stats(block_row, N, s_data);
+
+    using vec_t = aligned_vector<T, vec_size>;
+    const vec_t * X_vec = reinterpret_cast<const vec_t*>(block_row);
+    const vec_t * gamma_vec = (gamma != nullptr) ? reinterpret_cast<const vec_t*>(gamma) : nullptr;
+    const vec_t * beta_vec = (beta != nullptr) ? reinterpret_cast<const vec_t*>(beta) : nullptr;
+    vec_t * Y_vec = reinterpret_cast<vec_t*>(Y + i1 * N);
+
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    const int n_vec_to_read = N/vec_size;
+
+    T_ACC rstd_val = c10::hip::compat::rsqrt(wd.sigma2 + eps);
+
+    // No tail, N is guaranteed to be multiple of vec size
+    for (int i = thrx; i < n_vec_to_read; i += numx) {
+      vec_t data = X_vec[i];
+      vec_t out;
+
+      // Computation is performed in T_ACC, X is cast to T_ACC and result is implicitly cast to T
+      if (gamma_vec != nullptr && beta_vec != nullptr) {
+        #pragma unroll
+        for (int ii=0; ii < vec_size; ii++){
+          out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean))
+            + static_cast<T_ACC>(beta_vec[i].val[ii]);
+        }
+      } else if (gamma_vec != nullptr) {
+        #pragma unroll
+        for (int ii=0; ii < vec_size; ii++){
+          out.val[ii] = static_cast<T_ACC>(gamma_vec[i].val[ii]) * (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean));
+        }
+      } else if (beta_vec != nullptr) {
+        #pragma unroll
+        for (int ii=0; ii < vec_size; ii++){
+          out.val[ii] = (rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean)) + static_cast<T_ACC>(beta_vec[i].val[ii]);
+        }
+      } else {
+        #pragma unroll
+        for (int ii=0; ii < vec_size; ii++){
+          out.val[ii] = rstd_val * (static_cast<T_ACC>(data.val[ii]) - wd.mean);
+        }
+      }
+      Y_vec[i] = out;
+    }
+    if (thrx == 0) {
+      mean[i1] = wd.mean;
+      rstd[i1] = rstd_val;
+    }
+}
+
+template <typename T, typename T_ACC,
+typename std::enable_if<std::is_same<T, double>::value, int>::type = 0>
+__device__ __inline__ void vectorized_layer_norm_kernel_impl(
+  const int /*N*/,
+  T_ACC /*eps*/,
+  const  T* __restrict__ /*X*/,
+  const  T* /*gamma*/,
+  const  T* /*beta*/,
+  T_ACC* /*mean*/,
+  T_ACC* /*rstd*/,
+  T* /*Y*/){
+    ZOOM_KERNEL_ASSERT(false && "doesn't work with double");
+  }
+
+//to avoid windows SFINAE errors
+template <typename T, typename T_ACC>
+__global__ void vectorized_layer_norm_kernel(
+  const int N,
+  T_ACC eps,
+  const  T* __restrict__ X,
+  const  T* gamma,
+  const  T* beta,
+  T_ACC* mean,
+  T_ACC* rstd,
+  T* Y){
+    vectorized_layer_norm_kernel_impl(N, eps, X, gamma, beta, mean, rstd, Y);
+  }
+
+
+template<typename T, typename T_ACC>
+__device__ __inline__ void compute_gI(
+  const T* __restrict__ dY,
+  const T* __restrict__ X,
+  const T_ACC* __restrict__ mean,
+  const T_ACC* __restrict__ rstd,
+  const T* __restrict__ gamma,
+  T* dX,
+  const int N,
+  T_ACC * buf){
+    const auto i1 = blockIdx.x;
+    const T_ACC mean_val = mean[i1];
+    const T_ACC rstd_val = rstd[i1];
+    T_ACC stats_x1{0}, stats_x2{0};
+    constexpr int unroll = 4;
+    auto l = unroll * threadIdx.x;
+    const T * X_i = X + i1 * N;
+    const T * dY_i = dY + i1 * N;
+    T * dX_i = dX + i1 * N;
+    //vectorized reads don't improve perf, so use regular unrolling
+
+    for (; l+unroll - 1 < N; l += blockDim.x * unroll){
+      #pragma unroll
+      for (int k=0; k< unroll; k++){
+          const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l+k]) : T_ACC(1);
+          const auto c_h = static_cast<T_ACC>(X_i[l+k]);
+          const auto c_loss = static_cast<T_ACC>(dY_i[l+k]);
+          stats_x1 += c_loss * gamma_val;
+          stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+      }
+    }
+    for (;  l < N; l ++) {
+          const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
+          const auto c_h = static_cast<T_ACC>(X_i[l]);
+          const auto c_loss = static_cast<T_ACC>(dY_i[l]);
+          stats_x1 += c_loss * gamma_val;
+          stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+
+    stats_x1 = zoom_utils::BlockReduceSum(stats_x1, buf);
+    stats_x2 = zoom_utils::BlockReduceSum(stats_x2, buf);
+    if (threadIdx.x == 0) {
+      buf[0] = stats_x1;
+      buf[1] = stats_x2;
+    }
+    __syncthreads();
+    stats_x1 = buf[0];
+    stats_x2 = buf[1];
+    T_ACC fH = N;
+    T_ACC term1 = (T_ACC(1) / fH) * rstd_val;
+
+    for (int l = threadIdx.x; l < N; l += blockDim.x){
+        const auto x = X_i[l];
+        const auto dy = dY_i[l];
+        const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
+
+        T_ACC f_grad_input = fH * gamma_val * dy;
+        f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+        f_grad_input -= stats_x1;
+        f_grad_input *= term1;
+        dX_i[l] = f_grad_input;
+    }
+  }
+
+
+template<typename T, typename T_ACC>
+__global__ void layer_norm_grad_input_kernel(
+  const T* __restrict__ dY,
+  const T* __restrict__ X,
+  const T_ACC* __restrict__ mean,
+  const T_ACC* __restrict__ rstd,
+  const T* __restrict__ gamma,
+  T*  dX,
+  const int N){
+    alignas(sizeof(double)) extern __shared__ char s_data1[];
+    T_ACC * buf = reinterpret_cast<T_ACC*>(&s_data1);
+
+    compute_gI(dY, X, mean, rstd, gamma, dX, N, buf);
+  }
+
+
+// This implementation gets called when input buffers (dY, X, gamma and dX) are aligned
+// to vec_size * sizeof(T). Compared to the unvectorized implementation, it is about 10%
+// faster measured at PT operator level, with cases seeing a 2X speedup (where N >> M).
+// There are no noticeable regressions on the rest of the sizes.
+
+template<typename T, typename T_ACC>
+__global__ void layer_norm_grad_input_kernel_vectorized(
+  const T* __restrict__ dY,
+  const T* __restrict__ X,
+  const T_ACC* __restrict__ mean,
+  const T_ACC* __restrict__ rstd,
+  const T* __restrict__ gamma,
+  T* dX,
+  const int N) {
+  alignas(sizeof(double)) extern __shared__ char shared_data[];
+  T_ACC* reduce_buf = reinterpret_cast<T_ACC*>(&shared_data);
+
+  const auto bIdx = blockIdx.x;
+  const T_ACC mean_val = mean[bIdx];
+  const T_ACC rstd_val = rstd[bIdx];
+  const T* X_i = X + bIdx * N;
+  const T* dY_i = dY + bIdx * N;
+  T* dX_i = dX + bIdx * N;
+
+  using vec_t = aligned_vector<T, vec_size>;
+  const vec_t* const X_i_vec_ptr = reinterpret_cast<const vec_t*>(X_i);
+  const vec_t* const dY_i_vec_ptr = reinterpret_cast<const vec_t*>(dY_i);
+  const vec_t* const gamma_vec_ptr = (gamma != nullptr) ? reinterpret_cast<const vec_t*>(gamma) : nullptr;
+  vec_t* const dX_i_vec = reinterpret_cast<vec_t*>(dX_i);
+
+  vec_t X_i_vec_reg, dY_i_vec_reg, gamma_vec_reg, dX_i_vec_reg;
+  for (int k = 0; k < vec_size; ++k) {
+    gamma_vec_reg.val[k] = T(1);
+  }
+
+  T_ACC stats_x1{0}, stats_x2{0};
+  unsigned int l = threadIdx.x * vec_size;
+  for (; l + vec_size - 1 < N; l += blockDim.x * vec_size) {
+    unsigned int vec_idx = l / vec_size;
+    if (gamma != nullptr) {
+      gamma_vec_reg = gamma_vec_ptr[vec_idx];
+    }
+
+    X_i_vec_reg = X_i_vec_ptr[vec_idx];
+    dY_i_vec_reg = dY_i_vec_ptr[vec_idx];
+
+    for (int k = 0; k < vec_size; ++k) {
+      const auto gamma_val = static_cast<T_ACC>(gamma_vec_reg.val[k]);
+      const auto c_h = static_cast<T_ACC>(X_i_vec_reg.val[k]);
+      const auto c_loss = static_cast<T_ACC>(dY_i_vec_reg.val[k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+
+  // Tail Loop
+  for (; l < N; l++) {
+    const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
+    const auto c_h = static_cast<T_ACC>(X_i[l]);
+    const auto c_loss = static_cast<T_ACC>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  // Reduction in Shared Memory
+  stats_x1 = zoom_utils::BlockReduceSum(stats_x1, reduce_buf);
+  stats_x2 = zoom_utils::BlockReduceSum(stats_x2, reduce_buf);
+  if (threadIdx.x == 0) {
+    reduce_buf[0] = stats_x1;
+    reduce_buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = reduce_buf[0];
+  stats_x2 = reduce_buf[1];
+
+  T_ACC fH = N;
+  T_ACC term1 = (T_ACC(1) / fH) * rstd_val;
+
+  l = threadIdx.x * vec_size;
+  for (; l + vec_size - 1 < N; l += blockDim.x * vec_size) {
+    unsigned int vec_idx = l / vec_size;
+    if (gamma != nullptr) {
+      gamma_vec_reg = gamma_vec_ptr[vec_idx];
+    }
+
+    X_i_vec_reg = X_i_vec_ptr[vec_idx];
+    dY_i_vec_reg = dY_i_vec_ptr[vec_idx];
+
+    for (int k = 0; k < vec_size; ++k) {
+      const auto gamma_val = static_cast<T_ACC>(gamma_vec_reg.val[k]);
+      const auto x = static_cast<T_ACC>(X_i_vec_reg.val[k]);
+      const auto dy = static_cast<T_ACC>(dY_i_vec_reg.val[k]);
+
+      T_ACC f_grad_input = fH * gamma_val * dy;
+      f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+      f_grad_input -= stats_x1;
+      f_grad_input *= term1;
+      dX_i_vec_reg.val[k] = f_grad_input;
+    }
+
+    dX_i_vec[vec_idx] = dX_i_vec_reg;
+  }
+
+  // Tail Loop
+  for (; l < N; l += blockDim.x) {
+    const auto x = X_i[l];
+    const auto dy = dY_i[l];
+    const auto gamma_val = (gamma != nullptr) ? static_cast<T_ACC>(gamma[l]) : T_ACC(1);
+
+    T_ACC f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    dX_i[l] = f_grad_input;
+  }
+}
+
+
+template <typename T, typename T_ACC>
+__global__ void GammaBetaBackwardSimpleZoomKernel(
+    int64_t M,
+    int64_t N,
+    const T* dY,
+    const T* X,
+    const T_ACC* mean,
+    const T_ACC* rstd,
+    T* dg,
+    T* db) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+              (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+              static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+// This implementation gets called if M and N divide with 32. This case should
+// be the most common. We can then make better use of warp level intrinsics
+// to improve performance.
+
+template <typename T, typename T_ACC>
+__global__ void GammaBetaBackwardZoomKernel_32x32(
+    int64_t M,
+    int64_t N,
+    const T* dY,
+    const T* X,
+    const T_ACC* mean,
+    const T_ACC* rstd,
+    T* dg,
+    T* db) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_dg;
+  T_ACC* s_db;
+
+  T_ACC dg_sum = 0;
+  T_ACC db_sum = 0;
+
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (j < N) {
+    constexpr int unroll_factor = 8;
+    int laneId = threadIdx.x & (C10_WARP_SIZE - 1);
+
+    T_ACC mean_reg, mean_reg_tmp;
+    T_ACC rstd_reg, rstd_reg_tmp;
+    T dY_reg;
+    T X_reg;
+
+    // Main loop
+    int bcounter;
+    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor);
+         bcounter++) {
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+
+      if (laneId < unroll_factor) {
+        mean_reg_tmp = mean[offset + laneId];
+        rstd_reg_tmp = rstd[offset + laneId];
+      }
+      WARP_SYNC();
+
+      #pragma unroll
+      for (int ii = 0; ii < unroll_factor; ++ii) {
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize);
+        rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize);
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // Remainder loop
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      if ((offset + ii) < M) {
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // This kernel uses a block of (C10_WARP_SIZE x C10_WARP_SIZE) and
+    // gets called when M; N divide by 32. We can use warp shuffles
+    // for the final reduction step. This removes 4 shmem loads and
+    // stores with their corresponding __syncthreads()
+
+    // This greatly reduces bank conflicts at the expense of a little
+    // extra shared memory. It does not impact occupancy
+    int padded_bx = (1 + blockDim.x);
+
+    s_dg = s_data_typed;
+    s_db = s_data_typed + (padded_bx * blockDim.y);
+    s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum;
+    s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum;
+    __syncthreads();
+
+    // Load transposed so that a warp holds an entire column
+    T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y];
+    T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y];
+    for (unsigned delta = C10_WARP_SIZE >> 1; delta >= 1; delta >>= 1) {
+      reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize);
+      reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize);
+    }
+
+    if (threadIdx.x == 0) {
+      const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+      if (dg) {
+        dg[j] = reg_dg;
+      }
+      if (db) {
+        db[j] = reg_db;
+      }
+    }
+  }
+}
+
+template <typename T, typename T_ACC>
+__global__ void GammaBetaBackwardZoomKernel(
+    int64_t M,
+    int64_t N,
+    const T* dY,
+    const T* X,
+    const T_ACC* mean,
+    const T_ACC* rstd,
+    T* dg,
+    T* db) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_dg;
+  T_ACC* s_db;
+
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  T_ACC dg_sum = 0;
+  T_ACC db_sum = 0;
+
+  if (j < N) {
+    constexpr int unroll_factor = 8;
+
+    T_ACC mean_reg;
+    T_ACC rstd_reg;
+    T dY_reg;
+    T X_reg;
+
+    // Main Loop
+    int bcounter;
+    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); bcounter++){
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+
+      #pragma unroll
+      for (int ii = 0; ii < unroll_factor; ++ii) {
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // Remainder loop
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (int ii = 0; ii < unroll_factor; ii++ ){
+      if ((offset + ii) < M) {
+        dY_reg = dY[(offset + ii) * N + j ];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // Do the final reduction in shared memory
+    s_dg = s_data_typed;
+    s_db = s_data_typed + blockDim.x * blockDim.y;
+    s_dg[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
+    s_db[threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
+    __syncthreads();
+
+    for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
+      if (threadIdx.y < offset) {
+        s_dg[threadIdx.y * blockDim.x + threadIdx.x] +=
+            s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        s_db[threadIdx.y * blockDim.x + threadIdx.x] +=
+            s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        }
+      __syncthreads();
+    }
+
+    if (threadIdx.y == 0) {
+      if (dg) {
+        dg[j] = s_dg[threadIdx.x];
+      }
+      if (db) {
+        db[j] = s_db[threadIdx.x];
+      }
+    }
+  }
+}
+
+template <typename T, typename T_ACC>
+void launch_vectorized_layer_norm_kernel(
+  int N,
+  int64_t M,
+  T_ACC eps,
+  const T* X_data,
+  const T* gamma_data,
+  const T* beta_data,
+  T* Y_data,
+  T_ACC* mean_data,
+  T_ACC* rstd_data
+) {
+    //constexpr int alignment = 16; //currently unused to make sure float and half results are bw accurate
+    auto stream = c10::zoom::getCurrentZoomStream().stream();
+    const int warp_size = at::zoom::warp_size();
+    const dim3 threads(warp_size, num_threads() / warp_size, 1);
+    const dim3 blocks(M);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(threads.y % 2 == 0 || threads.y == 1);
+    int nshared = threads.y > 1 ? threads.y * 3/2 *sizeof(T_ACC) : 0;
+    vectorized_layer_norm_kernel<<<blocks, threads, nshared, stream>>>(N, eps, X_data,
+    gamma_data, beta_data, mean_data, rstd_data, Y_data);
+    C10_ZOOM_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename T, typename T_ACC>
+void LayerNormKernelImplInternal(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    T_ACC eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  // assumes input, gamma and beta are of proper shape, this was checked in _check_layer_norm_inputs
+  // assumes all tensors are contiguous
+  TORCH_CHECK(M <= at::zoom::getCurrentDeviceProperties()->maxGridSize[0], "M should be less than maximum HIP grid size, \
+  file a support request to support bigger batches");
+  const T* X_data = X.const_data_ptr<T>();
+  const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+  const T* beta_data = beta.defined() ? beta.const_data_ptr<T>() : nullptr;
+  T* Y_data = Y->data_ptr<T>();
+  T_ACC* mean_data = mean->data_ptr<T_ACC>();
+  T_ACC* rstd_data = rstd->data_ptr<T_ACC>();
+
+  // check if can take fast path - all tensors are properly aligned, N is less than 2^24 (to use float count),
+  // N is multiple of vec_size (so that all rows are aligned if tensor is aligned)
+  constexpr int num_vec_elems = vec_size;
+  constexpr int alignment = num_vec_elems * sizeof(T);
+  bool can_vec_X = can_vectorize(X_data, alignment);
+  bool can_vec_Y = can_vectorize(Y_data, alignment);
+  bool can_vec_gamma = gamma.defined() ? can_vectorize(gamma_data, alignment) : true;
+  bool can_vec_beta = beta.defined() ? can_vectorize(beta_data, alignment) : true;
+
+  if ((std::is_same<T, float>::value || std::is_same<T, at::Half>::value || std::is_same<T, at::BFloat16>::value) &&
+  N <= static_cast<int64_t>(1ULL << std::numeric_limits<float>::digits) && N % num_vec_elems == 0 &&
+  can_vec_X && can_vec_Y && can_vec_gamma && can_vec_beta) {
+    launch_vectorized_layer_norm_kernel(static_cast<int>(N), M, eps, X_data, gamma_data, beta_data, Y_data, mean_data, rstd_data);
+  } else {
+  hipStream_t zoom_stream = c10::zoom::getCurrentZoomStream();
+  RowwiseMomentsZoomKernel<T, T_ACC>
+      <<<M, zoom_utils::kHIPBlockReduceNumThreads, 0, zoom_stream>>>(
+          N, eps, X_data, mean_data, rstd_data);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  LayerNormForwardZoomKernel<T, T_ACC><<<M, kHIPNumThreads, 0, zoom_stream>>>(
+      N, X_data, mean_data, rstd_data, gamma_data, beta_data, Y_data);
+  C10_ZOOM_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+void LayerNormKernelImpl(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    double eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      X.scalar_type(),
+      "LayerNormKernelImpl",
+      [&]() {
+        using acc_t = acc_type<scalar_t, true>;
+        LayerNormKernelImplInternal<scalar_t, acc_t>(
+            X, gamma, beta, M, N, static_cast<acc_t>(eps), Y, mean, rstd);
+      });
+}
+
+template<typename T, typename T_ACC> __device__
+void cuLoadWriteStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    T_ACC* warp_buf1,
+    T_ACC* warp_buf2,
+    const T* input,
+    const T* dout,
+    const int i1_end,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd)
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    T curr_mean = mean[i1];
+    T curr_rstd = rstd[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*N+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<N) {
+        T curr_input = static_cast<T>(input[load_idx]);
+        T curr_dout = static_cast<T>(dout[load_idx]);
+        warp_buf1[write_idx] = curr_dout;
+        warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_rstd;
+      } else {
+        warp_buf1[write_idx] = T(0);
+        warp_buf2[write_idx] = T(0);
+      }
+    }
+  } else {
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      warp_buf1[write_idx] = T(0);
+      warp_buf2[write_idx] = T(0);
+    }
+  }
+}
+
+template<typename T, typename T_ACC> __device__
+void cuLoadAddStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    T_ACC* warp_buf1,
+    T_ACC* warp_buf2,
+    const T* input,
+    const T* dout,
+    const int i1_end,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd)
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    T_ACC curr_mean = mean[i1];
+    T_ACC curr_rstd = rstd[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*N+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<N) {
+        T_ACC curr_input = static_cast<T_ACC>(input[load_idx]);
+        T_ACC curr_dout = static_cast<T_ACC>(dout[load_idx]);
+        warp_buf1[write_idx] += curr_dout;
+        warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_rstd;
+      }
+    }
+  }
+}
+
+template<typename T, typename T_ACC> __global__
+void cuComputePartGradGammaBeta(
+    const T* __restrict__ dout,
+    const T* __restrict__ input,
+    const int64_t M,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    T_ACC* part_grad_gamma,
+    T_ACC* part_grad_beta)
+{
+    const int numsegs_M = (M+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
+    const int segs_per_block = (numsegs_M + gridDim.y - 1) / gridDim.y;
+    const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_end = i1_beg_plus_one < M ? i1_beg_plus_one : M;
+    const int row_stride = blockDim.x+1;
+    const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
+    const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
+    const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
+    alignas(sizeof(double)) extern __shared__ char shared[];
+    T_ACC * buf = reinterpret_cast<T_ACC*>(&shared); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
+    T_ACC* warp_buf1 = (T_ACC*)buf;
+    T_ACC* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
+    // compute partial sums from strided inputs
+    // do this to increase number of loads in flight
+    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
+      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+    }
+    __syncthreads();
+    // inter-warp reductions
+    // sum within each warp
+    T_ACC acc1 = T_ACC(0);
+    T_ACC acc2 = T_ACC(0);
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int row1 = threadIdx.y + k*blockDim.y;
+      int idx1 = row1*row_stride + threadIdx.x;
+      acc1 += warp_buf1[idx1];
+      acc2 += warp_buf2[idx1];
+    }
+    warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
+    warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
+    __syncthreads();
+    // sum all warps
+    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
+      if (threadIdx.y < offset) {
+        int row1 = threadIdx.y;
+        int row2 = threadIdx.y + offset;
+        int idx1 = row1*row_stride + threadIdx.x;
+        int idx2 = row2*row_stride + threadIdx.x;
+        warp_buf1[idx1] += warp_buf1[idx2];
+        warp_buf2[idx1] += warp_buf2[idx2];
+      }
+      __syncthreads();
+    }
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIdx.y == 0 && i2 < N) {
+      int row1 = threadIdx.y;
+      int row2 = threadIdx.y + 1;
+      int idx1 = row1*row_stride + threadIdx.x;
+      int idx2 = row2*row_stride + threadIdx.x;
+      part_grad_beta[blockIdx.y*N+i2] = warp_buf1[idx1] + warp_buf1[idx2];
+      part_grad_gamma[blockIdx.y*N+i2] = warp_buf2[idx1] + warp_buf2[idx2];
+    }
+}
+
+template<typename T, typename T_ACC> __global__
+void cuComputeGradGammaBeta(
+    const T_ACC* part_grad_gamma,
+    const T_ACC* part_grad_beta,
+    const int part_size,
+    const int64_t M,
+    const int64_t N,
+    T* grad_gamma,
+    T* grad_beta)
+{
+    // sum partial gradients for gamma and beta
+    alignas(sizeof(double)) extern __shared__ char shared[];
+    T_ACC * buf = reinterpret_cast<T_ACC*>(&shared);
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // each warp does sequential reductions until reduced part_size is num_warps
+    int num_warp_reductions = part_size / blockDim.y;
+    T_ACC sum_gamma = T_ACC(0);
+    T_ACC sum_beta = T_ACC(0);
+    const T_ACC* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * N + i2;
+    const T_ACC* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * N + i2;
+
+    if (i2 < N) {
+        for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
+          sum_gamma += part_grad_gamma_ptr[warp_offset*N];
+          sum_beta += part_grad_beta_ptr[warp_offset*N];
+        }
+    }
+
+    // inter-warp reductions
+    const int nbsize3 = blockDim.x * blockDim.y / 2;
+    for (int offset = blockDim.y/2;  offset >= 1;  offset /= 2) {
+      // top half write to shared memory
+      if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+        const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+        buf[write_idx] = sum_gamma;
+        buf[write_idx+nbsize3] = sum_beta;
+      }
+      __syncthreads();
+      // bottom half sums
+      if (threadIdx.y < offset) {
+        const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
+        sum_gamma += buf[read_idx];
+        sum_beta += buf[read_idx+nbsize3];
+      }
+      __syncthreads();
+    }
+
+    // write out fully summed gradients
+    if (threadIdx.y == 0 && i2 < N) {
+      if (grad_gamma) {
+          grad_gamma[i2] = sum_gamma;
+      }
+      if (grad_beta) {
+          grad_beta[i2] = sum_beta;
+      }
+    }
+}
+
+template<typename T, typename T_ACC> __global__
+void cuComputeGradInput(
+    const T* __restrict__ dout,
+    const T* __restrict__ input,
+    const int64_t M,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    const T* gamma,
+    T* grad_input)
+{
+  for (int i1=blockIdx.y; i1 < M; i1 += gridDim.y) {
+    T_ACC sum_loss1 = T_ACC(0);
+    T_ACC sum_loss2 = T_ACC(0);
+    T_ACC c_mean = mean[i1];
+    const T_ACC c_rstd = rstd[i1];
+    const T* k_input = input + i1*N;
+    const T* k_dout = dout + i1*N;
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    if (gamma != NULL) {
+      // Optimization for ROCm MI100
+      for( int l = 0; l < N ; l += numx) {
+        int idx = l + thrx;
+        const T_ACC gamma_idx = static_cast<T_ACC>((idx<N) ? gamma[idx] : T(0));
+        const T_ACC c_h = static_cast<T_ACC>((idx<N) ? k_input[idx] : T(0));
+        const T_ACC c_loss = static_cast<T_ACC>((idx<N) ? k_dout[idx] : T(0));
+        sum_loss1 += c_loss * gamma_idx;
+        sum_loss2 += c_loss * gamma_idx * (c_h - c_mean) * c_rstd;
+      }
+    } else {
+      for( int l = 0; l < N ; l += numx) {
+        int idx = l + thrx;
+        const T_ACC c_h = static_cast<T_ACC>((idx<N) ? k_input[idx] : T(0));
+        const T_ACC c_loss = static_cast<T_ACC>((idx<N) ? k_dout[idx] : T(0));
+        sum_loss1 += c_loss;
+        sum_loss2 += c_loss * (c_h - c_mean) * c_rstd;
+      }
+    }
+    // intra-warp reductions
+    for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
+      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
+      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
+    }
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      alignas(sizeof(double)) extern __shared__ char shared[];
+      T_ACC * buf = reinterpret_cast<T_ACC*>(&shared);
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[2*wrt_i] = sum_loss1;
+          buf[2*wrt_i+1] = sum_loss2;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.y < offset) {
+          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_loss1 += buf[2*read_i];
+          sum_loss2 += buf[2*read_i+1];
+        }
+        __syncthreads();
+      }
+      if (threadIdx.y == 0) {
+        buf[2*threadIdx.x] = sum_loss1;
+        buf[2*threadIdx.x+1] = sum_loss2;
+      }
+      __syncthreads();
+      if (threadIdx.y !=0) {
+        sum_loss1 = buf[2*threadIdx.x];
+        sum_loss2 = buf[2*threadIdx.x+1];
+      }
+    }
+    // all threads now have the two sums over l
+    T_ACC fH = (T_ACC)N;
+    T_ACC term1 = (T_ACC(1) / fH) * c_rstd;
+    T* k_grad_input = grad_input + i1*N;
+    if (gamma != NULL) {
+      for (int l = thrx;  l < N;  l+=numx) {
+        const T_ACC c_h = static_cast<T_ACC>(k_input[l]);
+        const T_ACC c_loss = static_cast<T_ACC>(k_dout[l]);
+        T_ACC f_grad_input = fH * c_loss * gamma[l];
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    } else {
+      for (int l = thrx;  l < N;  l+=numx) {
+        const T_ACC c_h = static_cast<T_ACC>(k_input[l]);
+        const T_ACC c_loss = static_cast<T_ACC>(k_dout[l]);
+        T_ACC f_grad_input = fH * c_loss;
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    }
+    // prevent race where buf is written again before reads are done
+    __syncthreads();
+  }
+}
+
+template <typename T>
+void LayerNormBackwardKernelImplInternal(
+    const Tensor& dY,
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const Tensor& gamma,
+    int64_t M,
+    int64_t N,
+    Tensor* dX,
+    Tensor* dgamma,
+    Tensor* dbeta) {
+  using T_ACC = acc_type<T, true>;
+  TORCH_CHECK(dY.numel() == M * N);
+  TORCH_CHECK(mean.numel() == M);
+  TORCH_CHECK(rstd.numel() == M);
+  TORCH_CHECK(M <= at::zoom::getCurrentDeviceProperties()->maxGridSize[0], "M should be less than maximum HIP grid size, \
+  file a support request to support bigger batches");
+  TORCH_CHECK(N <= std::numeric_limits<int>::max(), "Normalized shape should have less than INT_MAX elements, \
+  file a support request to support bigger normalized shapes");
+  const T* dY_data = dY.template const_data_ptr<T>();
+  const T* X_data = X.template const_data_ptr<T>();
+  const T_ACC* mean_data = mean.template const_data_ptr<T_ACC>();
+  const T_ACC* rstd_data = rstd.template const_data_ptr<T_ACC>();
+  const T* gamma_data =
+      gamma.defined() ? gamma.template const_data_ptr<T>() : nullptr;
+  T* dX_data = dX->defined() ? dX->template data_ptr<T>() : nullptr;
+  hipStream_t zoom_stream = c10::zoom::getCurrentZoomStream();
+  const int warp_size = at::zoom::warp_size();
+  if (dX_data != nullptr) {
+    if (M >= 32768) {
+      const uint64_t maxGridY = at::zoom::getCurrentDeviceProperties()->maxGridSize[1];
+      const dim3 blocks1(1, std::min((uint64_t)M, maxGridY), 1);
+      dim3 threads1(warp_size, 4, 1);
+      threads1.y = 2; // Optimization for ROCm
+      int nshared =
+              threads1.y > 1 ?
+              threads1.y*threads1.x*sizeof(T_ACC) :
+              0;
+      cuComputeGradInput<<<blocks1, threads1, nshared, zoom_stream>>>(
+              dY_data,
+              X_data,
+              M, N,
+              mean_data,
+              rstd_data,
+              gamma_data,
+              dX_data);
+        C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      const dim3 blocks(M);
+      int nshared = (num_threads()/warp_size) * sizeof(T_ACC);
+      layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, zoom_stream>>>(dY_data,
+      X_data, mean_data, rstd_data, gamma_data, dX_data, N);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+
+  }
+
+  if (dgamma->defined() || dbeta->defined()) {
+    T* dgamma_data =
+        dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
+    T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
+
+    if (M < 128) {
+      // For small batch size, do colwise reduce directly.
+      const int64_t B = (N + kHIPNumThreads - 1) / kHIPNumThreads;
+      GammaBetaBackwardSimpleZoomKernel<T, T_ACC>
+          <<<B, kHIPNumThreads, 0, zoom_stream>>>(
+              M,
+              N,
+              dY_data,
+              X_data,
+              mean_data,
+              rstd_data,
+              dgamma_data,
+              dbeta_data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    } else {
+      // For small batch size, do colwise reduce directly.
+      const int part_size = warp_size;
+      const dim3 threads2(warp_size, 4, 1);
+      const dim3 blocks2((N + threads2.x - 1) / threads2.x, part_size, 1);
+      const int nshared2_a = 2 * sizeof(T_ACC) * threads2.y * threads2.y * (threads2.x + 1);
+      const int nshared2_b = threads2.x * threads2.y * sizeof(T_ACC);
+      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
+
+      const auto part_grad_dtype = at::toAccumulateType(X.scalar_type(), true);
+      Tensor part_grad_gamma = at::empty({part_size,N}, gamma.options().dtype(part_grad_dtype));
+      Tensor part_grad_beta = at::native::empty_like(part_grad_gamma);
+
+      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, zoom_stream>>>(
+                      dY_data,
+                      X_data,
+                      M,N,
+                      mean_data,
+                      rstd_data,
+                      part_grad_gamma.template data_ptr<T_ACC>(),
+                      part_grad_beta.template data_ptr<T_ACC>());
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+
+      const dim3 threads3(warp_size, 8, 1); // Optimization for ROCm
+      const dim3 blocks3((N + threads3.x - 1) / threads3.x, 1, 1);
+      const int nshared3 = threads3.x * threads3.y * sizeof(T_ACC);
+
+      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, zoom_stream>>>(
+                      part_grad_gamma.template data_ptr<T_ACC>(),
+                      part_grad_beta.template data_ptr<T_ACC>(),
+                      part_size,
+                      M,N,
+                      dgamma_data,
+                      dbeta_data);
+      C10_ZOOM_KERNEL_LAUNCH_CHECK();
+    }
+  }
+}
+
+void LayerNormBackwardKernelImpl(
+    const Tensor& dY,
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const Tensor& gamma,
+    int64_t M,
+    int64_t N,
+    Tensor* dX,
+    Tensor* dgamma,
+    Tensor* dbeta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      X.scalar_type(),
+      "LayerNormBackwardKernelImpl",
+      [&]() {
+        LayerNormBackwardKernelImplInternal<scalar_t>(
+            dY.contiguous(), X, mean, rstd, gamma, M, N, dX, dgamma, dbeta);
+      });
+}
+
+} // namespace
+
+std::tuple<Tensor, Tensor, Tensor> layer_norm_zoom(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
+    double eps) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
+
+  Tensor Y = at::native::empty_like(
+      *X,
+      c10::nullopt /* dtype */,
+      c10::nullopt /* layout */,
+      c10::nullopt /* device */,
+      c10::nullopt /* pin_memory */,
+      LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  auto acc_type = at::toAccumulateType(input.scalar_type(), /*is_zoom=*/true);
+  Tensor mean = at::empty({M}, X->options().dtype(acc_type));
+  Tensor rstd = at::empty({M}, X->options().dtype(acc_type));
+  // Calling the kernel for M==0 gives a CUDA error
+  // See: https://github.com/pytorch/pytorch/pull/28614
+  if (M > 0) {
+    LayerNormKernelImpl(*X, *gamma, *beta, M, N, eps, &Y, &mean, &rstd);
+  }
+  const auto input_shape = input.sizes();
+  const size_t axis = input.dim() - normalized_shape.size();
+
+  std::vector<int64_t> stat_shape;
+  for (const auto idx: c10::irange(axis)) {
+    stat_shape.push_back(input_shape[idx]);
+  }
+  for (const auto C10_UNUSED idx: c10::irange(axis, input.dim())) {
+    stat_shape.push_back(1);
+  }
+
+  mean = mean.view(stat_shape);
+  rstd = rstd.view(stat_shape);
+
+  return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd));
+}
+
+std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_zoom(
+    const Tensor& dY,
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const std::optional<Tensor>& weight_opt /* optional */,
+    const std::optional<Tensor>& bias_opt /* optional */,
+    std::array<bool, 3> grad_input_mask) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
+
+  Tensor dX;
+  Tensor dgamma;
+  Tensor dbeta;
+  if (grad_input_mask[0]) {
+    dX = at::native::empty_like(
+        *X,
+        c10::nullopt /* dtype */,
+        c10::nullopt /* layout */,
+        c10::nullopt /* device */,
+        c10::nullopt /* pin_memory */,
+        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (grad_input_mask[1]) {
+    dgamma = M > 0 ? at::native::empty_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+                   : at::native::zeros_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (grad_input_mask[2]) {
+    dbeta = M > 0 ? at::native::empty_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+                  : at::native::zeros_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (M > 0 && N > 0) {
+    LayerNormBackwardKernelImpl(
+        dY, *X, mean, rstd, *gamma, M, N, &dX, &dgamma, &dbeta);
+  }
+  return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
+}
+
+REGISTER_PRIVATEUSE1_DISPATCH(LayerNormKernel, &LayerNormKernelImpl);
+REGISTER_PRIVATEUSE1_DISPATCH(LayerNormBackwardKernel, &LayerNormBackwardKernelImpl);
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/zoom/legendre_polynomial_p.cu b/aten/src/ATen/native/zoom/legendre_polynomial_p.cu
new file mode 100644
index 00000000000000..a5077488ac0efc
--- /dev/null
+++ b/aten/src/ATen/native/zoom/legendre_polynomial_p.cu
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            const char legendre_polynomial_p_name[] = "legendre_polynomial_p_forward";
+
+            void legendre_polynomial_p_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "legendre_polynomial_p_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<legendre_polynomial_p_name, scalar_t, scalar_t>(iterator, legendre_polynomial_p_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "legendre_polynomial_p_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return legendre_polynomial_p_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // legendre_polynomial_p_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(legendre_polynomial_p_stub, &legendre_polynomial_p_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/modified_bessel_i0.cu b/aten/src/ATen/native/zoom/modified_bessel_i0.cu
new file mode 100644
index 00000000000000..04ae157a85a4c9
--- /dev/null
+++ b/aten/src/ATen/native/zoom/modified_bessel_i0.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_i0_name[] = "modified_bessel_i0_forward";
+
+            void modified_bessel_i0_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_i0_zoom", [&]() {
+                    jitted_gpu_kernel<modified_bessel_i0_name, scalar_t, scalar_t, 1>(iterator, modified_bessel_i0_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_i0_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return modified_bessel_i0_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_modified_bessel_i0_stub, &modified_bessel_i0_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/modified_bessel_i1.cu b/aten/src/ATen/native/zoom/modified_bessel_i1.cu
new file mode 100644
index 00000000000000..16289280447f2f
--- /dev/null
+++ b/aten/src/ATen/native/zoom/modified_bessel_i1.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_i1_name[] = "modified_bessel_i1_forward";
+
+            void modified_bessel_i1_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_i1_zoom", [&]() {
+                    jitted_gpu_kernel<modified_bessel_i1_name, scalar_t, scalar_t, 1>(iterator, modified_bessel_i1_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_i1_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return modified_bessel_i1_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_modified_bessel_i1_stub, &modified_bessel_i1_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/modified_bessel_k0.cu b/aten/src/ATen/native/zoom/modified_bessel_k0.cu
new file mode 100644
index 00000000000000..8a832006f7d39a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/modified_bessel_k0.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_k0_name[] = "modified_bessel_k0_forward";
+
+            void modified_bessel_k0_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_k0_zoom", [&]() {
+                    jitted_gpu_kernel<modified_bessel_k0_name, scalar_t, scalar_t, 1>(iterator, modified_bessel_k0_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_k0_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return modified_bessel_k0_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_modified_bessel_k0_stub, &modified_bessel_k0_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/modified_bessel_k1.cu b/aten/src/ATen/native/zoom/modified_bessel_k1.cu
new file mode 100644
index 00000000000000..632ba9395187c6
--- /dev/null
+++ b/aten/src/ATen/native/zoom/modified_bessel_k1.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char modified_bessel_k1_name[] = "modified_bessel_k1_forward";
+
+            void modified_bessel_k1_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_k1_zoom", [&]() {
+                    jitted_gpu_kernel<modified_bessel_k1_name, scalar_t, scalar_t, 1>(iterator, modified_bessel_k1_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "modified_bessel_k1_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return modified_bessel_k1_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_modified_bessel_k1_stub, &modified_bessel_k1_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/scaled_modified_bessel_k0.cu b/aten/src/ATen/native/zoom/scaled_modified_bessel_k0.cu
new file mode 100644
index 00000000000000..90002df223ca6d
--- /dev/null
+++ b/aten/src/ATen/native/zoom/scaled_modified_bessel_k0.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char scaled_modified_bessel_k0_name[] = "scaled_modified_bessel_k0_forward";
+
+            void scaled_modified_bessel_k0_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "scaled_modified_bessel_k0_zoom", [&]() {
+                    jitted_gpu_kernel<scaled_modified_bessel_k0_name, scalar_t, scalar_t, 1>(iterator, scaled_modified_bessel_k0_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "scaled_modified_bessel_k0_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return scaled_modified_bessel_k0_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_scaled_modified_bessel_k0_stub, &scaled_modified_bessel_k0_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/scaled_modified_bessel_k1.cu b/aten/src/ATen/native/zoom/scaled_modified_bessel_k1.cu
new file mode 100644
index 00000000000000..b24e88d7f2640a
--- /dev/null
+++ b/aten/src/ATen/native/zoom/scaled_modified_bessel_k1.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char scaled_modified_bessel_k1_name[] = "scaled_modified_bessel_k1_forward";
+
+            void scaled_modified_bessel_k1_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "scaled_modified_bessel_k1_zoom", [&]() {
+                    jitted_gpu_kernel<scaled_modified_bessel_k1_name, scalar_t, scalar_t, 1>(iterator, scaled_modified_bessel_k1_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "scaled_modified_bessel_k1_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return scaled_modified_bessel_k1_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_scaled_modified_bessel_k1_stub, &scaled_modified_bessel_k1_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_t.cu b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_t.cu
new file mode 100644
index 00000000000000..d82bcea0bd7751
--- /dev/null
+++ b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_t.cu
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_t_name[] = "shifted_chebyshev_polynomial_t_forward";
+
+            void shifted_chebyshev_polynomial_t_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_t_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<shifted_chebyshev_polynomial_t_name, scalar_t, scalar_t>(iterator, shifted_chebyshev_polynomial_t_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_t_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return shifted_chebyshev_polynomial_t_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // shifted_chebyshev_polynomial_t_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(shifted_chebyshev_polynomial_t_stub, &shifted_chebyshev_polynomial_t_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_u.cu b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_u.cu
new file mode 100644
index 00000000000000..446e1d0ecb7717
--- /dev/null
+++ b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_u.cu
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_u_name[] = "shifted_chebyshev_polynomial_u_forward";
+
+            void shifted_chebyshev_polynomial_u_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_u_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<shifted_chebyshev_polynomial_u_name, scalar_t, scalar_t>(iterator, shifted_chebyshev_polynomial_u_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_u_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return shifted_chebyshev_polynomial_u_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // shifted_chebyshev_polynomial_u_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(shifted_chebyshev_polynomial_u_stub, &shifted_chebyshev_polynomial_u_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_v.cu b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_v.cu
new file mode 100644
index 00000000000000..04819d7b9cb3a3
--- /dev/null
+++ b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_v.cu
@@ -0,0 +1,33 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+namespace {
+CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_v_name[] = "shifted_chebyshev_polynomial_v_forward";
+
+void shifted_chebyshev_polynomial_v_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_v_zoom", [&]() {
+        opmath_jitted_gpu_kernel_with_scalars<shifted_chebyshev_polynomial_v_name, scalar_t, scalar_t>(iterator, shifted_chebyshev_polynomial_v_string);
+    });
+#else
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_v_zoom", [&]() {
+        gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+            return shifted_chebyshev_polynomial_v_forward<scalar_t, true>(x, n);
+        });
+    });
+#endif
+} // shifted_chebyshev_polynomial_v_kernel_zoom
+
+} // namespace (anonymous)
+
+REGISTER_PRIVATEUSE1_DISPATCH(shifted_chebyshev_polynomial_v_stub, &shifted_chebyshev_polynomial_v_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_w.cu b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_w.cu
new file mode 100644
index 00000000000000..3adb5d3fff7ddc
--- /dev/null
+++ b/aten/src/ATen/native/zoom/shifted_chebyshev_polynomial_w.cu
@@ -0,0 +1,32 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char shifted_chebyshev_polynomial_w_name[] = "shifted_chebyshev_polynomial_w_forward";
+
+            void shifted_chebyshev_polynomial_w_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_w_zoom", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<shifted_chebyshev_polynomial_w_name, scalar_t, scalar_t>(iterator, shifted_chebyshev_polynomial_w_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "shifted_chebyshev_polynomial_w_zoom", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return shifted_chebyshev_polynomial_w_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // shifted_chebyshev_polynomial_w_kernel_zoom
+        } // namespace (anonymous)
+
+        REGISTER_PRIVATEUSE1_DISPATCH(shifted_chebyshev_polynomial_w_stub, &shifted_chebyshev_polynomial_w_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/native/zoom/spherical_bessel_j0.cu b/aten/src/ATen/native/zoom/spherical_bessel_j0.cu
new file mode 100644
index 00000000000000..6124646576d0d8
--- /dev/null
+++ b/aten/src/ATen/native/zoom/spherical_bessel_j0.cu
@@ -0,0 +1,42 @@
+// !!! This is a file automatically generated by hipify!!!
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/native/UnaryOps.h>
+
+#include <limits>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/zoom/jit/JitLoops.cuh>
+#include <ATen/zoom/jit/Loops.cuh>
+#include <ATen/native/zoom/Math.cuh>
+#include <ATen/zoom/jit/jit_utils.h>
+#include <ATen/NumericUtils.h>
+#include <c10/core/Scalar.h>
+#include <c10/zoom/HIPMathCompat.h>
+#include <c10/util/complex.h>
+
+namespace at::native {
+        namespace {
+            CONSTEXPR_EXCEPT_WIN_CUDA char spherical_bessel_j0_name[] = "spherical_bessel_j0_forward";
+
+            void spherical_bessel_j0_kernel_zoom(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "spherical_bessel_j0_zoom", [&]() {
+                    jitted_gpu_kernel<spherical_bessel_j0_name, scalar_t, scalar_t, 1>(iterator, spherical_bessel_j0_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "spherical_bessel_j0_zoom", [&]() {
+                    gpu_kernel(iterator, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+                        return spherical_bessel_j0_forward(a);
+                    });
+                });
+#endif // AT_USE_JITERATOR()
+            }
+        }
+
+        REGISTER_PRIVATEUSE1_DISPATCH(special_spherical_bessel_j0_stub, &spherical_bessel_j0_kernel_zoom);
+} // namespace at::native
diff --git a/aten/src/ATen/templates/RegisterBackendSelect.cpp b/aten/src/ATen/templates/RegisterBackendSelect.cpp
index 3586e44da999bc..adcca012aba106 100644
--- a/aten/src/ATen/templates/RegisterBackendSelect.cpp
+++ b/aten/src/ATen/templates/RegisterBackendSelect.cpp
@@ -29,7 +29,13 @@ bool is_pinned(const Tensor& self, std::optional<at::Device> device) {
     return false;
   }
   // TODO: fetch scalar type from Tensor? But it doesn't really matter...
-  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(at::kCUDA)));
+  #ifdef USE_ZOOM
+  // TODO(Arham): exchange keys
+  auto default_device = at::kPrivateUse1;
+  #else
+  auto default_device = at::kCUDA;
+  #endif
+  DispatchKeySet _dk = c10::DispatchKeySet(c10::computeDispatchKey(c10::nullopt, self.layout(), device.value_or(default_device)));
   return at::_ops::is_pinned::redispatch(_dk, self, device);
 }
 
diff --git a/aten/src/ATen/zoom/HIPConfig.h b/aten/src/ATen/zoom/HIPConfig.h
index 017177b4ed597b..78d23e2a5a4530 100644
--- a/aten/src/ATen/zoom/HIPConfig.h
+++ b/aten/src/ATen/zoom/HIPConfig.h
@@ -1,7 +1,6 @@
 #define AT_ROCM_ENABLED() true
 #define AT_MAGMA_ENABLED() false
 
-// disabled for now because we're testing on an old hipsparselt
 #ifdef HIPSPARSELT_ENABLED
 #define AT_HIPSPARSELT_ENABLED() true
 #else
diff --git a/aten/src/ATen/zoom/detail/ZoomHooks.cpp b/aten/src/ATen/zoom/detail/ZoomHooks.cpp
index 51ba8ae7be3f7d..af47f552216b64 100644
--- a/aten/src/ATen/zoom/detail/ZoomHooks.cpp
+++ b/aten/src/ATen/zoom/detail/ZoomHooks.cpp
@@ -18,7 +18,6 @@
 #include <cstddef>
 #include <functional>
 #include <memory>
-#include <iostream>
 #include <string>
 
 namespace c10::zoom::_internal {
@@ -210,7 +209,6 @@ std::string ZoomHooks::showConfig() const {
 
 int ZoomHooks::getNumGPUs() const {
   auto cnt = c10::zoom::device_count();
-  std::cout << "numgpu: " << cnt << std::endl;
   return cnt;
 }
 
diff --git a/test/test_torch.py b/test/test_torch.py
index cae17008081328..4d02f82ceca7b3 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2983,7 +2983,7 @@ def test_diff(self, device, dtype):
 
     # if the given input arg is not a list, it returns a list of single element: [arg]
     def _wrap_to_list(self, input_array):
-        return input_array if isinstance(input_array, list) else [input_array]
+        return list(input_array) if isinstance(input_array, (list, tuple)) else [input_array]
 
     # To ensure inf, -inf, and nan values do not cause divergence between Numpy and PyTorch.
     # There are two types of possible divergence:
@@ -3121,7 +3121,7 @@ def test_gradient_type_promotion(self, device):
                     # Result is given just as real number and all the imaginary parts to be equal to zero.
                     self.assertEqual(expected[i].imag, torch.zeros(actual[i].shape), exact_dtype=False)
             else:
-                actual, expected = self._inf_nan_preprocess(list(actual), expected)
+                actual, expected = self._inf_nan_preprocess(list(actual), list(expected))
                 self.assertEqual(actual, expected, equal_nan=True, exact_dtype=False)
 
     @onlyNativeDeviceTypes
@@ -5955,7 +5955,6 @@ def test_grad_scaling_state_dict(self, device):
             if lazy_init_scale:
                 # Dummy scale() call to ensure the scale tensor is lazily initialized.
                 s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device=device))
-                print(type(s1._scale), s1._scale.dtype)
                 if "cuda" == device.type:
                     self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor))
                 else: