[AMD][ROCm] Improve support of AMD

k-artem · k-artem · commit c131985d0c25 · 2025-08-03T15:15:53.000Z
The patch delivers several fixes for building issues for CUDA part
of DeepSpeed library.
Percentage of passed unit tests improved(tested on RDNA hardware, gfx110x and gfx12x)
Before:
collected 5298 items / 15 skipped
2773 failed, 862 passed, 1665 skipped, 13 errors
After:
collected 5851 items / 11 skipped
4187 failed, 1373 passed, 292 skipped, 10 errors

Signed-off-by: Artem Kuzmitckii &lt;artem.kuzmitckii@amd.com&gt;
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -233,7 +233,7 @@ struct call_conditional<false, TA, TB> {
 
 CUTLASS_DEVICE int32_t warp_uniform(int32_t value)
 {
-    return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+    return (int32_t)__shfl_sync(static_cast<uint64_t>(0xffffffff), (unsigned)value, 0);
 }
 
 template <typename T>
diff --git a/csrc/fp_quantizer/fp_quantize.cpp b/csrc/fp_quantizer/fp_quantize.cpp
@@ -6,6 +6,7 @@
 #include "fp_quantize.h"
 
 #include <c10/cuda/CUDAStream.h>
+#include <hip/hip_fp16.h>
 #include <torch/extension.h>
 #include <vector>
 
diff --git a/csrc/fp_quantizer/fp_quantize.cu b/csrc/fp_quantizer/fp_quantize.cu
@@ -4,7 +4,7 @@
 // DeepSpeed Team
 
 #include <stdexcept>
-#include "context.h"
+#include "fp_context.h"
 #include "fp_quantize.h"
 #include "memory_access_utils.h"
 #include "reduction_utils.h"
@@ -14,6 +14,7 @@
 
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
+#include <hip/hip_fp16.h>
 
 #ifdef BF16_AVAILABLE
 #include <cuda_bf16.h>
diff --git a/csrc/fp_quantizer/includes/fp_context.h b/csrc/fp_quantizer/includes/fp_context.h
diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h
@@ -526,12 +526,28 @@ here (fold is C++17 only and I don't think helps and recursion feels like
 huge overkill that harms readability) that would be wonderful.
 */
 
+template <typename T>
+DS_D_INLINE T shfl_xor_helper(cg::thread_block_tile<hw_warp_size>& warp, const T& value, int i)
+{
+    return warp.shfl_xor(value, i);
+}
+
+#if defined(__HIP_PLATFORM_AMD__)
+template <>
+DS_D_INLINE __half shfl_xor_helper<__half>(cg::thread_block_tile<hw_warp_size>& warp,
+                                           const __half& value,
+                                           int i)
+{
+    return __half(warp.shfl_xor(float(value), i));
+}
+#endif
+
 template <typename T, ROpType Op, int reduce_width = hw_warp_size>
 DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
-        data[0] = element<Op>(data[0], warp.shfl_xor(data[0], i));
+        data[0] = element<Op>(data[0], shfl_xor_helper(warp, data[0], i));
     }
 }
 
@@ -540,8 +556,8 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
-        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
-        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
+        data[0] = element<Op1>(data[0], shfl_xor_helper(warp, data[0], i));
+        data[1] = element<Op2>(data[1], shfl_xor_helper(warp, data[0], i));
     }
 }
 
@@ -550,9 +566,9 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
-        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
-        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
-        data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));
+        data[0] = element<Op1>(data[0], shfl_xor_helper(warp, data[0], i));
+        data[1] = element<Op2>(data[1], shfl_xor_helper(warp, data[0], i));
+        data[2] = element<Op3>(data[2], shfl_xor_helper(warp, data[0], i));
     }
 }
 
@@ -566,10 +582,10 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
 {
 #pragma unroll
     for (int i = 1; i < reduce_width; i *= 2) {
-        data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));
-        data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));
-        data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));
-        data[3] = element<Op4>(data[3], warp.shfl_xor(data[3], i));
+        data[0] = element<Op1>(data[0], shfl_xor_helper(warp, data[0], i));
+        data[1] = element<Op2>(data[1], shfl_xor_helper(warp, data[0], i));
+        data[2] = element<Op3>(data[2], shfl_xor_helper(warp, data[0], i));
+        data[3] = element<Op4>(data[3], shfl_xor_helper(warp, data[0], i));
     }
 }
 
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
@@ -120,7 +120,7 @@ __device__ __forceinline__ void ExtractFromSharedToReg_Scales(uint32_t* Scales,
 #pragma unroll
     for (int i = 0; i < 4; i++) {
         // T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
-        Scales[i] = __shfl_sync(0xffffffff, tmpReg, i, 4);
+        Scales[i] = __shfl_sync(static_cast<uint64_t>(0xffffffff), tmpReg, i, 4);
     }
 }
 
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu
@@ -45,7 +45,8 @@ static void Kernel_Ex(cudaStream_t stream,
     static size_t SHMEM_SZ =
         max(TilingConfig::SMEM_SIZE_B_TILE + SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE,
             TilingConfig::SMEM_SIZE_C_TILE);
-    cudaFuncSetAttribute(QUANT_GEMM_Kernel<TilingConfig, OutputDataType>,
+    auto kernel = QUANT_GEMM_Kernel<TilingConfig, OutputDataType>;
+    cudaFuncSetAttribute(reinterpret_cast<const void*>(kernel),
                          cudaFuncAttributeMaxDynamicSharedMemorySize,
                          SHMEM_SZ);
     size_t dimN = (N_Global - 1) / TilingConfig::TILE_N + 1;
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.cu b/deepspeed/inference/v2/kernels/cutlass_ops/mixed_gemm/mixed_gemm.cu
@@ -4,6 +4,7 @@
 // DeepSpeed Team
 
 #include <c10/cuda/CUDAStream.h>
+#include <hip/hip_bf16.h>
 #include "mixed_gemm.h"
 #include "mixed_gemm_api.h"
 #include "weight_variant.h"
diff --git a/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.cu b/deepspeed/inference/v2/kernels/cutlass_ops/moe_gemm/moe_gemm.cu
@@ -4,6 +4,7 @@
 // DeepSpeed Team
 
 #include <c10/cuda/CUDAStream.h>
+#include <hip/hip_bf16.h>
 #include "moe_gemm.h"
 #include "moe_gemm_api.h"
 #include "weight_variant.h"

Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ struct call_conditional<false, TA, TB> {`
`233`	`233`
`234`	`234`	`CUTLASS_DEVICE int32_t warp_uniform(int32_t value)`
`235`	`235`	`{`
`236`		`- return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);`
	`236`	`+ return (int32_t)__shfl_sync(static_cast<uint64_t>(0xffffffff), (unsigned)value, 0);`
`237`	`237`	`}`
`238`	`238`
`239`	`239`	`template <typename T>`
Original file line number	Diff line number	Diff line change
`@@ -526,12 +526,28 @@ here (fold is C++17 only and I don't think helps and recursion feels like`
`526`	`526`	`huge overkill that harms readability) that would be wonderful.`
`527`	`527`	`*/`
`528`	`528`
	`529`	`+template <typename T>`
	`530`	`+DS_D_INLINE T shfl_xor_helper(cg::thread_block_tile<hw_warp_size>& warp, const T& value, int i)`
	`531`	`+{`
	`532`	`+ return warp.shfl_xor(value, i);`
	`533`	`+}`
	`534`	`+`
	`535`	`+#if defined(__HIP_PLATFORM_AMD__)`
	`536`	`+template <>`
	`537`	`+DS_D_INLINE __half shfl_xor_helper<__half>(cg::thread_block_tile<hw_warp_size>& warp,`
	`538`	`+ const __half& value,`
	`539`	`+ int i)`
	`540`	`+{`
	`541`	`+ return __half(warp.shfl_xor(float(value), i));`
	`542`	`+}`
	`543`	`+#endif`
	`544`	`+`
`529`	`545`	`template <typename T, ROpType Op, int reduce_width = hw_warp_size>`
`530`	`546`	`DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)`
`531`	`547`	`{`
`532`	`548`	`#pragma unroll`
`533`	`549`	`for (int i = 1; i < reduce_width; i *= 2) {`
`534`		`- data[0] = element<Op>(data[0], warp.shfl_xor(data[0], i));`
	`550`	`+ data[0] = element<Op>(data[0], shfl_xor_helper(warp, data[0], i));`
`535`	`551`	`}`
`536`	`552`	`}`
`537`	`553`
`@@ -540,8 +556,8 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)`
`540`	`556`	`{`
`541`	`557`	`#pragma unroll`
`542`	`558`	`for (int i = 1; i < reduce_width; i *= 2) {`
`543`		`- data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));`
`544`		`- data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));`
	`559`	`+ data[0] = element<Op1>(data[0], shfl_xor_helper(warp, data[0], i));`
	`560`	`+ data[1] = element<Op2>(data[1], shfl_xor_helper(warp, data[0], i));`
`545`	`561`	`}`
`546`	`562`	`}`
`547`	`563`
`@@ -550,9 +566,9 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)`
`550`	`566`	`{`
`551`	`567`	`#pragma unroll`
`552`	`568`	`for (int i = 1; i < reduce_width; i *= 2) {`
`553`		`- data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));`
`554`		`- data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));`
`555`		`- data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));`
	`569`	`+ data[0] = element<Op1>(data[0], shfl_xor_helper(warp, data[0], i));`
	`570`	`+ data[1] = element<Op2>(data[1], shfl_xor_helper(warp, data[0], i));`
	`571`	`+ data[2] = element<Op3>(data[2], shfl_xor_helper(warp, data[0], i));`
`556`	`572`	`}`
`557`	`573`	`}`
`558`	`574`
`@@ -566,10 +582,10 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)`
`566`	`582`	`{`
`567`	`583`	`#pragma unroll`
`568`	`584`	`for (int i = 1; i < reduce_width; i *= 2) {`
`569`		`- data[0] = element<Op1>(data[0], warp.shfl_xor(data[0], i));`
`570`		`- data[1] = element<Op2>(data[1], warp.shfl_xor(data[1], i));`
`571`		`- data[2] = element<Op3>(data[2], warp.shfl_xor(data[2], i));`
`572`		`- data[3] = element<Op4>(data[3], warp.shfl_xor(data[3], i));`
	`585`	`+ data[0] = element<Op1>(data[0], shfl_xor_helper(warp, data[0], i));`
	`586`	`+ data[1] = element<Op2>(data[1], shfl_xor_helper(warp, data[0], i));`
	`587`	`+ data[2] = element<Op3>(data[2], shfl_xor_helper(warp, data[0], i));`
	`588`	`+ data[3] = element<Op4>(data[3], shfl_xor_helper(warp, data[0], i));`
`573`	`589`	`}`
`574`	`590`	`}`
`575`	`591`
Original file line number	Diff line number	Diff line change
`@@ -120,7 +120,7 @@ __device__ __forceinline__ void ExtractFromSharedToReg_Scales(uint32_t* Scales,`
`120`	`120`	`#pragma unroll`
`121`	`121`	`for (int i = 0; i < 4; i++) {`
`122`	`122`	`// T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);`
`123`		`- Scales[i] = __shfl_sync(0xffffffff, tmpReg, i, 4);`
	`123`	`+ Scales[i] = __shfl_sync(static_cast<uint64_t>(0xffffffff), tmpReg, i, 4);`
`124`	`124`	`}`
`125`	`125`	`}`
`126`	`126`