Enable FP8 Triton dequantized block-wise kernel (#3788)

jiawenliu64 · facebook-github-bot · commit a9a715c21433 · 2025-03-14T09:14:03.000-07:00
Summary: Pull Request resolved: #3788 X-link: facebookresearch/FBGEMM#875 Enable FP8 Triton dequantized block-wise kernel, which is required to upcast with block-wise quantized all2all. Reviewed By: sunfish2010 Differential Revision: D70872110 fbshipit-source-id: fa842baa49c72b67e6c12c375f469dae3219827a
diff --git a/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py
@@ -14,6 +14,7 @@
 
 if torch.cuda.is_available():
     from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
+        dequantize_fp8_block,
         matmul_fp8_block,
         matmul_fp8_row,
         quantize_fp8_block,
@@ -274,6 +275,34 @@ def _test_quantize_fp8_block(
         _test_quantize_fp8_block((3, 6), (2, 8))
         _test_quantize_fp8_block((3, 6), (2, 8), use_scale_ub=True)
 
+    def test_dequantize_fp8_block(self) -> None:
+        def _test_dequantize_fp8_block(
+            shape: Tuple[int, int],
+            block_shape: Tuple[int, int],
+            use_scale_ub: bool = False,
+        ) -> None:
+            M, K = shape
+            BLOCK_M, BLOCK_K = block_shape
+            a = torch.randn(M, K, dtype=torch.bfloat16, device="cuda")
+
+            scale_ub = (
+                torch.tensor([1200], dtype=torch.float, device="cuda")
+                if use_scale_ub
+                else None
+            )
+
+            a_fp8, a_scale = quantize_fp8_block(
+                a, block_m=BLOCK_M, block_k=BLOCK_K, scale_ub=scale_ub
+            )
+            a_dequant = dequantize_fp8_block(
+                a_fp8, a_scale, block_m=BLOCK_M, block_k=BLOCK_K
+            )
+            self.assertTrue(torch.allclose(a, a_dequant, atol=2e-1, rtol=5e-2))
+
+        _test_dequantize_fp8_block((3, 1024), (1, 256))
+        _test_dequantize_fp8_block((11, 128), (1, 128))
+        _test_dequantize_fp8_block((11, 256), (1, 256), use_scale_ub=True)
+
     def test_matmul_fp8_block(self) -> None:
         def _test_matmul_fp8_block(
             shape: Tuple[int, int, int],
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -3097,3 +3097,75 @@ def _kernel_matmul_fp8_row_non_persistent(
         tl.store(C, acc, mask=mask)
     else:
         tl.atomic_add(C, acc, mask=mask)
+
+
+@triton.jit
+def _kernel_dequantize_fp8_block(
+    xq_ptr,
+    x_scale_ptr,
+    x_dequant_ptr,
+    M,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    """
+    Kernel to dequantize FP8 tensor to BF16 tensor.
+    Args:
+        xq_ptr (tl.constexpr): Pointer to FP8 tensor.
+        x_scale_ptr (tl.constexpr): Pointer to FP8 scale tensor.
+        x_dequant_ptr (tl.constexpr): Pointer to BF16 tensor.
+        M (tl.constexpr): M dimension of input tensor.
+        K (tl.constexpr): K dimension of input tensor.
+        BLOCK_M (tl.constexpr): Block size for the M dimension.
+        BLOCK_K (tl.constexpr): Block size for the K dimension.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_k = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_K)
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = pid_k * BLOCK_K + tl.arange(0, BLOCK_K)
+    offs = offs_m[:, None] * K + offs_k[None, :]
+    mask = (offs_m[:, None] < M) & (offs_k[None, :] < K)
+    xq = tl.load(xq_ptr + offs, mask=mask).to(tl.bfloat16)
+    x_scale = tl.load(x_scale_ptr + pid_m * k + pid_k)
+    x_dequant = xq * x_scale
+    tl.store(x_dequant_ptr + offs, x_dequant, mask=mask)
+
+
+def dequantize_fp8_block(
+    xq: torch.Tensor,
+    x_scale: torch.Tensor,
+    block_m: int = 256,
+    block_k: int = 256,
+) -> torch.Tensor:
+    """
+    Dequantize FP8 tensor to BF16 tensor.
+
+    Args:
+        xq (torch.Tensor): FP8 tensor to be dequantized.
+        x_scale (torch.Tensor): FP8 scale tensor.
+        block_m (int): Block size for the M dimension.
+        block_k (int): Block size for the K dimension.
+
+    Returns:
+        torch.Tensor: Dequantized BF16 tensor.
+    """
+
+    assert (
+        xq.is_contiguous() and x_scale.is_contiguous()
+    ), "Input tensors must be contiguous"
+    assert xq.dim() == 2 and x_scale.dim() == 2, "Input tensors must have 2 dimensions"
+    M, K = xq.size()
+    x_dequant = torch.empty_like(xq, dtype=torch.bfloat16)
+
+    def grid(meta):
+        return (
+            triton.cdiv(M, meta["BLOCK_M"]),
+            triton.cdiv(K, meta["BLOCK_K"]),
+        )
+
+    _kernel_dequantize_fp8_block[grid](
+        xq, x_scale, x_dequant, M, K, BLOCK_M=block_m, BLOCK_K=block_k  # pyre-ignore[6]
+    )
+    return x_dequant