bertmaher
diff --git a/‎README.md
Lines changed: 10 additions & 0 deletions b/‎README.md
Lines changed: 10 additions & 0 deletions
diff --git a/‎TARGETS
Lines changed: 75 additions & 0 deletions b/‎TARGETS
Lines changed: 75 additions & 0 deletions
diff --git a/‎benchmark.py
Lines changed: 68 additions & 0 deletions b/‎benchmark.py
Lines changed: 68 additions & 0 deletions
diff --git a/‎cutlass.cpp
Lines changed: 22 additions & 0 deletions b/‎cutlass.cpp
Lines changed: 22 additions & 0 deletions
diff --git a/‎cutlass_kernel.cu
Lines changed: 139 additions & 0 deletions b/‎cutlass_kernel.cu
Lines changed: 139 additions & 0 deletions
diff --git a/‎cutlass_kernel.h
Lines changed: 5 additions & 0 deletions b/‎cutlass_kernel.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎denoise-gpu.sh
Lines changed: 36 additions & 0 deletions b/‎denoise-gpu.sh
Lines changed: 36 additions & 0 deletions
@@ -0,0 +1,10 @@
+# Usage
+
+```bash
+python setup.py develop
+python test.py
+python benchmark.py
+```
+
+Optionally, do `denoise-gpu.sh python test.py` (or `benchmark.py`) for less
+noisy (but slower) results.
@@ -0,0 +1,75 @@
+load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
+load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
+
+cpp_library(
+    name = "cutlass_kernel",
+    srcs = [
+        "cutlass_kernel.cu",
+    ],
+    headers = [
+        "cutlass_kernel.h",
+    ],
+    nvcc_flags = get_nvcc_arch_args(),
+    deps = [
+        "fbsource//third-party/cutlass-3:cutlass-3",
+    ],
+)
+
+cpp_library(
+    name = "cutlass",
+    srcs = [
+        "cutlass.cpp",
+    ],
+    supports_python_dlopen = True,
+    deps = [
+        ":cutlass_kernel",
+        "//caffe2:torch-cpp",  # @manual
+        "//caffe2:torch_extension",  # @manual
+    ],
+)
+
+python_library(
+    name = "triton_kernel",
+    srcs = [
+        "triton_kernel.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
+python_binary(
+    name = "test",
+    srcs = [
+        "test.py",
+    ],
+    cpp_deps = [
+        ":cutlass",
+    ],
+    main_function = "scripts.bertrand.tf32_gemm.test.main",
+    par_style = "xar",
+    deps = [
+        ":triton_kernel",
+        "//caffe2:torch",
+    ],
+)
+
+python_binary(
+    name = "benchmark",
+    srcs = [
+        "benchmark.py",
+    ],
+    cpp_deps = [
+        ":cutlass",
+    ],
+    main_function = "scripts.bertrand.tf32_gemm.benchmark.main",
+    par_style = "xar",
+    deps = [
+        "fbsource//third-party/pypi/matplotlib:matplotlib",  # @manual
+        "fbsource//third-party/pypi/pandas:pandas",  # @manual
+        ":triton_kernel",
+        "//caffe2:torch",
+    ],
+)
@@ -0,0 +1,68 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+import triton  # @manual
+
+from .triton_kernel import matmul as triton_matmul
+
+try:
+    torch.ops.load_library("cutlass.so")
+except Exception:
+    torch.ops.load_library("//scripts/bertrand/tf32_gemm:cutlass")
+
+torch.set_float32_matmul_precision("high")
+
+configs = []
+for fp8_inputs in [False]:
+    configs.append(
+        triton.testing.Benchmark(
+            x_names=["M", "N", "K"],  # Argument names to use as an x-axis for the plot
+            x_vals=[128 * i for i in range(2, 33)],
+            line_arg="provider",  # Argument name whose value corresponds to a different line in the plot
+            # Possible values for `line_arg`
+            # Don't compare to cublas for fp8 cases as torch.matmul doesn't support fp8 at the moment.
+            line_vals=["cublas", "triton", "cutlass", "precompiled"],
+            line_names=["cublas", "triton", "cutlass", "precompiled"],
+            ylabel="TFLOPS",  # Label name for the y-axis
+            plot_name="matmul-performance-fp32",
+            args={"fp8_inputs": fp8_inputs},
+        )
+    )
+
+
+@triton.testing.perf_report(configs)
+def benchmark(M, N, K, provider, fp8_inputs):
+    a = torch.zeros((M, K), device="cuda", dtype=torch.float32)
+    b = torch.zeros((K, N), device="cuda", dtype=torch.float32)
+    quantiles = [0.5, 0.2, 0.8]
+    if provider == "cublas":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch.matmul(a, b), quantiles=quantiles
+        )
+    if provider == "triton":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: triton_matmul(a, b), quantiles=quantiles
+        )
+        # print(f"{N}: {matmul_kernel.best_config}")
+    if provider == "precompiled":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: triton_matmul(a, b, precompiled=True), quantiles=quantiles
+        )
+        # print(f"{N}: {matmul_kernel.best_config}")
+    if provider == "cutlass":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch.ops.cutlass.gemm(a, b), quantiles=quantiles
+        )
+
+    def perf(ms):
+        return 2 * M * N * K * 1e-12 / (ms * 1e-3)
+
+    return perf(ms), perf(max_ms), perf(min_ms)
+
+
+def main():
+    benchmark.run(show_plots=True, print_data=True, save_path=".")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,22 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "cutlass_kernel.h"
+
+#include "ATen/ATen.h" // @manual
+#include "torch/extension.h" // @manual
+
+at::Tensor gemm(at::Tensor a, at::Tensor b) {
+  auto c = a.new_empty({a.size(0), b.size(1)});
+  gemm_kernel(
+      a.data_ptr<float>(),
+      b.data_ptr<float>(),
+      c.data_ptr<float>(),
+      a.size(0),
+      b.size(1),
+      a.size(1));
+  return c;
+}
+
+TORCH_LIBRARY(cutlass, m) {
+  m.def("gemm", &gemm);
+}
@@ -0,0 +1,139 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+/**
+ * Panic wrapper for unwinding CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                                             \
+  {                                                                       \
+    cutlass::Status error = status;                                       \
+    if (error != cutlass::Status::kSuccess) {                             \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) \
+                << " at: " << __LINE__ << std::endl;                      \
+      exit(EXIT_FAILURE);                                                 \
+    }                                                                     \
+  }
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The code section below describes datatype for input, output matrices and
+// computation between elements in input matrices.
+using ElementAccumulator = float; // <- data type of accumulator
+using ElementComputeEpilogue =
+    ElementAccumulator; // <- data type of epilogue operations
+using ElementInputA = float; // <- data type of elements in input matrix A
+using ElementInputB = float; // <- data type of elements in input matrix B
+using ElementOutput = float; // <- data type of elements in output matrix D
+
+// The code section below describes matrix layout of input and output matrices.
+// Column Major for Matrix A, Row Major for Matrix B and Row Major for Matrix C
+using LayoutInputA = cutlass::layout::RowMajor;
+using LayoutInputB = cutlass::layout::RowMajor;
+using LayoutOutput = cutlass::layout::RowMajor;
+
+// This code section describes whether you want to use tensor cores or regular
+// SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<128, 256, 16>; // <- threadblock tile M = 128, N =
+                                            // 128, K = 16
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp =
+    cutlass::gemm::GemmShape<64, 64, 16>; // <- warp tile M = 64, N = 64, K = 16
+// This code section describes the size of MMA op
+using ShapeMMAOp =
+    cutlass::gemm::GemmShape<16, 8, 8>; // <- MMA Op tile M = 16, N = 8, K = 8
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock =
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ??
+
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput, // <- data type of output matrix
+    128 /
+        cutlass::sizeof_bits<
+            ElementOutput>::value, // <- the number of elements per vectorized
+                                   // memory access. For a byte, it's 16
+                                   // elements. This becomes the vector width of
+                                   // math instructions in the epilogue too
+    ElementAccumulator, // <- data type of accumulator
+    ElementComputeEpilogue>; // <- data type for alpha/beta in linear
+                             // combination function
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+
+using Gemm = cutlass::gemm::device::Gemm<
+    ElementInputA,
+    LayoutInputA,
+    ElementInputB,
+    LayoutInputB,
+    ElementOutput,
+    LayoutOutput,
+    ElementAccumulator,
+    MMAOp,
+    SmArch,
+    ShapeMMAThreadBlock,
+    ShapeMMAWarp,
+    ShapeMMAOp,
+    EpilogueOp,
+    SwizzleThreadBlock,
+    NumStages>;
+
+void gemm_kernel(float* a, float* b, float* c, int m, int n, int k) {
+  cutlass::gemm::GemmCoord problem_size{m, n, k};
+  cutlass::TensorRef tensor_a{a, LayoutInputA{k}};
+  cutlass::TensorRef tensor_b{b, LayoutInputB{n}};
+  cutlass::TensorRef tensor_c{c, LayoutOutput{n}};
+  cutlass::TensorRef tensor_d{c, LayoutOutput{n}};
+
+  // Initialize alpha and beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1.0f);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(0.0f);
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments
+  // to launch instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{
+      problem_size, // <- problem size of matrix multiplication
+      tensor_a, // <- reference to matrix A on device
+      tensor_b, // <- reference to matrix B on device
+      tensor_c, // <- reference to matrix C on device
+      tensor_d, // <- reference to matrix D on device
+      {alpha, beta}, // <- tuple of alpha and beta
+      split_k_slices}; // <- k-dimension split factor
+
+  // Using the arguments, query for extra workspace required for matrix
+  // multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // printf("workspace size: %d\n", workspace_size);
+  if (workspace_size != 0) {
+    exit(EXIT_FAILURE);
+  }
+  // Allocate workspace memory
+  // cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  Gemm gemm_op;
+
+  // Instantiate CUTLASS kernel depending on templates
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  status = gemm_op.initialize(arguments, nullptr); // workspace.get());
+  CUTLASS_CHECK(status);
+
+  status = gemm_op();
+  CUTLASS_CHECK(status);
+}
@@ -0,0 +1,5 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+void gemm_kernel(float* a, float* b, float* c, int m, int n, int k);
@@ -0,0 +1,36 @@
+#!/bin/bash
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# There's a whole presentation about stable benchmarking here:
+# https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9956-best-practices-when-benchmarking-cuda-applications_V2.pdf
+
+# Lock GPU clocks
+sudo nvidia-smi -i 6 -pm 1 >&/dev/null                # persistent mode
+sudo nvidia-smi --power-limit=330 -i 6 >& /dev/null   # lock to 330 W
+sudo nvidia-smi -lgc 1140 -i 6 >& /dev/null            # lock to 1410 MHz.  The max on A100 is 1410 MHz
+
+# TODO: On my devgpu, device 6 is apparently attached to NUMA node 3.  How did
+# I discover this?
+#
+# `nvidia-smi -i 6 -pm 1` prints the PCI bus ID (00000000:C6:00.0)
+#
+# You can also get this from `nvidia-smi -x -q` and looking for minor_number
+# and pci_bus_id
+#
+# Then, `cat /sys/bus/pci/devices/0000:c6:00.0/numa_node` prints 3
+# is it always the case that device N is on numa node N/2? :shrug:
+#
+# Maybe automate this process or figure out if it always holds?
+#
+# ... Or you can just `nvidia-smi topo -mp` and it will just print out exactly
+# what you want, like this:
+
+#       GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7    mlx5_0  mlx5_1  mlx5_2  mlx5_3  CPU Affinity    NUMA Affinity
+# GPU0   X      PXB     SYS     SYS     SYS     SYS     SYS     SYS     NODE    SYS     SYS     SYS     0-23,96-119     0
+# GPU6  SYS     SYS     SYS     SYS     SYS     SYS      X      PXB     SYS     SYS     SYS     NODE    72-95,168-191   3
+
+export CUDA_VISIBLE_DEVICES=6
+numactl -m 3 -c 3 "$@"
+
+# Unlock GPU clock
+sudo nvidia-smi -rgc -i 6 >& /dev/null