mit-han-lab
diff --git a/‎awq/kernels/csrc/fused_layernorm/utils.cuh‎
Lines changed: 0 additions & 469 deletions b/‎awq/kernels/csrc/fused_layernorm/utils.cuh‎
Lines changed: 0 additions & 469 deletions
diff --git a/‎awq/kernels/csrc/pybind.cpp‎
Lines changed: 7 additions & 4 deletions b/‎awq/kernels/csrc/pybind.cpp‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎awq/kernels/csrc/w8a8/act.cu‎
Lines changed: 141 additions & 0 deletions b/‎awq/kernels/csrc/w8a8/act.cu‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎awq/kernels/csrc/w8a8/act.h‎
Lines changed: 29 additions & 0 deletions b/‎awq/kernels/csrc/w8a8/act.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎awq/kernels/csrc/fused_layernorm/dispatch_utils.h‎ renamed to ‎awq/kernels/csrc/w8a8/dispatch_utils.h‎ b/‎awq/kernels/csrc/fused_layernorm/dispatch_utils.h‎ renamed to ‎awq/kernels/csrc/w8a8/dispatch_utils.h‎
diff --git a/‎awq/kernels/csrc/fused_layernorm/layernorm_kernels.cu‎ renamed to ‎awq/kernels/csrc/w8a8/layernorm.cu‎
Lines changed: 17 additions & 46 deletions b/‎awq/kernels/csrc/fused_layernorm/layernorm_kernels.cu‎ renamed to ‎awq/kernels/csrc/w8a8/layernorm.cu‎
Lines changed: 17 additions & 46 deletions
diff --git a/‎awq/kernels/csrc/fused_layernorm/layernorm.h‎ renamed to ‎awq/kernels/csrc/w8a8/layernorm.h‎
Lines changed: 1 addition & 0 deletions b/‎awq/kernels/csrc/fused_layernorm/layernorm.h‎ renamed to ‎awq/kernels/csrc/w8a8/layernorm.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎awq/kernels/csrc/fused_layernorm/reduction_utils.cuh‎ renamed to ‎awq/kernels/csrc/w8a8/reduction_utils.cuh‎ b/‎awq/kernels/csrc/fused_layernorm/reduction_utils.cuh‎ renamed to ‎awq/kernels/csrc/w8a8/reduction_utils.cuh‎
@@ -10,7 +10,8 @@
 #include "rope_new/fused_rope_with_pos.h"
 #include "w8a8/w8a8_gemm_cuda.h"
 #include "w8a8/quantization.h"
-// #include "fused_layernorm/layernorm.h"
+#include "w8a8/layernorm.h"
+#include "w8a8/act.h"
 
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
@@ -29,7 +30,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("w8a8_gemm_forward_cuda", &w8a8_gemm_forward_cuda, "our w8a8 gemm kernel");
     m.def("w8a8_gemm_fuse_bias_forward_cuda", &w8a8_gemm_fuse_bias_forward_cuda, "our w8a8 gemm fused bias kernel");
     m.def("invoke_quant", &invoke_quant, "fp16->int8 quantization");
-    // m.def("rms_norm_general", &rms_norm_general, py::arg("out"), py::arg("input"),
-    //     py::arg("weight"), py::arg("scaling"), py::arg("epsilon"), py::arg("use_per_token_quant") = false,
-    //     "Apply Root Mean Square (RMS) Normalization to the input tensor (TRTLLM kernel).");
+    m.def("rms_norm_general", &rms_norm_general, py::arg("out"), py::arg("input"),
+        py::arg("weight"), py::arg("bias"),py::arg("scaling"), py::arg("epsilon"), py::arg("use_per_token_quant") = true,
+        "Apply Root Mean Square (RMS) Normalization to the input tensor (TRTLLM kernel).");
+    m.def("silu_and_mul", &silu_and_mul, "Activation function.");
+    m.def("gelu_and_quant",&gelu_and_quant, "Apply gelu act and quant output");
 }
@@ -0,0 +1,141 @@
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <cuda_fp16.h>
+
+#include "dispatch_utils.h"
+#include "utils.cuh"
+#include "reduction_utils.cuh"
+
+namespace vllm {
+
+template <typename T> __device__ __forceinline__ T silu(const T &x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename T> __device__ __forceinline__ T gelu_new(const T &x) {
+  const half x3 = (half)(x * x * x);
+  const T t = (T)tanhf((T)((T)0.79788456f * (half)(x + (T)((T)0.044715f * x3))));
+  return ((T)0.5) * x * (((T)1.0) + t);
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_fast(const T &x) {
+  const half f = (half)x;
+  const T t =
+      (T)tanhf(((T)(f * (T)0.79788456f)) * (((T)1.0) + (T)((T)0.044715f * f) * x));
+  return ((T)0.5) * x * (((T)1.0) + t);
+}
+
+  
+
+// dequant int32 input, apply silu and mul, then per token quant to int8
+template <typename scale_type, bool use_per_token_quant>
+__global__ void gelu_and_quant_kernel(
+    int8_t *__restrict__ out,          // [..., d]
+    half *__restrict__ input, // [..., d]
+    const int d,
+    scale_type * scale_out,                  // [num_tokens]
+    half *__restrict__ tmp = nullptr // [num_tokens, d]
+) {
+  const int token_idx = blockIdx.x;
+  const float max_value= 127.0f;
+  if constexpr (use_per_token_quant) {
+    float amax_val = 0.0f;
+    const half zero = 0.0001f;
+
+    for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const half x =
+          (half)__ldg(&input[token_idx * d + idx]);
+      half t = gelu_fast(x);
+      tmp[token_idx * d + idx] = t;
+      t = t > zero ? t : -t;
+      if ((float)t > amax_val)
+        amax_val = (float)t;
+    }
+
+    __shared__ float s_amax;
+    const float block_amax_val = blockReduceMax(amax_val);
+    if (threadIdx.x == 0) {
+      s_amax = block_amax_val;
+      scale_out[token_idx] = half(block_amax_val / max_value);
+    }
+    __syncthreads();
+    
+    float tmp_scale = max_value / s_amax;
+    for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      out[token_idx * d + idx] =
+          float_to_int8_rn((half)tmp_scale * tmp[token_idx * d + idx]);
+    }
+  } else {
+    for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+      const float x =
+          (float)__ldg(&input[token_idx * d + idx]);
+      out[token_idx * d + idx] = float_to_int8_rn((half)gelu_fast(x)  / scale_out[0]);
+    }
+  }
+}
+} // namespace vllm
+
+
+
+void gelu_and_quant(
+    torch::Tensor &out,   // [..., d]
+    torch::Tensor &input, // [..., d]
+    torch::Tensor &scale_out, // [...]
+    torch::Tensor &tmp // [num_tokens, d]
+    ) {
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1);
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 128));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  vllm::gelu_and_quant_kernel<half, true><<<grid, block, 0, stream>>>(
+      out.data_ptr<int8_t>(), reinterpret_cast<half *>(input.data_ptr<at::Half>()), d, reinterpret_cast<half *>(scale_out.data_ptr<at::Half>()),reinterpret_cast<half *>(tmp.data_ptr<at::Half>()));
+}
+
+
+
+namespace vllm {
+  
+template<typename scalar_t>
+__global__ void silu_and_mul_kernel(
+  scalar_t* __restrict__ out,               // [..., d]
+  const scalar_t* __restrict__ input,       // [..., 2 * d]
+  const int d) {
+
+  const int token_idx = blockIdx.x;
+  const int64_t token_idx_d = token_idx * int64_t(d);
+  const int64_t token_idx_2d = token_idx_d * 2;
+  for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = __ldg(&input[token_idx_2d + idx]);
+    const scalar_t y = __ldg(&input[token_idx_2d + d + idx]);
+    out[token_idx_d + idx] = silu(x) * y;
+  }
+}
+} // namespace vllm
+
+
+
+torch::Tensor silu_and_mul(
+  torch::Tensor& input)    // [..., 2 * d]
+{
+  int64_t num_tokens = input.numel() / input.size(-1);
+  int d = input.size(-1) / 2;
+
+  std::vector<int64_t> output_shape = input.sizes().vec();
+  output_shape[output_shape.size() - 1]=d;
+  auto options =
+      torch::TensorOptions().dtype(input.dtype()).device(input.device());
+  at::Tensor output = torch::empty(output_shape, options);
+
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(d, 256));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_kernel", [&] {
+    vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        output.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), d);
+  });
+  return output;
+}
@@ -0,0 +1,29 @@
+// Inspired by TRT-LLM.
+// Modified by Shang Yang and Haotian Tang.
+// @article{lin2024awq,
+//   title={AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration},
+//   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Chen, Wei-Ming and Wang, Wei-Chen and Xiao, Guangxuan and Dang, Xingyu and Gan, Chuang and Han, Song},
+//   journal={Proceedings of Machine Learning and Systems},
+//   volume={6},
+//   pages={87--100},
+//   year={2024}
+// }
+
+#include <torch/extension.h>
+#include <cuda_fp16.h>
+// Inspired by vLLM-SmoothQuant: https://github.com/vllm-project/vllm/pull/1112.
+#include <torch/extension.h>
+
+
+void gelu_and_quant(torch::Tensor &out,   // [..., d]
+                                torch::Tensor &input, // [..., d]
+                                torch::Tensor &scale_out, // [num_tokens]
+                                torch::Tensor &tmp // [num_tokens, d]
+);
+
+torch::Tensor silu_and_mul(torch::Tensor &input  // [..., 2 * d]
+);
+
+
+        
+
@@ -1,5 +1,5 @@
-// Inspired by TRT-LLM.
-// Modified by Shang Yang and Haotian Tang.
+// Inspired by QServe https://github.com/mit-han-lab/qserve/tree/main.
+// Modified by Yuming Lou.
 // @article{lin2024awq,
 //   title={AWQ: Activation-aware Weight Quantization for On-Device LLM Compression and Acceleration},
 //   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Chen, Wei-Ming and Wang, Wei-Chen and Xiao, Guangxuan and Dang, Xingyu and Gan, Chuang and Han, Song},
@@ -10,7 +10,6 @@
 // }
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
-
 #include "dispatch_utils.h"
 #include "utils.cuh"
 #include "reduction_utils.cuh"
@@ -41,18 +40,19 @@ __inline__ __device__ Tf compute_layernorm(Tf val, float s_mean, float s_varianc
  * First pass (loop) computes the mean.
  * Second computes the variance via Var[x] = E[(x - E[x])²].
  * Third pass computes and writes normed_output
- *
- * with USE_DIFF_OF_SQUARES set to true (may be faster but less accurate):
+ * For better speedup, we set USE_DIFF_OF_SQUARES to true (may be faster but less accurate):
+ * It turns out the accuracy dosen't drop.
  * First pass (loop) computes the mean and variance via Var[x] = E[x²] - E[x]²
  * Second pass computes and writes normed_output
+ * 
  *
  * use_shmem controls if we cache input values into shared memory
  *
  * Optional: with dynamic scaling, the last pass doesn't write immediately but finds the
  *           amax per row. A final pass scales to int8 accordingly, and writes output to
  *           normed_output_quant.
  */
-template <typename T, typename scale_type, bool USE_DIFF_OF_SQUARES = false>
+template <typename T, typename scale_type, bool USE_DIFF_OF_SQUARES = true>
 __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta, T* normed_output, const float eps,
     int tokens, int hidden_dim, const scale_type* scale_orig_quant_per_tensor, scale_type* scale_orig_quant_per_token,
     int8_t* normed_output_quant, bool use_shmem)
@@ -74,7 +74,6 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,
     float variance = 0.0f;
     float local_sum = 0.0f;
     float local_var_sum = 0.0f;
-
     const int n_elems = hidden_dim / num_elems_T;
     for (int i = tidx; i < n_elems; i += blockDim.x)
     {
@@ -83,15 +82,14 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,
         {
             shmem[i] = val;
         }
-
         const float_packed_t val_f = cuda_cast<float_packed_t>(val);
         local_sum += cuda_sum<float>(val_f);
         if (USE_DIFF_OF_SQUARES)
         {
             local_var_sum += cuda_sum<float>(val_f * val_f);
         }
     }
-
+    //Compute mean
     if (USE_DIFF_OF_SQUARES)
     {
         float packed[2] = {local_sum, local_var_sum};
@@ -116,12 +114,13 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,
     }
     __syncthreads();
 
+
     if (!USE_DIFF_OF_SQUARES)
     {
         for (int i = tidx; i < n_elems; i += blockDim.x)
         {
             const T val = use_shmem ? shmem[i] : input[bidx * n_elems + i];
-            float_packed_t diff = cuda_cast<float_packed_t>(val) - s_mean;
+            float_packed_t diff = cuda_cast<float_packed_t>(val); // - s_mean;
             local_var_sum += cuda_sum<float>(diff * diff);
         }
         variance = blockReduceSum(local_var_sum);
@@ -133,6 +132,7 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,
         __syncthreads();
     }
 
+    // Compute LN and Quantize
     const bool with_per_token_scaling = scale_orig_quant_per_token != nullptr;
     const bool with_per_tensor_scaling = scale_orig_quant_per_tensor != nullptr;
     const float_packed_t scale_orig_quant
@@ -186,51 +186,21 @@ __global__ void generalLayerNorm(const T* input, const T* gamma, const T* beta,
         }
     }
 }
-}
 
-// TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, typename out_type, bool use_quant>
-__global__ void
-rms_norm_kernel(out_type *__restrict__ out,         // [..., hidden_size]
-                const scalar_t *__restrict__ input, // [..., hidden_size]
-                const scalar_t *__restrict__ weight, // [hidden_size]
-                const float epsilon, const int num_tokens,
-                const int hidden_size) {
-  __shared__ float s_variance;
-  float variance = 0.0f;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * hidden_size + idx];
-    variance += x * x;
-  }
-  variance = blockReduceSum<float>(variance);
-  if (threadIdx.x == 0) {
-    s_variance = rsqrtf(variance / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    if constexpr (use_quant) {
-      out[blockIdx.x * hidden_size + idx] = float_to_int8_rn(
-        ((float)(x * s_variance)) * (float)(weight[idx]));
-    } else {
-      out[blockIdx.x * hidden_size + idx] =
-        ((scalar_t)(x * s_variance)) * weight[idx];
-    }
-  }
-}
+} // namespace vllm
 
 void rms_norm_general(torch::Tensor &out,    // [..., hidden_size]
               torch::Tensor &input,  // [..., hidden_size]
               torch::Tensor &weight, // [hidden_size]
+              torch::Tensor &bias, // [hidden_size]
               torch::Tensor &scaling, // [tokens] or [1]
               float epsilon,
-              bool use_per_token_quant) {
+              bool use_per_token_quant = true) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
+  dim3 block(std::min(hidden_size, 128));//Reduce the idle probability of threads
   block.x = 32 * ((block.x + 31) / 32);
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -240,7 +210,8 @@ void rms_norm_general(torch::Tensor &out,    // [..., hidden_size]
       // per-token
       vllm::generalLayerNorm<T, at::Half><<<grid, block, 0, stream>>>(
         reinterpret_cast<T*>(input.data_ptr<scalar_t>()), 
-        reinterpret_cast<T*>(weight.data_ptr<scalar_t>()), nullptr,
+        reinterpret_cast<T*>(weight.data_ptr<scalar_t>()), 
+        reinterpret_cast<T*>(bias.data_ptr<scalar_t>()),
         nullptr, epsilon, num_tokens, hidden_size, nullptr, scaling.data_ptr<at::Half>(),
         out.data_ptr<int8_t>(), false
       );
@@ -258,4 +229,4 @@ void rms_norm_general(torch::Tensor &out,    // [..., hidden_size]
       );
     }
   });
-}
+}
@@ -14,6 +14,7 @@
 void rms_norm_general(torch::Tensor &out,    // [..., hidden_size]
               torch::Tensor &input,  // [..., hidden_size]
               torch::Tensor &weight, // [hidden_size]
+              torch::Tensor &bias, // [hidden_size]
               torch::Tensor &scaling, // [tokens] or [1]
               float epsilon,
               bool use_per_token_quant);